From 36d50de50e30e92950070c3449b99d78143fb221 Mon Sep 17 00:00:00 2001
From: felix <felix.li@amd.com>
Date: Fri, 28 Mar 2025 00:04:31 +0800
Subject: [PATCH 001/443] ckmoe: change cmake; use smaller shape for i4 (#2027)

* change cmake; use smaller shape for i4

* fix pki4 run

* fix typo

* fix runtime arch logic for moe_gemm2 example

---------

Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
---
 example/65_gemm_multiply_multiply/CMakeLists.txt       |  4 ++--
 .../65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp  | 10 +++++-----
 .../65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp  |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 38b42fefc4..95fd8bace8 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -3,14 +3,14 @@ add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_mult
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp)
 add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp)
-# add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
+add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
 
 list(APPEND gpu_list gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
-        # add_example_executable(example_moe_gemm1_xdl_pk_i4 moe_gemm1_xdl_pk_i4.cpp)
+        add_example_executable(example_moe_gemm1_xdl_pk_i4 moe_gemm1_xdl_pk_i4.cpp)
         add_example_executable(example_moe_gemm2_xdl_pk_i4 moe_gemm2_xdl_pk_i4.cpp)
         set(target 1)
     endif()
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index 17f4cd8a3f..1102ce1054 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -191,14 +191,14 @@ int main(int argc, char* argv[])
     // experts = 8
     // per expert:
     // GEMM shape
-    ck::index_t N               = 14336 * 2;
-    ck::index_t K               = 4096;
+    ck::index_t N               = 4096 * 2;
+    ck::index_t K               = 6144;
     ck::index_t experts         = 8;
     ck::index_t sorted_tile_num = 16;
     ck::index_t valid_tile_num  = 13;
     ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
     ck::index_t valid_size      = valid_tile_num * MPerBlock;
-    ck::index_t tokens          = 64;
+    ck::index_t tokens          = 644;
     ck::index_t topk            = 2;
 
     if(argc == 1)
@@ -440,8 +440,8 @@ int main(int argc, char* argv[])
                                b_element_op,
                                cde_element_op);
 
-    if(!device_op.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!device_op.IsSupportedArgument(argument) ||
+       !(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
     {
         throw std::runtime_error(
             "wrong! device_gemm with the specified compilation parameters does "
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index 8441862004..528503a2c4 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -407,8 +407,8 @@ int main(int argc, char* argv[])
                                b_element_op,
                                cde_element_op);
 
-    if(!device_op.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!device_op.IsSupportedArgument(argument) ||
+       !(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
     {
         throw std::runtime_error(
             "wrong! device_gemm with the specified compilation parameters does "

From a426f673018465e057f19355c444ff1c0eb2ff35 Mon Sep 17 00:00:00 2001
From: spolifroni-amd <Sandra.Polifroni@amd.com>
Date: Thu, 27 Mar 2025 17:13:18 -0400
Subject: [PATCH 002/443] creation of install doc and refactor of doc in
 general (#1908)

* creation of install doc and refactor of doc in general

* updates based on review comments

* updated based on review comments

* updated readme and contributors markdown

* added extra note to not use -j on its own

* added note about smoke tests and regression tests

* made changes as per Illia's feedback

---------

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 CONTRIBUTORS.md                               |   5 +-
 README.md                                     |   6 +-
 .../Composable-Kernel-math.rst}               |  11 +-
 .../Composable-Kernel-structure.rst           |  29 +++
 docs/conceptual/what-is-ck.rst                |  41 -----
 docs/index.rst                                |  23 +--
 docs/install/Composable-Kernel-Docker.rst     |  16 ++
 docs/install/Composable-Kernel-install.rst    |  72 ++++++++
 .../Composable-Kernel-prerequisites.rst       |  32 ++++
 docs/install/dockerhub.rst                    | 101 -----------
 ...st => Composable-Kernel-API-reference.rst} |  16 +-
 ...pper.rst => Composable-Kernel-wrapper.rst} |  13 +-
 docs/sphinx/_toc.yml.in                       |  40 +++--
 docs/tutorial/Composable-Kernel-examples.rst  |  40 +++++
 docs/tutorial/tutorial_hello_world.rst        | 165 ------------------
 15 files changed, 244 insertions(+), 366 deletions(-)
 rename docs/{reference/Supported_Primitives_Guide.rst => conceptual/Composable-Kernel-math.rst} (85%)
 create mode 100644 docs/conceptual/Composable-Kernel-structure.rst
 delete mode 100644 docs/conceptual/what-is-ck.rst
 create mode 100644 docs/install/Composable-Kernel-Docker.rst
 create mode 100644 docs/install/Composable-Kernel-install.rst
 create mode 100644 docs/install/Composable-Kernel-prerequisites.rst
 delete mode 100644 docs/install/dockerhub.rst
 rename docs/reference/{API_Reference_Guide.rst => Composable-Kernel-API-reference.rst} (79%)
 rename docs/reference/{wrapper.rst => Composable-Kernel-wrapper.rst} (88%)
 create mode 100644 docs/tutorial/Composable-Kernel-examples.rst
 delete mode 100644 docs/tutorial/tutorial_hello_world.rst

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8ef5c2b726..0900b7a1f8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -20,10 +20,11 @@ Tejash Shah, 2019-2020
 Xiaoyan Zhou, 2020
 
 [Jianfeng Yan](https://github.com/j4yan), 2021-2022
-
+[Jun Liu](https://github.com/junliume), 2021-2024
 
 ## Product Manager
-[Jun Liu](https://github.com/junliume)
+[John Afaganis](https://github.com/afagaj)
+
 
 
 ## Contributors
diff --git a/README.md b/README.md
index c316a0a322..29d3d4e85a 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
     ```bash
     make -j install
     ```
+    **[See Note on -j](#notes)**
 
 ## Optional post-install steps
 
@@ -146,7 +147,8 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
     python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
     ```
 
-Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+### Notes
+The `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
 However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
 crash. On average, you should expect each thread to use ~2Gb of RAM.
 Depending on the number of CPU cores and the amount of RAM on your system, you may want to
@@ -211,4 +213,4 @@ script/uninstall_precommit.sh
 ```
 
 If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
-`git commit` command.
+`git commit` command.
\ No newline at end of file
diff --git a/docs/reference/Supported_Primitives_Guide.rst b/docs/conceptual/Composable-Kernel-math.rst
similarity index 85%
rename from docs/reference/Supported_Primitives_Guide.rst
rename to docs/conceptual/Composable-Kernel-math.rst
index e24acf5656..1c21fd8a11 100644
--- a/docs/reference/Supported_Primitives_Guide.rst
+++ b/docs/conceptual/Composable-Kernel-math.rst
@@ -1,18 +1,15 @@
 .. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
+  :description: Composable Kernel mathematical basis
+  :keywords: composable kernel, CK, ROCm, API, mathematics, algorithm
 
 .. _supported-primitives:
 
 ********************************************************************
-Supported Primitives Guide
+Composable Kernel mathematical basis
 ********************************************************************
 
-This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
+This is an introduction to the math which underpins the algorithms implemented in Composable Kernel.
 
-------------
-Softmax
-------------
 
 For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` you can decompose the
 softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
diff --git a/docs/conceptual/Composable-Kernel-structure.rst b/docs/conceptual/Composable-Kernel-structure.rst
new file mode 100644
index 0000000000..43c3603b95
--- /dev/null
+++ b/docs/conceptual/Composable-Kernel-structure.rst
@@ -0,0 +1,29 @@
+.. meta::
+  :description: Composable Kernel structure
+  :keywords: composable kernel, CK, ROCm, API, structure
+
+.. _what-is-ck:
+
+********************************************************************
+Composable Kernel structure
+********************************************************************
+
+The Composable Kernel library uses a tile-based programming model and tensor coordinate transformation to achieve performance portability and code maintainability. Tensor coordinate transformation is a complexity reduction technique for complex machine learning operators.
+  
+
+.. image:: ../data/ck_component.png
+   :alt: CK Components
+
+
+The Composable Kernel library consists of four layers: 
+
+* a templated tile operator layer
+* a templated kernel and invoker layer
+* an instantiated kernel and invoker layer
+* a client API layer.
+
+A wrapper component is included to simplify tensor transform operations.
+
+.. image:: ../data/ck_layer.png
+   :alt: CK Layers
+   
\ No newline at end of file
diff --git a/docs/conceptual/what-is-ck.rst b/docs/conceptual/what-is-ck.rst
deleted file mode 100644
index 36785fc6ca..0000000000
--- a/docs/conceptual/what-is-ck.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _what-is-ck:
-
-********************************************************************
-What is the Composable Kernel library
-********************************************************************
-
-
-Methodology
-===========
-
-The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
-
-CK utilizes two concepts to achieve performance portability and code maintainability:
-
-* A tile-based programming model
-* Algorithm complexity reduction for complex ML operators using an innovative technique called
-  "Tensor Coordinate Transformation".
-
-.. image:: ../data/ck_component.png
-   :alt: CK Components
-
-
-Code Structure
-==============
-
-The CK library is structured into 4 layers:
-
-* "Templated Tile Operators" layer
-* "Templated Kernel and Invoker" layer
-* "Instantiated Kernel and Invoker" layer
-* "Client API" layer
-
-It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
-
-.. image:: ../data/ck_layer.png
-   :alt: CK Layers
-   
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 30ef672f84..82e4c48001 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,30 +8,33 @@
 Composable Kernel User Guide
 ********************************************************************
 
-The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++. This document contains instructions for installing, using, and contributing to the Composable Kernel project. To learn more see :ref:`what-is-ck`.
+The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_. 
 
-The CK documentation is structured as follows:
+The Composable Kernel repository is located at `https://github.com/ROCm/composable-kernel <https://github.com/ROCm/composable-kernel>`_.
 
 .. grid:: 2
   :gutter: 3
 
-  .. grid-item-card:: Installation
+  .. grid-item-card:: Install
 
-    * :ref:`docker-hub`
+    * :doc:`Composable Kernel prerequisites <./install/Composable-Kernel-prerequisites>`
+    * :doc:`Build and install Composable Kernel <./install/Composable-Kernel-install>`
+    * :doc:`Build and install Composable Kernel on a Docker image <./install/Composable-Kernel-Docker>`
 
   .. grid-item-card:: Conceptual
 
-    * :ref:`what-is-ck`
+    * :doc:`Composable Kernel structure <./conceptual/Composable-Kernel-structure>`
+    * :doc:`Composable Kernel mathematical basis <./conceptual/Composable-Kernel-math>`
 
-  .. grid-item-card:: API reference
+  .. grid-item-card:: Tutorials
+
+    * :doc:`Composable Kernel examples and tests <./tutorial/Composable-Kernel-examples>`
+
+  .. grid-item-card:: Reference
 
-    * :ref:`supported-primitives`
     * :ref:`api-reference`
     * :ref:`wrapper`
 
-  .. grid-item-card:: Tutorial
-
-    * :ref:`hello-world`
 
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
 
diff --git a/docs/install/Composable-Kernel-Docker.rst b/docs/install/Composable-Kernel-Docker.rst
new file mode 100644
index 0000000000..d40cc2bff5
--- /dev/null
+++ b/docs/install/Composable-Kernel-Docker.rst
@@ -0,0 +1,16 @@
+.. meta::
+  :description: Composable Kernel docker files
+  :keywords: composable kernel, CK, ROCm, API, docker
+
+.. _docker-hub:
+
+********************************************************************
+Composable Kernel Docker containers
+********************************************************************
+
+Docker images that include all the required prerequisites for building Composable Kernel are available on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_.
+
+The images also contain `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_, `CMake <https://cmake.org/getting-started/>`_, and the `ROCm LLVM compiler infrastructure <https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html>`_.
+
+Composable Kernel Docker images are named according to their operating system and ROCm version. For example, a Docker image named ``ck_ub22.04_rocm6.3`` would correspond to an Ubuntu 22.04 image with ROCm 6.3. 
+
diff --git a/docs/install/Composable-Kernel-install.rst b/docs/install/Composable-Kernel-install.rst
new file mode 100644
index 0000000000..61b1fe0fcb
--- /dev/null
+++ b/docs/install/Composable-Kernel-install.rst
@@ -0,0 +1,72 @@
+.. meta::
+  :description: Composable Kernel build and install
+  :keywords: composable kernel, CK, ROCm, API, documentation, install
+
+******************************************************
+Building and installing Composable Kernel with CMake
+******************************************************
+
+Before you begin, clone the `Composable Kernel GitHub repository <https://github.com/ROCm/composable_kernel.git>`_ and create a ``build`` directory in its root:
+
+.. code:: shell
+
+  git clone https://github.com/ROCm/composable_kernel.git
+  cd composable_kernel
+  mkdir build
+
+Change directory to the ``build`` directory and generate the makefile using the ``cmake`` command. Two build options are required:
+
+* ``CMAKE_PREFIX_PATH``: The ROCm installation path. ROCm is installed in ``/opt/rocm`` by default.
+* ``CMAKE_CXX_COMPILER``: The path to the Clang compiler. Clang is found at ``/opt/rocm/llvm/bin/clang++`` by default.
+
+
+.. code:: shell
+
+  cd build
+  cmake ../. -D CMAKE_PREFIX_PATH="/opt/rocm" -D CMAKE_CXX_COMPILER="/opt/rocm/llvm/bin/clang++" [-D<OPTION1=VALUE1> [-D<OPTION2=VALUE2>] ...]
+
+
+Other build options are:
+
+* ``DISABLE_DL_KERNELS``: Set this to "ON" to not build deep learning (DL) and data parallel primitive (DPP) instances. 
+
+  .. note::
+
+      DL and DPP instances are useful on architectures that don't support XDL or WMMA.
+
+* ``CK_USE_FP8_ON_UNSUPPORTED_ARCH``: Set to ``ON`` to build FP8 data type instances on gfx90a without native FP8 support.
+* ``GPU_TARGETS``: Target architectures. Target architectures in this list must all be different versions of the same architectures. Enclose the list of targets in quotation marks. Separate multiple targets with semicolons (``;``). For example, ``cmake -D GPU_TARGETS="gfx908;gfx90a"``. This option is required to build tests and examples.
+* ``GPU_ARCHS``: Target architectures. Target architectures in this list are not limited to different versions of the same architectures. Enclose the list of targets in quotation marks. Separate multiple targets with semicolons (``;``). For example, ``cmake -D GPU_TARGETS="gfx908;gfx1100"``.
+* ``CMAKE_BUILD_TYPE``: The build type. Can be ``None``, ``Release``, ``Debug``, ``RelWithDebInfo``, or ``MinSizeRel``. CMake will use ``Release`` by default.
+
+.. Note::
+
+  If neither ``GPU_TARGETS`` nor ``GPU_ARCHS`` is specified, Composable Kernel will be built for all targets supported by the compiler.
+
+Build Composable Kernel using the generated makefile. This will build the library, the examples, and the tests, and save them to ``bin``.
+
+.. code:: shell
+
+    make -j20
+
+The ``-j`` option speeds up the build by using multiple threads in parallel. For example, ``-j20`` uses twenty threads in parallel. On average, each thread will use 2GB of memory. Make sure that the number of threads you use doesn't exceed the available memory in your system.
+
+Using ``-j`` alone will launch an unlimited number of threads and is not recommended.
+
+Install the Composable Kernel library:
+
+.. code:: shell
+  
+  make install
+
+After running ``make install``, the Composable Kernel files will be saved to the following locations:
+
+* Library files: ``/opt/rocm/lib/``
+* Header files: ``/opt/rocm/include/ck/`` and ``/opt/rocm/include/ck_tile/``
+* Examples, tests, and ckProfiler: ``/opt/rocm/bin/``
+
+For information about ckProfiler, see `the ckProfiler readme file <https://github.com/ROCm/composable_kernel/blob/develop/profiler/README.md>`_.
+
+For information about running the examples and tests, see :doc:`Composable Kernel examples and tests <../tutorial/Composable-Kernel-examples>`.
+
+
diff --git a/docs/install/Composable-Kernel-prerequisites.rst b/docs/install/Composable-Kernel-prerequisites.rst
new file mode 100644
index 0000000000..10be849ea6
--- /dev/null
+++ b/docs/install/Composable-Kernel-prerequisites.rst
@@ -0,0 +1,32 @@
+.. meta::
+  :description: Composable Kernel prerequisites
+  :keywords: composable kernel, CK, ROCm, API, documentation, prerequisites
+
+******************************************************
+Composable Kernel prerequisites
+******************************************************
+
+Docker images that include all the required prerequisites for building Composable Kernel are available on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_.
+
+The following prerequisites are required to build and install Composable Kernel:
+
+* cmake    
+* hip-rocclr    
+* iputils-ping
+* jq 
+* libelf-dev
+* libncurses5-dev  
+* libnuma-dev  
+* libpthread-stubs0-dev  
+* llvm-amdgpu  
+* mpich  
+* net-tools  
+* python3  
+* python3-dev  
+* python3-pip  
+* redis  
+* rocm-llvm-dev  
+* zlib1g-dev  
+* libzstd-dev  
+* openssh-server  
+* clang-format-12  
diff --git a/docs/install/dockerhub.rst b/docs/install/dockerhub.rst
deleted file mode 100644
index 87eb5a4f81..0000000000
--- a/docs/install/dockerhub.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _docker-hub:
-
-********************************************************************
-CK Docker Hub
-********************************************************************
-
-Why do I need this?
-===================
-
-To make things simpler, and bring Composable Kernel and its dependencies together, 
-docker images can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_. Docker images provide a complete image of the OS, the Composable Kernel library, and its dependencies in a single downloadable file. 
-
-Refer to `Docker Overview <https://docs.docker.com/get-started/overview/>`_ for more information on Docker images and containers.
-
-Which image is right for me?
-============================
-
-The image naming includes information related to the docker image. 
-For example ``ck_ub20.04_rocm6.0`` indicates the following:
-
-* ``ck`` - made for running Composable Kernel;
-* ``ub20.04`` - based on Ubuntu 20.04;
-* ``rocm6.0`` - ROCm platform version 6.0.
-
-Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. Use the ``docker pull`` command to download the file::
-
-    docker pull rocm/composable_kernel:ck_ub20.04_rocm6.0
-
-
-What is inside the image?
--------------------------
-
-The docker images have everything you need for running CK including:
-
-* `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_
-* `CMake <https://cmake.org/getting-started/>`_
-* `Compiler <https://github.com/ROCm/llvm-project>`_
-* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
-
-Running the docker container
-============================
-
-After downloading the docker image, you can start the container using one of a number of commands. Start with the ``docker run`` command as shown below::
-
-    docker run                                                            \
-    -it                                                                   \
-    --privileged                                                          \
-    --group-add sudo                                                      \
-    -w /root/workspace                                                    \
-    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-    rocm/composable_kernel:ck_ub20.04_rocm6.0                             \
-    /bin/bash
-
-After starting the bash shell, the docker container current folder is `~/workspace`. The library path is ``~/workspace/composable_kernel``. Navigate to the library to begin the tutorial as explained in :ref:`hello-world`:
-
-.. note::
-
-    If your current folder is different from `${HOME}`, adjust the line ``-v ${HOME}:/root/workspace`` in the ``docker run`` command to fit your folder structure.
-
-Stop and restart the docker image
-=================================
-
-After finishing the tutorial, or just when you have completed your work session, you can close the docker container, or stop the docker container to restart it at another time. Closing the docker container means that it is still in the active state, and can be resumed from where you left it. Stopping the container closes it, and returns the image to its initial state. 
-
-Use the ``Ctrl-D`` option to exit the container, while leaving it active, so you can return to the container in its current state to resume the tutorial, or pickup your project where you left off. 
-
-To restart the active container use the ``docker exec`` command to specify the container name and options as follows::
-
-    docker exec -it <container_name> bash
-
-Where: 
-
-* `exec` is the docker command
-* `-it` is the interactive option for `exec`
-* `<container_name>` specifies an active container on the system
-* `bash` specifies the command to run in the interactive shell
-
-.. note::
-
-    You can use the ``docker container ls`` command to list the active containers on the system.
-
-To start a container from the image, use the ``docker start`` command::
-
-    docker start <container_name>
-
-Then use the docker exec command as shown above to start the bash shell. 
-
-Use the ``docker stop`` command to stop the container and restore the image to its initial state::
-
-    docker stop <container_name>
-    
-Editing the docker image
-=======================
-
-If you want to customize the docker image, edit the
-`Dockerfile <https://github.com/ROCm/composable_kernel/blob/develop/Dockerfile>`_
-from the GitHub repository to suit your needs.
diff --git a/docs/reference/API_Reference_Guide.rst b/docs/reference/Composable-Kernel-API-reference.rst
similarity index 79%
rename from docs/reference/API_Reference_Guide.rst
rename to docs/reference/Composable-Kernel-API-reference.rst
index 0d2d41c1eb..b6ee9f7790 100644
--- a/docs/reference/API_Reference_Guide.rst
+++ b/docs/reference/Composable-Kernel-API-reference.rst
@@ -5,26 +5,20 @@
 .. _api-reference:
 
 ********************************************************************
-API reference guide
+Composable Kernel API reference guide
 ********************************************************************
 
-
-This document contains details of the APIs for the Composable Kernel (CK) library and introduces
-some of the key design principles that are used to write new classes that extend CK functionality.
+This document contains details of the APIs for the Composable Kernel library and introduces some of the key design principles that are used to write new classes that extend the functionality of the Composable Kernel library.
 
 =================
-CK Datatypes
-=================
-
------------------
 DeviceMem
------------------
+=================
 
 .. doxygenstruct:: DeviceMem
 
----------------------------
+=============================
 Kernels For Flashattention
----------------------------
+=============================
 
 The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
 the classes that are used in the CK GPU implementation of Flashattention.
diff --git a/docs/reference/wrapper.rst b/docs/reference/Composable-Kernel-wrapper.rst
similarity index 88%
rename from docs/reference/wrapper.rst
rename to docs/reference/Composable-Kernel-wrapper.rst
index 190fbcd445..4baa8d2b64 100644
--- a/docs/reference/wrapper.rst
+++ b/docs/reference/Composable-Kernel-wrapper.rst
@@ -1,20 +1,15 @@
 .. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
+  :description: Composable Kernel wrapper
+  :keywords: composable kernel, CK, ROCm, API, wrapper
 
 .. _wrapper:
 
 ********************************************************************
-Wrapper
+Composable Kernel wrapper
 ********************************************************************
 
--------------------------------------
-Description
--------------------------------------
 
-
-The CK library provides a lightweight wrapper for more complex operations implemented in 
-the library.
+The Composable Kernel library provides a lightweight wrapper to simplify the more complex operations.
 
 Example:
 
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 533b81cd39..ab82b7deb1 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -3,34 +3,38 @@ defaults:
 root: index
 subtrees:
 
-- caption: Conceptual
-  entries:
-  - file: conceptual/what-is-ck.rst
-    title: What is Composable Kernel?
-
 - caption: Install
   entries:
-  - file: install/dockerhub.rst
-    title: Docker Hub
-
-- caption: CK API Reference
+  - file: install/Composable-Kernel-prerequisites.rst
+    title: Composable Kernel prerequisites
+  - file: install/Composable-Kernel-install.rst
+    title: Build and install Composable Kernel
+  - file: install/Composable-Kernel-Docker.rst
+    title: Composable Kernel Docker images
+    
+- caption: Conceptual
   entries:
-  - file: reference/Supported_Primitives_Guide.rst
-    title: Supported Primitives
-  - file: reference/API_Reference_Guide.rst
-    title: API Reference
-  - file: reference/wrapper.rst
-    title: Wrapper
+  - file: conceptual/Composable-Kernel-structure.rst
+    title: Composable Kernel structure
+  - file: conceptual/Composable-Kernel-math.rst
+    title: Composable Kernel mathematical basis
 
 - caption: Tutorial
   entries:
-  - file: tutorial/tutorial_hello_world.rst
-    title: Hello World Tutorial
+  - file: tutorial/Composable-Kernel-examples.rst
+    title: Composable Kernel examples
+
+- caption: Reference
+  entries:
+  - file: reference/Composable-Kernel-API-reference.rst
+    title: Composable Kernel API reference
+  - file: reference/Composable-Kernel-wrapper.rst
+    title: Composable Kernel Wrapper
 
 - caption: About
   entries:
   - file: Contributors_Guide.rst
-    title: Contributing to CK
+    title: Contributing to Composable Kernel
   - file: license.rst
     title: License
     
\ No newline at end of file
diff --git a/docs/tutorial/Composable-Kernel-examples.rst b/docs/tutorial/Composable-Kernel-examples.rst
new file mode 100644
index 0000000000..62422d6f15
--- /dev/null
+++ b/docs/tutorial/Composable-Kernel-examples.rst
@@ -0,0 +1,40 @@
+.. meta::
+  :description: Composable Kernel examples and tests
+  :keywords: composable kernel, CK, ROCm, API, examples, tests
+
+********************************************************************
+Composable Kernel examples and tests
+********************************************************************
+
+After :doc:`building and installing Composable Kernel <../install/Composable-Kernel-install>`, the examples and tests will be moved to ``/opt/rocm/bin/``.
+
+All tests have the prefix ``test`` and all examples have the prefix ``example``.
+
+Use ``ctest`` with no arguments to run all examples and tests, or use ``ctest -R`` to run a single test. For example:
+
+.. code:: shell
+
+    ctest -R test_gemm_fp16
+
+Examples can be run individually as well. For example:
+
+.. code:: shell
+
+    ./bin/example_gemm_xdl_fp16 1 1 1
+
+For instructions on how to run individual examples and tests, see their README files in the |example|_ and |test|_ GitHub folders.
+
+To run smoke tests, use ``make smoke``.
+
+To run regression tests, use ``make regression``.
+
+In general, tests that run for under thirty seconds are included in the smoke tests and tests that run for over thirty seconds are included in the regression tests. 
+
+.. |example| replace:: ``example``
+.. _example: https://github.com/ROCm/composable_kernel/tree/develop/example
+
+.. |client_example| replace:: ``client_example``
+.. _client_example: https://github.com/ROCm/composable_kernel/tree/develop/client_example
+
+.. |test| replace:: ``test``
+.. _test: https://github.com/ROCm/composable_kernel/tree/develop/test
\ No newline at end of file
diff --git a/docs/tutorial/tutorial_hello_world.rst b/docs/tutorial/tutorial_hello_world.rst
deleted file mode 100644
index c31460785b..0000000000
--- a/docs/tutorial/tutorial_hello_world.rst
+++ /dev/null
@@ -1,165 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _hello-world:
-
-********************************************************************
-Hello World Tutorial
-********************************************************************
-
-This tutorial is for engineers dealing with artificial intelligence and machine learning who
-would like to optimize pipelines and improve performance using the Composable
-Kernel (CK) library. This tutorial provides an introduction to the CK library. You will build the library and run some examples using a "Hello World" example. 
-
-Description
-===========
-
-Modern AI technology solves more and more problems in a variety of fields, but crafting fast and
-efficient workflows is still challenging. CK can make the AI workflow fast
-and efficient. CK is a collection of optimized AI operator kernels with tools to create
-new kernels. The library has components required for modern neural network architectures
-including matrix multiplication, convolution, contraction, reduction, attention modules, a variety of activation functions, and fused operators.
-
-CK library acceleration features are based on:
-
-* Layered structure
-* Tile-based computation model
-* Tensor coordinate transformation
-* Hardware acceleration use
-* Support of low precision data types including fp16, bf16, int8 and int4
-
-If you need more technical details and benchmarking results read the following 
-`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
-
-To download the library visit the `composable_kernel repository <https://github.com/ROCm/composable_kernel>`_.
-
-Hardware targets
-================
-
-CK library fully supports `gfx908` and `gfx90a` GPU architectures, while only some operators are
-supported for `gfx1030` devices. Check your hardware to determine the target GPU architecture.
-
-==========     =========
-GPU Target     AMD GPU
-==========     =========
-gfx908 	       Radeon Instinct MI100
-gfx90a 	       Radeon Instinct MI210, MI250, MI250X
-gfx1030        Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
-==========     =========
-
-There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
-you don't have an AMD GPU at hand.
-
-Build the library
-=================
-
-This tutorial is based on the use of docker images as explained in :ref:`docker-hub`. Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. 
-
-.. note::
-
-   You can also `install ROCm <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>`_ on your system, clone the `Composable Kernel repository <https://github.com/ROCm/composable_kernel.git>`_ on GitHub, and use that to build and run the examples using the commands described below.
-
-Both the docker container and GitHub repository include the Composable Kernel library. Navigate to the library::
-
-    cd composable_kernel/
-
-Create and change to a ``build`` directory::
-
-    mkdir build && cd build
-
-The previous section discussed supported GPU architecture. Once you decide which hardware targets are needed, run CMake using the ``GPU_TARGETS`` flag::
-
-    cmake  \
-    -D CMAKE_PREFIX_PATH=/opt/rocm  \
-    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc  \
-    -D CMAKE_CXX_FLAGS="-O3"  \
-    -D CMAKE_BUILD_TYPE=Release  \
-    -D BUILD_DEV=OFF  \
-    -D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
-
-If everything goes well the CMake command will return::
-
-    -- Configuring done
-    -- Generating done
-    -- Build files have been written to: "/root/workspace/composable_kernel/build"
-
-Finally, you can build examples and tests::
-
-    make -j examples tests
-
-When complete you should see::
-
-    Scanning dependencies of target tests
-    [100%] Built target tests
-
-Run examples and tests
-======================
-
-Examples are listed as test cases as well, so you can run all examples and tests with::
-
-    ctest
-
-You can check the list of all tests by running::
-
-    ctest -N
-
-You can also run examples separately as shown in the following example execution::
-
-    ./bin/example_gemm_xdl_fp16 1 1 1
-
-The arguments ``1 1 1`` mean that you want to run this example in the mode: verify results with CPU, initialize matrices with integers, and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
-
-If you have a device based on `gfx908` or `gfx90a` architecture, and if the example runs as expected, you should see something like::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    Perf: 1.08153 ms, 119.136 TFlops, 89.1972 GB/s, DeviceGemm_Xdl_CShuffle<Default, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, 8, 4, 1, 2> LoopScheduler: Interwave, PipelineVersion: v1
-
-However, running it on a `gfx1030` device should result in the following::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
-
-Don't worry, some operators are supported on `gfx1030` architecture, so you can run a
-separate example like::
-
-    ./bin/example_gemm_dl_fp16 1 1 1
-
-and it should return something like::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-    arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-    arg.c_grid_desc_m_n_{ 3840, 4096}
-    launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-    Warm up 1 time
-    Start running 10 times...
-    Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
-
-.. note::
-
-    A new CMake flag ``DL_KERNELS`` has been added to the latest versions of CK. If you do not see the above results when running ``example_gemm_dl_fp16``, you might need to add ``-D DL_KERNELS=ON`` to your CMake command to build the operators supported on the `gfx1030` architecture.
-
-You can also run a separate test::
-
-    ctest -R test_gemm_fp16
-
-If everything goes well you should see something like::
-
-    Start 121: test_gemm_fp16
-    1/1 Test #121: test_gemm_fp16 ...................   Passed   51.81 sec
-
-    100% tests passed, 0 tests failed out of 1
-
-Summary
-=======
-
-In this tutorial you took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. In the next tutorial you will run kernels with different configurations to find out the best one for your hardware and task.
-
-P.S.: If you are running on a cloud instance, don't forget to switch off the cloud instance. 

From d142e15f5e18f9c9cfa66d1de6479d8f2583827d Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 27 Mar 2025 18:48:47 -0700
Subject: [PATCH 003/443] add gfx950 to default targets for rocm6.4+ (#2032)

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb0c254e06..4c1ca789f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,8 +167,10 @@ if(NOT ENABLE_ASAN_PACKAGING)
     if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
         # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
-    else()
+    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600300000 AND ${hip_VERSION_FLAT} LESS 600400000)
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
+    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600400000)
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950")
     endif()
 else()
     #build CK only for xnack-supported targets when using ASAN

From a82f338fb9fb5743f071c5e6831c3dd92fcd7982 Mon Sep 17 00:00:00 2001
From: felix <felix.li@amd.com>
Date: Fri, 28 Mar 2025 11:31:52 +0800
Subject: [PATCH 004/443] hotfix fix sorting int64 (#2025)

* fix sorting int64

* clang format

* fix example issue

* update WA issue #

---------

Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: carlushuang <carlus.huang@amd.com>
---
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    |  4 +--
 .../15_fused_moe/instances/fused_moe_api.cpp  | 29 ++++++++++---------
 include/ck_tile/core/config.hpp               |  4 +++
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 18 ++++++++----
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index f00d948f25..e59fcaedad 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -74,7 +74,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     int topk                = args.get_int("k");
     int seed                = args.get_int("seed");
     int unit_size           = args.get_int("unit");
-    int moe_buf_size        = args.get_int("moe_buf_size");
+    int64_t moe_buf_size    = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
     int kname               = args.get_int("kname");
     int warmup              = args.get_int("warmup");
     int repeat              = args.get_int("repeat");
@@ -175,7 +175,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                           unit_size,
                           num_experts,
                           topk,
-                          static_cast<ck_tile::index_t>(moe_buf_size * sizeof(float))};
+                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))};
 
     ck_tile::stream_config sc{nullptr,
                               true,
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
index 466420f066..f887d57aa9 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -19,20 +19,21 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
 
     auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
     auto a0 = fused_moesorting_args{
-        a.topk_ids_ptr,                              // const void* p_topk_ids;
-        a.topk_weight_ptr,                           // const void* p_weights;
-        a.local_expert_mask_ptr,                     // const void* p_local_expert_mask;
-        a.sorted_token_ids_ptr,                      // void* p_sorted_token_ids;
-        a.sorted_weight_ptr,                         // void* p_sorted_weights;
-        a.sorted_expert_ids_ptr,                     // void* p_sorted_expert_ids;
-        a.num_sorted_tiles_ptr,                      // void* p_total_tokens_post_pad;
-        a.o_ptr,                                     // void* p_moe_buf;
-        a.ws_ptr,                                    // void* p_ws;
-        a.num_tokens,                                // index_t tokens;
-        a.block_m,                                   // index_t unit_size;
-        a.num_experts,                               // index_t num_experts;
-        a.topk,                                      // index_t topk;
-        a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes;
+        a.topk_ids_ptr,          // const void* p_topk_ids;
+        a.topk_weight_ptr,       // const void* p_weights;
+        a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+        a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+        a.sorted_weight_ptr,     // void* p_sorted_weights;
+        a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+        a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+        a.o_ptr,                 // void* p_moe_buf;
+        a.ws_ptr,                // void* p_ws;
+        a.num_tokens,            // index_t tokens;
+        a.block_m,               // index_t unit_size;
+        a.num_experts,           // index_t num_experts;
+        a.topk,                  // index_t topk;
+        static_cast<ck_tile::long_index_t>(a.num_tokens) * a.stride_token *
+            o_data_bytes // index_t moe_buf_bytes;
     };
 
     auto t1 = fused_moegemm_traits{t.prec_i,
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index eeaf0dca6f..b1d201e30e 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -260,3 +260,7 @@ CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING)
 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 0
 #endif
 #endif
+
+#ifndef CK_TILE_WA_ISSUE_2028
+#define CK_TILE_WA_ISSUE_2028 1
+#endif
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index a1410d1f4f..6a7ccd2472 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -192,7 +192,7 @@ struct MoeSortingHostArgs
     index_t unit_size;      // this is the M_a of fused-moe kernel
     index_t num_experts;
     index_t topk;
-    index_t moe_buf_bytes;  // byte size of p_moe_buf
+    long_index_t moe_buf_bytes;  // byte size of p_moe_buf
 };
 
 template <typename Problem_>
@@ -219,7 +219,7 @@ struct MoeSortingKernel
         void* p_moe_buf;
         index_t tokens;
         index_t num_experts;
-        index_t moe_buf_bytes;
+        long_index_t moe_buf_bytes;
 
         index_t tokens_per_thread;
         index_t smem_rows;
@@ -426,7 +426,7 @@ struct MoeSortingKernel
         return row * total_col + col;
     }
 
-    CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes) const
+    CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, long_index_t buf_bytes) const
     {
         const index_t offset = (blockIdx.x - 1) * blockDim.x + threadIdx.x;
         if(offset < buf_bytes / 16)
@@ -1218,10 +1218,10 @@ CK_TILE_DEVICE void moe_sorting_wave_cumsum(data_t& thread_data)
 }
 
 template <index_t BLOCK_SIZE = 256>
-CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes, index_t gid)
+CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, long_index_t buf_bytes, index_t gid)
 {
     // const index_t offset = (blockIdx.x - 1) * BLOCK_SIZE + threadIdx.x;
-    index_t offset = gid * BLOCK_SIZE + threadIdx.x;
+    long_index_t offset = static_cast<long_index_t>(gid) * BLOCK_SIZE + threadIdx.x;
     if(offset < buf_bytes / 16)
     {
         buf[offset] = uint8x16_t{0};
@@ -1233,6 +1233,12 @@ CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes,
 // prefer to run mp kernel if is not oneshot
 CK_TILE_HOST bool moe_sorting_is_oneshot(int tokens_, int num_experts_)
 {
+#if CK_TILE_WA_ISSUE_2028
+    if(tokens_ >= 65536 * 2)
+    {
+        return true;
+    }
+#endif
     auto sub_token_          = moe_sorting_get_sub_token(tokens_, num_experts_);
     bool is_sub_token_onshot = tokens_ <= sub_token_;
     return is_sub_token_onshot;
@@ -1523,7 +1529,7 @@ struct MoeSortingMultiPhaseKernel_P2
         index_t num_experts;
         index_t mesh_stride; // mesh_stride for p_expert_mesh
         mdiv unit_size_mdiv;
-        index_t moe_buf_bytes;
+        long_index_t moe_buf_bytes;
     };
 
     CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)

From 8a20b62e9124c10f4f240dce2c312b0a332bce6c Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Fri, 28 Mar 2025 21:58:06 +0800
Subject: [PATCH 005/443] Reduce redundant space in bias tensor (#2024)

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 example/ck_tile/01_fmha/fmha_fwd.cpp                | 12 ++++++------
 include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index b3855e59df..8f6fb8df54 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -620,7 +620,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
     ck_tile::HostTensor<BiasDataType> bias_host(
         bias.type == bias_enum::elementwise_bias
-            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, shape_seqlen_k)
+            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, max_seqlen_k)
             : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
 
     ck_tile::HostTensor<SaccDataType> alibi_slope_host(
@@ -884,7 +884,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             else
                 return i_perm ? seqlen_knew : nhead_k * seqlen_knew;
         }();
-        const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
+        const ck_tile::index_t stride_bias    = (i_perm ? max_seqlen_k : 1 * max_seqlen_k);
         const ck_tile::index_t stride_randval = (max_seqlen_k);
         const ck_tile::index_t stride_o_acc   = (hdim_v);
         const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
@@ -909,7 +909,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 return i_perm ? hdim_v * seqlen_knew : seqlen_knew;
         }();
         const ck_tile::index_t nhead_stride_bias =
-            (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
+            (i_perm ? 0 * shape_seqlen_q * max_seqlen_k : 0 * max_seqlen_k);
         const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
         const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q);
@@ -925,7 +925,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             (0 < page_block_size ? (nhead_k * hdim_v * page_block_size)
                                  : (nhead_k * hdim_v * shape_seqlen_k));
         const ck_tile::index_t batch_stride_vnew    = (nhead_k * hdim_v * seqlen_knew);
-        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
+        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
         const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
         const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q);
@@ -1381,9 +1381,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
             ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
             // clang-format off
             if(i_perm)
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2] + key_offset); });
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
             else
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2] + key_offset); });
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
             // clang-format on
 
             // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index a578f0c2f4..1202524950 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -983,7 +983,7 @@ struct FmhaFwdKernel
             }
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+                batch_offset_bias = query_start * kargs.stride_bias;
             }
             if constexpr(kStoreLSE)
             {

From fc073b483e03caa7377e56a6b8b3573054e031fa Mon Sep 17 00:00:00 2001
From: Adel Johar <adel.johar@amd.com>
Date: Fri, 28 Mar 2025 15:12:27 +0100
Subject: [PATCH 006/443] Docs: Add precision support reference page (#1973)

* Docs: Add precision support reference page

* edit of the precision type content

* added more description on scalars

---------

Co-authored-by: spolifroni-amd <sandra.polifroni@amd.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 .gitignore                                    |  2 +
 docs/index.rst                                |  6 +-
 .../Composable_Kernel_custom_types.rst        | 39 +++++++++++
 ...mposable_Kernel_supported_scalar_types.rst | 69 +++++++++++++++++++
 .../Composable_Kernel_vector_utilities.rst    | 16 +++++
 docs/sphinx/_toc.yml.in                       |  9 ++-
 6 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 docs/reference/Composable_Kernel_custom_types.rst
 create mode 100644 docs/reference/Composable_Kernel_supported_scalar_types.rst
 create mode 100644 docs/reference/Composable_Kernel_vector_utilities.rst

diff --git a/.gitignore b/.gitignore
index f4d5ff7abd..599ef99e35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,6 +55,8 @@ _static/
 _templates/
 _toc.yml
 _doxygen/
+docs/doxygen/html
+docs/doxygen/xml
 
 # JetBrains IDE
 .idea/
diff --git a/docs/index.rst b/docs/index.rst
index 82e4c48001..15a9321d43 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,7 +8,7 @@
 Composable Kernel User Guide
 ********************************************************************
 
-The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_. 
+The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_.
 
 The Composable Kernel repository is located at `https://github.com/ROCm/composable-kernel <https://github.com/ROCm/composable-kernel>`_.
 
@@ -32,10 +32,12 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
 
   .. grid-item-card:: Reference
 
+    * :doc:`Composable Kernel supported scalar types <./reference/Composable_Kernel_supported_scalar_types>`
+    * :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
+    * :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
     * :ref:`api-reference`
     * :ref:`wrapper`
 
-
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
 
 You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
diff --git a/docs/reference/Composable_Kernel_custom_types.rst b/docs/reference/Composable_Kernel_custom_types.rst
new file mode 100644
index 0000000000..863d4131b9
--- /dev/null
+++ b/docs/reference/Composable_Kernel_custom_types.rst
@@ -0,0 +1,39 @@
+.. meta::
+  :description: Composable Kernel supported custom types
+  :keywords: composable kernel, custom, data types, support, CK, ROCm
+
+******************************************************
+Composable Kernel custom data types
+******************************************************
+
+Composable Kernel supports the use of custom types that provide a way to implement specialized numerical formats.
+
+To use custom types, a C++ type that implements the necessary operations for tensor computations needs to be created. These should include:
+
+* Constructors and initialization methods
+* Arithmetic operators if the type will be used in computational operations
+* Any conversion functions needed to interface with other parts of an application
+
+For example, to create a complex half-precision type:
+
+.. code:: cpp
+
+    struct complex_half_t
+    {
+        half_t real;
+        half_t img;
+    };
+
+    struct complex_half_t
+    {
+        using type = half_t;
+        type real;
+        type img;
+
+        complex_half_t() : real{type{}}, img{type{}} {}
+        complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
+    };
+
+Custom types can be particularly useful for specialized applications such as complex number arithmetic,
+custom quantization schemes, or domain-specific number representations.
+
diff --git a/docs/reference/Composable_Kernel_supported_scalar_types.rst b/docs/reference/Composable_Kernel_supported_scalar_types.rst
new file mode 100644
index 0000000000..7ea1a9eaeb
--- /dev/null
+++ b/docs/reference/Composable_Kernel_supported_scalar_types.rst
@@ -0,0 +1,69 @@
+.. meta::
+  :description: Composable Kernel supported scalar types
+  :keywords: composable kernel, scalar, data types, support, CK, ROCm
+
+***************************************************
+Composable Kernel supported scalar data types
+***************************************************
+
+The Composable Kernel library provides support for the following scalar data types:
+
+.. list-table::
+    :header-rows: 1
+    :widths: 25 15 60
+
+    * - Type
+      - Bit Width
+      - Description
+
+    * - ``double``
+      - 64-bit
+      - Standard IEEE 754 double precision floating point
+
+    * - ``float``
+      - 32-bit
+      - Standard IEEE 754 single precision floating point
+
+    * - ``int32_t``
+      - 32-bit
+      - Standard signed 32-bit integer
+
+    * - ``int8_t``
+      - 8-bit
+      - Standard signed 8-bit integer
+
+    * - ``uint8_t``
+      - 8-bit
+      - Standard unsigned 8-bit integer
+
+    * - ``bool``
+      - 1-bit
+      - Boolean type
+
+    * - ``ck::half_t``
+      - 16-bit
+      - IEEE 754 half precision floating point with 5 exponent bits, 10 mantissa bits, and 1 sign bit
+
+    * - ``ck::bhalf_t``
+      - 16-bit
+      - Brain floating point with 8 exponent bits, 7 mantissa bits, and 1 sign bit
+
+    * - ``ck::f8_t``
+      - 8-bit
+      - 8-bit floating point (E4M3 format) with 4 exponent bits, 3 mantissa bits, and 1 sign bit
+
+    * - ``ck::bf8_t``
+      - 8-bit
+      - 8-bit brain floating point (E5M2 format) with 5 exponent bits, 2 mantissa bits, and 1 sign bit
+
+    * - ``ck::f4_t``
+      - 4-bit
+      - 4-bit floating point format (E2M1 format) with 2 exponent bits, 1 mantissa bit, and 1 sign bit
+
+    * - ``ck::f6_t``
+      - 6-bit
+      - 6-bit floating point format (E2M3 format) with 2 exponent bits, 3 mantissa bits, and 1 sign bit
+
+    * - ``ck::bf6_t``
+      - 6-bit
+      - 6-bit brain floating point format (E3M2 format) with 3 exponent bits, 2 mantissa bits, and 1 sign bit
\ No newline at end of file
diff --git a/docs/reference/Composable_Kernel_vector_utilities.rst b/docs/reference/Composable_Kernel_vector_utilities.rst
new file mode 100644
index 0000000000..3103653191
--- /dev/null
+++ b/docs/reference/Composable_Kernel_vector_utilities.rst
@@ -0,0 +1,16 @@
+.. meta::
+  :description: Composable Kernel supported precision types and custom type support
+  :keywords: composable kernel, precision, data types, ROCm
+
+******************************************************
+Composable Kernel vector template utilities
+******************************************************
+
+Composable Kernel includes template utilities for creating vector types with customizable widths. These template utilities also flatten nested vector types into a single, wider vector, preventing the creation of vectors of vectors.
+
+Vectors composed of supported scalar and custom types can be created with the ``ck::vector_type`` template.
+
+For example, ``ck::vector_type<float, 4>`` creates a vector composed of four floats and ``ck::vector_type<ck::half_t, 8>`` creates a vector composed of eight half-precision scalars.
+
+For vector operations to be valid, the underlying types must be either a :doc:`supported scalar type <Composable_Kernel_supported_scalar_types>` or :doc:`a custom type <Composable_Kernel_custom_types>` that implements the required operations.
+
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index ab82b7deb1..df98998224 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -11,7 +11,7 @@ subtrees:
     title: Build and install Composable Kernel
   - file: install/Composable-Kernel-Docker.rst
     title: Composable Kernel Docker images
-    
+
 - caption: Conceptual
   entries:
   - file: conceptual/Composable-Kernel-structure.rst
@@ -26,6 +26,12 @@ subtrees:
 
 - caption: Reference
   entries:
+  - file: reference/Composable_Kernel_supported_scalar_types.rst
+    title: Composable Kernel scalar types
+  - file: reference/Composable_Kernel_custom_types.rst
+    title: Composable Kernel custom types
+  - file: reference/Composable_Kernel_vector_utilities.rst
+    title: Composable Kernel vector utilities
   - file: reference/Composable-Kernel-API-reference.rst
     title: Composable Kernel API reference
   - file: reference/Composable-Kernel-wrapper.rst
@@ -37,4 +43,3 @@ subtrees:
     title: Contributing to Composable Kernel
   - file: license.rst
     title: License
-    
\ No newline at end of file

From 16b15e336a13e60f54ac9ea03975b9cf44b1d6f3 Mon Sep 17 00:00:00 2001
From: jefyang1 <146495389+jefyang1@users.noreply.github.com>
Date: Mon, 31 Mar 2025 09:20:52 -0700
Subject: [PATCH 007/443] Fix gemm universal and grouped_conv_fwd test failures
 on gfx950 (#2031)

---
 .../device_grouped_conv_fwd_xdl_comp_instance.hpp            | 5 ++++-
 .../device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp    | 2 +-
 .../device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp    | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index e7bbf8a26a..f491474d38 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -84,7 +84,6 @@ using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -94,6 +93,7 @@ using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
     // clang-format on
     >;
 
+// instances not working on gfx950
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -102,6 +102,7 @@ template <index_t NDimSpatial,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_bf16_comp_instances_part2 = std::tuple<
     // clang-format off
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -143,6 +144,7 @@ using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
     // clang-format on
     >;
 
+// instances not working on gfx950
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -219,6 +221,7 @@ using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
     // clang-format on
     >;
 
+// instances not working on gfx950
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
index 6f0d722b60..b962d75b12 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -48,7 +48,6 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -62,6 +61,7 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2 = std::tuple<
     // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 50fdca9348..9f142ad831 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -44,7 +44,6 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -55,6 +54,7 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tu
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
     // clang-format off
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
     // clang-format on

From dd4c12b155c6eece31e851b3aa46939d00f6adbd Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Mon, 31 Mar 2025 19:30:17 -0700
Subject: [PATCH 008/443] f8/bf16 GEMM Stream-K  (#1879)

---
 CHANGELOG.md                                  |   2 +-
 example/01_gemm/CMakeLists.txt                |   6 +
 .../01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp  |  64 +++++++
 .../gpu/gemm_universal_streamk.hpp            | 129 ++++++++++++-
 .../gpu/gemm_universal_streamk/CMakeLists.txt |  23 ++-
 ...versal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp |   2 +
 ...bf16_mk_nk_mn_comp_mnkpadding_instance.cpp |  31 +++
 ..._bf16_mk_nk_mn_comp_mnpadding_instance.cpp |  30 +++
 ...16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp |  31 +++
 ...16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp |  31 +++
 ..._universal_streamk_f8_f8_bf16_mk_kn_mn.hpp |  99 ++++++++++
 ...f8_bf16_mk_kn_mn_comp_default_instance.cpp |  24 +++
 ...8_bf16_mk_kn_mn_comp_kpadding_instance.cpp |  24 +++
 ..._bf16_mk_kn_mn_comp_nkpadding_instance.cpp |  24 +++
 ..._bf16_mk_kn_mn_mem_v1_default_instance.cpp |  25 +++
 ...bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp |  25 +++
 ...f16_mk_kn_mn_mem_v1_nkpadding_instance.cpp |  25 +++
 ..._bf16_mk_kn_mn_mem_v2_default_instance.cpp |  25 +++
 ...bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp |  25 +++
 ...f16_mk_kn_mn_mem_v2_nkpadding_instance.cpp |  25 +++
 ..._universal_streamk_f8_f8_bf16_mk_nk_mn.hpp | 107 +++++++++++
 ...f8_bf16_mk_nk_mn_comp_default_instance.cpp |  24 +++
 ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  24 +++
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |  25 +++
 ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  25 +++
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  25 +++
 ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  25 +++
 .../profile_gemm_universal_streamk_impl.hpp   |  38 ++--
 .../src/profile_gemm_universal_streamk.cpp    |  44 +++--
 test/CMakeLists.txt                           |   4 +
 test/gemm_universal_streamk/CMakeLists.txt    |  15 ++
 ...t_gemm_universal_streamk_ut_cases_bf16.inc | 177 ++++++++++++++++++
 ...t_gemm_universal_streamk_ut_cases_fp16.inc | 113 +++++++++++
 ...st_gemm_universal_streamk_ut_cases_fp8.inc | 113 +++++++++++
 .../test_gemm_universal_streamk_util.hpp      | 104 ++++++++++
 .../test_gemm_universal_streamk_xdl_bf16.cpp  |  85 +++++++++
 .../test_gemm_universal_streamk_xdl_fp16.cpp  |  84 +++++++++
 .../test_gemm_universal_streamk_xdl_fp8.cpp   |  74 ++++++++
 38 files changed, 1738 insertions(+), 38 deletions(-)
 create mode 100644 example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp
 mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
 create mode 100755 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 mode change 100644 => 100755 profiler/src/profile_gemm_universal_streamk.cpp
 mode change 100644 => 100755 test/CMakeLists.txt
 create mode 100755 test/gemm_universal_streamk/CMakeLists.txt
 create mode 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
 create mode 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
 create mode 100755 test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
 create mode 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
 create mode 100755 test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
 create mode 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
 create mode 100755 test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0d07abfc24..de831a6898 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
 * Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
-
+* Added support for Stream-K version of mixed fp8/bf16 GEMM
 ### Optimized
 
 None
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index ee9f959d94..96678d275a 100755
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -28,8 +28,14 @@ add_example_executable(example_gemm_xdl_fp16_v3 gemm_xdl_fp16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3)
 add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3)
+
 add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
+
+
+add_example_executable(example_gemm_xdl_fp16_fp8_streamk_v3 gemm_xdl_fp16_fp8_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_streamk_v3)
+
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
 
diff --git a/example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp
new file mode 100644
index 0000000000..bd38eb17ee
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp16_fp8_streamk_v3.cpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2_Streamk_Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        64,
+        16, 16, 
+        256, 8, 16,
+        16,   16,
+        1,    1, 
+        S<32, 2, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<16, 4, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 16, 16, 0,
+        1, 1, S<1, 16, 1, 4>, 4,
+        ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+// clang-format on
+
+using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+#include "run_gemm_example_streamk_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
index 18203e7d5c..372e744bd7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
@@ -635,7 +635,7 @@ void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadd
                                                       PassThrough>>>& instances);
 #endif
 
-#if(defined(CK_ENABLE_FP8))
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
 void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemm_Streamk_V2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -834,6 +834,83 @@ void add_device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding
         instances);
 #endif
 
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -1027,7 +1104,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
         }
 #endif
 
-#if(defined(CK_ENABLE_FP8))
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, half_t>)
         {
@@ -1141,6 +1218,54 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
             }
         }
 #endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+            }
+        }
+#endif
 
         return op_ptrs;
     }
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
old mode 100644
new mode 100755
index e1612bcd24..b7391d3446
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
@@ -21,9 +21,7 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-
         device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
-
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -44,7 +42,6 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
-
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -65,7 +62,6 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
-        
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
@@ -101,6 +97,21 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
-        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp)
-
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+)
 add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
index 209d8f644e..959c1c0992 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -51,8 +51,10 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
+
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
 
+
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100755
index 0000000000..a16d3988fe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100755
index 0000000000..3716b46f6c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
new file mode 100755
index 0000000000..00ed1698dd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
new file mode 100755
index 0000000000..bee03061a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                                GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
new file mode 100755
index 0000000000..5bf5c01b97
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmNKPadding  = GemmSpecialization::NKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        //Only enable these instances on gfx94x
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,   8,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   128,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    128, 16,   4,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    128, 16,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    128, 16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 1, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    128, 16,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    128, 16,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128,  8,   8,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..689c2bbbec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..149b830a83
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
new file mode 100644
index 0000000000..db5082f25c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_comp_instances<GemmNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..cd2ad4f654
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..1ed170785b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
new file mode 100644
index 0000000000..9e28c16191
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Intrawave,
+                                                                            GemmNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..85dc38fbe4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 0000000000..2f188ac939
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
new file mode 100644
index 0000000000..94684921c7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn_mem_instances<Interwave,
+                                                                            GemmNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100755
index 0000000000..540b90e54b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,  16,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,    128, 16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    128, 16,  16,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    128, 16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    128, 16,  16,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,     64, 16,  16,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    128, 16,  16,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    128, 16,  16,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    128, 16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    128, 16,  16,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    128, 16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..df07e21eef
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..22ffb264b7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..d5e84297d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..314aec027a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..eb0c871a04
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
new file mode 100644
index 0000000000..df92ed71c4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm_Streamk_V2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
index 72194e8e61..d145ab1766 100644
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -11,6 +11,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp"
 
@@ -20,12 +21,14 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
 
 namespace ck {
 namespace profiler {
 
 template <typename ADataType,
           typename BDataType,
+          typename ComputeDataType,
           typename AccDataType,
           typename CDataType,
           typename ALayout,
@@ -66,7 +69,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
     Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
     int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
     int rotating_count    = std::max(
@@ -103,6 +108,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
+    DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
+                                   c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
+
     a_device_buf.ToDevice(a_m_k.mData.data());
     b_device_buf.ToDevice(b_k_n.mData.data());
 
@@ -125,21 +133,22 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     // Run reference GEMM
     if(do_verification)
     {
-        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                BDataType,
-                                                                                CDataType,
-                                                                                AccDataType,
-                                                                                AElementOp,
-                                                                                BElementOp,
-                                                                                CElementOp>;
 
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
+        // Use CPU validation
+        // Note: GPU validation is not supported for fp8 !!!
+        using ReferenceGemmInstanceCPU = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                   BDataType,
+                                                                                   CDataType,
+                                                                                   AccDataType,
+                                                                                   AElementOp,
+                                                                                   BElementOp,
+                                                                                   CElementOp,
+                                                                                   ComputeDataType>;
+        auto ref_gemm_cpu              = ReferenceGemmInstanceCPU{};
+        auto ref_invoker_cpu           = ref_gemm_cpu.MakeInvoker();
+        auto ref_argument_cpu          = ref_gemm_cpu.MakeArgument(
             a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-
-        ref_invoker.Run(ref_argument);
+        ref_invoker_cpu.Run(ref_argument_cpu);
     }
 
     std::string best_op_name;
@@ -157,7 +166,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
             0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP,
                             // 2:2-tile Stream-K + DP
 
-        if(Grid_size == -1)
+        if(Grid_size != -1)
         {
             grid_size_list = {Grid_size};
         }
@@ -203,6 +212,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
                     {
                         c_device_buf.FromDevice(c_m_n_device_result.mData.data());
 
+                        // Always compare against CPU reference results computed earlier
                         pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
 
                         if(do_log)
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100644
new mode 100755
index b0f66a0c73..4d1ab811ee
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -26,6 +26,7 @@ enum struct GemmDataType
     F8_F16_F16,     // 4
     F16_F8_F16,     // 5
     F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
 };
 
 #define OP_NAME "gemm_universal_streamk"
@@ -37,7 +38,7 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
     {
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, "
-               "comp f8)\n");
+               "comp f8;  7: f8->bf16,)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -112,15 +113,17 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
     auto profile = [&](auto a_type,
                        auto b_type,
+                       auto comp_type,
                        auto acc_type,
                        auto c_type,
                        auto a_layout,
                        auto b_layout,
                        auto c_layout) {
-        using ADataType   = decltype(a_type);
-        using BDataType   = decltype(b_type);
-        using AccDataType = decltype(acc_type);
-        using CDataType   = decltype(c_type);
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using CDataType       = decltype(c_type);
 
         using ALayout = decltype(a_layout);
         using BLayout = decltype(b_layout);
@@ -132,6 +135,7 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
         bool pass = ck::profiler::profile_gemm_universal_streamk_impl<ADataType,
                                                                       BDataType,
+                                                                      ComputeDataType,
                                                                       AccDataType,
                                                                       CDataType,
                                                                       ALayout,
@@ -158,46 +162,56 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
     if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F8{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F16{}, F8{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
     else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F8{}, F16{}, F32{}, F32{}, F16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F8{}, F16{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
 #endif
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Row{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
     else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
     {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{});
+        return profile(BF16{}, BF16{}, F32{}, F32{}, BF16{}, Col{}, Col{}, Row{});
     }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+#endif
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
old mode 100644
new mode 100755
index ab6e36d00b..38fbf5385f
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,6 +15,9 @@ set(REGRESSION_TESTS
     test_gemm_splitk
     test_batched_gemm
     test_gemm_universal
+    test_gemm_universal_streamk_fp16
+    test_gemm_universal_streamk_bf16
+    test_gemm_universal_streamk_fp8
     test_batched_gemm_softmax_gemm_fp16
     test_batched_gemm_softmax_gemm_permute_fp16
     test_batched_gemm_bias_softmax_gemm_permute_fp16
@@ -239,6 +242,7 @@ add_subdirectory(gemm_add)
 add_subdirectory(gemm_layernorm)
 add_subdirectory(gemm_split_k)
 add_subdirectory(gemm_universal)
+add_subdirectory(gemm_universal_streamk)
 add_subdirectory(gemm_reduce)
 add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
diff --git a/test/gemm_universal_streamk/CMakeLists.txt b/test/gemm_universal_streamk/CMakeLists.txt
new file mode 100755
index 0000000000..6e42bfe396
--- /dev/null
+++ b/test/gemm_universal_streamk/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_gtest_executable(test_gemm_universal_streamk_fp16 test_gemm_universal_streamk_xdl_fp16.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_streamk_fp16 PRIVATE utility device_gemm_universal_streamk_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_streamk_fp8 test_gemm_universal_streamk_xdl_fp8.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_streamk_fp8 PRIVATE utility device_gemm_universal_streamk_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_streamk_bf16 test_gemm_universal_streamk_xdl_bf16.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_streamk_bf16 PRIVATE utility device_gemm_universal_streamk_instance)
+endif()
+
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
new file mode 100644
index 0000000000..b6970c4233
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
@@ -0,0 +1,177 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
new file mode 100644
index 0000000000..b2fdfe8193
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
new file mode 100755
index 0000000000..b3da08f703
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
new file mode 100644
index 0000000000..ef3509c0ca
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_gemm_universal_streamk_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk : public testing::Test
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using F32 = float;
+
+    protected:
+    using ALayout         = std::tuple_element_t<0, Tuple>;
+    using BLayout         = std::tuple_element_t<1, Tuple>;
+    using CLayout         = Row;
+    using ADataType       = std::tuple_element_t<2, Tuple>;
+    using BDataType       = std::tuple_element_t<3, Tuple>;
+    using ComputeDataType = std::tuple_element_t<4, Tuple>;
+    using CDataType       = std::tuple_element_t<5, Tuple>;
+
+    public:
+    static constexpr bool verify_     = true;
+    static constexpr int init_method_ = 1; // decimal value initialization
+    static constexpr bool log_        = false;
+    static constexpr bool bench_      = false; // measure kernel performance
+
+    std::vector<int> grid_size_list;
+    std::vector<int> streamk_sel_list;
+
+    void SetUp() override
+    {
+        grid_size_list   = {38, 114, 228}; // {38, 76, 114, 152, 190, 228, 266, 304, 342, 380};
+        streamk_sel_list = {0, 1, 2};      // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
+                                           // Stream-K+ DP, // {0, 1, 2, 3, 4}
+        // 2:2-tile Stream-K + DP
+    }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto streamk_sel : streamk_sel_list)
+            for(auto grid_size : grid_size_list)
+            {
+                RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, grid_size);
+            }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int streamk_sel,
+                   int Grid_size,
+                   int n_warmup = 1,
+                   int n_iter   = 10)
+    {
+        bool pass = ck::profiler::profile_gemm_universal_streamk_impl<ADataType,
+                                                                      BDataType,
+                                                                      ComputeDataType,
+                                                                      F32,
+                                                                      CDataType,
+                                                                      ALayout,
+                                                                      BLayout,
+                                                                      CLayout>(verify_,
+                                                                               init_method_,
+                                                                               log_,
+                                                                               bench_,
+                                                                               M,
+                                                                               N,
+                                                                               K,
+                                                                               StrideA,
+                                                                               StrideB,
+                                                                               StrideC,
+                                                                               streamk_sel,
+                                                                               Grid_size,
+                                                                               n_warmup,
+                                                                               n_iter);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
new file mode 100755
index 0000000000..1aef74cf18
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_streamk_util.hpp"
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_MK_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_MK_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_KM_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_BF16_KM_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
+using KernelTypes_KM_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_BF16_KM_NK, KernelTypes_KM_NK);
+
+#include "test_gemm_universal_streamk_ut_cases_bf16.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
new file mode 100644
index 0000000000..43b122ff0d
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp16.cpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_streamk_util.hpp"
+
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_MK_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_MK_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_KM_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP16_KM_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+#endif
+
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_streamk_ut_cases_fp16.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
new file mode 100755
index 0000000000..3836de056c
--- /dev/null
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_streamk_util.hpp"
+
+using F8   = ck::f8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP8_MK_KN
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_Streamk_FP8_MK_NK
+    : public ck::test::TestGemmUniversal_Streamk<
+          typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_Streamk_FP8_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_streamk_ut_cases_fp8.inc"

From 6355ee7ca5c6f3c1a22ee40f58fe6dc956b94242 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 1 Apr 2025 16:11:42 +0200
Subject: [PATCH 009/443] Improve compilation time for grouped conv fwd (#2039)

* Improve compilation time for grouped conv fwd

* Fix
---
 .../gpu/grouped_convolution_forward.hpp       |  12 ++
 .../grouped_convolution_forward_comp_xdl.inc  | 112 ++++++++++++++++++
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   8 ++
 ...gchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp |  43 +++++++
 ...l_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp |  24 ----
 ...w_gkcyx_ngkhw_bf16_comp_part2_instance.cpp |  45 +++++++
 ...ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp |  43 +++++++
 ...dl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp |  24 ----
 ...hw_gkcyx_ngkhw_f16_comp_part2_instance.cpp |  45 +++++++
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |  70 +++++++++++
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |  80 +------------
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |  70 +++++++++++
 ...nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp |  70 +++++++++++
 ...dl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp |  80 +------------
 ...gc_gkyxc_nhwgk_f16_comp_part2_instance.cpp |  70 +++++++++++
 15 files changed, 590 insertions(+), 206 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index c2e1337737..0b7df6ecfb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -226,6 +226,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
@@ -245,6 +248,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
@@ -298,6 +304,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
+                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_inter_instances(
@@ -315,6 +324,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
+                    op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_inter_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index 1f924737cd..48bc8942a8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -23,6 +23,34 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -39,6 +67,34 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP32
@@ -88,6 +144,34 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_BF16
@@ -104,6 +188,34 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP32
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index b095840a34..c1790901ec 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -60,10 +60,18 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
    # NGCHW, GKCYX, NGKHW
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
    #dl
    # GNHWC, GKYXC, GNHWK
    dl/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..6cb4ca5652
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NGCHW,
+                                                               GKCYX,
+                                                               Empty_Tuple,
+                                                               NGKHW,
+                                                               ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
index b055e782c2..7368587c93 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
@@ -32,30 +32,6 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NGKHW,
                                                         ConvFwdDefault>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NGCHW,
-                                                                  GKCYX,
-                                                                  Empty_Tuple,
-                                                                  NGKHW,
-                                                                  ConvFwdDefault>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NGCHW,
-                                                               GKCYX,
-                                                               Empty_Tuple,
-                                                               NGKHW,
-                                                               ConvFwdDefault>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..7f0feb61d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NGCHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGKHW,
+                                                                  ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..f9ad6b8212
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NGCHW,
+                                                              GKCYX,
+                                                              Empty_Tuple,
+                                                              NGKHW,
+                                                              ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
index 13e0e91f97..803de2de55 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
@@ -32,30 +32,6 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NGKHW,
                                                        ConvFwdDefault>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NGCHW,
-                                                                 GKCYX,
-                                                                 Empty_Tuple,
-                                                                 NGKHW,
-                                                                 ConvFwdDefault>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NGCHW,
-                                                              GKCYX,
-                                                              Empty_Tuple,
-                                                              NGKHW,
-                                                              ConvFwdDefault>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..da7949668a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NGCHW,
+                                                                 GKCYX,
+                                                                 Empty_Tuple,
+                                                                 NGKHW,
+                                                                 ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..c078f8ed04
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdOddC>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index a344e35c8d..a67b11f1cf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -57,84 +57,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                                                    Empty_Tuple,
                                                                                    NHWGK,
                                                                                    ConvFwdOddC>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwdDefault>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwd1x1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwdOddC>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwdDefault>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwd1x1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwdOddC>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..5c0391a25f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdOddC>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..726276c461
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwdOddC>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
index 30a8b60bfc..8b7bdec2a8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -57,84 +57,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                                                   Empty_Tuple,
                                                                                   NHWGK,
                                                                                   ConvFwdOddC>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdDefault>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwd1x1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdOddC>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              ConvFwdDefault>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              ConvFwd1x1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              ConvFwdOddC>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..c66114b9a3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdOddC>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From c59a8bb206d4dc763d07e16f730e563849e68cb6 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 1 Apr 2025 12:06:25 -0700
Subject: [PATCH 010/443] add a fast compilation path for static for (0..N)
 (#2005)

* add a fast compilation path for static for (0..N)

* Update functional2.hpp

add comment and put range applier into detail namespace

* Update functional.hpp

ditto for ck-tile

* prettify

* prettify more

* add comment

* clang-format
---
 include/ck/utility/functional2.hpp          | 24 +++++++++++++++++++++
 include/ck_tile/core/utility/functional.hpp | 24 +++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
index 99c65f4eb8..a11963cb47 100644
--- a/include/ck/utility/functional2.hpp
+++ b/include/ck/utility/functional2.hpp
@@ -46,4 +46,28 @@ struct static_for
     }
 };
 
+namespace detail {
+
+template <typename T, T... Is>
+struct applier
+{
+    template <typename F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        // tweak -fbracket-depth if compilation fails. Clang default limit is 256
+        (f(Number<Is>{}), ...);
+    }
+};
+
+template <int32_t Size> // == sizeof...(Is)
+using make_applier = __make_integer_seq<applier, index_t, Size>;
+
+} // namespace detail
+
+template <index_t N>
+struct static_for<0, N, 1> : detail::make_applier<N>
+{
+    using detail::make_applier<N>::operator();
+};
+
 } // namespace ck
diff --git a/include/ck_tile/core/utility/functional.hpp b/include/ck_tile/core/utility/functional.hpp
index 2cdce94063..fd0252d3ca 100644
--- a/include/ck_tile/core/utility/functional.hpp
+++ b/include/ck_tile/core/utility/functional.hpp
@@ -58,6 +58,30 @@ struct static_for
     }
 };
 
+namespace detail {
+
+template <typename T, T... Is>
+struct applier
+{
+    template <typename F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
+    {
+        // tweak -fbracket-depth if compilation fails. Clang default limit is 256
+        (f(number<Is>{}), ...);
+    }
+};
+
+template <int32_t Size> // == sizeof...(Is)
+using make_applier = __make_integer_seq<applier, index_t, Size>;
+
+} // namespace detail
+
+template <index_t N>
+struct static_for<0, N, 1> : detail::make_applier<N>
+{
+    using detail::make_applier<N>::operator();
+};
+
 struct identity
 {
     template <typename T>

From df32020f93880a0086ac10a4e5cdbce47e6a1b41 Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Wed, 2 Apr 2025 04:22:10 +0900
Subject: [PATCH 011/443] Fix Windows build. (#2012)

* Remove duplicate using uint64_t.

* Cast before shift.
---
 include/ck/utility/dtype_vector.hpp        | 2 --
 include/ck_tile/core/utility/magic_div.hpp | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index 8f70962fa6..9c40d923d3 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -2000,8 +2000,6 @@ struct vector_type<T, 64, typename ck::enable_if_t<!is_native_type<T>()>>
     }
 };
 
-using int64_t = long;
-
 // fp32
 using float2_t  = typename vector_type<float, 2>::type;
 using float4_t  = typename vector_type<float, 4>::type;
diff --git a/include/ck_tile/core/utility/magic_div.hpp b/include/ck_tile/core/utility/magic_div.hpp
index fd9c733c52..1715983c09 100644
--- a/include/ck_tile/core/utility/magic_div.hpp
+++ b/include/ck_tile/core/utility/magic_div.hpp
@@ -38,7 +38,7 @@ struct magic_division32_bit_range
             shift_u32++;
         };
 
-        uint64_t tmp_u64        = ((1UL << shift_u32) - divisor) << 32;
+        uint64_t tmp_u64        = static_cast<uint64_t>((1UL << shift_u32) - divisor) << 32;
         uint32_t multiplier_u32 = tmp_u64 / divisor + 1;
 
         return make_tuple(multiplier_u32, shift_u32);

From ec742908bdae09387e76980af628f7c1125473cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 1 Apr 2025 22:19:35 +0200
Subject: [PATCH 012/443] Grouped conv fwd v3 fix for SplitN an G > 1 (#2038)

* Grouped conv fwd v3 fix for SplitN an G > 1

* Remove int8 large test

* Retore int8 test
---
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 27 +++++++------------
 ...est_grouped_convnd_fwd_large_cases_xdl.cpp |  5 +++-
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index e91496f6a5..b2f1dbfa5c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -79,15 +79,12 @@ __global__ void
             [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                 c_grid_desc_mblock_mperblock_nblock_nperblock,
             [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
-            [[maybe_unused]] const index_t groups_count)
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
-    const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(gridDim.y / groups_count);
-    const index_t& num_blocks_per_n    = groups_count;
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
     const long_index_t a_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
@@ -141,15 +138,12 @@ __global__ void
             [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                 c_grid_desc_mblock_mperblock_nblock_nperblock,
             [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n,
-            [[maybe_unused]] const index_t groups_count)
+            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
-    const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(gridDim.y / groups_count);
-    const index_t& num_blocks_per_n    = groups_count;
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
     const long_index_t a_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
@@ -766,7 +760,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             std::tie(gdx, gdy, gdz) =
                 GridwiseGemm::CalculateGridSize(GemmM, GemmN, I1 /*arg.KBatch*/);
 
-            gdy *= arg.num_group_ * num_workgroups_per_Conv_N;
+            gdy = arg.num_group_;
+            gdz = num_workgroups_per_Conv_N;
 
             index_t K_split                  = (GemmK + KPerBlock - 1) / KPerBlock * KPerBlock;
             const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
@@ -820,8 +815,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         arg.b_grid_desc_bk0_n_bk1_,
                         arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
                         arg.compute_ptr_offset_of_groups_,
-                        arg.compute_ptr_offset_of_n_,
-                        arg.num_group_);
+                        arg.compute_ptr_offset_of_n_);
                 }
                 else
                 {
@@ -836,8 +830,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                                                arg.b_grid_desc_bk0_n_bk1_,
                                                arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
                                                arg.compute_ptr_offset_of_groups_,
-                                               arg.compute_ptr_offset_of_n_,
-                                               arg.num_group_);
+                                               arg.compute_ptr_offset_of_n_);
                 }
             };
 
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
index 088fed89ff..d017a40bce 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <iostream>
@@ -83,6 +83,9 @@ TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
     // When image is larger than 2GB
     this->conv_params.push_back(
         {2, 2, 2, 128, 128, {3, 3}, {4096, 2048}, {300, 300}, {3, 3}, {1, 1}, {1, 1}});
+    // Split N and G > 1
+    this->conv_params.push_back(
+        {2, 4, 112, 8, 8, {3, 3}, {469, 724}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
     this->template Run<2>();
 }
 

From 8c0ab61ece87f47e4ffece69e27c22b33f6074f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 1 Apr 2025 22:24:38 +0200
Subject: [PATCH 013/443] Grouped conv backward data GKCYX support (#2029)

* Grouped conv backward data GKCYX support

* profiler

* Converter

* split instances
---
 CHANGELOG.md                                  |   5 +
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 241 +++++++++++++-----
 .../gpu/grid/gridwise_elementwise_2d.hpp      | 113 ++++++++
 ...ice_grouped_conv_bwd_data_xdl_instance.hpp |  55 +++-
 .../device_grouped_conv_fwd_xdl_instance.hpp  |  70 ++++-
 .../gpu/grouped_convolution_backward_data.hpp |  72 ++++++
 .../grouped_convolution_backward_data_xdl.inc | 175 +++++++++++++
 .../grouped_conv2d_bwd_data/CMakeLists.txt    |   6 +
 ...ta_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp |  40 +++
 ...kcyx_ngkhw_bf16_vec_transpose_instance.cpp |  40 +++
 ...ata_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp |  40 +++
 ...gkcyx_ngkhw_f16_vec_transpose_instance.cpp |  40 +++
 ...ata_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp |  40 +++
 ...gkcyx_ngkhw_f32_vec_transpose_instance.cpp |  40 +++
 ...ta_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp |  22 +-
 ...ata_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp |  22 +-
 ...ata_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp |  22 +-
 ...ta_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |   4 +-
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |   4 +-
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |   4 +-
 ...wd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp |  17 +-
 ...fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp |  17 +-
 ...fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp |  17 +-
 ...wd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp |  17 +-
 .../grouped_conv3d_bwd_data/CMakeLists.txt    |   6 +
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |  41 +++
 ...zyx_ngkdhw_bf16_vec_transpose_instance.cpp |  41 +++
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |  41 +++
 ...czyx_ngkdhw_f16_vec_transpose_instance.cpp |  41 +++
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp |  41 +++
 ...czyx_ngkdhw_f32_vec_transpose_instance.cpp |  41 +++
 ...xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp |  20 +-
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp |  20 +-
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp |  20 +-
 .../src/profile_grouped_conv_bwd_data.cpp     |  38 ++-
 script/convert_miopen_driver_to_profiler.py   |   5 +-
 .../test_grouped_convnd_bwd_data_xdl.cpp      |   6 +
 37 files changed, 1286 insertions(+), 198 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de831a6898..8cc32e7bda 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,11 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ### Added
 
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
+* Added support for GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW).
+* Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
+
 ### Optimized
 
 None
@@ -22,6 +25,8 @@ None
 * Removed support for gfx940 and gfx941 targets (#1944)
 * Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876)
 * DL and DPP kernels are now enabled by default.
+* Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
+* Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
 
 ### Known issues
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 38e9e3c3d5..770e531e44 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -243,15 +243,21 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     static constexpr auto I3 = Number<3>{};
 
     using ALayoutAfterTranspose =
-        std::conditional_t<is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>(),
+        std::conditional_t<is_NGCHW_NGKHW<ELayout, BLayout, ALayout>(),
                            tensor_layout::convolution::NHWGK,
-                           std::conditional_t<is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>(),
+                           std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>(),
                                               tensor_layout::convolution::NDHWGK,
                                               ALayout>>;
+    using BLayoutAfterTranspose =
+        std::conditional_t<is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>(),
+                           tensor_layout::convolution::GKYXC,
+                           std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>(),
+                                              tensor_layout::convolution::GKZYXC,
+                                              BLayout>>;
     using ELayoutAfterTranspose =
-        std::conditional_t<is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>(),
+        std::conditional_t<is_NGCHW_NGKHW<ELayout, BLayout, ALayout>(),
                            tensor_layout::convolution::NHWGC,
-                           std::conditional_t<is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>(),
+                           std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>(),
                                               tensor_layout::convolution::NDHWGC,
                                               ELayout>>;
 
@@ -265,7 +271,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                                      DoPadGemmM,
                                                                      DoPadGemmN,
                                                                      ALayoutAfterTranspose,
-                                                                     BLayout,
+                                                                     BLayoutAfterTranspose,
                                                                      ELayoutAfterTranspose,
                                                                      true, /*SplitConvN*/
                                                                      ABDataType,
@@ -392,7 +398,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     // block-to-e-tile map
     using Block2ETileMap =
         remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
-    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, MPerBlock>;
+    using Block2TileMapInOutElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, MPerBlock>;
+    using Block2TileMapWeiElementwise   = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
     static constexpr index_t ClusterLengthMPerBlock =
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1);
@@ -418,6 +425,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using NHWGCTransposeDescType =
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
 
     static constexpr index_t ElementwiseBlocksize = ClusterLengthMPerBlock * ClusterLengthNPerBlock;
 
@@ -426,7 +439,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             Tuple<NHWGCTransposeDescType>,
                             Tuple<const ADataType*>,
                             Tuple<ADataType*>,
-                            Block2TileMapElementwise,
+                            Block2TileMapInOutElementwise,
                             element_wise::PassThrough,
                             ElementwiseBlocksize,
                             NPerBlock,
@@ -439,12 +452,30 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             I1,
                             I0>;
 
+    using GridwiseElementwiseWeightTranspose =
+        GridwiseElementwise<Tuple<GKCYXTransposeDescType>,
+                            Tuple<GKYXCTransposeDescType>,
+                            Tuple<const BDataType*>,
+                            Tuple<BDataType*>,
+                            Block2TileMapWeiElementwise,
+                            element_wise::PassThrough,
+                            ElementwiseBlocksize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<1>,
+                            Sequence<CDEBlockTransferScalarPerVector_NPerBlock>,
+                            I0,
+                            I1>;
+
     using GridwiseElementwiseOutputTranspose =
         GridwiseElementwise<Tuple<NHWGCTransposeDescType>,
                             Tuple<NGCHWTransposeDescType>,
                             Tuple<const EDataType*>,
                             Tuple<EDataType*>,
-                            Block2TileMapElementwise,
+                            Block2TileMapInOutElementwise,
                             element_wise::PassThrough,
                             ElementwiseBlocksize,
                             NPerBlock,
@@ -498,6 +529,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
                                                                       a_g_n_k_wos_strides);
+            std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(b_g_k_c_xs_lengths,
+                                                                    b_g_k_c_xs_strides);
             std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(e_g_n_c_wis_lengths,
                                                                       e_g_n_c_wis_strides);
@@ -584,7 +618,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             a_g_n_k_wos_lengths,
                             a_g_n_k_wos_strides_transposed,
                             b_g_k_c_xs_lengths,
-                            b_g_k_c_xs_strides,
+                            b_g_k_c_xs_strides_transposed,
                             e_g_n_c_wis_lengths,
                             e_g_n_c_wis_strides_transposed,
                             conv_filter_strides,
@@ -618,7 +652,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                               DoPadGemmM,
                                                               DoPadGemmN,
                                                               ALayoutAfterTranspose,
-                                                              BLayout,
+                                                              BLayoutAfterTranspose,
                                                               DLayout,
                                                               true, /*SplitConvN*/
                                                               ABDataType,
@@ -627,7 +661,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                 a_g_n_k_wos_lengths,
                                 a_g_n_k_wos_strides_transposed,
                                 b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
+                                b_g_k_c_xs_strides_transposed,
                                 ds_g_n_c_wis_lengths[i],
                                 ds_g_n_c_wis_strides[i],
                                 conv_filter_strides,
@@ -682,7 +716,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             }
             // A/B/Ds/E Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides_transposed[0];
 
             compute_ptr_offset_of_n_.BatchStrideA_ =
@@ -692,8 +726,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
             num_workgroups_per_Conv_N_ = a_g_n_k_wos_lengths_[I1] / conv_N_per_block_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
             {
                 // Use not modified base strides
                 a_in_transpose_desc_ =
@@ -703,6 +737,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         a_g_n_k_wos_lengths, a_g_n_k_wos_strides, num_workgroups_per_Conv_N_);
 
+                b_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+                b_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        b_g_k_c_xs_lengths, b_g_k_c_xs_strides);
+
                 e_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         e_g_n_c_wis_lengths, e_g_n_c_wis_strides, num_workgroups_per_Conv_N_);
@@ -710,9 +751,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
                         e_g_n_c_wis_lengths, e_g_n_c_wis_strides, num_workgroups_per_Conv_N_);
 
-                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
+                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapInOutElementwise{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
-                elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapElementwise{
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapWeiElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
+                elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapInOutElementwise{
                     e_in_transpose_desc_.GetLength(I0), e_in_transpose_desc_.GetLength(I1)};
 
                 compute_ptr_offset_of_workspace_n_.BatchStrideA_ =
@@ -724,25 +767,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            const long_index_t a_acum = ck::accumulate_n<long_index_t>(
-                a_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
-            return sizeof(ADataType) * a_acum;
-        }
-
-        std::size_t GetWorkspaceETensorSizeBytes() const
-        {
-            const long_index_t e_accum = ck::accumulate_n<long_index_t>(
-                e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
-            return sizeof(EDataType) * e_accum;
-        }
-
-        std::size_t GetWorkspaceSizeBytes() const
-        {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
             {
-                return GetWorkspaceATensorSizeBytes() + GetWorkspaceETensorSizeBytes();
+                const long_index_t a_acum = ck::accumulate_n<long_index_t>(
+                    a_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(ADataType) * a_acum, 128) * 128;
             }
             else
             {
@@ -750,6 +781,43 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             }
         }
 
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                const long_index_t b_acum = ck::accumulate_n<long_index_t>(
+                    b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                // Align to 128B
+                return math::integer_divide_ceil(sizeof(BDataType) * b_acum, 128) * 128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                const long_index_t e_accum = ck::accumulate_n<long_index_t>(
+                    e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
+                return sizeof(EDataType) * e_accum;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
+                   GetWorkspaceETensorSizeBytes();
+        }
+
         void Print() const
         {
             for(std::size_t i = 0; i < a_grid_desc_ak0_m_ak1_container_.size(); i++)
@@ -796,11 +864,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // block-to-e-tile map
         std::vector<Block2ETileMap> block_2_etile_map_container_;
-        Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
+        Block2TileMapInOutElementwise elementwise_block_2_ctile_map_transpose_a_,
             elementwise_block_2_ctile_map_transpose_e_;
+        Block2TileMapWeiElementwise elementwise_block_2_ctile_map_transpose_b_;
 
         NGCHWTransposeDescType a_in_transpose_desc_, e_out_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, e_in_transpose_desc_;
+        GKCYXTransposeDescType b_in_transpose_desc_;
+        GKYXCTransposeDescType b_out_transpose_desc_;
 
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
@@ -835,14 +906,24 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             const index_t gdz = arg.num_workgroups_per_Conv_N_;
 
             const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
             EDataType* p_e_grid       = arg.p_e_grid_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
             {
                 p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                           arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                p_e_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
+            }
+
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
             }
 
             for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
@@ -888,7 +969,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         dim3(BlockSize),
                         0,
                         p_a_grid,
-                        arg.p_b_grid_,
+                        p_b_grid,
                         arg.p_ds_grid_,
                         p_e_grid,
                         arg.a_element_op_,
@@ -925,11 +1006,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 arg.Print();
             }
             // Transpose from NGKHW to NHWGK
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
             {
-                EDataType* p_e_in_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                                         arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                EDataType* p_e_in_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
 
                 const auto clear_workspace = [&]() {
                     hip_check_error(hipMemsetAsync(p_e_in_grid,
@@ -938,47 +1021,72 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                    stream_config.stream_id_));
                 };
 
-                const index_t grid_size =
+                const index_t a_grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
                         arg.a_in_transpose_desc_) *
                     arg.num_workgroups_per_Conv_N_;
+                const index_t b_grid_size =
+                    (is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                     is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+                        ? arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                              arg.b_in_transpose_desc_)
+                        : 0; // Dont run transpose B if not needed
 
                 ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_);
+                BDataType* p_b_out_grid = type_convert<BDataType*>(arg.p_workspace_) +
+                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
 
                 auto kernel_transpose =
-                    kernel_batched_elementwise<GridwiseElementwiseInputTranspose,
-                                               ck::Tuple<NGCHWTransposeDescType>,
-                                               ck::Tuple<NHWGCTransposeDescType>,
-                                               ck::Tuple<const ADataType*>,
-                                               ck::Tuple<ADataType*>,
-                                               Block2TileMapElementwise,
-                                               element_wise::PassThrough,
-                                               I1,
-                                               I1>;
+                    kernel_elementwise_batched_dual<GridwiseElementwiseInputTranspose,
+                                                    GridwiseElementwiseWeightTranspose,
+                                                    ck::Tuple<NGCHWTransposeDescType>,
+                                                    ck::Tuple<GKCYXTransposeDescType>,
+                                                    ck::Tuple<NHWGCTransposeDescType>,
+                                                    ck::Tuple<GKYXCTransposeDescType>,
+                                                    ck::Tuple<const ADataType*>,
+                                                    ck::Tuple<const BDataType*>,
+                                                    ck::Tuple<ADataType*>,
+                                                    ck::Tuple<BDataType*>,
+                                                    Block2TileMapInOutElementwise,
+                                                    Block2TileMapWeiElementwise,
+                                                    element_wise::PassThrough,
+                                                    I1,
+                                                    I1,
+                                                    I1,
+                                                    I1>;
 
                 ave_time += launch_and_time_kernel_with_preprocess(
                     stream_config,
                     clear_workspace,
                     kernel_transpose,
-                    dim3(grid_size),
+                    dim3(a_grid_size + b_grid_size),
                     dim3(ElementwiseBlocksize),
                     0,
                     make_tuple(arg.a_in_transpose_desc_),
+                    make_tuple(arg.b_in_transpose_desc_),
                     make_tuple(arg.a_out_transpose_desc_),
+                    make_tuple(arg.b_out_transpose_desc_),
                     make_tuple(arg.p_a_grid_),
+                    make_tuple(arg.p_b_grid_),
                     make_tuple(p_a_out_grid),
+                    make_tuple(p_b_out_grid),
                     arg.elementwise_block_2_ctile_map_transpose_a_,
+                    arg.elementwise_block_2_ctile_map_transpose_b_,
                     element_wise::PassThrough{},
+                    a_grid_size,
                     arg.num_workgroups_per_Conv_N_,
+                    I1, // B is not splited per N
                     std::array<index_t, I1>{
                         static_cast<index_t>(arg.compute_ptr_offset_of_workspace_n_.BatchStrideA_)},
+                    std::array<index_t, I1>{0},
                     std::array<index_t, I1>{
-                        static_cast<index_t>(arg.compute_ptr_offset_of_n_.BatchStrideA_)});
+                        static_cast<index_t>(arg.compute_ptr_offset_of_n_.BatchStrideA_)},
+                    std::array<index_t, I1>{0});
             }
             ave_time += RunGemm(arg, stream_config);
             // Transpose from NHWGC to NGCHW
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
             {
                 const index_t grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
@@ -987,7 +1095,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
                 const EDataType* p_e_in_grid =
                     type_convert<EDataType*>(arg.p_workspace_) +
-                    arg.GetWorkspaceATensorSizeBytes() / sizeof(EDataType);
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
 
                 EDataType* p_e_out_grid = arg.p_e_grid_;
 
@@ -997,7 +1106,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                ck::Tuple<NGCHWTransposeDescType>,
                                                ck::Tuple<const EDataType*>,
                                                ck::Tuple<EDataType*>,
-                                               Block2TileMapElementwise,
+                                               Block2TileMapInOutElementwise,
                                                element_wise::PassThrough,
                                                I1,
                                                I1>;
@@ -1077,7 +1186,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         // vector load for B matrix from global memory to LDS
         if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
-                     is_same_v<BLayout, tensor_layout::convolution::GKZYXC>)
+                     is_same_v<BLayout, tensor_layout::convolution::GKZYXC> ||
+                     is_same_v<BLayout, tensor_layout::convolution::GKCYX> ||
+                     is_same_v<BLayout, tensor_layout::convolution::GKCZYX>)
         {
             if(!(BBlockTransferSrcVectorDim == 1 && ConvC % BBlockTransferSrcScalarPerVector == 0))
             {
@@ -1152,8 +1263,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             }
         }
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>())
+        if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                     is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
         {
             if((ConvG * ConvC) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
@@ -1320,8 +1431,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             << CShuffleMXdlPerWavePerShuffle << ", "
             << CShuffleNXdlPerWavePerShuffle;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<ELayout, BLayout, ALayout>() ||
-                        is_NGCDHW_GKZYXC_NGKDHW<ELayout, BLayout, ALayout>()) {
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                        is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>()) {
                     str << ", TransposeTransferInScalarPerVectorAligned: "
                     << TransposeTransferInScalarPerVectorAligned <<", "
                     << "TransposeTransferOutScalarPerVectorAligned: " << TransposeTransferOutScalarPerVectorAligned;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index 0edfc9b0ee..1326c5d62d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -93,6 +93,119 @@ __global__ void
     }
 }
 
+template <typename GridwiseElementwiseFunctorA,
+          typename GridwiseElementwiseFunctorB,
+          typename InAGridDescTuple,
+          typename InBGridDescTuple,
+          typename OutAGridDescTuple,
+          typename OutBGridDescTuple,
+          typename InADataTypePointerTuple,
+          typename InBDataTypePointerTuple,
+          typename OutADataTypePointerTuple,
+          typename OutBDataTypePointerTuple,
+          typename Block2TileMapA,
+          typename Block2TileMapB,
+          typename ElementwiseOperation,
+          index_t NumInputsA,
+          index_t NumInputsB,
+          index_t NumOutputsA,
+          index_t NumOutputsB>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_elementwise_batched_dual(
+            const InAGridDescTuple in_grid_desc_tuple_a,
+            const InBGridDescTuple in_grid_desc_tuple_b,
+            const OutAGridDescTuple out_grid_desc_tuple_a,
+            const OutBGridDescTuple out_grid_desc_tuple_b,
+            const InADataTypePointerTuple p_in_global_tuple_a,
+            const InBDataTypePointerTuple p_in_global_tuple_b,
+            const OutADataTypePointerTuple p_out_global_tuple_a,
+            const OutBDataTypePointerTuple p_out_global_tuple_b,
+            const Block2TileMapA block_2_tile_map_a,
+            const Block2TileMapB block_2_tile_map_b,
+            const ElementwiseOperation elementwise_op,
+            const index_t a_grid_size,
+            const index_t batch_count_a,
+            const index_t batch_count_b,
+            const std::array<index_t, NumInputsA> input_batch_strides_a,
+            const std::array<index_t, NumInputsB> input_batch_strides_b,
+            const std::array<index_t, NumOutputsA> output_batch_strides_a,
+            const std::array<index_t, NumOutputsB> output_batch_strides_b)
+{
+    static_assert(InAGridDescTuple::Size() == NumInputsA &&
+                  InADataTypePointerTuple::Size() == NumInputsA);
+    static_assert(OutAGridDescTuple::Size() == NumOutputsA &&
+                  OutADataTypePointerTuple::Size() == NumOutputsA);
+    static_assert(InBGridDescTuple::Size() == NumInputsB &&
+                  InBDataTypePointerTuple::Size() == NumInputsB);
+    static_assert(OutBGridDescTuple::Size() == NumOutputsB &&
+                  OutBDataTypePointerTuple::Size() == NumOutputsB);
+
+    const index_t block_id = __builtin_amdgcn_readfirstlane(get_block_1d_id());
+
+    if(block_id < a_grid_size)
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(a_grid_size / batch_count_a);
+        const index_t g_idx = __builtin_amdgcn_readfirstlane(block_id / num_blocks_per_batch);
+
+        InADataTypePointerTuple p_in_global_with_offset_tuple;
+        OutADataTypePointerTuple p_out_global_with_offset_tuple;
+
+        static_for<0, InADataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_in_global_with_offset_tuple(i) =
+                p_in_global_tuple_a.At(i) +
+                type_convert<long_index_t>(input_batch_strides_a[i]) * g_idx;
+        });
+
+        static_for<0, OutADataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_out_global_with_offset_tuple(i) =
+                p_out_global_tuple_a.At(i) +
+                type_convert<long_index_t>(output_batch_strides_a[i]) * g_idx;
+        });
+
+        GridwiseElementwiseFunctorA::Run(in_grid_desc_tuple_a,
+                                         out_grid_desc_tuple_a,
+                                         p_in_global_with_offset_tuple,
+                                         p_out_global_with_offset_tuple,
+                                         block_2_tile_map_a,
+                                         elementwise_op,
+                                         block_id);
+    }
+    else
+    {
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane((get_grid_size() - a_grid_size) / batch_count_b);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane((block_id - a_grid_size) / num_blocks_per_batch);
+
+        InBDataTypePointerTuple p_in_global_with_offset_tuple;
+        OutBDataTypePointerTuple p_out_global_with_offset_tuple;
+
+        static_for<0, InBDataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_in_global_with_offset_tuple(i) =
+                p_in_global_tuple_b.At(i) +
+                type_convert<long_index_t>(input_batch_strides_b[i]) * g_idx;
+        });
+
+        static_for<0, OutBDataTypePointerTuple::Size(), 1>{}([&](auto i) {
+            p_out_global_with_offset_tuple(i) =
+                p_out_global_tuple_b.At(i) +
+                type_convert<long_index_t>(output_batch_strides_b[i]) * g_idx;
+        });
+
+        GridwiseElementwiseFunctorB::Run(in_grid_desc_tuple_b,
+                                         out_grid_desc_tuple_b,
+                                         p_in_global_with_offset_tuple,
+                                         p_out_global_with_offset_tuple,
+                                         block_2_tile_map_b,
+                                         elementwise_op,
+                                         block_id - a_grid_size);
+    }
+}
+
 template <typename GridwiseElementwiseFunctor,
           typename InGridDescTuple,
           typename OutGridDescTuple,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
index ee86950992..ae6fabd0bd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -36,6 +36,24 @@ static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
 
 // f16_f16_f32_f16
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f16_generic_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -73,6 +91,23 @@ using device_grouped_conv_bwd_data_xdl_f16_instances =
         >;
 
 // bf16_bf16_f32_bf16
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_bf16_generic_instances = std::tuple<
+    // clang-format off
+        // ##############################################|          NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|       Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,    S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -109,6 +144,24 @@ using device_grouped_conv_bwd_data_xdl_bf16_instances = std::tuple<
     >;
 
 // f32_f32_f32_f32
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f32_generic_instances =
+    std::tuple<
+        // clang-format off
+         // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+         // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 16, 1, 4>,                1>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index af79eefa1f..c9ea462316 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -46,6 +46,23 @@ static constexpr auto ConvFwdOddC =
 
 static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_bf16_generic_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -80,6 +97,23 @@ using device_grouped_conv_fwd_xdl_bf16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f16_generic_instances = std::tuple<
+    // clang-format off
+  //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+  //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+  // generic instance
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -114,6 +148,23 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f32_generic_instances = std::tuple<
+    // clang-format off
+  //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+  //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+  //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+  // generic instance
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -148,6 +199,23 @@ using device_grouped_conv_fwd_xdl_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_int8_generic_instances = std::tuple<
+    // clang-format off
+  //########################################|     NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+  //########################################|    Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+  //########################################|           |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+  //########################################|           |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+  // generic instance
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index e353d7939b..12695f4f16 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -156,6 +156,41 @@ struct DeviceOperationInstanceFactory<
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
                         op_ptrs);
                 }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKCYX> &&
+                         is_same_v<OutLayout, NGKHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                             is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
+                             is_same_v<ComputeTypeB, F32>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                             is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
+                             is_same_v<ComputeTypeB, BF16>)
+                {
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
+                        op_ptrs);
+                }
 #endif
             }
         }
@@ -261,6 +296,43 @@ struct DeviceOperationInstanceFactory<
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
                         op_ptrs);
                 }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKCZYX> &&
+                         is_same_v<OutLayout, NGKDHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                             is_same_v<OutDataType, F16> && is_same_v<ComputeTypeA, F16> &&
+                             is_same_v<ComputeTypeB, F16>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, F32> && is_same_v<WeiDataType, F32> &&
+                             is_same_v<OutDataType, F32> && is_same_v<ComputeTypeA, F32> &&
+                             is_same_v<ComputeTypeB, F32>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, BF16> && is_same_v<WeiDataType, BF16> &&
+                             is_same_v<OutDataType, BF16> && is_same_v<ComputeTypeA, BF16> &&
+                             is_same_v<ComputeTypeB, BF16>)
+                {
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
+                        op_ptrs);
+                }
 #endif
             }
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
index 6f82117731..5be8f29e99 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
@@ -147,6 +147,94 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
                                                                   PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+
 // conv3d backward data
 #ifdef CK_ENABLE_FP16
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
@@ -300,6 +388,93 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 #endif
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+#endif
 
 } // namespace instance
 } // namespace device
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
index 50b724206e..913ebd3a12 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
@@ -10,6 +10,12 @@ add_instance_library(
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
 
        wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
        wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
new file mode 100644
index 0000000000..23aeeaf505
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
+                                                        NGKHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGCHW,
+                                                        ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..b6e4c170df
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_bf16_instances<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
new file mode 100644
index 0000000000..beeda26690
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<2,
+                                                       NGKHW,
+                                                       GKCYX,
+                                                       Empty_Tuple,
+                                                       NGCHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..234fd53c8c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f16_instances<2,
+                                                                 NGKHW,
+                                                                 GKCYX,
+                                                                 Empty_Tuple,
+                                                                 NGCHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..a1d768f4eb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<2,
+                                                       NGKHW,
+                                                       GKCYX,
+                                                       Empty_Tuple,
+                                                       NGCHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..3a8b22924a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f32_instances<2,
+                                                                 NGKHW,
+                                                                 GKCYX,
+                                                                 Empty_Tuple,
+                                                                 NGCHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
index 974615c434..38c3ebc67b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
@@ -26,20 +26,12 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
-                                                        NGKHW,
-                                                        GKYXC,
-                                                        Empty_Tuple,
-                                                        NGCHW,
-                                                        ConvBwdDataDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_data_transpose_xdl_bf16_instances<2,
-                                                                  NGKHW,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NGCHW,
-                                                                  ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_bf16_generic_instances<2,
+                                                                NGKHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGCHW,
+                                                                ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
index 272e5f3cb7..e6f3985935 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
@@ -26,20 +26,12 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<2,
-                                                       NGKHW,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       NGCHW,
-                                                       ConvBwdDataDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_data_transpose_xdl_f16_instances<2,
-                                                                 NGKHW,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NGCHW,
-                                                                 ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f16_generic_instances<2,
+                                                               NGKHW,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NGCHW,
+                                                               ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
index 01cd2c9206..9212c546ca 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
@@ -26,20 +26,12 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<2,
-                                                       NGKHW,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       NGCHW,
-                                                       ConvBwdDataDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_data_transpose_xdl_f32_instances<2,
-                                                                 NGKHW,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NGCHW,
-                                                                 ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f32_generic_instances<2,
+                                                               NGKHW,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NGCHW,
+                                                               ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 5d9194798b..75e7f61f8a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 5269bb9652..231e894be0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index adfa08c1f7..dbaece1123 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, hi, wi, g, c] * wei[g, k, y, x, c] = in[n, ho, wo, g, k]
+// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
index 6ee6aa1e4a..28283fb33d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              NGCHW,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_generic_instances<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NGKHW,
+                                                           ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
index e002058557..78d1747548 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_generic_instances<2,
+                                                          NGCHW,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          NGKHW,
+                                                          ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
index 1033db4972..5c8c3cb8a5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             NGCHW,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_generic_instances<2,
+                                                          NGCHW,
+                                                          GKYXC,
+                                                          Empty_Tuple,
+                                                          NGKHW,
+                                                          ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
index 65c75fa043..d89c29327c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -23,13 +23,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_instances<2,
-                                                                              NGCHW,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_int8_generic_instances<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NGKHW,
+                                                           ConvFwdDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
index 4ab7335f7d..a656c79289 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
@@ -9,6 +9,12 @@ set(GROUPED_CONV3D_BWD_DATA
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..a9a6b4d281
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
+                                                        NGKDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGCDHW,
+                                                        ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..e0703a60fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_bf16_instances<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..eec3944078
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<3,
+                                                       NGKDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGCDHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..5bbd7863da
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f16_instances<3,
+                                                                 NGKDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGCDHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..a596482ca8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<3,
+                                                       NGKDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGCDHW,
+                                                       ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
new file mode 100644
index 0000000000..d68062a707
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_transpose_xdl_f32_instances<3,
+                                                                 NGKDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGCDHW,
+                                                                 ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
index 88e091568c..b42eca238f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
@@ -27,20 +27,12 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
-                                                        NGKDHW,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NGCDHW,
-                                                        ConvBwdDataDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_data_transpose_xdl_bf16_instances<3,
-                                                                  NGKDHW,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NGCDHW,
-                                                                  ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_bf16_generic_instances<3,
+                                                                NGKDHW,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NGCDHW,
+                                                                ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
index 0378ec13cb..a66965b4a3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
@@ -27,20 +27,12 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<3,
-                                                       NGKDHW,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NGCDHW,
-                                                       ConvBwdDataDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_data_transpose_xdl_f16_instances<3,
-                                                                 NGKDHW,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NGCDHW,
-                                                                 ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f16_generic_instances<3,
+                                                               NGKDHW,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NGCDHW,
+                                                               ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
index 066fc8a3eb..af21d6dc5d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
@@ -27,20 +27,12 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<3,
-                                                       NGKDHW,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NGCDHW,
-                                                       ConvBwdDataDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_data_transpose_xdl_f32_instances<3,
-                                                                 NGKDHW,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NGCDHW,
-                                                                 ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f32_generic_instances<3,
+                                                               NGKDHW,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NGCDHW,
+                                                               ConvBwdDataDefault>{});
 }
 
 } // namespace instance
diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/profiler/src/profile_grouped_conv_bwd_data.cpp
index 9565833b32..1515f1105f 100644
--- a/profiler/src/profile_grouped_conv_bwd_data.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_data.cpp
@@ -16,6 +16,7 @@ enum struct ConvLayout
     GNHWC_GKYXC_GNHWK, // 0
     NHWGC_GKYXC_NHWGK, // 1
     NGCHW_GKYXC_NGKHW, // 2
+    NGCHW_GKCYX_NGKHW, // 3
 };
 
 enum struct ConvDataType
@@ -36,9 +37,10 @@ static void print_helper_msg()
         << "arg2: data type (0: Output fp32, Weight fp32, Input fp32\n"
         << "                 1: Output fp16, Weight fp16, Input fp16\n"
         << "                 2: Output bf16, Weight bf16, Input bf16\n"
-        << "arg3: tensor layout (0: Output[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Input[G, N, Ho, Wo, K]\n"
-        << "                     1: Output[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Input[N, Ho, Wo, G, K])\n"
-        << "                     2: Output[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Input[N, G, K, Ho, Wo])\n"
+        << "arg3: tensor layout (0: Output[G, N, Ho, Wo, C], Weight[G, K, Y, X, C], Input[G, N, Hi, Wi, K]\n"
+        << "                     1: Output[N, Ho, Wo, G, C], Weight[G, K, Y, X, C], Input[N, Hi, Wi, G, K])\n"
+        << "                     2: Output[N, G, C, Ho, Wo], Weight[G, K, Y, X, C], Input[N, G, K, Hi, Wi])\n"
+        << "                     3: Output[N, G, C, Ho, Wo], Weight[G, K, C, Y, X], Input[N, G, K, Hi, Wi])\n"
         << "arg4: verification (0: no, 1: yes)\n"
         << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
         << "arg6: print tensor value (0: no; 1: yes)\n"
@@ -160,6 +162,21 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                 return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, BF16{}, BF16{}, BF16{});
             }
         }
+        else if(layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+        {
+            if(data_type == ConvDataType::F32_F32_F32)
+            {
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F32{}, F32{}, F32{});
+            }
+            else if(data_type == ConvDataType::F16_F16_F16)
+            {
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F16{}, F16{}, F16{});
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, BF16{}, BF16{}, BF16{});
+            }
+        }
     }
     else if(num_dim_spatial == 3)
     {
@@ -208,6 +225,21 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                 return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, BF16{}, BF16{}, BF16{});
             }
         }
+        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
+        {
+            if(data_type == ConvDataType::F32_F32_F32)
+            {
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F32{}, F32{}, F32{});
+            }
+            else if(data_type == ConvDataType::F16_F16_F16)
+            {
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F16{}, F16{}, F16{});
+            }
+            else if(data_type == ConvDataType::BF16_BF16_BF16)
+            {
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, BF16{}, BF16{}, BF16{});
+            }
+        }
     }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index 9bb668e164..81f9977542 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -30,10 +30,9 @@ def parse_layouts(args):
     if args.in_layout == "NCW" or args.in_layout == "NCHW" or \
        args.in_layout == "NCDHW":
         if args.ck_profier_op == "grouped_conv_bwd_weight" or \
-             args.ck_profier_op == "grouped_conv_fwd":
+             args.ck_profier_op == "grouped_conv_fwd" or \
+             args.ck_profier_op == "grouped_conv_bwd_data":
             args.layout = 3
-        elif args.ck_profier_op == "grouped_conv_bwd_data":
-            args.layout = 2
         else:
             print('Not supported layout for this op')
             exit(1)
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index 3fe4dac2ba..eb6083c521 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -54,6 +54,9 @@ using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
                                        std::tuple<float, NGKHW, GKYXC, NGCHW>,
                                        std::tuple<ck::half_t, NGKHW, GKYXC, NGCHW>,
                                        std::tuple<ck::bhalf_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<float, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKCYX, NGCHW>,
                                        std::tuple<float, NHWGK, GKYXC, NHWGC>,
                                        std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
                                        std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
@@ -64,6 +67,9 @@ using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>
                                        std::tuple<float, NGKDHW, GKZYXC, NGCDHW>,
                                        std::tuple<ck::half_t, NGKDHW, GKZYXC, NGCDHW>,
                                        std::tuple<ck::bhalf_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<float, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKCZYX, NGCDHW>,
                                        std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
                                        std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
                                        std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;

From e5ad48a7843a16a1ed0c1268b5dba7dfe2d59e4d Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 2 Apr 2025 11:03:40 +0200
Subject: [PATCH 014/443] Basic docs for universal gemm & ck-tile gemm. (#2014)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Basic docs for universal gemm & ck-tile gemm.

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Reviewers suggestions.

* Align tparam names in doc with class tparams.

* More reviewers fine tuning ;)

---------

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 .../impl/device_gemm_xdl_cshuffle_v3.hpp      | 116 +++++++++++++++++-
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    | 103 ++++++++++++++++
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  60 +++++++++
 3 files changed, 277 insertions(+), 2 deletions(-)
 mode change 100755 => 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index a8cf681995..51c223efd2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -21,6 +21,105 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+/// @brief \"Universal\" GEMM operation with SplitK support.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations applied to the A, B, and C tensors, respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam GemmAccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerXDL     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXDL     N size of matrix-fused-multiply-add instruction.
+/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -130,9 +229,22 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
-    // Invoker
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
     struct Invoker : public BaseInvoker
     {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(stream_config.log_level_ > 0)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
old mode 100755
new mode 100644
index 55639f4aee..9f6d85dd78
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -82,6 +82,109 @@ __global__ void
 #endif // end of if (defined(__gfx9__))
 }
 
+/// @brief \"Universal\" GEMM kernel with SplitK support.
+///
+/// @par Overview
+///         This GEMM kernel is carrying out following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations that could be applied on each tensor respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1Value    The vector load size from global memory for A tensor.
+/// @tparam BK1Value    The vector load size from global memory for B tensor.
+/// @tparam MPerXdl     M size of matrix-fused-multiply-add instruction.
+/// @tparam NPerXdl     N size of matrix-fused-multiply-add instruction.
+/// @tparam MXdlPerWave The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NXdlPerWave The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam AThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNXdlPerWavePerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 9435855d0a..e5b9d17bac 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -12,6 +12,11 @@
 
 namespace ck_tile {
 
+/// @brief The GEMM problem definition.
+///
+/// @par Overview
+///      This structure defines the GEMM problem configuration by stating all required information
+///      like M,N,K sizes and respective strides.
 struct GemmProblem
 {
     CK_TILE_HOST GemmProblem() = default;
@@ -29,6 +34,12 @@ struct GemmProblem
     index_t stride_C;
 };
 
+/// @brief The GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GemmKernel "GemmKernel" when creating kernel arguments
+///      object. It contain all necessary information required to build proper kernel argument
+///      and launch kernel on GPU.
 struct GemmHostArgs : public GemmProblem
 {
     CK_TILE_HOST GemmHostArgs() = default;
@@ -56,20 +67,69 @@ struct GemmHostArgs : public GemmProblem
     index_t k_batch;
 };
 
+/// @brief The GEMM kernel device arguments.
 struct GemmKernelArgs
 {
+    /// @brief The A input tensor's pointer to device memory.
     const void* a_ptr;
+    /// @brief The B input tensor's pointer to device memory.
     const void* b_ptr;
+    /// @brief The C output tensor's pointer to device memory.
     void* c_ptr;
+    /// @brief GEMM's M dimension size.
     index_t M;
+    /// @brief GEMM's N dimension size.
     index_t N;
+    /// @brief GEMM's K dimension size.
     index_t K;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of A tensor.
     index_t stride_A;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of B tensor.
     index_t stride_B;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of C tensor.
     index_t stride_C;
     index_t k_batch;
 };
 
+/// @brief The GEMM kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the generic matrix multiplication kernel template. By semantic
+///            division of GEMM algorithm into following parts we achieve flexible, versatile
+///            and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam TilePartitioner_    The type of class providing mapping of workgroup index into the
+///                             output data tile to be calculated. It determines the workgroup to
+///                             data relationship (or in other words - which data would be
+///                             processed and calculated by which workgroup).
+/// @tparam GemmPipeline_       The type of class which provides the core part of matrix
+///                             multiplication. This class should provide implementation of data
+///                             loading from global memory and performing block-wise matrix
+///                             multiplication. You can think of it as a work done by single
+///                             workgroup point of view.
+/// @tparam EpiloguePipeline_   The type of class providing the final part of matrix
+///                             multiplication implementation. It is responsible for storing
+///                             results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                             the output C tensor in global memory.
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct GemmKernel
 {

From 2ccf91488878239c8dde9b3be312b84311907a44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 2 Apr 2025 23:59:49 +0200
Subject: [PATCH 015/443] Add support for GKCYX grouped conv weight (#2023)

* Grouped conv bwd weight GKCYX support

* fix and changelog

* fix

* fix

* fixes

* comments

* fix
---
 CHANGELOG.md                                  |   3 +-
 .../07_grouped_convnd_fwd/README.md           |  16 +-
 .../11_grouped_conv_bwd_weight/README.md      |  12 +-
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 203 +++++++++-----
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 249 +++++++++++++-----
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |   5 +
 .../transform_conv_ngchw_to_nhwgc.hpp         |   9 +
 .../grouped_convolution_backward_weight.hpp   | 114 ++++++--
 ...rouped_convolution_backward_weight_xdl.inc | 120 +++++++--
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  |  90 ++++---
 ...hwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp |   0
 ...kyxc_gnhwk_f16_default_pipev1_instance.cpp |   0
 ...ght_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |   0
 ...c_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp |   0
 ...kyxc_gnhwk_f32_default_pipev1_instance.cpp |   0
 ...ght_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |   0
 ...c_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp |   0
 ...ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp |  41 +++
 ...gchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp} |   8 +-
 ...gchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp} |   8 +-
 ..._ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp |  41 +++
 ...ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp} |   8 +-
 ...ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp} |   8 +-
 ...t_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp} |   8 +-
 ...ht_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp} |   8 +-
 ...ht_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp} |   8 +-
 ...ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp |   2 +-
 ..._ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp |   2 +-
 ...ght_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp |  38 +++
 ...nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp |   2 +-
 ...nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp |   2 +-
 ...c_nhwgk_bf16_pipev2_irregular_instance.cpp |   0
 ...nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp |   2 +-
 ...c_nhwgk_bf16_pipev5_irregular_instance.cpp |   0
 ..._nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp |   2 +-
 ..._nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp |   2 +-
 ...xc_nhwgk_f16_pipev2_irregular_instance.cpp |   0
 ..._nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp |   2 +-
 ...xc_nhwgk_f16_pipev5_irregular_instance.cpp |   0
 ...yxc_nhwgk_bf16_default_pipev2_instance.cpp |   0
 ...yxc_nhwgk_bf16_default_pipev5_instance.cpp |   0
 ...wgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp |   2 +-
 ...ht_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |   2 +-
 ..._gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp |   0
 ..._gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp |   0
 ...kyxc_nhwgk_f16_default_pipev2_instance.cpp |   0
 ...kyxc_nhwgk_f16_default_pipev5_instance.cpp |   0
 ...ght_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |   2 +-
 ...c_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp |   0
 ...c_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp |   0
 ...kyxc_nhwgk_f32_default_pipev2_instance.cpp |   0
 ...kyxc_nhwgk_f32_default_pipev5_instance.cpp |   0
 ...ght_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |   2 +-
 ...c_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp |   0
 ...c_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp |   0
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |  84 +++---
 ...c_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp |   0
 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp |   0
 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp |   0
 ...wgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp |   2 +-
 ...wgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp |   2 +-
 ..._ndhwgk_bf16_pipev2_irregular_instance.cpp |   0
 ...wgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp |   2 +-
 ..._ndhwgk_bf16_pipev5_irregular_instance.cpp |   0
 ...hwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp |   2 +-
 ...hwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp |   2 +-
 ...c_ndhwgk_f16_pipev2_irregular_instance.cpp |   0
 ...hwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp |   2 +-
 ...c_ndhwgk_f16_pipev5_irregular_instance.cpp |   0
 ...xc_ndhwgk_bf16_default_pipev2_instance.cpp |   0
 ...xc_ndhwgk_bf16_default_pipev5_instance.cpp |   0
 ...c_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp |   2 +-
 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |   2 +-
 ...kzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp |   0
 ...kzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp |   0
 ...kzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp |   2 +-
 ...yxc_ndhwgk_f16_default_pipev2_instance.cpp |   0
 ...yxc_ndhwgk_f16_default_pipev5_instance.cpp |   0
 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |   2 +-
 ...gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp |   0
 ...gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp |   0
 ...yxc_ndhwgk_f32_default_pipev2_instance.cpp |   0
 ...yxc_ndhwgk_f32_default_pipev5_instance.cpp |   0
 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |   2 +-
 ...gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp |   0
 ...gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp |   0
 ...dhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp |  41 +++
 ...hw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp} |   8 +-
 ...hw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp} |   8 +-
 ...cdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp |  41 +++
 ...dhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp} |   8 +-
 ...dhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp} |   8 +-
 ...dl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp} |   8 +-
 ...xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp} |   8 +-
 ...xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp} |   8 +-
 ...dhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp |   2 +-
 ...cdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp |   2 +-
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp |  38 +++
 .../src/profile_grouped_conv_bwd_weight.cpp   |  36 ++-
 script/convert_miopen_driver_to_profiler.py   |   5 +-
 .../test_grouped_convnd_bwd_weight.cpp        |  12 +-
 101 files changed, 1004 insertions(+), 356 deletions(-)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => gnhwc_gkyxc_gnhwk}/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp (100%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp} (88%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp} (88%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp} (88%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp} (88%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp => ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => ngchw_gkyxc_ngkhw}/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => ngchw_gkyxc_ngkhw}/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp (95%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp (96%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp (97%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp (97%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/{ => nhwgc_gkyxc_nhwgk}/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => gndhwc_gkzyxc_gndhwk}/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => gndhwc_gkzyxc_gndhwk}/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => gndhwc_gkzyxc_gndhwk}/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp (96%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp (97%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp (97%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp (97%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp (100%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ndhwgc_gkzyxc_ndhwgk}/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp (100%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp} (89%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp} (89%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp} (89%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp} (89%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp} (93%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp} (93%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp => ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp} (93%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ngcdhw_gkzyxc_ngkdhw}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/{ => ngcdhw_gkzyxc_ngkdhw}/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp (95%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8cc32e7bda..f9da2b3117 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,8 +8,8 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
 * Added support for GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW).
+* Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
-* Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
 
 ### Optimized
@@ -26,6 +26,7 @@ None
 * Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876)
 * DL and DPP kernels are now enabled by default.
 * Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
+* Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced.
 * Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
 
 ### Known issues
diff --git a/client_example/07_grouped_convnd_fwd/README.md b/client_example/07_grouped_convnd_fwd/README.md
index 28a64ad733..9e96df222d 100644
--- a/client_example/07_grouped_convnd_fwd/README.md
+++ b/client_example/07_grouped_convnd_fwd/README.md
@@ -30,14 +30,14 @@ List of the device operations for grouped convolution forward in CK:
 
 Table of supported cases by instance factory with XDL instruction:
 
-|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
-|-------|---|---|---|
-|bf16 |2D, 3D|2D|1D, 2D, 3D|
-|fp16 |2D, 3D|2D|1D, 2D, 3D|
-|fp32 |2D, 3D|2D|1D, 2D, 3D|
-|int8 |2D, 3D|2D|1D, 3D|
-|fp8  |3D|&cross;|&cross;|
-|bf8  |3D|&cross;|&cross;|
+|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
+|-------|---|---|---|---|
+|bf16 |2D, 3D|2D|2D|1D, 2D, 3D|
+|fp16 |2D, 3D|2D|2D|1D, 2D, 3D|
+|fp32 |2D, 3D|2D|2D|1D, 2D, 3D|
+|int8 |2D, 3D|2D|2D|1D, 3D|
+|fp8  |3D|&cross;|&cross;|&cross;|
+|bf8  |3D|&cross;|&cross;|&cross;|
 
 Table of supported cases by instance factory with WMMA instruction:
 
diff --git a/client_example/11_grouped_conv_bwd_weight/README.md b/client_example/11_grouped_conv_bwd_weight/README.md
index 834fd62c8f..f1ba95e9cd 100644
--- a/client_example/11_grouped_conv_bwd_weight/README.md
+++ b/client_example/11_grouped_conv_bwd_weight/README.md
@@ -34,12 +34,12 @@ List of the device operations for grouped convolution backward weight in CK:
 
 Table of supported cases by instance factory with XDL instruction:
 
-|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
-|-------|---|---|---|
-|bf16|2D, 3D|2D, 3D|&cross;|
-|bf16(fp32 for weight)|2D, 3D|&cross;|1D, 2D, 3D|
-|fp16 |2D, 3D|2D, 3D|1D, 2D, 3D|
-|fp32  |2D, 3D|2D, 3D|1D, 2D, 3D|
+|       |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
+|-------|---|---|---|---|
+|bf16|2D, 3D|2D, 3D|2D, 3D|&cross;|
+|bf16(fp32 for weight)|2D, 3D|&cross;|&cross;|1D, 2D, 3D|
+|fp16 |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
+|fp32  |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
 
 Table of supported cases by instance factory with WMMA instruction:
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 033b84aafc..4d730b1f37 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -218,8 +218,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     using EDataType = WeiDataType;
 
     // If NGCHW then ADataType must be equal to BDataType
-    static_assert(!(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                    is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
+    static_assert(!(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                    is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
                   is_same_v<ADataType, BDataType>);
 
     using AElementwiseOperation   = OutElementwiseOperation;
@@ -376,6 +376,12 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     using NHWGCTransposeDescType =
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
 
     using ABCGridDescs = decltype(GetABCGridDesc<NDimSpatial>());
 
@@ -452,6 +458,28 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                             Sequence<CBlockTransferScalarPerVector_NWaveNPerXdl>,
                             I1,
                             I1>;
+    // NPerBlock is used for the first dim which is store dimension
+    // (with CBlockTransferScalarPerVector_NWaveNPerXdl scalar per vector).
+    // CBlockTransferScalarPerVector_NWaveNPerXdl is aligned to NPerBlock so
+    // it is more flexible to use this dim for store dimension with such scalar
+    // per vector.
+    using GridwiseElementwiseWeightTransposeCast =
+        GridwiseElementwise<Tuple<GKYXCTransposeDescType>,
+                            Tuple<GKCYXTransposeDescType>,
+                            Tuple<const AccDataType*>,
+                            Tuple<EDataType*>,
+                            Block2TileMapElementwise,
+                            CDEElementwiseOperation,
+                            BlockSize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<0, 1>,
+                            Sequence<CBlockTransferScalarPerVector_NWaveNPerXdl>,
+                            Sequence<1>,
+                            I1,
+                            I0>;
 
     using GridwiseElementwiseTranspose =
         GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
@@ -533,12 +561,15 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
-            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(b_g_n_c_wis_lengths,
-                                                                      b_g_n_c_wis_strides);
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
                                                                       a_g_n_k_wos_strides);
+            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(b_g_n_c_wis_lengths,
+                                                                      b_g_n_c_wis_strides);
+            std::array<index_t, NDimSpatial + 3> e_g_k_c_xs_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
+                                                                    e_g_k_c_xs_strides);
 
             const auto descs =
                 conv_to_gemm_transformer_v2
@@ -550,7 +581,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                         filter_spatial_lengths_,
                         output_spatial_lengths_,
                         b_g_n_c_wis_strides_transposed,
-                        e_g_k_c_xs_strides,
+                        e_g_k_c_xs_strides_transposed,
                         a_g_n_k_wos_strides_transposed,
                         conv_filter_strides,
                         conv_filter_dilations,
@@ -580,29 +611,21 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                         input_right_pads,
                         k_batch_)[I2];
 
-            elementwise_block_2_ctile_map_ = Block2TileMapElementwise{
-                ce_grid_desc_m_n_.GetLength(I0), ce_grid_desc_m_n_.GetLength(I1)};
-
             const index_t GemmM = a_grid_desc_k0_m_k1_.GetLength(I1);
             const index_t GemmN = b_grid_desc_k0_n_k1_.GetLength(I1);
 
             // A/B/C Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
-            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                Conv_K_ * Conv_C_ *
-                std::accumulate(begin(filter_spatial_lengths_),
-                                end(filter_spatial_lengths_),
-                                index_t{1},
-                                std::multiplies<>{});
+            compute_ptr_offset_of_batch_.BatchStrideC_ = e_g_k_c_xs_strides_transposed[0];
             c_grid_desc_mblock_mperblock_nblock_nperblock_ =
                 GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                     ce_grid_desc_m_n_,
                     GridwiseGemm::CalculateMBlock(GemmM),
                     GridwiseGemm::CalculateNBlock(GemmN));
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 a_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
@@ -618,17 +641,35 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
 
+                e_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+                e_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+
                 elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
 
                 elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
                     b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
             }
+
+            elementwise_block_2_ctile_map_ =
+                is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                        is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>()
+                    ? Block2TileMapElementwise{e_in_transpose_desc_.GetLength(I0),
+                                               e_in_transpose_desc_.GetLength(I1)}
+                    : Block2TileMapElementwise{ce_grid_desc_m_n_.GetLength(I0),
+                                               ce_grid_desc_m_n_.GetLength(I1)};
         }
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
+            // Align to 128B
+            return math::integer_divide_ceil(
+                       sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize(), 128) *
+                   128;
         }
 
         std::size_t GetWorkspaceBTensorSizeBytes() const
@@ -638,14 +679,23 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
         std::size_t GetWorkspaceETensorSizeBytes() const
         {
-            return sizeof(AccDataType) * ce_grid_desc_m_n_.GetElementSpaceSize() * Conv_G_;
+            // Align to 128B
+            return math::integer_divide_ceil(sizeof(AccDataType) *
+                                                 ce_grid_desc_m_n_.GetElementSpaceSize() * Conv_G_,
+                                             128) *
+                   128;
         }
 
         std::size_t GetWorkspaceSizeBytes() const
         {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            // 1. We need to transpose A and B for NGCHW and NGKHW layouts
+            // 2. If C format is GKCYX then tranpose during second stage.
+            //    If C format is GKYXC then just perform second stage.
+            //    Due to the fact that E workspace is always needed, we
+            //    allocate them as the first part of the workspace.
+            //    [EWorkspace, AWorkspace, BWorkspace]
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
                        GetWorkspaceETensorSizeBytes();
@@ -672,6 +722,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
 
         NGCHWTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_;
+        GKYXCTransposeDescType e_in_transpose_desc_;
+        GKCYXTransposeDescType e_out_transpose_desc_;
 
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_batch_;
@@ -728,11 +780,11 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             const ADataType* p_a_grid = arg.p_a_grid_;
             const BDataType* p_b_grid = arg.p_b_grid_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 p_a_grid = type_convert<const ADataType*>(arg.p_workspace_) +
-                           arg.GetWorkspaceETensorSizeBytes() / sizeof(BDataType);
+                           arg.GetWorkspaceETensorSizeBytes() / sizeof(ADataType);
                 p_b_grid =
                     type_convert<const BDataType*>(arg.p_workspace_) +
                     (arg.GetWorkspaceETensorSizeBytes() + arg.GetWorkspaceATensorSizeBytes()) /
@@ -1373,41 +1425,72 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             float avg_time                 = 0.f;
             auto launch_elementwise_kernel = [&]() {
                 const AccDataType* p_c_grid = type_convert<const AccDataType*>(arg.p_workspace_);
-                const index_t grid_size     = arg.elementwise_block_2_ctile_map_.CalculateGridSize(
-                                              arg.ce_elementwise_grid_desc_m_n_) *
-                                          arg.Conv_G_;
 
                 std::array<index_t, I1> in_out_batch_strides = {
                     static_cast<index_t>(arg.compute_ptr_offset_of_batch_.BatchStrideC_)};
 
-                const auto kernel = kernel_batched_elementwise<GridwiseElementwiseCast,
-                                                               ck::Tuple<CElementwiseGridDesc_M_N>,
-                                                               ck::Tuple<CElementwiseGridDesc_M_N>,
-                                                               ck::Tuple<const AccDataType*>,
-                                                               ck::Tuple<EDataType*>,
-                                                               Block2TileMapElementwise,
-                                                               CDEElementwiseOperation,
-                                                               I1,
-                                                               I1>;
+                if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                             is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+                {
+                    const index_t grid_size = arg.elementwise_block_2_ctile_map_.CalculateGridSize(
+                        arg.e_in_transpose_desc_);
 
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              make_tuple(arg.ce_elementwise_grid_desc_m_n_),
-                                              make_tuple(arg.ce_elementwise_grid_desc_m_n_),
-                                              make_tuple(p_c_grid),
-                                              make_tuple(arg.p_e_grid_),
-                                              arg.elementwise_block_2_ctile_map_,
-                                              arg.cde_element_op_,
-                                              arg.Conv_G_,
-                                              in_out_batch_strides,
-                                              in_out_batch_strides);
+                    const auto kernel = kernel_elementwise<GridwiseElementwiseWeightTransposeCast,
+                                                           ck::Tuple<GKYXCTransposeDescType>,
+                                                           ck::Tuple<GKCYXTransposeDescType>,
+                                                           ck::Tuple<const AccDataType*>,
+                                                           ck::Tuple<EDataType*>,
+                                                           Block2TileMapElementwise,
+                                                           CDEElementwiseOperation>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  make_tuple(arg.e_in_transpose_desc_),
+                                                  make_tuple(arg.e_out_transpose_desc_),
+                                                  make_tuple(p_c_grid),
+                                                  make_tuple(arg.p_e_grid_),
+                                                  arg.elementwise_block_2_ctile_map_,
+                                                  arg.cde_element_op_);
+                }
+                else
+                {
+                    const index_t grid_size = arg.elementwise_block_2_ctile_map_.CalculateGridSize(
+                                                  arg.ce_elementwise_grid_desc_m_n_) *
+                                              arg.Conv_G_;
+
+                    const auto kernel =
+                        kernel_batched_elementwise<GridwiseElementwiseCast,
+                                                   ck::Tuple<CElementwiseGridDesc_M_N>,
+                                                   ck::Tuple<CElementwiseGridDesc_M_N>,
+                                                   ck::Tuple<const AccDataType*>,
+                                                   ck::Tuple<EDataType*>,
+                                                   Block2TileMapElementwise,
+                                                   CDEElementwiseOperation,
+                                                   I1,
+                                                   I1>;
+
+                    return launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  make_tuple(arg.ce_elementwise_grid_desc_m_n_),
+                                                  make_tuple(arg.ce_elementwise_grid_desc_m_n_),
+                                                  make_tuple(p_c_grid),
+                                                  make_tuple(arg.p_e_grid_),
+                                                  arg.elementwise_block_2_ctile_map_,
+                                                  arg.cde_element_op_,
+                                                  arg.Conv_G_,
+                                                  in_out_batch_strides,
+                                                  in_out_batch_strides);
+                }
             };
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 const index_t grid_size_a =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
@@ -1417,7 +1500,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                         arg.b_in_transpose_desc_);
 
                 ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_) +
-                                          arg.GetWorkspaceETensorSizeBytes() / sizeof(BDataType);
+                                          arg.GetWorkspaceETensorSizeBytes() / sizeof(ADataType);
                 BDataType* p_b_out_grid =
                     type_convert<BDataType*>(arg.p_workspace_) +
                     (arg.GetWorkspaceETensorSizeBytes() + arg.GetWorkspaceATensorSizeBytes()) /
@@ -1514,7 +1597,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         if constexpr(NDimSpatial == 2)
         {
             if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -1522,7 +1605,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         else if constexpr(NDimSpatial == 3)
         {
             if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -1597,8 +1680,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             return false;
         }
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                     is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
         {
             if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVector != 0)
             {
@@ -1767,8 +1850,8 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
             << NumGroupsToMerge;
             
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() || 
-                        is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() || 
+                        is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
                 str << ", TransposeTransferSrcScalarPerVector: "
                 << TransposeTransferSrcScalarPerVector <<", "
                 << "TransposeTransferDstScalarPerVector: " << TransposeTransferDstScalarPerVector;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 6d2a354ce3..f40b238c8a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -165,8 +165,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using CDataType = WeiDataType;
 
     // If NGCHW then ADataType must be equal to BDataType
-    static_assert(!(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                    is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
+    static_assert(!(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                    is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
                   is_same_v<ADataType, BDataType>);
 
     using AElementwiseOperation = OutElementwiseOperation;
@@ -301,7 +301,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                   MPerBlock / ClusterLengthMPerBlock,
                                   NPerBlock / ClusterLengthNPerBlock>{};
 
-    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
+    using Block2TileMapTranspose = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
     static constexpr index_t TransposeTransferSrcScalarPerVectorAligned =
         std::min(NPerBlock / ClusterLengthNPerBlock, MaxTransposeTransferSrcScalarPerVector);
@@ -314,13 +314,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using NHWGCTransposeDescType =
         remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
                                     .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKCYXTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKCYXTransposeDesc<NDimSpatial>({}, {}))>;
+    using GKYXCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeGKYXCTransposeDesc<NDimSpatial>({}, {}))>;
 
-    using GridwiseElementwiseTranspose =
+    using GridwiseInOutTranspose =
         GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
                             Tuple<NHWGCTransposeDescType>,
                             Tuple<const ADataType*>,
                             Tuple<ADataType*>,
-                            Block2TileMapElementwise,
+                            Block2TileMapTranspose,
                             element_wise::PassThrough,
                             BlockSize,
                             MPerBlock,
@@ -333,6 +339,26 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                             I1,
                             I0>;
 
+    // NPerBlock is used for the first dim which is store dimension
+    // (with CBlockTransferScalarPerVector_NWaveNPerXdl scalar per vector).
+    using GridwiseElementwiseWeightTranspose =
+        GridwiseElementwise<Tuple<GKYXCTransposeDescType>,
+                            Tuple<GKCYXTransposeDescType>,
+                            Tuple<const CDataType*>,
+                            Tuple<CDataType*>,
+                            Block2TileMapTranspose,
+                            element_wise::PassThrough,
+                            BlockSize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<CBlockTransferScalarPerVector_NWaveNPerXdl>,
+                            Sequence<1>,
+                            I1,
+                            I0>;
+
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType,
@@ -452,13 +478,15 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
-            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(b_g_n_c_wis_lengths,
-                                                                      b_g_n_c_wis_strides);
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
                                                                       a_g_n_k_wos_strides);
-
+            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(b_g_n_c_wis_lengths,
+                                                                      b_g_n_c_wis_strides);
+            std::array<index_t, NDimSpatial + 3> e_g_k_c_xs_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
+                                                                    e_g_k_c_xs_strides);
             const auto descs =
                 conv_to_gemm_transformer
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -469,7 +497,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                         filter_spatial_lengths_,
                         output_spatial_lengths_,
                         b_g_n_c_wis_strides_transposed,
-                        e_g_k_c_xs_strides,
+                        e_g_k_c_xs_strides_transposed,
                         a_g_n_k_wos_strides_transposed,
                         conv_filter_strides,
                         conv_filter_dilations,
@@ -487,12 +515,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             // A/B/C Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
-            compute_ptr_offset_of_batch_.BatchStrideC_ =
-                Conv_K_ * Conv_C_ *
-                std::accumulate(begin(filter_spatial_lengths_),
-                                end(filter_spatial_lengths_),
-                                index_t{1},
-                                std::multiplies<>{});
+            compute_ptr_offset_of_batch_.BatchStrideC_ = e_g_k_c_xs_strides_transposed[0];
 
             if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
                                            b_grid_desc_kbatch_k0_n_k1_,
@@ -503,8 +526,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
             }
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 a_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
@@ -520,31 +543,33 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
 
-                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
+                e_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKYXCTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+                e_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeGKCYXTransposeDesc<NDimSpatial>(
+                        e_g_k_c_xs_lengths, e_g_k_c_xs_strides);
+
+                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapTranspose{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
 
-                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapTranspose{
                     b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
+
+                elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapTranspose{
+                    e_in_transpose_desc_.GetLength(I0), e_in_transpose_desc_.GetLength(I1)};
             }
         }
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceBTensorSizeBytes() const
-        {
-            return sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize();
-        }
-
-        std::size_t GetWorkspaceSizeBytes() const
-        {
-            // Transpose require workspace for A and B
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
-                return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes();
+                // Align to 128B
+                return math::integer_divide_ceil(
+                           sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize(), 128) *
+                       128;
             }
             else
             {
@@ -552,6 +577,41 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             }
         }
 
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                // Align to 128B
+                return math::integer_divide_ceil(
+                           sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize(), 128) *
+                       128;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                return sizeof(CDataType) * e_in_transpose_desc_.GetElementSpaceSize();
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes() +
+                   GetWorkspaceETensorSizeBytes();
+        }
+
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
         CDataType* p_c_grid_;
@@ -562,12 +622,15 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
         Block2CTileMap block_2_ctile_map_;
 
-        Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
-            elementwise_block_2_ctile_map_transpose_b_;
+        Block2TileMapTranspose elementwise_block_2_ctile_map_transpose_a_,
+            elementwise_block_2_ctile_map_transpose_b_, elementwise_block_2_ctile_map_transpose_e_;
 
         NGCHWTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_;
         NHWGCTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_;
 
+        GKYXCTransposeDescType e_in_transpose_desc_;
+        GKCYXTransposeDescType e_out_transpose_desc_;
+
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;
 
@@ -621,9 +684,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
             const ADataType* p_a_grid = arg.p_a_grid_;
             const BDataType* p_b_grid = arg.p_b_grid_;
+            CDataType* p_e_grid       = arg.p_c_grid_;
 
-            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                p_e_grid =
+                    type_convert<CDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(CDataType);
+            }
+
+            if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
                 const index_t grid_size_a =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
@@ -640,8 +713,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
 
                 // Different data type for A and B is not supported
-                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseTranspose,
-                                                                GridwiseElementwiseTranspose,
+                auto kernel_transpose = kernel_elementwise_dual<GridwiseInOutTranspose,
+                                                                GridwiseInOutTranspose,
                                                                 ck::Tuple<NGCHWTransposeDescType>,
                                                                 ck::Tuple<NGCHWTransposeDescType>,
                                                                 ck::Tuple<NHWGCTransposeDescType>,
@@ -650,8 +723,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                                                 ck::Tuple<const ADataType*>,
                                                                 ck::Tuple<ADataType*>,
                                                                 ck::Tuple<ADataType*>,
-                                                                Block2TileMapElementwise,
-                                                                Block2TileMapElementwise,
+                                                                Block2TileMapTranspose,
+                                                                Block2TileMapTranspose,
                                                                 element_wise::PassThrough>;
 
                 avg_time += launch_and_time_kernel(stream_config,
@@ -698,24 +771,36 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     ComputePtrOffsetOfStridedBatch<>,
                     has_main_loop>;
 
-                avg_time +=
-                    launch_and_time_kernel(stream_config,
-                                           kernel,
-                                           dim3(grid_size),
-                                           dim3(BlockSize),
-                                           0,
-                                           p_a_grid,
-                                           p_b_grid,
-                                           arg.p_c_grid_,
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.Conv_G_,
-                                           arg.a_grid_desc_kbatch_k0_m_k1_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                           arg.block_2_ctile_map_,
-                                           arg.compute_ptr_offset_of_batch_);
+                const auto clear_workspace = [&]() {
+                    if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                                 is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+                    {
+                        hip_check_error(hipMemsetAsync(p_e_grid,
+                                                       0,
+                                                       arg.GetWorkspaceETensorSizeBytes(),
+                                                       stream_config.stream_id_));
+                    }
+                };
+
+                avg_time += launch_and_time_kernel_with_preprocess(
+                    stream_config,
+                    clear_workspace,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    p_a_grid,
+                    p_b_grid,
+                    p_e_grid,
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.c_element_op_,
+                    arg.Conv_G_,
+                    arg.a_grid_desc_kbatch_k0_m_k1_,
+                    arg.b_grid_desc_kbatch_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                    arg.block_2_ctile_map_,
+                    arg.compute_ptr_offset_of_batch_);
             };
 
             if(has_main_k0_block_loop)
@@ -726,6 +811,38 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             {
                 launch_kernel(integral_constant<bool, false>{});
             }
+
+            if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                const index_t grid_size_e =
+                    arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
+                        arg.e_in_transpose_desc_);
+
+                const CDataType* p_e_in_grid = static_cast<const CDataType*>(p_e_grid);
+
+                // Different data type for A and B is not supported
+                auto kernel_transpose = kernel_elementwise<GridwiseElementwiseWeightTranspose,
+                                                           ck::Tuple<GKYXCTransposeDescType>,
+                                                           ck::Tuple<GKCYXTransposeDescType>,
+                                                           ck::Tuple<const CDataType*>,
+                                                           ck::Tuple<CDataType*>,
+                                                           Block2TileMapTranspose,
+                                                           element_wise::PassThrough>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_transpose,
+                                                   dim3(grid_size_e),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   make_tuple(arg.e_in_transpose_desc_),
+                                                   make_tuple(arg.e_out_transpose_desc_),
+                                                   make_tuple(p_e_in_grid),
+                                                   make_tuple(arg.p_c_grid_),
+                                                   arg.elementwise_block_2_ctile_map_transpose_e_,
+                                                   element_wise::PassThrough{});
+            }
+
             return avg_time;
         }
 
@@ -763,7 +880,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         {
             if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
                            is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -772,7 +889,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         {
             if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
                            is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>() ||
-                           is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()))
+                           is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -810,8 +927,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             return false;
         }
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                     is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                     is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>())
         {
             if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVectorAligned != 0)
             {
@@ -980,8 +1097,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             << CShuffleNXdlPerWavePerShuffle << ", "
             << CBlockTransferScalarPerVector_NWaveNPerXdl;
 
-        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() || 
-                        is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
+        if constexpr(is_NGCHW_NGKHW<InLayout, WeiLayout, OutLayout>() || 
+                        is_NGCDHW_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
                 str << ", TransposeTransferSrcScalarPerVectorAligned: "
                 << TransposeTransferSrcScalarPerVectorAligned <<", "
                 << "TransposeTransferDstScalarPerVectorAligned: " << TransposeTransferDstScalarPerVectorAligned;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 69913163f0..272b832e11 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -502,6 +502,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
     static constexpr index_t ElementwiseBlocksize = ClusterLengthNPerBlock * ClusterLengthNPerBlock;
 
+    // NPerBlock is used for the first and second dim which to use
+    // CDEBlockTransferScalarPerVector_NPerBlock for load and store during
+    // transposition. CBlockTransferScalarPerVector_NWaveNPerXdl is aligned to
+    // NPerBlock so it is more flexible to use this dim for load store dimension
+    // with such scalar per vector.
     using GridwiseElementwiseInputTranspose =
         GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
                             Tuple<NHWGCTransposeDescType>,
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
index 7bf52cb229..0f28fe8169 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
@@ -12,6 +12,15 @@
 namespace ck {
 namespace tensor_operation {
 
+/*
+ * Transform Convolution NGCHW to NHWGC. We transform [N, G, C, H, W] tensor
+ * descriptor to [N * G * C, H * W] (input or output image). The first
+ * dimension is store dimension, the second one is load dimension. For
+ * NHWGC to NGCHW load and store are reverted. For weight we transform
+ * [G, K, C, Y, X] to [G * K * Y * X, C]. First dim is load dimension,
+ * second dim is store dimension.
+ */
+
 template <typename ALayout,
           typename BLayout,
           typename ELayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index f1993eb149..e8e46a7329 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -431,6 +431,51 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances(
                         op_ptrs);
                 }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKCYX> &&
+                         is_same_v<OutLayout, NGKHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instances(
+                        op_ptrs);
+                }
 #endif
             }
             if constexpr(is_same_v<InLayout, NGCHW> && is_same_v<WeiLayout, GKYXC> &&
@@ -443,12 +488,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -460,12 +499,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
@@ -610,6 +643,51 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
                         op_ptrs);
                 }
+#endif
+            }
+            if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKCZYX> &&
+                         is_same_v<OutLayout, NGKDHW>)
+            {
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                             is_same_v<WeiDataType, ck::bhalf_t> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+                        op_ptrs);
+                }
 #endif
             }
             if constexpr(is_same_v<InLayout, NGCDHW> && is_same_v<WeiLayout, GKZYXC> &&
@@ -622,12 +700,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -639,12 +711,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances(
                         op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
-                        op_ptrs);
-                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
-                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index 31a536c7bb..2af5edf98c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -149,10 +149,10 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -292,10 +292,10 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -304,10 +304,22 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -329,10 +341,10 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -461,10 +473,10 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -473,10 +485,22 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -510,6 +534,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -611,10 +647,10 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -755,10 +791,10 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -767,10 +803,22 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -792,10 +840,10 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -924,10 +972,10 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -936,10 +984,22 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -961,6 +1021,18 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index ac393560b7..9d5be260cb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -1,47 +1,53 @@
 # ONLY XDL_AND_DL_KERNELS
 set(GROUPED_CONV2D_BWD_WEIGHT
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
+    xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
+
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
+
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp    
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
+
+    xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+    xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
+    xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
new file mode 100644
index 0000000000..d63cb7375a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+            2,
+            NGCHW,
+            GKCYX,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
index 9fbdc6c461..0f0817b775 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
index e1c865a883..8d4ae3dd76 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
new file mode 100644
index 0000000000..c28de81134
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+            2,
+            NGCHW,
+            GKCYX,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
index bbab53d9b5..7efe6f7bc1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp
similarity index 88%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp
index b3b5489930..9831e87569 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             2,
             NGCHW,
-            GKYXC,
+            GKCYX,
             NGKHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index 2c494f22ff..ff7ca68215 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            BF16,
                                                            BF16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
                                                                     NGCHW,
-                                                                    GKYXC,
+                                                                    GKCYX,
                                                                     NGKHW,
                                                                     ConvBwdWeightDefault,
                                                                     1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
                                                                     NGCHW,
-                                                                    GKYXC,
+                                                                    GKCYX,
                                                                     NGKHW,
                                                                     ConvBwdWeightDefault,
                                                                     4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
index 911751b0b9..829debe292 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F16,
                                                            F16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
index 3b1fdd9be7..4b4dd38132 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
-                                                           GKYXC,
+                                                           GKCYX,
                                                            NGKHW,
                                                            F32,
                                                            F32,
@@ -27,7 +27,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
                                                                    NGCHW,
-                                                                   GKYXC,
+                                                                   GKCYX,
                                                                    NGKHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
index 0b429af832..e554e7558f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
index d70c95bf6e..0d16f9bd82 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..da10f9c235
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<2,
+                                                                           NGCHW,
+                                                                           GKYXC,
+                                                                           NGKHW,
+                                                                           ConvBwdWeightDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
index 74ccc4c89b..6e77488299 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
index fab2898559..4a0e89f0fe 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
index 407645e893..9a0da7c431 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
index 807de66ca5..e2ecee734f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
index 084c83cd65..a65c20c840 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
index d174e5b6c0..089953dad2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
index cac9353354..678e5d234f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index ee71e37e79..54edc0d247 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index f6e1ada352..f77d88e71c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 384706414a..e6115f28a1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index 860e08cafe..1b0d2dd0b2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -1,43 +1,49 @@
  # XDL_DL_WMMA_KERNELS
 set(GROUPED_CONV3D_BWD_WEIGHT
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
-     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
+     xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+     xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+     xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
+
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
+
+     xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+     xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
+     xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
+
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
     )
 
 if(DL_KERNELS)
@@ -62,7 +68,7 @@ list(APPEND GROUPED_CONV3D_BWD_WEIGHT
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_BWD_WEIGHT
-      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
+      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
 endif()
 
 add_instance_library(device_grouped_conv3d_bwd_weight_instance ${GROUPED_CONV3D_BWD_WEIGHT})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/gndhwc_gkzyxc_gndhwk/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
index 63249a1c13..4c4589d128 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
index 7841ddad99..b6d8c7f635 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
index ba6285a380..5b295e728b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
index a8fbefb5bd..125b324985 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
index e4baafc0be..beb937f185 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
index f9bc5b1349..5274ff74a0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
index 679f30a3d9..767e091b94 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index f1ea371819..53011b4972 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
index 6e7f22b7e5..8a1e0b2008 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index eba721c7b8..d23b8516ca 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
similarity index 97%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 7dd289139c..4de221a885 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
similarity index 100%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
new file mode 100644
index 0000000000..e7cfcf1e5f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+            3,
+            NGCDHW,
+            GKCZYX,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
index ac6cb82681..8d9c3c56ed 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
index 705f5e8cef..b8036bab91 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
new file mode 100644
index 0000000000..f22b0c74c0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+            3,
+            NGCDHW,
+            GKCZYX,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
index 489fa81a7f..c8c6253362 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
similarity index 89%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
index f9eadcec5b..c69a182271 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
         device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             3,
             NGCDHW,
-            GKZYXC,
+            GKCZYX,
             NGKDHW,
             ConvBwdWeightDefault,
             BlockGemmPipelineScheduler::Intrawave,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
index fd558f70ca..20e516dbe1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            BF16,
                                                            BF16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
                                                                     NGCDHW,
-                                                                    GKZYXC,
+                                                                    GKCZYX,
                                                                     NGKDHW,
                                                                     ConvBwdWeightDefault,
                                                                     1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
                                                                     NGCDHW,
-                                                                    GKZYXC,
+                                                                    GKCZYX,
                                                                     NGKDHW,
                                                                     ConvBwdWeightDefault,
                                                                     4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
index 945e6fa563..90dc8f858f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F16,
                                                            F16,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
index b7e6454062..7550dcf7fd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -10,10 +10,10 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
-                                                           GKZYXC,
+                                                           GKCZYX,
                                                            NGKDHW,
                                                            F32,
                                                            F32,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    1,
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances
         instances,
         device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
                                                                    NGCDHW,
-                                                                   GKZYXC,
+                                                                   GKCZYX,
                                                                    NGKDHW,
                                                                    ConvBwdWeightDefault,
                                                                    4,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
index 16221eb3e7..35669e6752 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
index 126e90f2ce..2e0aeabca1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..b21e4fdef6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkzyxc_ngkdhw/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<3,
+                                                                           NGCDHW,
+                                                                           GKZYXC,
+                                                                           NGKDHW,
+                                                                           ConvBwdWeightDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index 4170ac65aa..1640b48ffd 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <initializer_list>
@@ -17,6 +17,7 @@ enum struct ConvLayout
     GNHWC_GKYXC_GNHWK, // 1
     NHWGC_GKYXC_NHWGK, // 2
     NGCHW_GKYXC_NGKHW, // 3
+    NGCHW_GKCYX_NGKHW, // 4
 };
 
 enum struct ConvDataType
@@ -49,6 +50,8 @@ static void print_helper_msg()
                  "Ho, Wo, G, K]\n"
               << "                     3: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
                  "G, K, Ho, Wo]\n"
+              << "                     4: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, "
+                 "G, K, Ho, Wo]\n"
               << "arg4: verification (0: no, 1: yes)\n"
               << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
               << "arg6: print tensor value (0: no; 1: yes)\n"
@@ -199,6 +202,21 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
             return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
         }
     }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+    }
     if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
     {
         if(data_type == ConvDataType::F32_F32_F32)
@@ -262,6 +280,22 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
                 I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
         }
     }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+    }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
 
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index 81f9977542..1278b6744d 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -29,8 +29,9 @@ def run_ck_profiler_cmd(cmd):
 def parse_layouts(args):
     if args.in_layout == "NCW" or args.in_layout == "NCHW" or \
        args.in_layout == "NCDHW":
-        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
-             args.ck_profier_op == "grouped_conv_fwd" or \
+        if args.ck_profier_op == "grouped_conv_bwd_weight":
+            args.layout = 4
+        elif args.ck_profier_op == "grouped_conv_fwd" or \
              args.ck_profier_op == "grouped_conv_bwd_data":
             args.layout = 3
         else:
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 54b96d775c..21f2cb5ce6 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -146,8 +146,12 @@ using KernelTypes2d = ::testing::Types<
     std::tuple<float, float, float, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
     std::tuple<ck::half_t, ck::half_t, ck::half_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
     std::tuple<ck::bhalf_t, float, ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
+    std::tuple<float, float, float, NGCHW, GKYXC, NGKHW, ck::Number<2>>,
     std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>>;
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>,
+    std::tuple<float, float, float, NGCHW, GKCYX, NGKHW, ck::Number<2>>,
+    std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCHW, GKCYX, NGKHW, ck::Number<2>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKCYX, NGKHW, ck::Number<2>>>;
 using KernelTypes3d = ::testing::Types<
     std::tuple<float, float, float, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
     std::tuple<ck::half_t, ck::half_t, ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
@@ -158,8 +162,12 @@ using KernelTypes3d = ::testing::Types<
     std::tuple<ck::half_t, ck::half_t, ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
     std::tuple<ck::bhalf_t, float, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
     std::tuple<int8_t, int8_t, int8_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
+    std::tuple<float, float, float, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>,
     std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>>;
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>,
+    std::tuple<float, float, float, NGCDHW, GKCZYX, NGKDHW, ck::Number<3>>,
+    std::tuple<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, NGCDHW, GKCZYX, NGKDHW, ck::Number<3>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKCZYX, NGKDHW, ck::Number<3>>>;
 
 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d);
 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d);

From 9329432f6c3d4ddd8d5b836245bd44acef89be3d Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Thu, 3 Apr 2025 13:35:43 +0200
Subject: [PATCH 016/443] Post-merge changes for fully async args copy in ck
 grouped gemm (#1991)

* Post-merge changes for fully async args copy in ck grouped gemm

* Post-merge documentation and naming changes

* Build fix and updated changelog

* Revised comments
---
 CHANGELOG.md                                  |  2 ++
 .../run_grouped_gemm_example.inc              | 35 +++++++++++++------
 .../device_grouped_gemm_multiple_d_dl.hpp     | 15 ++++++--
 .../device/impl/device_grouped_gemm_xdl.hpp   | 16 +++++++--
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp | 16 +++++++--
 5 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f9da2b3117..49ef2998eb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 ### Added
 
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
+* Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
+* Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
 * Added support for GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
index 86b3182a52..7186c22233 100644
--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -21,6 +21,7 @@ struct ExecutionConfig final
     bool do_verification = true;
     int init_method      = 1;
     bool time_kernel     = false;
+    bool async_hargs     = false;
 };
 
 bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
@@ -190,10 +191,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
         gemm_workspace.Realloc(workspace_size);
         gemm.SetWorkSpacePointer(&argument, gemm_workspace.GetDeviceBuffer());
     }
-    if(hargs_size > 0)
+    if(config.async_hargs && hargs_size > 0)
     {
         hip_check_error(hipHostMalloc(&gemm_hargs, hargs_size));
-        gemm.SetHostKernelArgs(&argument, gemm_hargs);
+        gemm.SetHostKernelArgsPointer(&argument, gemm_hargs);
     }
 
     if(!gemm.IsSupportedArgument(argument))
@@ -203,16 +204,23 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
             "not support this GEMM problem");
     }
 
-    hipStream_t stream0 = nullptr;
-    hip_check_error(hipStreamCreate(&stream0));
+    if(!config.async_hargs)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false});
+    }
+    else
+    {
+        hipStream_t stream0 = nullptr;
+        hip_check_error(hipStreamCreate(&stream0));
 
-    hipEvent_t event0 = nullptr;
-    hip_check_error(hipEventCreate(&event0));
+        hipEvent_t event0 = nullptr;
+        hip_check_error(hipEventCreate(&event0));
 
-    invoker.Run(argument, StreamConfig{nullptr, false}, stream0, event0);
+        invoker.Run(argument, StreamConfig{nullptr, false}, stream0, event0);
 
-    hip_check_error(hipEventSynchronize(event0));
-    hip_check_error(hipStreamSynchronize(stream0));
+        hip_check_error(hipEventSynchronize(event0));
+        hip_check_error(hipStreamSynchronize(stream0));
+    }
 
     bool pass = true;
     if(config.do_verification)
@@ -280,18 +288,25 @@ bool run_grouped_gemm_example(int argc, char* argv[])
         problem_size.stride_Bs.push_back(problem_size.Ks[i]);
         problem_size.stride_Cs.push_back(problem_size.Ns[i]);
     }
-
     if(argc == 4)
     {
         config.do_verification = std::stoi(argv[1]);
         config.init_method     = std::stoi(argv[2]);
         config.time_kernel     = std::stoi(argv[3]);
     }
+    else if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        config.async_hargs     = std::stoi(argv[4]);
+    }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
         printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4: async hargs (0=n0, 1=yes)\n");
         exit(0);
     }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index c148d7dbb7..463b10de43 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -607,6 +607,9 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
+            // If the user provides copy stream and copy event, we assume that they're also
+            // responsible for providing allocated host memory (eg. pinned) which
+            // would be used to copy kernel arguments to the device.
             if(cpy_stream && cpy_event)
             {
                 if(arg.gemm_kernel_host_args_ == nullptr)
@@ -625,7 +628,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
                 hipGetErrorString(hipEventRecord(cpy_event, cpy_stream));
                 hipGetErrorString(hipEventSynchronize(cpy_event));
             }
-            else
+            else // In this case CK owns memory allocated on host.
             {
                 hipGetErrorString(
                     hipMemcpyAsync(arg.p_workspace_,
@@ -801,7 +804,15 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
         return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
     }
 
-    void SetHostKernelArgs(BaseArgument* p_arg, void* p_host_kernel_args) const
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    ///             This function can be utilised to use pinned memory for the host args and
+    ///             achieve fully async data copy.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_host_kernel_args The pointer to the host memory where the kernel
+    ///                                arguments will be copied
+    void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
     {
         Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
         if(!pArg_)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index 2a6406aac3..d9a0249da8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -560,6 +560,9 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 }
             }
 
+            // If the user provides copy stream and copy event, we assume that they're also
+            // responsible for providing allocated host memory (eg. pinned) which
+            // would be used to copy kernel arguments to the device.
             if(cpy_stream && cpy_event)
             {
                 if(arg.gemm_kernel_host_args_ == nullptr)
@@ -578,7 +581,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                 hipGetErrorString(hipEventRecord(cpy_event, cpy_stream));
                 hipGetErrorString(hipEventSynchronize(cpy_event));
             }
-            else
+            else // In this case CK owns memory allocated on host.
             {
                 hipGetErrorString(hipMemcpyAsync(arg.p_workspace_,
                                                  arg.gemm_desc_kernel_arg_.data(),
@@ -763,7 +766,16 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
 
     size_t GetHostKernelArgSize(const BaseArgument* p_arg) const { return GetWorkSpaceSize(p_arg); }
 
-    void SetHostKernelArgs(BaseArgument* p_arg, void* p_host_kernel_args) const
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    ///             This function can be utilised to use pinned memory for the host args and
+    ///             achieve fully async data copy.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_host_kernel_args The pointer to the host memory where the kernel
+    ///                                arguments will be copied
+    ///
+    void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
     {
         Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
         if(!pArg_)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 03431d7156..a2afb62eec 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -423,6 +423,9 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 }
             }
 
+            // If the user provides copy stream and copy event, we assume that they're also
+            // responsible for providing allocated host memory (eg. pinned) which
+            // would be used to copy kernel arguments to the device.
             if(cpy_stream && cpy_event)
             {
                 if(arg.gemm_kernel_host_args_ == nullptr)
@@ -441,7 +444,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 hip_check_error(hipEventRecord(cpy_event, cpy_stream));
                 hip_check_error(hipEventSynchronize(cpy_event));
             }
-            else
+            else // In this case CK owns memory allocated on host.
             {
 
                 hip_check_error(
@@ -702,7 +705,16 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
         return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
     }
 
-    void SetHostKernelArgs(BaseArgument* p_arg, void* p_host_kernel_args) const
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Sets the host kernel arguments pointer and copies that data on the host side.
+    ///             This function can be utilised to use pinned memory for the host args and
+    ///             achieve fully async data copy.
+    ///
+    /// @param      p_arg              The pointer to the Argument we're going to update.
+    /// @param[in]  p_host_kernel_args The pointer to the host memory where the kernel
+    ///                                arguments will be copied
+    ///
+    void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
     {
         Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
         if(!pArg_)

From 265af71a71fd81c99988365477973c337c512e13 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Thu, 3 Apr 2025 12:42:03 -0500
Subject: [PATCH 017/443] Add FP16/BF16<->FP8/BF8 conversions (#2035)

* Move conversion functions and add missing conversions

* Add tests

* Add missing conversions

* Add missing conversions

* Add bf8 tests

* Update clipping for vectors

* Add missing conversions

* Add bf16 fp8 tests

* Add bf16 bf8 tests

* Fix device conversion

* Fix conversions

* Fix vector use

* Minor fix

* Add a workaround flag

* Add a workaround flag for bf16 conversion

* Add another workaround

* Add a workaround for fp16 to bf8 conversion

* Update type alias

* Add docstrings and missing wrappers

* Fix if defined macros

* Fix more if defined macros

* Add comments

* Remove __host__ specifier

* Add a gfx950 guard

* Update function naming
---
 include/ck/ck.hpp                          |   6 +
 include/ck/utility/amd_ck_fp8.hpp          | 864 +++++++++++++++++++--
 include/ck/utility/mxf8_utils.hpp          |   2 +-
 include/ck/utility/scaled_type_convert.hpp |   4 +-
 include/ck/utility/type_convert.hpp        | 696 ++++++++++++++++-
 test/data_type/test_bf8_ocp.cpp            | 595 +++++++++++++-
 test/data_type/test_fp8_ocp.cpp            | 571 +++++++++++++-
 7 files changed, 2628 insertions(+), 110 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 5fa73d2fda..1d49b68a32 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -248,6 +248,12 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 // workaround: compiler issue on gfx950
 #define CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION 1
 
+// workaround: compiler issue on gfx950
+#define CK_WORKAROUND_FP16_TO_FP8_CONVERSION 1
+
+// workaround: compiler issue on gfx950
+#define CK_WORKAROUND_BF16_TO_FP8_CONVERSION 1
+
 // denorm test fix, necessary for gfx90a
 #ifndef CK_GFX90A_DENORM_WORKAROUND
 #define CK_GFX90A_DENORM_WORKAROUND 0
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index 5c80c42d6c..b0089bb2d1 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -64,6 +64,9 @@ enum class ck_saturation_t
 namespace fp8_impl {
 
 typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2)));
+typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushortx2_t __attribute__((ext_vector_type(2)));
+typedef short shortx2_t __attribute__((ext_vector_type(2)));
 typedef float float2_t __attribute__((ext_vector_type(2)));
 
 __host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a)
@@ -270,7 +273,7 @@ static __host__ __device__ float cast_to_f32_from_f8(fp8_storage_t v)
 }
 
 template <ck_fp8_interpretation_t interpret>
-static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v)
+static __device__ float2_t cast_to_f32_from_f8(fp8x2_storage_t v)
 {
     const auto i16val = bit_cast<uint16_t>(v);
 
@@ -458,6 +461,510 @@ __is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
 #endif
 }
 
+#if defined(__gfx950__)
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.half_vec[0]               = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 448.0, -448.0);
+        }
+    }
+
+    val.i32val =
+        __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(i32val, val.half_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.half_vec[0]               = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 57344.0, -57344.0);
+        }
+    }
+
+    val.i32val =
+        __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(i32val, val.half_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    std::ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 448.0, -448.0);
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+#if CK_WORKAROUND_FP16_TO_FP8_CONVERSION
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+#else
+    std::ignore = rng;
+
+    union
+    {
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec                 = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 448.0, -448.0);
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[1] = __builtin_amdgcn_fmed3h(val.half_vec[1], 448.0, -448.0);
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+#endif
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
+{
+    std::ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 57344.0, -57344.0);
+        }
+    }
+
+    val.half_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rng = 0)
+{
+#if CK_WORKAROUND_FP16_TO_FP8_CONVERSION
+    return fp8x2_storage_t{
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+#else
+    std::ignore = rng;
+
+    union
+    {
+        half2_t half_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.half_vec                 = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[0] = __builtin_amdgcn_fmed3h(val.half_vec[0], 57344.0, -57344.0);
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.half_vec[1] = __builtin_amdgcn_fmed3h(val.half_vec[1], 57344.0, -57344.0);
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(i16x2val, val.half_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+#endif
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.bhalf_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+    }
+
+    val.i32val = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(
+        i32val, val.bhalf_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr unsigned int i32val = 0;
+    val.bhalf_vec[0]              = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+    }
+
+    val.i32val = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(
+        i32val, val.bhalf_vec[0], rng, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == true, bool>                       = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+    // there is no packed conversion with SR, so convert one element at a time
+    return fp8x2_storage_t{
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    std::ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec[0]             = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E4M3_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+#if CK_WORKAROUND_BF16_TO_FP8_CONVERSION
+    return fp8x2_storage_t{
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
+        cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
+#else
+    std::ignore = rng;
+
+    union
+    {
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec                = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[1] =
+                ushort((bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                            bit_cast<float>(uint32_t{val.bhalf_vec[1]} << 16), 448.0, -448.0)) >>
+                        16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+#endif
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
+{
+    std::ignore = rng;
+
+    union
+    {
+        unsigned int i32val;
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec[0]             = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i32val & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return val.i8val[0];
+}
+
+template <ck_fp8_interpretation_t interpret,
+          bool saturate,
+          bool stochastic_rounding                                                 = false,
+          ck::enable_if_t<interpret == ck_fp8_interpretation_t::CK_E5M2_OCP, bool> = false,
+          ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
+{
+    std::ignore = rng;
+
+    union
+    {
+        ushortx2_t bhalf_vec;
+        shortx2_t i16_vec;
+        fp8_storage_t i8val[4];
+    } val;
+
+    constexpr shortx2_t i16x2val = {0, 0};
+    val.bhalf_vec                = v;
+
+    if constexpr(saturate)
+    {
+        if((val.i16_vec[0] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[0] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[0]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+        if((val.i16_vec[1] & 0x7FFF) != 0x7FFF)
+        {
+            val.bhalf_vec[1] = ushort(
+                (bit_cast<uint32_t>(__builtin_amdgcn_fmed3f(
+                     bit_cast<float>(uint32_t{val.bhalf_vec[1]} << 16), 57344.0, -57344.0)) >>
+                 16)); // convert to float and back
+        }
+    }
+
+    val.i16_vec =
+        __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(i16x2val, val.bhalf_vec, /* scale */ 1.f, 0);
+
+    return fp8x2_storage_t{val.i8val[0], val.i8val[1]};
+}
+#endif // defined(__gfx950__)
+
 #if CK_FP8_CVT_FAST_PATH
 // The conversion function is from rocblas
 // https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79
@@ -523,6 +1030,84 @@ static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng =
     }
     return i8data;
 }
+
+template <ck_fp8_interpretation_t interpret, bool saturate, bool stochastic_rounding = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f32(float2_t v, unsigned int rng = 0)
+{
+    if constexpr(stochastic_rounding)
+    {
+        // there is no packed conversion with SR, so convert one element at a time
+        return fp8x2_storage_t{
+            cast_to_f8_from_f32<interpret, saturate, stochastic_rounding>(v[0], rng),
+            cast_to_f8_from_f32<interpret, saturate, stochastic_rounding>(v[1], rng)};
+    }
+    else
+    {
+        union
+        {
+            float fval;
+            unsigned int i32val;
+            unsigned char i8val[4];
+        } val0, val1;
+
+        val0.fval = v[0];
+        val1.fval = v[1];
+
+        unsigned int ival = 0;
+
+        if constexpr(saturate)
+        {
+            if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+            {
+                if((val0.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val0.fval = __builtin_amdgcn_fmed3f(val0.fval, 240.0, -240.0);
+                }
+                if((val1.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val1.fval = __builtin_amdgcn_fmed3f(val1.fval, 240.0, -240.0);
+                }
+            }
+            else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+            { // OCP type
+                if((val0.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val0.fval = __builtin_amdgcn_fmed3f(val0.fval, 448.0, -448.0);
+                }
+                if((val1.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val1.fval = __builtin_amdgcn_fmed3f(val1.fval, 448.0, -448.0);
+                }
+            }
+            else
+            {
+                if((val0.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val0.fval = __builtin_amdgcn_fmed3f(val0.fval, 57344.0, -57344.0);
+                }
+                if((val1.i32val & 0x7F800000) != 0x7F800000)
+                { /// propagate NAN/INF, no clipping
+                    val1.fval = __builtin_amdgcn_fmed3f(val1.fval, 57344.0, -57344.0);
+                }
+            }
+        }
+
+        // RNE CVT
+        if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                     (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+        {
+            ival = __builtin_amdgcn_cvt_pk_fp8_f32(val0.fval, val1.fval, ival, false);
+        }
+        else
+        {
+            ival = __builtin_amdgcn_cvt_pk_bf8_f32(val0.fval, val1.fval, ival, false);
+        }
+
+        val0.i32val = ival;
+
+        return fp8x2_storage_t{val0.i8val[0], val0.i8val[1]};
+    }
+}
 #endif // CK_FP8_CVT_FAST_PATH
 
 // The conversion function is from rocblas
@@ -797,6 +1382,7 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn
  *
  * \tparam interp interpretation of fp8
  * \tparam sat saturation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
  * \param f float number
  * \return fp8_storage_t
  */
@@ -882,6 +1468,47 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
 #endif // CK_FP8_CVT_FAST_PATH
 }
 
+/**
+ * \brief convert vector of 2 floats to vector of 2 @p fp8_storage_t
+ *
+ * \tparam interp interpretation of fp8
+ * \tparam sat saturation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param f vector of 2 floats
+ * \return fp8x2_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH
+__device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+#else
+        rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f[0]);
+#endif
+    }
+    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        f, rng);
+#else
+#if CK_USE_OCP_FP8
+__host__ __device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
+{
+#else
+__host__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
+{
+#endif // CK_USE_OCP_FP8
+    return fp8x2_storage_t{cvt_float_to_fp8<interp, sat, stochastic_rounding>(f[0]),
+                           cvt_float_to_fp8<interp, sat, stochastic_rounding>(f[1])};
+#endif // CK_FP8_CVT_FAST_PATH
+}
+
 /**
  * \brief convert _Float16 to @p fp8_storage_t
  *
@@ -900,87 +1527,168 @@ __host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16
 __host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
 #endif
 {
-    return cvt_float_to_fp8<interp, sat, stochastic_rounding>(static_cast<float>(x));
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_f16<interp,
+                                   sat == ck_saturation_t::CK_SATFINITE,
+                                   stochastic_rounding>(x, rng);
+#else
+        std::ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            static_cast<float>(x));
+#endif // defined(__gfx950__)
+    }
+}
+
+/**
+ * \brief convert vector of 2 _Float16 to vector of 2 @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x vector of 2 _Float16
+ * \return fp8x2_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
+#else
+__host__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
+#endif
+{
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_f16<interp,
+                                   sat == ck_saturation_t::CK_SATFINITE,
+                                   stochastic_rounding>(x, rng);
+#else
+        std::ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            float2_t{static_cast<float>(x[0]), static_cast<float>(x[1])});
+#endif // defined(__gfx950__)
+    }
+}
+
+/**
+ * \brief convert bhalf_t to @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x bhalf_t value
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
+#else
+__host__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
+#endif
+{
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+                                               static_cast<float>(x));
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), static_cast<float>(x));
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_bf16<interp,
+                                    sat == ck_saturation_t::CK_SATFINITE,
+                                    stochastic_rounding>(x, rng);
+#else
+        std::ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            bit_cast<float>(uint32_t{x} << 16)); // convert value to float
+#endif // defined(__gfx950__)
+    }
+}
+
+/**
+ * \brief convert vector of 2 bhalf_t to vector of 2 @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x vector of 2 bhalf_t
+ * \return fp8x2_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
+#else
+__host__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
+#endif
+{
+#if CK_WORKAROUND_BF16_TO_FP8_CONVERSION
+    return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        float2_t{bit_cast<float>(uint32_t{x[0]} << 16),
+                 bit_cast<float>(uint32_t{x[1]} << 16)}); // convert values to float
+#else                                                     // CK_WORKAROUND_BF16_TO_FP8_CONVERSION
+    {
+        __is_interpret_supported(interp);
+        uint32_t rng = 0;
+        if constexpr(stochastic_rounding)
+        {
+            constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+                                               static_cast<float>(x[0]));
+#else
+            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x),
+                                               static_cast<float>(x[0]));
+#endif
+        }
+#if defined(__gfx950__)
+        return cast_to_f8_from_bf16<interp,
+                                    sat == ck_saturation_t::CK_SATFINITE,
+                                    stochastic_rounding>(x, rng);
+#else
+        std::ignore = rng;
+        return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+            float2_t{bit_cast<float>(uint32_t{x[0]} << 16),
+                     bit_cast<float>(uint32_t{x[1]} << 16)}); // convert values to float
+#endif // defined(__gfx950__)
+    }
+#endif // CK_WORKAROUND_BF16_TO_FP8_CONVERSION
 }
 
 } // namespace fp8_impl
 
-// Declare a template function for fp8 conversion using RNE
-template <typename Y, typename X>
-__host__ __device__ constexpr Y f8_convert_rne(X x);
-
-// convert fp32 to fp8 with rounding to nearest even
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
-{
-    return f8_ocp_t{
-        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
-}
-
-// convert fp32 to bf8 with rounding to nearest even
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
-{
-    return bf8_ocp_t{
-        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
-}
-
-// convert _Float16 to fp8 with rounding to nearest even
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, _Float16>(_Float16 x)
-{
-    return f8_ocp_t{
-        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
-}
-
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, _Float16>(_Float16 x)
-{
-    return bf8_ocp_t{
-        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
-            x)};
-}
-
-// Declare a template function for fp8 conversion using RNE
-template <typename Y, typename X>
-__host__ __device__ constexpr Y f8_convert_sr(X x);
-
-// convert fp32 to fp8 with stochastic rounding
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
-{
-    return f8_ocp_t{
-        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
-            x)};
-}
-
-// convert fp32 to bf8 with stochastic rounding
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
-{
-    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
-                                                bf8_ocp_t::default_saturation,
-                                                true>(x)};
-}
-
-// convert _Float16 to fp8 with stochastic rounding
-template <>
-inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, _Float16>(_Float16 x)
-{
-    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
-                                                f8_ocp_t::default_saturation,
-                                                true>(x)};
-}
-
-// convert _Float16 to bf8 with stochastic rounding
-template <>
-inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, _Float16>(_Float16 x)
-{
-    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
-                                                 bf8_ocp_t::default_saturation,
-                                                 true>(x)};
-}
-
 #if CK_USE_OCP_FP8
 using f8_t  = f8_ocp_t;
 using bf8_t = bf8_ocp_t;
diff --git a/include/ck/utility/mxf8_utils.hpp b/include/ck/utility/mxf8_utils.hpp
index b7b98c6455..9046a24a3a 100644
--- a/include/ck/utility/mxf8_utils.hpp
+++ b/include/ck/utility/mxf8_utils.hpp
@@ -39,7 +39,7 @@ static __device__ float cast_to_f32_from_f8_scaled(float scale, fp8_storage_t v)
 }
 
 template <ck_fp8_interpretation_t interpret>
-static __device__ float2_t cast_to_f32x2_from_f8x2_scaled(float scale, fp8x2_storage_t v)
+static __device__ float2_t cast_to_f32_from_f8_scaled(float scale, fp8x2_storage_t v)
 {
     const auto i16val = bit_cast<uint16_t>(v);
 
diff --git a/include/ck/utility/scaled_type_convert.hpp b/include/ck/utility/scaled_type_convert.hpp
index 9a9c53caec..f3e2bd3dd9 100644
--- a/include/ck/utility/scaled_type_convert.hpp
+++ b/include/ck/utility/scaled_type_convert.hpp
@@ -67,7 +67,7 @@ inline __host__ float2_t scaled_type_convert<float2_t, f8x2_ocp_t>(e8m0_bexp_t s
 #endif
 {
 #if CK_MX_FP8_CVT_FAST_PATH
-    return fp8_impl::cast_to_f32x2_from_f8x2_scaled<f8_ocp_t::default_interpret>(
+    return fp8_impl::cast_to_f32_from_f8_scaled<f8_ocp_t::default_interpret>(
         type_convert<float>(scale), x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
 #else
     return float2_t{scaled_type_convert<float>(scale, x.AsType<f8_ocp_t>()[Number<0>{}]),
@@ -86,7 +86,7 @@ inline __host__ float2_t scaled_type_convert<float2_t, bf8x2_ocp_t>(e8m0_bexp_t
 #endif
 {
 #if CK_MX_FP8_CVT_FAST_PATH
-    return fp8_impl::cast_to_f32x2_from_f8x2_scaled<bf8_ocp_t::default_interpret>(
+    return fp8_impl::cast_to_f32_from_f8_scaled<bf8_ocp_t::default_interpret>(
         type_convert<float>(scale), x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
 #else
     return float2_t{scaled_type_convert<float>(scale, x.AsType<bf8_ocp_t>()[Number<0>{}]),
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index b9aeb44999..c8127aa887 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -117,7 +117,7 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
 #if CK_USE_RNE_BF16_CONVERSION
     return bf16_convert_rtn<bhalf_t>(x);
 #else
-    return uint16_t(u.int32 >> 16);
+    return uint16_t(uint32_t{x} >> 16);
 #endif
 }
 
@@ -356,6 +356,180 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x
 #endif
 }
 
+/**
+ * @brief Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input float value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_sr<f8x2_ocp_t, float2_t>(float2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+/**
+ * @brief Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input float value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                bf8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_sr<bf8x2_ocp_t, float2_t>(float2_t x)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                  bf8_ocp_t::default_saturation,
+                                                  true>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, half_t>(half_t x)
+{
+    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                f8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_sr<f8x2_ocp_t, half2_t>(half2_t x)
+{
+    return f8x2_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                  f8_ocp_t::default_saturation,
+                                                  true>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, half_t>(half_t x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                 bf8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_sr<bf8x2_ocp_t, half2_t>(half2_t x)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                   bf8_ocp_t::default_saturation,
+                                                   true>(x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return f8_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret,
+                                                 f8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_sr<f8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return f8x2_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret,
+                                                   f8_ocp_t::default_saturation,
+                                                   true>(x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                  bf8_ocp_t::default_saturation,
+                                                  true>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * stochastic rounding.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_sr<bf8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                    bf8_ocp_t::default_saturation,
+                                                    true>(x)};
+}
+
 // Declare a template function for fp8 conversion using RNE
 template <typename Y, typename X>
 __host__ __device__ constexpr Y f8_convert_rne(X x);
@@ -466,6 +640,172 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, half_t>(half_t
 #endif
 }
 
+/**
+ * @brief Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input float value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding
+ * to nearest/even.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_rne<f8x2_ocp_t, float2_t>(float2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input float value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 floats.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_rne<bf8x2_ocp_t, float2_t>(float2_t x)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, half_t>(half_t x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding
+ * to nearest/even.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_rne<f8x2_ocp_t, half2_t>(half2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, half_t>(half_t x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+/**
+ * @brief Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 half_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_rne<bf8x2_ocp_t, half2_t>(half2_t x)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 f8_ocp_t.
+ */
+template <>
+inline __host__ __device__ f8x2_ocp_t f8_convert_rne<f8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+/**
+ * @brief Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, bhalf_t>(bhalf_t x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+/**
+ * @brief Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using
+ * rounding to nearest/even.
+ *
+ * @param x     The input vector of 2 bhalf_t.
+ * @return      The converted vector of 2 bf8_ocp_t.
+ */
+template <>
+inline __host__ __device__ bf8x2_ocp_t f8_convert_rne<bf8x2_ocp_t, bhalf2_t>(bhalf2_t x)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_bhalf_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
 // convert fp32 to fp8
 template <>
 inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
@@ -477,17 +817,6 @@ inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
 #endif
 }
 
-// convert fp32 to fp8
-template <>
-inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
-{
-#if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_ocp_t>(x);
-#else
-    return f8_convert_rne<f8_ocp_t>(x);
-#endif
-}
-
 // convert fp8 to fp32
 template <>
 inline __host__ __device__ float type_convert<float, f8_fnuz_t>(f8_fnuz_t x)
@@ -524,12 +853,39 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_fnuz_t>(f8x2_fnu
 #endif
 }
 
+/**
+ * @brief Converts a f8_ocp_t value to a float value.
+ *
+ * @param x     The input f8_ocp_t value.
+ * @return      The converted float value.
+ */
+template <>
+inline __host__ __device__ float type_convert<float, f8_ocp_t>(f8_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    union
+    {
+        unsigned int i32val;
+        fp8_storage_t i8val[4];
+    } val;
+    val.i8val[0] = x.data;
+    return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0);
+#else
+    return fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.
+ *
+ * @param x     The input vector of 2 f8_ocp_t values.
+ * @return      The converted vector of 2 float values.
+ */
 template <>
 inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_t x)
 {
 #if CK_OCP_FP8_CVT_FAST_PATH
-    return fp8_impl::cast_to_f32x2_from_f8x2<f8_ocp_t::default_interpret>(
-        x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+    return __builtin_amdgcn_cvt_pk_f32_fp8(bit_cast<uint16_t>(x), false);
 #else
     return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
                         x.AsType<fp8_storage_t>()[Number<0>{}]),
@@ -538,6 +894,229 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_
 #endif
 }
 
+/**
+ * @brief Converts a f8_ocp_t value to a half_t value.
+ *
+ * @param x     The input f8_ocp_t value.
+ * @return      The converted half_t value.
+ */
+template <>
+inline __host__ __device__ half_t type_convert<half_t, f8_ocp_t>(f8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } input;
+    input.i8val[0] = x.data;
+
+    union
+    {
+        half2_t half_vec;
+        half_t half_arr[2];
+    } output;
+    output.half_vec = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(input.i16val, /*scale*/ 1.f, 0);
+
+    return output.half_arr[0];
+#else
+    return fp8_impl::cast_from_f8<half_t, f8_ocp_t::wm, f8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.
+ *
+ * @param x     The input vector of 2 f8_ocp_t values.
+ * @return      The converted vector of 2 half_t values.
+ */
+template <>
+inline __host__ __device__ half2_t type_convert<half2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return half2_t{type_convert<half_t>(float(x.AsType<f8_ocp_t>()[Number<0>{}])),
+                   type_convert<half_t>(float(x.AsType<f8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
+/**
+ * @brief Converts a f8_ocp_t value to a bhalf_t value.
+ *
+ * @param x     The input f8_ocp_t value.
+ * @return      The converted bhalf_t value.
+ */
+template <>
+inline __host__ __device__ bhalf_t type_convert<bhalf_t, f8_ocp_t>(f8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } input;
+    input.i8val[0] = x.data;
+
+    union
+    {
+        bhalf2_t bhalf_vec;
+        bhalf_t bhalf_arr[2];
+    } output;
+    output.bhalf_vec = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(input.i16val, /*scale*/ 1.f, 0);
+
+    return output.bhalf_arr[0];
+#else
+    return type_convert<bhalf_t>(
+        fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(x.data));
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.
+ *
+ * @param x     The input vector of 2 f8_ocp_t values.
+ * @return      The converted vector of 2 bhalf_t values.
+ */
+template <>
+inline __host__ __device__ bhalf2_t type_convert<bhalf2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return bhalf2_t{type_convert<bhalf_t>(float(x.AsType<f8_ocp_t>()[Number<0>{}])),
+                    type_convert<bhalf_t>(float(x.AsType<f8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
+/**
+ * @brief Converts a bf8_ocp_t value to a float value.
+ *
+ * @param x     The input bf8_ocp_t value.
+ * @return      The converted float value.
+ */
+template <>
+inline __host__ __device__ float type_convert<float, bf8_ocp_t>(bf8_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    union
+    {
+        unsigned int i32val;
+        fp8_storage_t i8val[4];
+    } val;
+    val.i8val[0] = x.data;
+    return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
+#else
+    return fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.
+ *
+ * @param x     The input vector of 2 bf8_ocp_t values.
+ * @return      The converted vector of 2 float values.
+ */
+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    return __builtin_amdgcn_cvt_pk_f32_bf8(bit_cast<uint16_t>(x), false);
+#else
+    return float2_t{fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<0>{}]),
+                    fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<1>{}])};
+#endif
+}
+
+/**
+ * @brief Converts a bf8_ocp_t value to a half_t value.
+ *
+ * @param x     The input bf8_ocp_t value.
+ * @return      The converted half_t value.
+ */
+template <>
+inline __host__ __device__ half_t type_convert<half_t, bf8_ocp_t>(bf8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } val;
+    val.i8val[0] = x.data;
+    return __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(val.i16val, /*scale*/ 1.f, 0)[0];
+#else
+    return fp8_impl::cast_from_f8<half_t, bf8_ocp_t::wm, bf8_ocp_t::we, false>(x.data);
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.
+ *
+ * @param x     The input vector of 2 bf8_ocp_t values.
+ * @return      The converted vector of 2 half_t values.
+ */
+template <>
+inline __host__ __device__ half2_t type_convert<half2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return half2_t{type_convert<half_t>(float(x.AsType<bf8_ocp_t>()[Number<0>{}])),
+                   type_convert<half_t>(float(x.AsType<bf8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
+/**
+ * @brief Converts a bf8_ocp_t value to a bhalf_t value.
+ *
+ * @param x     The input bf8_ocp_t value.
+ * @return      The converted bhalf_t value.
+ */
+template <>
+inline __host__ __device__ bhalf_t type_convert<bhalf_t, bf8_ocp_t>(bf8_ocp_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint16_t i16val;
+        fp8_storage_t i8val[2];
+    } input;
+    input.i8val[0] = x.data;
+
+    union
+    {
+        bhalf2_t bhalf_vec;
+        bhalf_t bhalf_arr[2];
+    } output;
+    output.bhalf_vec = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(input.i16val, /*scale*/ 1.f, 0);
+
+    return output.bhalf_arr[0];
+#else
+    return type_convert<bhalf_t>(
+        fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(x.data));
+#endif
+}
+
+/**
+ * @brief Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.
+ *
+ * @param x     The input vector of 2 bf8_ocp_t values.
+ * @return      The converted vector of 2 bhalf_t values.
+ */
+template <>
+inline __host__ __device__ bhalf2_t type_convert<bhalf2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(bit_cast<uint16_t>(x), /*scale*/ 1.f, 0);
+#else
+    return bhalf2_t{type_convert<bhalf_t>(float(x.AsType<bf8_ocp_t>()[Number<0>{}])),
+                    type_convert<bhalf_t>(float(x.AsType<bf8_ocp_t>()[Number<1>{}]))};
+#endif
+}
+
 template <>
 inline __host__ __device__ float2_t type_convert<float2_t, pk_i4_t>(pk_i4_t x)
 {
@@ -610,7 +1189,12 @@ inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, half_t>(half_t x)
 #endif
 }
 
-// convert fp16 to fp8
+/**
+ * @brief Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted f8_ocp_t value.
+ */
 template <>
 inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
 {
@@ -621,6 +1205,22 @@ inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
 #endif
 }
 
+/**
+ * @brief Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input half_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_ocp_t>(x);
+#else
+    return f8_convert_rne<bf8_ocp_t>(x);
+#endif
+}
+
 // convert fp8 to fp16
 template <>
 inline __host__ __device__ half_t type_convert<half_t, f8_fnuz_t>(f8_fnuz_t x)
@@ -645,7 +1245,28 @@ inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, float>(float x)
 #endif
 }
 
-// convert fp32 to bf8
+/**
+ * @brief Converts a float value to a f8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input float value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_ocp_t>(x);
+#else
+    return f8_convert_rne<f8_ocp_t>(x);
+#endif
+}
+
+/**
+ * @brief Converts a float value to a bf8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input float value.
+ * @return      The converted bf8_ocp_t value.
+ */
 template <>
 inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
 {
@@ -656,6 +1277,38 @@ inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
 #endif
 }
 
+/**
+ * @brief Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted f8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, bhalf_t>(bhalf_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_ocp_t>(x);
+#else
+    return f8_convert_rne<f8_ocp_t>(x);
+#endif
+}
+
+/**
+ * @brief Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.
+ *
+ * @param x     The input bhalf_t value.
+ * @return      The converted bf8_ocp_t value.
+ */
+template <>
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, bhalf_t>(bhalf_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_ocp_t>(x);
+#else
+    return f8_convert_rne<bf8_ocp_t>(x);
+#endif
+}
+
 // convert bf8 to fp32
 template <>
 inline __host__ __device__ float type_convert<float, bf8_fnuz_t>(bf8_fnuz_t x)
@@ -683,17 +1336,6 @@ inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, half_t>(half_t x)
 #endif
 }
 
-// convert fp16 to bf8
-template <>
-inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
-{
-#if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_ocp_t>(x);
-#else
-    return f8_convert_rne<bf8_ocp_t>(x);
-#endif
-}
-
 // convert bf8 to fp16
 template <>
 inline __host__ __device__ half_t type_convert<half_t, bf8_fnuz_t>(bf8_fnuz_t x)
diff --git a/test/data_type/test_bf8_ocp.cpp b/test/data_type/test_bf8_ocp.cpp
index 9d4ee38b15..285e7e69fc 100644
--- a/test/data_type/test_bf8_ocp.cpp
+++ b/test/data_type/test_bf8_ocp.cpp
@@ -1,13 +1,19 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+#include "ck/library/utility/device_memory.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 
 using ck::bf8_ocp_t;
+using ck::bf8x2_ocp_t;
+using ck::bhalf2_t;
+using ck::bhalf_t;
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
+using ck::float2_t;
+using ck::half2_t;
 using ck::half_t;
 using ck::type_convert;
 
@@ -266,3 +272,590 @@ TEST(BF8OCP, ConvertFP16Stochastic)
     const auto bf8_nan = f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
     ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
 }
+
+constexpr uint64_t test_size = 256 + 6;
+
+__host__ __device__ void
+test_fp32_bf8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto v          = type_convert<float>(bf8_ocp_t{bf8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // bf8x2 -> fp32x2
+    bf8x2_ocp_t bf8x2{bf8x2_ocp_t::data_v{0b10000100, 0b00000001}}; //-2^-14, 2^-16
+
+    float2_t f32x2 = type_convert<float2_t>(bf8x2);
+    p_test[i++]    = f32x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f32x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp32x2 -> bf8x2
+    f32x2 = {-4.0f, 2.0f};
+    bf8x2 = f8_convert_rne<bf8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    bf8x2 = f8_convert_sr<bf8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(BF8OCP, HostFP32BF8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp32_bf8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx]));
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(bf8_ocp_t{bf8_uid});
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -14.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -16.0f));
+
+    // fp32x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp32_bf8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    test_fp32_bf8_type_convert(N, p_test, p_completed);
+}
+
+TEST(BF8OCP, DeviceFP32BF8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(float));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp32_bf8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<float*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx])) << "idx: " << idx << " out[idx]: " << out[idx];
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(bf8_ocp_t{bf8_uid});
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -14.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -16.0f));
+
+    // fp32x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_fp16_bf8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto v          = type_convert<half_t>(bf8_ocp_t{bf8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // bf8x2 -> fp16x2
+    bf8x2_ocp_t bf8x2{bf8x2_ocp_t::data_v{0b10000100, 0b00000001}}; //-2^-14, 2^-16
+
+    half2_t f16x2 = type_convert<half2_t>(bf8x2);
+    p_test[i++]   = f16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp16x2 -> bf8x2
+    f16x2 = {-4.0f, 2.0f};
+    bf8x2 = f8_convert_rne<bf8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    bf8x2 = f8_convert_sr<bf8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(BF8OCP, HostFP16BF8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp16_bf8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -16.0f)));
+
+    // fp16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp16_bf8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    test_fp16_bf8_type_convert(N, p_test, p_completed);
+}
+
+TEST(BF8OCP, DeviceFP16BF8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(half_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp16_bf8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<half_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -16.0f)));
+
+    // fp16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_bf16_bf8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto v          = type_convert<bhalf_t>(bf8_ocp_t{bf8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // bf8x2 -> bf16x2
+    bf8x2_ocp_t bf8x2{bf8x2_ocp_t::data_v{0b10000100, 0b00000001}}; //-2^-14, 2^-16
+
+    bhalf2_t bf16x2 = type_convert<bhalf2_t>(bf8x2);
+    p_test[i++]     = bf16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = bf16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // bf16x2 -> bf8x2
+    bf16x2 = {type_convert<bhalf_t>(-4.0f), type_convert<bhalf_t>(2.0f)};
+    bf8x2  = f8_convert_rne<bf8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    bf8x2 = f8_convert_sr<bf8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(bf8x2.AsType<bf8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(BF8OCP, HostBF16BF8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_bf16_bf8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -16.0f)));
+
+    // bf16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void
+device_test_bf16_bf8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    test_bf16_bf8_type_convert(N, p_test, p_completed);
+}
+
+TEST(BF8OCP, DeviceBF16BF8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(bhalf_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_bf16_bf8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<bhalf_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> bf8_nan_ids;
+    bf8_nan_ids.insert(0b11111111);
+    bf8_nan_ids.insert(0b01111111);
+    bf8_nan_ids.insert(0b11111101);
+    bf8_nan_ids.insert(0b01111101);
+    bf8_nan_ids.insert(0b11111110);
+    bf8_nan_ids.insert(0b01111110);
+    for(auto bf8_nan_id : bf8_nan_ids)
+    {
+        auto idx = bf8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t bf8_id = 0; bf8_id < 256; bf8_id++)
+    {
+        if(bf8_nan_ids.find(bf8_id) != bf8_nan_ids.end())
+            continue;
+
+        uint8_t bf8_uid = static_cast<uint8_t>(bf8_id);
+        auto idx        = bf8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}))
+            << " bf8_id: " << bf8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(bf8_ocp_t{bf8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // bf8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -14.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -16.0f)));
+
+    // bf16x2 -> bf8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
diff --git a/test/data_type/test_fp8_ocp.cpp b/test/data_type/test_fp8_ocp.cpp
index 944dd89930..bf562112c8 100644
--- a/test/data_type/test_fp8_ocp.cpp
+++ b/test/data_type/test_fp8_ocp.cpp
@@ -1,13 +1,19 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+#include "ck/library/utility/device_memory.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 
+using ck::bhalf2_t;
+using ck::bhalf_t;
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::f8_ocp_t;
+using ck::f8x2_ocp_t;
+using ck::float2_t;
+using ck::half2_t;
 using ck::half_t;
 using ck::type_convert;
 
@@ -248,3 +254,566 @@ TEST(FP8OCP, ConvertFP16Stochastic)
     auto f8_nan = f8_convert_sr<f8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
     ASSERT_TRUE(ck::fp8_impl::ocp_f8_is_nan(f8_nan.data));
 }
+
+constexpr uint64_t test_size = 256 + 6;
+
+__host__ __device__ void
+test_fp32_fp8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto v          = type_convert<float>(f8_ocp_t{fp8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // fp8x2 -> fp32x2
+    f8x2_ocp_t fp8x2{f8x2_ocp_t::data_v{0b10001000, 0b00000001}}; //-2^-6, 2^-9
+
+    float2_t f32x2 = type_convert<float2_t>(fp8x2);
+    p_test[i++]    = f32x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f32x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp32x2 -> fp8x2
+    f32x2 = {-4.0f, 2.0f};
+    fp8x2 = f8_convert_rne<f8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    fp8x2 = f8_convert_sr<f8x2_ocp_t>(f32x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<float>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(FP8OCP, HostFP32FP8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp32_fp8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx]));
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(f8_ocp_t{fp8_uid});
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -6.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -9.0f));
+
+    // fp32x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp32_fp8_type_convert(uint64_t N, float* p_test, uint64_t* p_completed)
+{
+    test_fp32_fp8_type_convert(N, p_test, p_completed);
+}
+
+TEST(FP8OCP, DeviceFP32FP8Convert)
+{
+    std::vector<float> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(float));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp32_fp8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<float*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(out[idx])) << "idx: " << idx << " out[idx]: " << out[idx];
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<float>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(f8_ocp_t{fp8_uid});
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp32x2
+    EXPECT_EQ(out[i++], -powf(2.0f, -6.0f));
+    EXPECT_EQ(out[i++], powf(2.0f, -9.0f));
+
+    // fp32x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+    // SR
+    EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 2.0f);
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_fp16_fp8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto v          = type_convert<half_t>(f8_ocp_t{fp8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // fp8x2 -> fp16x2
+    f8x2_ocp_t fp8x2{f8x2_ocp_t::data_v{0b10001000, 0b00000001}}; //-2^-6, 2^-9
+
+    half2_t f16x2 = type_convert<half2_t>(fp8x2);
+    p_test[i++]   = f16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = f16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // fp16x2 -> fp8x2
+    f16x2 = {-4.0f, 2.0f};
+    fp8x2 = f8_convert_rne<f8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    fp8x2 = f8_convert_sr<f8x2_ocp_t>(f16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<half_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(FP8OCP, HostFP16FP8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_fp16_fp8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -9.0f)));
+
+    // fp16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void device_test_fp16_fp8_type_convert(uint64_t N, half_t* p_test, uint64_t* p_completed)
+{
+    test_fp16_fp8_type_convert(N, p_test, p_completed);
+}
+
+TEST(FP8OCP, DeviceFP16FP8Convert)
+{
+    std::vector<half_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(half_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_fp16_fp8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<half_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<half_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<half_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> fp16x2
+    EXPECT_EQ(out[i++], type_convert<half_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<half_t>(powf(2.0f, -9.0f)));
+
+    // fp16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<half_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<half_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__host__ __device__ void
+test_bf16_fp8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto v          = type_convert<bhalf_t>(f8_ocp_t{fp8_uid});
+        p_test[i]       = v;
+        i++;
+        if(i >= N)
+        {
+            return;
+        }
+    }
+
+    /// Test vector conversion
+    // fp8x2 -> bf16x2
+    f8x2_ocp_t fp8x2{f8x2_ocp_t::data_v{0b10001000, 0b00000001}}; //-2^-6, 2^-9
+
+    bhalf2_t bf16x2 = type_convert<bhalf2_t>(fp8x2);
+    p_test[i++]     = bf16x2[0];
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = bf16x2[1];
+    if(i >= N)
+    {
+        return;
+    }
+
+    // bf16x2 -> fp8x2
+    bf16x2 = {type_convert<bhalf_t>(-4.0f), type_convert<bhalf_t>(2.0f)};
+    fp8x2  = f8_convert_rne<f8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+
+    fp8x2 = f8_convert_sr<f8x2_ocp_t>(bf16x2); // expect {-4, 2}
+
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<0>{})); //-4f
+    if(i >= N)
+    {
+        return;
+    }
+    p_test[i++] = type_convert<bhalf_t>(fp8x2.AsType<f8_ocp_t>()(ck::Number<1>{})); // 2f
+    if(i >= N)
+    {
+        return;
+    }
+}
+
+TEST(FP8OCP, HostBF16FP8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+    uint64_t completed = 0;
+
+    test_bf16_fp8_type_convert(test_size, out.data(), &completed);
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])));
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    // /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -9.0f)));
+
+    // bf16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}
+
+__global__ void
+device_test_bf16_fp8_type_convert(uint64_t N, bhalf_t* p_test, uint64_t* p_completed)
+{
+    test_bf16_fp8_type_convert(N, p_test, p_completed);
+}
+
+TEST(FP8OCP, DeviceBF16FP8Convert)
+{
+    std::vector<bhalf_t> out(test_size, -1.0f);
+
+    DeviceMem device_out(test_size * sizeof(bhalf_t));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    device_test_bf16_fp8_type_convert<<<1, 1>>>(
+        test_size,
+        static_cast<bhalf_t*>(device_out.GetDeviceBuffer()),
+        static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    std::set<uint8_t> fp8_nan_ids;
+    fp8_nan_ids.insert(0b11111111); //-NaN
+    fp8_nan_ids.insert(0b01111111); // +NaN
+    for(auto fp8_nan_id : fp8_nan_ids)
+    {
+        auto idx = fp8_nan_id;
+        ASSERT_TRUE(std::isnan(type_convert<float>(out[idx])))
+            << "idx: " << idx << " out[idx]: " << type_convert<float>(out[idx]);
+    }
+
+    for(ck::index_t fp8_id = 0; fp8_id < 256; fp8_id++)
+    {
+        if(fp8_nan_ids.find(fp8_id) != fp8_nan_ids.end())
+            continue;
+
+        uint8_t fp8_uid = static_cast<uint8_t>(fp8_id);
+        auto idx        = fp8_uid;
+        ASSERT_FLOAT_EQ(out[idx], type_convert<bhalf_t>(f8_ocp_t{fp8_uid}))
+            << " fp8_id: " << fp8_id << std::endl
+            << type_convert<float>(type_convert<bhalf_t>(f8_ocp_t{fp8_uid}));
+    }
+
+    /// Test vector conversions
+
+    auto i = 256;
+
+    // fp8x2 -> bf16x2
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-powf(2.0f, -6.0f)));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(powf(2.0f, -9.0f)));
+
+    // bf16x2 -> fp8x2
+    // RNE
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+    // SR
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(-4.0f));
+    EXPECT_EQ(out[i++], type_convert<bhalf_t>(2.0f));
+
+    EXPECT_EQ(test_size, completed);
+    EXPECT_EQ(test_size, i);
+}

From 50d1f8ff905eeabc61123864d9a805d215676a53 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Thu, 3 Apr 2025 11:48:54 -0700
Subject: [PATCH 018/443] Add the MI355 support for CK TILE GEMM (#2046)

* Get the root cause of the ck tile gemm failing on mi355

* Fix the ck tile gemm on MI355

* delete the debug info
---
 example/ck_tile/03_gemm/CMakeLists.txt        |  9 ++++++---
 example/ck_tile/03_gemm/run_gemm_example.inc  |  8 ++++----
 test/ck_tile/gemm/CMakeLists.txt              | 20 +++++++++++++++++++
 .../gemm/test_gemm_pipeline_compv3.cpp        |  2 +-
 .../gemm/test_gemm_pipeline_compv4.cpp        |  2 +-
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 30cfee22f6..61c3a57391 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
-target_compile_options(tile_example_gemm_universal PRIVATE
-  -mllvm -enable-noalias-to-md-conversion=0
-)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 6cb40e45d1..c3b4ec609c 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -240,8 +240,8 @@ int run_gemm_example_with_layouts(int argc,
 
     if(init_method == 0)
     {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
     }
     else if(init_method == 1)
     {
@@ -250,8 +250,8 @@ int run_gemm_example_with_layouts(int argc,
     }
     else if(init_method == 2)
     {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
     }
     else
     {
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 7701e451ad..3e7296b1eb 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -1,8 +1,28 @@
 # Currently ck_tile is only built on gfx94/gfx95
+set(EXAMPLE_GEMM_COMPILE_OPTIONS "")
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+set(EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS "")
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
+    -mllvm
+    -enable-noalias-to-md-conversion=0
+)
+
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
     add_gtest_executable(test_ck_tile_gemm_pipeline_compv3 test_gemm_pipeline_compv3.cpp)
     add_gtest_executable(test_ck_tile_gemm_pipeline_compv4 test_gemm_pipeline_compv4.cpp)
+
+    target_compile_options(test_ck_tile_gemm_pipeline_mem PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv3 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv4 PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
 else()
     message("Skipping ck_tile_gemm tests for current target")
 endif()
+endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
index d81e870ffc..8944e6865d 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
@@ -9,7 +9,7 @@ class TestCkTileGemmPipelineCompV3 : public TestCkTileGemmPipeline<T>
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3, KernelTypesMem);
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3, KernelTypesCompV3);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
index 1da0028f63..22e77fac41 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
@@ -9,7 +9,7 @@ class TestCkTileGemmPipelineCompV4 : public TestCkTileGemmPipeline<T>
 
 #define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4
 
-TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4, KernelTypesMem);
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4, KernelTypesCompV4);
 
 #include "test_gemm_pipeline_ut_cases.inc"
 

From fed0709121365e4ce8208a1a0a988905d43a1963 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 3 Apr 2025 11:54:12 -0700
Subject: [PATCH 019/443] [New] Build up the feature of CK Tile GEMM CodeGen
 (#1994)

* New branch for codegen changes

* Fix verify function for int4

* pk_int4 codegen

* Update to review comments

* Remove codegen directory and rename filenames

* Remove extra files; clean up CMake file

* New branch for codegen changes

* Fix verify function for int4

* pk_int4 codegen

* Update to review comments

* Remove codegen directory and rename filenames

* Remove extra files; clean up CMake file

* code changes for single instance

* config file rename, added few more combinations in json file

* Fix cmake file

* Addressing review comments

* Reverting files changed by merge to develop

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 CMakeLists.txt                                |   1 +
 tile_engine/CMakeLists.txt                    |   5 +
 tile_engine/ops/CMakeLists.txt                |   1 +
 tile_engine/ops/gemm/CMakeLists.txt           |  45 ++
 .../gemm/configs/instance_combination.json    |  60 ++
 tile_engine/ops/gemm/gemm_host_api.cpp        | 169 +++++
 tile_engine/ops/gemm/gemm_host_api.hpp        | 287 +++++++++
 tile_engine/ops/gemm/gemm_instance_builder.py | 596 ++++++++++++++++++
 8 files changed, 1164 insertions(+)
 create mode 100755 tile_engine/CMakeLists.txt
 create mode 100755 tile_engine/ops/CMakeLists.txt
 create mode 100644 tile_engine/ops/gemm/CMakeLists.txt
 create mode 100644 tile_engine/ops/gemm/configs/instance_combination.json
 create mode 100644 tile_engine/ops/gemm/gemm_host_api.cpp
 create mode 100644 tile_engine/ops/gemm/gemm_host_api.hpp
 create mode 100755 tile_engine/ops/gemm/gemm_instance_builder.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c1ca789f5..ba57ead09a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -610,6 +610,7 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
         PACKAGE_NAME examples
    )
    add_subdirectory(example)
+   add_subdirectory(tile_engine)
    if(BUILD_TESTING)
        add_subdirectory(test)
    endif()
diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
new file mode 100755
index 0000000000..cd1a192a74
--- /dev/null
+++ b/tile_engine/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories(BEFORE
+        ${CMAKE_CURRENT_LIST_DIR}/include
+    )
+
+add_subdirectory(ops)
diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
new file mode 100755
index 0000000000..0cf2c16da2
--- /dev/null
+++ b/tile_engine/ops/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(gemm)
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
new file mode 100644
index 0000000000..d28017ca0c
--- /dev/null
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -0,0 +1,45 @@
+
+
+# generate a list of kernels, but not actually emit files at config stage
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+            --working_path ${CMAKE_CURRENT_BINARY_DIR}
+            --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+            --list_blobs
+            RESULT_VARIABLE ret
+)
+
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+endif()
+
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt GEMM_CODEGEN_BLOBS)
+
+add_custom_command(
+    OUTPUT  ${GEMM_CODEGEN_BLOBS}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+            --working_path ${CMAKE_CURRENT_BINARY_DIR}
+            --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+            --gen_blobs
+    DEPENDS ${GEMM_CODEGEN_BLOBS}
+)
+
+set(EXECUTABLE_GEMM_INSTANCE "tile_engine_gemm")
+message("adding example ${EXECUTABLE_GEMM_INSTANCE}")
+
+# use build as include directory
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+add_executable(${EXECUTABLE_GEMM_INSTANCE} EXCLUDE_FROM_ALL gemm_host_api.cpp)
+target_include_directories(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${GEMM_CODEGEN_BLOBS})
+
+set(EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS)
+
+list(APPEND EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS
+     -Wno-undefined-func-template
+     -Wno-float-equal
+     --offload-compress)
+
+target_compile_options(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS})
+
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
new file mode 100644
index 0000000000..e21197d1de
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/instance_combination.json
@@ -0,0 +1,60 @@
+{
+   
+    "layout_a": {
+      "values": ["r"]
+    },
+    "layout_b": {
+      "values": ["c"]
+    },
+    "layout_c": {
+      "values": ["r"] 
+    },
+    "datatype": {
+      "values": ["fp16"] 
+    },
+    "tile_m": {
+      "values": [256]
+    },
+    "tile_n": {
+      "values": [256]
+    },
+    "tile_k": {
+      "values": [64]
+    },
+    "warp_m": {
+      "values": [2]
+    },
+    "warp_n": {
+      "values": [2]
+    },
+    "warp_k": {
+      "values": [1]
+    },
+    "warp_tile_m": {
+      "values": [32]
+    },
+    "warp_tile_n": {
+      "values": [32]
+    },
+    "warp_tile_k": {
+      "values": [16]
+    },
+    "kPadM": {
+      "values": [false]
+    },
+    "kPadN": {
+      "values": [false]
+    },
+    "kPadK": {
+      "values": [false]
+    },
+    "pipeline": {
+      "values": ["compv3", "mem"]
+    },
+    "scheduler": {
+      "values": ["intrawave", "interwave"]
+    },
+    "epilogue": {
+      "values": ["default", "cshuffle"]
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp
new file mode 100644
index 0000000000..508f634920
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_host_api.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "gemm_common.hpp"
+#include "gemm_dispatcher.hpp"
+#include "gemm_host_api.hpp"
+
+float gemm_kernel_launch(KernelTraits& trait,
+                         ck_tile::GemmHostArgs& args,
+                         const ck_tile::stream_config& s)
+{
+    return GemmDispatcher::dispatch(trait, args, s);
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    const ALayout a_layout = ALayout{};
+    const BLayout b_layout = BLayout{};
+    // const CLayout c_layout = CLayout{};
+
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t K      = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    int verify                   = arg_parser.get_int("v");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
+        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+    {
+        // Permute vector pk_i4x4 data for device implementation
+        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+        // permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
+        permute_vectors_i4x4_b(b_k_n_dev);
+        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+    }
+    else
+    {
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    ck_tile::GemmHostArgs gemm_args;
+    gemm_args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
+    gemm_args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
+    gemm_args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+    gemm_args.k_batch  = kbatch;
+    gemm_args.M        = M;
+    gemm_args.N        = N;
+    gemm_args.K        = K;
+    gemm_args.stride_A = stride_A;
+    gemm_args.stride_B = stride_B;
+    gemm_args.stride_C = stride_C;
+
+    KernelTraits trait;
+    trait.pipeline  = arg_parser.get_str("pipeline");
+    trait.scheduler = arg_parser.get_str("scheduler");
+    trait.epilogue  = arg_parser.get_str("epilogue");
+    trait.kPadM     = arg_parser.get_bool("pad_m");
+    trait.kPadN     = arg_parser.get_bool("pad_n");
+    trait.kPadK     = arg_parser.get_bool("pad_k");
+
+    float ave_time = gemm_kernel_launch(
+        trait, gemm_args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
+              << " B Type = " << DataTypeTraits<BDataType>::name
+              << " C Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+    if(verify)
+    {
+        pass = gemm_verify<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+            verify,
+            a_m_k,
+            b_k_n,
+            c_m_n_dev_result,
+            a_m_k_dev_buf,
+            b_k_n_dev_buf,
+            M,
+            N,
+            K,
+            stride_A,
+            stride_B,
+            stride_C,
+            kbatch);
+    }
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+        return run<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(parser);
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
new file mode 100644
index 0000000000..4f0ea52a18
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -0,0 +1,287 @@
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include "ck_tile/ops/gemm.hpp"
+
+#pragma once
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+struct KernelTraits
+{
+    std::string pipeline;
+    std::string scheduler;
+    std::string epilogue;
+    bool kPadM;
+    bool kPadN;
+    bool kPadK;
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("split_k", "1", "splitK value")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("pipeline", "compv3", "compv3, compv4, mem")
+        .insert("scheduler", "intrawave", "intrawave, interwave")
+        .insert("epilogue", "cshuffle", "cshuffle, default")
+        .insert("pad_m", "false", "true, false")
+        .insert("pad_n", "false", "true, false")
+        .insert("pad_k", "false", "true, false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename Tensor>
+void permute_vectors_i4x4_b(Tensor& tensor)
+{
+    const ck_tile::index_t K = tensor.get_length(0);
+    const ck_tile::index_t N = tensor.get_length(1);
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int8_t input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int8_t i4x2      = tensor(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int8_t hi   = input[2];
+                int8_t lo   = input[0];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 0, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[6];
+                int8_t lo   = input[4];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 2, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[3];
+                int8_t lo   = input[1];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 4, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[7];
+                int8_t lo   = input[5];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 6, i) = i4x2;
+            }
+        }
+    }
+}
+
+// verification code
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool gemm_verify(int verify,
+                 ck_tile::HostTensor<ADataType>& a_m_k,
+                 ck_tile::HostTensor<BDataType>& b_k_n,
+                 ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                 ck_tile::DeviceMem& a_m_k_dev_buf,
+                 ck_tile::DeviceMem& b_k_n_dev_buf,
+                 ck_tile::index_t M,
+                 ck_tile::index_t N,
+                 ck_tile::index_t K,
+                 ck_tile::index_t stride_A,
+                 ck_tile::index_t stride_B,
+                 ck_tile::index_t stride_C,
+                 ck_tile::index_t kbatch)
+{
+    bool pass = true;
+    if(verify == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(verify == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, a_m_k.get_element_space_size_in_bytes()));
+        ck_tile::hip_check_error(hipMalloc(&d_B, b_k_n.get_element_space_size_in_bytes()));
+        ck_tile::hip_check_error(
+            hipMalloc(&d_C, c_m_n_dev_result.get_element_space_size_in_bytes()));
+
+        ck_tile::hip_check_error(hipMemcpy(d_A,
+                                           a_m_k_dev_buf.GetDeviceBuffer(),
+                                           a_m_k.get_element_space_size_in_bytes(),
+                                           hipMemcpyHostToDevice));
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_k_n_dev_buf.GetDeviceBuffer(),
+                                           b_k_n.get_element_space_size_in_bytes(),
+                                           hipMemcpyHostToDevice));
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
+                                           d_C,
+                                           c_m_n_dev_result.get_element_space_size_in_bytes(),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_gpu_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+    return pass;
+}
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
new file mode 100755
index 0000000000..c0dad03ef0
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -0,0 +1,596 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Dict, Any
+import functools
+import itertools
+import copy
+import json
+from dataclasses import dataclass
+ 
+DATA_TYPE_MAP = {'fp32'  : 'float',
+                 'fp16'  : 'ck_tile::half_t',
+                 'bf16'  : 'ck_tile::bf16_t',
+                 'int8'  : 'ck_tile::int8_t',
+                 'fp8'   : 'ck_tile::fp8_t',
+                 'bf8'   : 'ck_tile::bf8_t',
+                 'int4'  : 'ck_tile::pk_int4_t'
+                }
+
+LAYOUT_MAP = {'r' : 'ck_tile::tensor_layout::gemm::RowMajor',
+              'c' : 'ck_tile::tensor_layout::gemm::ColumnMajor'}                                       
+
+DEFAULT_EPILOGUE = """
+            using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
+                                ck_tile::DefaultGemm2DEpilogueProblem<AccDataType, 
+                                                                      CDataType, 
+                                                                      CLayout, 
+                                                                      kPadM,
+                                                                      kPadN,
+                                                                      WarpTileM,
+                                                                      WarpTileN,
+                                                                      WarpTileK,
+                                                                      UniversalGemmProblem::TransposeC>>;
+"""
+
+CSHUFFLE_EPILOGUE = """
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                             BDataType,
+                                                             AccDataType,
+                                                             CDataType,
+                                                             CLayout,
+                                                             GemmPipelineProblem::kBlockSize,
+                                                             TilePartitioner::MPerBlock,
+                                                             TilePartitioner::NPerBlock,
+                                                             WarpM,
+                                                             WarpN,
+                                                             WarpTileM,
+                                                             WarpTileN,
+                                                             WarpTileK,
+                                                             UniversalGemmProblem::TransposeC>>;
+"""
+HOT_LOOP_FALSE = """
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Odd)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Even)
+            {
+                Run(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else
+            {
+                throw std::runtime_error("Num K loop must be larger than number of prefetech stages.");
+            }  
+"""
+RUN_MEM = """
+            if(tail_num == ck_tile::TailNumber::One)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+
+            if constexpr(BaseGemmPipeline::PrefetchStages > 2)
+            {
+                if(tail_num == ck_tile::TailNumber::Two)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+                }
+        
+                if(tail_num == ck_tile::TailNumber::Three)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Four)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Five)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Six)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
+                }
+                if(tail_num == ck_tile::TailNumber::Seven)
+                {
+                    Run(ck_tile::bool_constant<true>{},
+                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+                }
+                throw std::runtime_error("The tile number is wrong! It should not exceed the prefetch stage numbers");
+            }
+"""
+
+RUN_COMPV3 = """
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Odd)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Even)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even.");
+            }
+"""
+
+RUN_COMPV4 = """
+            if(tail_num == ck_tile::TailNumber::Three)
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+            }
+            else
+            {
+                Run(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            }
+"""
+
+
+PIPELINE_MAP = {'mem' : ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'],
+                'compv3' : ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'],
+                'compv4' : ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']}
+
+SCHEDULER_MAP = {'interwave' : 'ck_tile::GemmPipelineScheduler::Interwave',
+                 'intrawave' : 'ck_tile::GemmPipelineScheduler::Intrawave'}
+
+EPILOGUE_MAP = {'default' :DEFAULT_EPILOGUE,
+                'cshuffle' : CSHUFFLE_EPILOGUE}      
+
+HOT_LOOP_TRUE = {'mem' : RUN_MEM,
+                 'compv3' : RUN_COMPV3,
+                 'compv4' : RUN_COMPV4}    
+
+
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+
+@dataclass
+class GemmConfig:
+    def __init__(self, config_data):
+        self.matrix_cfg : Dict[str, Any] = {}
+        self.impl_cfg : Dict[str, Any] = {}
+        for key, value in config_data.items():
+            if key in ["datatype", "layout_a", "layout_b", "layout_c"]:
+                self.matrix_cfg[key] = value
+            else:
+                self.impl_cfg[key] = value
+    
+    @property
+    def datatype(self) -> str:
+        return self.matrix_cfg["datatype"]["values"][0]
+    
+    @property
+    def layouts(self) -> List[str]:
+        return [
+            self.matrix_cfg["layout_a"]["values"][0],
+            self.matrix_cfg["layout_b"]["values"][0],
+            self.matrix_cfg["layout_c"]["values"][0]
+        ]
+
+
+class GemmCodeGenerator:
+    def __init__(self, output_dir: str, config: GemmConfig):
+        self.output_dir = Path(output_dir)
+        if not self.output_dir.exists():
+            self.output_dir.mkdir()
+
+        self.config = config
+        self.all_kernels = []
+        self.unique_configs = [] 
+        # Validate configurations
+        self._validate_config()
+
+    def _validate_config(self):
+        """Validate matrix and implementation configurations"""
+        # Matrix config validation
+        for param in ["datatype", "layout_a", "layout_b", "layout_c"]:
+            if len(self.config.matrix_cfg[param]["values"]) != 1:
+                raise ValueError(f"Matrix config {param} must have exactly one value")
+        
+        # Implementation traits validation
+        required_params = ["tile_m", "tile_n", "tile_k", "warp_m", "warp_n", "warp_k",
+                          "warp_tile_m", "warp_tile_n", "warp_tile_k", "pipeline",
+                          "epilogue", "scheduler", "kPadM", "kPadN", "kPadK"]
+        for param in required_params:
+            if not self.config.impl_cfg.get(param, {}).get("values"):
+                raise ValueError(f"Missing implementation parameter: {param}")
+
+    def list_all(self):
+        """List all possible kernel configurations"""
+        w_p = Path(self.output_dir)
+        list_p = w_p / 'gemm_instance_blobs.txt'
+        self._list_config_groups()
+        with list_p.open('w') as list_f:
+            list_f.write(str(w_p / ("gemm_common.hpp"))  + "\n")
+            list_f.write(str(w_p / ("gemm_instances.hpp"))  + "\n")
+            list_f.write(str(w_p / ("gemm_dispatcher.hpp"))  + "\n")  
+            for group in self.all_kernels:
+                list_f.write(str(w_p / ("gemm_" + group + ".hpp")) + "\n")
+            
+
+
+    def _list_config_groups(self):
+        params = [
+            ("pipeline", "pipeline"),
+            ("epilogue", "epilogue"),
+            ("scheduler", "scheduler"),
+            ("kPadM", "kPadM"),
+            ("kPadN", "kPadN"), 
+            ("kPadK", "kPadK")
+        ]
+        
+        # Generate all unique_combinations
+        _unique = set(itertools.product(*[self.config.impl_cfg[p]["values"] for (p, _) in params]))
+        for combo in _unique:
+            config = {name: value for (_, name), value in zip(params, combo)}
+            pipeline, epilogue, scheduler, kPadM, kPadN, kPadK = config.values()
+            # To remove some unsupported combinations
+            unsupported_combination = [("compv3", "cshuffle", "interwave"),
+                                       ("compv3", "default", "interwave"),
+                                       ("compv4", "cshuffle", "interwave"),
+                                       ("compv4", "default", "interwave")]
+            if (pipeline, epilogue, scheduler) not in unsupported_combination:
+                group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}"
+                self.all_kernels.append(group_name)
+                self.unique_configs.append(config)
+
+    def generate_all(self):
+        self._generate_common_header()
+        self._generate_config_groups()
+        self._generate_dispatcher()
+       
+
+    def _generate_common_header(self):
+        """Generate common header with datatypes and layout"""
+        ctype = self.config.datatype
+        atype = self.config.datatype
+        btype = self.config.datatype
+        if self.config.datatype in ['fp8', 'bf8']:
+            ctype = 'fp16'
+        elif self.config.datatype in ['int4']:
+            atype = 'fp16'
+            ctype = 'fp16'
+
+        content = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+
+// Data types
+using ADataType = {DATA_TYPE_MAP[atype]};
+using BDataType = {DATA_TYPE_MAP[btype]};
+using AccDataType = float;
+using CDataType = {DATA_TYPE_MAP[ctype]};
+
+// Layout configurations
+using ALayout = {LAYOUT_MAP[self.config.layouts[0]]};
+using BLayout = {LAYOUT_MAP[self.config.layouts[1]]};
+using CLayout = {LAYOUT_MAP[self.config.layouts[2]]};
+"""
+        
+
+        (self.output_dir / "gemm_common.hpp").write_text(content)
+
+    def _generate_config_groups(self):
+        """Generate implementation configuration groups"""
+        if not self.unique_configs:  # Check if the list is empty
+            self._list_config_groups()
+        for config in self.unique_configs:
+            self._generate_config_group(**config)
+        self.generate_common_instances_header()
+
+    
+    def _generate_config_group(self, pipeline: str, epilogue: str, scheduler: str,
+                              kPadM: bool, kPadN: bool, kPadK: bool):
+        """Generate a configuration group with all tile/warp combinations"""
+        group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}"
+        filename = f"gemm_{group_name}.hpp"
+
+        content = f"""// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_common.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/host.hpp"
+
+namespace {group_name} {{
+"""
+        # Add template struct with configuration
+        content += self._generate_kernel_struct(pipeline, epilogue, scheduler, kPadM, kPadN, kPadK)
+
+        content += f"\n}} // namespace {group_name}\n"
+        (self.output_dir / filename).write_text(content)
+
+    def _generate_kernel_struct(self, pipeline: str, epilogue: str, scheduler: str,
+                               kPadM: bool, kPadN: bool, kPadK: bool) -> str:
+        """Generate kernel struct template"""
+        return f"""
+template <int TileM, int TileN, int TileK,
+          int WarpM, int WarpN, int WarpK,
+          int WarpTileM, int WarpTileN, int WarpTileK>
+struct GemmKernel {{
+    static constexpr bool kPadM = {BOOL_MAP(kPadM)};
+    static constexpr bool kPadN = {BOOL_MAP(kPadN)};
+    static constexpr bool kPadK = {BOOL_MAP(kPadK)};
+
+    static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
+        static constexpr bool permuteA = false;
+        static constexpr bool permuteB = false;
+        static constexpr bool DoubleSmemBuffer = false;
+        static constexpr bool TransposeC = false;
+
+        static constexpr int kBlockPerCu                         = 1;
+        static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        static constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape = 
+            ck_tile::TileGemmShape<ck_tile::sequence<TileM, TileN, TileK>,
+                                   ck_tile::sequence<WarpM, WarpN, WarpK>,
+                                   ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>,
+                                   permuteA,
+                                   permuteB>;
+
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                      TileParitionerGroupNum,
+                                                      TileParitionerM01>;
+
+        using Traits  =
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;        
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
+                                             ALayout, BLayout, CLayout, TransposeC>;    
+
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}<GemmPipelineProblem>;  
+
+        const ck_tile::index_t k_grain     = args.k_batch * TileK;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * TileK;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);                                                                                                             
+
+        float ave_time{{0}};
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr auto scheduler      = {SCHEDULER_MAP[scheduler]};
+
+            using UniversalGemmProblem = 
+                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      GemmShape,
+                                                      GemmUniversalTraits,
+                                                      scheduler,
+                                                      has_hot_loop_v,
+                                                      tail_number_v>;
+
+            using GemmPipeline = {PIPELINE_MAP[pipeline][1]}<UniversalGemmProblem>; 
+            {EPILOGUE_MAP[epilogue]}
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {{
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
+            }}
+
+            if(s.log_level_ > 0)
+            {{
+                std::cout << "Launching kernel with args:"
+                      << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
+                      << ", blocks: {{" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}}"
+                      << std::endl;
+            }}
+
+            ave_time = ck_tile::launch_kernel(s,
+                                          ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                              Kernel{{}}, grids, blocks, 0, kargs));
+            return ave_time;
+
+        }};
+
+        if(has_hot_loop) {{
+            {HOT_LOOP_TRUE[pipeline]}
+        }} else {{
+            {HOT_LOOP_FALSE}
+        }}
+
+        return ave_time;
+    }}
+}};
+"""
+
+    def generate_common_instances_header(self):
+        """Generate common instances header"""
+        content = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+"""
+        for group in self.all_kernels:
+            content += f"#include \"gemm_{group}.hpp\"\n"
+        (self.output_dir / "gemm_instances.hpp").write_text(content)
+
+    def _generate_dispatcher(self):
+        """Generate dispatch mechanism"""
+        content = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_common.hpp"
+#include "gemm_instances.hpp"
+#include "gemm_host_api.hpp"
+#include <unordered_map>
+#include <functional>
+#include <vector>
+
+struct GemmDispatcher {
+    static auto& get_kernel_map() {
+        // Use a static local variable
+        static std::unordered_map<std::string, 
+            std::function<float(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>> kernel_map;
+        return kernel_map;
+    }
+
+    static void init() {
+        auto& kernel_map = get_kernel_map();    
+        if(!kernel_map.empty()) return;
+        \n"""
+         # Add tile/warp instantiations
+        tile_params = set(itertools.product(
+            self.config.impl_cfg["tile_m"]["values"],
+            self.config.impl_cfg["tile_n"]["values"],
+            self.config.impl_cfg["tile_k"]["values"],
+            self.config.impl_cfg["warp_m"]["values"],
+            self.config.impl_cfg["warp_n"]["values"],
+            self.config.impl_cfg["warp_k"]["values"],
+            self.config.impl_cfg["warp_tile_m"]["values"],
+            self.config.impl_cfg["warp_tile_n"]["values"],
+            self.config.impl_cfg["warp_tile_k"]["values"]
+        ))
+
+        
+        for group in self.all_kernels:
+            content += f"""            kernel_map["{group}"] = [](ck_tile::GemmHostArgs& args, 
+                                        const ck_tile::stream_config& s) {{
+                std::vector<float> results;"""
+            for tile in tile_params:
+                # Check if we have valid tile/warp combinations 
+                # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
+                if ((tile[0]/(tile[3] * tile[7]) * tile[3] * tile[7]) != tile[0]) or \
+                   ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
+                    continue
+                content += f"""
+                //we can have multiple tiles config for the one kernel_trait
+                return {group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}>::launch(args, s);"""
+            content += """
+            };\n"""
+
+        content += """    }
+        
+    
+    static float dispatch(const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
+                         const ck_tile::stream_config& s) {
+        init();
+        const std::string key = assemble_key(trait);
+        auto& kernel_map = get_kernel_map(); 
+        if(auto it = kernel_map.find(key); it != kernel_map.end()) {
+            return it->second(gemm_args, s); //Running single instance
+        }
+        throw std::runtime_error("No suitable kernel found: " + key);
+    }
+
+private:
+    static std::string assemble_key(const KernelTraits &trait) {
+        return std::string(trait.pipeline) + "_" + 
+               trait.epilogue + "_" + 
+               trait.scheduler + "_" +
+               "pad_" + 
+               (trait.kPadM ? "true" : "false") + "_" +
+               (trait.kPadN ? "true" : "false") + "_" +
+               (trait.kPadK ? "true" : "false");
+    }
+};
+
+"""
+        (self.output_dir / "gemm_dispatcher.hpp").write_text(content)
+
+        
+def do_list_blobs(args, gemm_config):
+    generator = GemmCodeGenerator(args.working_path, gemm_config)
+    generator.list_all()
+
+def do_gen_blobs(args, gemm_config):
+    generator = GemmCodeGenerator(args.working_path, gemm_config)
+    generator.generate_all()
+
+     
+
+def main(args):
+    # Read and validate json file
+    with open(args.json, 'r') as json_file:
+        config_data = json.load(json_file)
+    
+    # Validate and parse configuration
+    gemm_config = GemmConfig(config_data)
+
+    if args.list_blobs:
+        do_list_blobs(args, gemm_config)
+    elif args.gen_blobs:
+        do_gen_blobs(args, gemm_config)
+    else:
+        # If neither was specified, either do nothing or default to gen_blobs
+        print("No mode specified (use --list_blobs or --gen_blobs). Generating by default...")
+        do_gen_blobs(args, gemm_config)
+   
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK gemm kernel",
+    )
+    parser.add_argument(
+        "-w", "--working_path", default="./", required=False, help="the path where all the blobs are going to be generated"
+    )
+    parser.add_argument(
+        "-j", "--json", required=True, help="Path to the json which contains the kernel configurations"
+    )
+    parser.add_argument(
+        "-l", "--list_blobs", action = 'store_true', help="List all kernel to file"
+    )
+    parser.add_argument(
+        "-g", "--gen_blobs", action = 'store_true', help="Generate all kernels into different files"
+    )
+    
+    args = parser.parse_args()
+    
+    main(args)

From 7142d8003c6a99f952a62bbd0b90d5f0261fc807 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Thu, 3 Apr 2025 14:22:43 -0700
Subject: [PATCH 020/443] CkProfiler StreamK GemmUniversal Fix and Split
 Gemm_universal Test  (#2044)

* fix and split gemm_universal test

* clang

* Update test_gemm_universal_ut_cases_bf16.inc

* Update test_gemm_universal_xdl_bf16.cpp

* Update test_gemm_universal_ut_cases_fp16.inc
---
 .../profile_gemm_universal_streamk_impl.hpp   |   2 +-
 test/gemm_universal/CMakeLists.txt            |  15 ++-
 ... => test_gemm_universal_ut_cases_bf16.inc} |  60 +++-------
 .../test_gemm_universal_ut_cases_fp16.inc     |  99 +++++++++++++++
 .../test_gemm_universal_ut_cases_fp8.inc      | 113 ++++++++++++++++++
 ...l.cpp => test_gemm_universal_xdl_bf16.cpp} |  34 ++----
 .../test_gemm_universal_xdl_fp16.cpp          |  82 +++++++++++++
 .../test_gemm_universal_xdl_fp8.cpp           |  71 +++++++++++
 .../test_gemm_universal_streamk_util.hpp      |  12 +-
 9 files changed, 409 insertions(+), 79 deletions(-)
 mode change 100644 => 100755 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
 mode change 100644 => 100755 test/gemm_universal/CMakeLists.txt
 rename test/gemm_universal/{test_gemm_universal_ut_cases.inc => test_gemm_universal_ut_cases_bf16.inc} (75%)
 create mode 100644 test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
 create mode 100644 test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
 rename test/gemm_universal/{test_gemm_universal_xdl.cpp => test_gemm_universal_xdl_bf16.cpp} (61%)
 create mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
 create mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp8.cpp

diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100644
new mode 100755
index d145ab1766..e625fae808
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -166,7 +166,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
             0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP,
                             // 2:2-tile Stream-K + DP
 
-        if(Grid_size != -1)
+        if(Grid_size == -1)
         {
             grid_size_list = {Grid_size};
         }
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
old mode 100644
new mode 100755
index 4aab6323cc..cf5c68e220
--- a/test/gemm_universal/CMakeLists.txt
+++ b/test/gemm_universal/CMakeLists.txt
@@ -1,4 +1,15 @@
-add_gtest_executable(test_gemm_universal test_gemm_universal_xdl.cpp)
+add_gtest_executable(test_gemm_universal_fp16 test_gemm_universal_xdl_fp16.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal PRIVATE utility device_gemm_universal_instance)
+   target_link_libraries(test_gemm_universal_fp16 PRIVATE utility device_gemm_universal_instance)
  endif()
+
+add_gtest_executable(test_gemm_universal_fp8 test_gemm_universal_xdl_fp8.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_fp8 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_bf16 test_gemm_universal_xdl_bf16.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_bf16 PRIVATE utility device_gemm_universal_instance)
+endif()
+
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases.inc b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
similarity index 75%
rename from test/gemm_universal/test_gemm_universal_ut_cases.inc
rename to test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
index 9a21666856..8a6c672a9f 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
@@ -1,6 +1,6 @@
 #pragma once
 
-TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -14,7 +14,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -28,7 +28,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -44,7 +44,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -60,7 +60,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -74,7 +74,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -88,7 +88,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -104,7 +104,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -120,7 +120,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -134,7 +134,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -148,7 +148,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -164,7 +164,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -180,7 +180,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -194,7 +194,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -207,35 +207,3 @@ TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
-
-TYPED_TEST(TestGemmUniversal_KM_KN, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-    {
-        int StrideA = M;
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-    }
-}
-
-TYPED_TEST(TestGemmUniversal_KM_NK, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-    {
-        int StrideA = M;
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-    }
-}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
new file mode 100644
index 0000000000..b61ea0e6b4
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
@@ -0,0 +1,99 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
new file mode 100644
index 0000000000..b831e15e9c
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
similarity index 61%
rename from test/gemm_universal/test_gemm_universal_xdl.cpp
rename to test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
index b872d7089a..8fde65657a 100644
--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
@@ -7,8 +7,6 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -29,25 +27,25 @@ struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
 } // namespace
 
 template <typename Tuple>
-class TestGemmUniversal_MK_KN
+class TestGemmUniversal_BF16_MK_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_MK_NK
+class TestGemmUniversal_BF16_MK_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_KM_KN
+class TestGemmUniversal_BF16_KM_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_KM_NK
+class TestGemmUniversal_BF16_KM_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
 {
 };
@@ -55,22 +53,12 @@ class TestGemmUniversal_KM_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 
@@ -86,9 +74,9 @@ using KernelTypes_KM_KN = ::testing::Types<
 
 // clang-format on
 
-TYPED_TEST_SUITE(TestGemmUniversal_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_MK_NK, KernelTypes_MK_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_KM_KN, KernelTypes_KM_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_KM_NK, KernelTypes_KM_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
-#include "test_gemm_universal_ut_cases.inc"
+#include "test_gemm_universal_ut_cases_bf16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
new file mode 100644
index 0000000000..24f587daf6
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
new file mode 100644
index 0000000000..e833ab7825
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F8   = ck::f8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
+
+
+#include "test_gemm_universal_ut_cases_fp8.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
index ef3509c0ca..805587a274 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -44,9 +44,8 @@ class TestGemmUniversal_Streamk : public testing::Test
 
     void SetUp() override
     {
-        grid_size_list   = {38, 114, 228}; // {38, 76, 114, 152, 190, 228, 266, 304, 342, 380};
-        streamk_sel_list = {0, 1, 2};      // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
-                                           // Stream-K+ DP, // {0, 1, 2, 3, 4}
+        streamk_sel_list = {0, 1, 2}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
+                                      // Stream-K+ DP, // {0, 1, 2, 3, 4}
         // 2:2-tile Stream-K + DP
     }
 
@@ -58,10 +57,9 @@ class TestGemmUniversal_Streamk : public testing::Test
              const int StrideC)
     {
         for(auto streamk_sel : streamk_sel_list)
-            for(auto grid_size : grid_size_list)
-            {
-                RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, grid_size);
-            }
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
+        }
     }
 
     void RunSingle(const int M,

From 572cd820ce720aed32168660f7d3d41304390776 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 3 Apr 2025 15:30:21 -0700
Subject: [PATCH 021/443] Split env.hpp header from the ck.hpp header. (#2049)

* split env.hpp out of main headers

* fix namespace logic
---
 include/ck/ck.hpp                                           | 5 -----
 include/ck/host_utility/flush_cache.hpp                     | 1 +
 include/ck/host_utility/kernel_launch.hpp                   | 1 +
 ...batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp | 1 +
 .../device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp | 3 ++-
 ...evice_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 3 ++-
 .../impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp      | 3 ++-
 ...fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 3 ++-
 ...v2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 3 ++-
 .../impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 3 ++-
 .../device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp    | 3 ++-
 .../device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 3 ++-
 .../device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp   | 1 +
 .../device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp  | 3 ++-
 .../ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp  | 3 ++-
 .../gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp  | 3 ++-
 .../gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp          | 3 ++-
 ...ice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 1 +
 ...evice_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 2 +-
 .../impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp    | 1 +
 .../device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp   | 1 +
 ...device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 1 +
 .../gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp   | 1 +
 ...rouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp | 3 ++-
 .../gpu/device/impl/device_grouped_gemm_xdl.hpp             | 1 +
 .../device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 1 +
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp      | 1 +
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp              | 3 ++-
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp | 1 +
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp      | 3 ++-
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp    | 3 ++-
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp           | 1 +
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp                | 3 ++-
 include/ck/utility/env.hpp                                  | 5 +++++
 include/ck_tile/core.hpp                                    | 1 -
 include/ck_tile/core/config.hpp                             | 6 ------
 include/ck_tile/core/utility/env.hpp                        | 4 ++++
 include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp             | 1 +
 .../include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp | 1 +
 profiler/include/profiler/profile_grouped_gemm_impl.hpp     | 3 ++-
 .../profile_grouped_gemm_multiply_tile_loop_impl.hpp        | 1 +
 .../profiler/profile_grouped_gemm_tile_loop_impl.hpp        | 1 +
 42 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 1d49b68a32..9d5d5fbc0b 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -6,15 +6,10 @@
 #include "ck/config.h"
 
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
-#include "ck/utility/env.hpp"
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
-
-// environment variable to enable logging:
-// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
-CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #endif
 // to do: add various levels of logging with CK_LOG_LEVEL
 
diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp
index 918fb28ea9..08b3aba2b3 100644
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/utility/flush_icache.hpp"
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
index 5c1c1c4e60..11a1c9bbc0 100644
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -6,6 +6,7 @@
 #include <hip/hip_runtime.h>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index f6c228fb7b..d38698af4b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index 30ae72a63e..de7d67f08b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 2662e5c360..bae5c6019d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index 0b73317c5e..d4f89b3e09 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 13eb23574f..a8eb73d730 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index 28778d825b..6eb9281d30 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 7fa231d4f4..5fad21f521 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 3be7313d2b..c7aa54f1d9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index 8aa20f7ad4..68ec8187a4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef DEVICE_CONV3D_FWD_XDL_HPP
 #define DEVICE_CONV3D_FWD_XDL_HPP
@@ -10,6 +10,7 @@
 #include "device.hpp"
 #include "device_conv_fwd.hpp"
 #include "common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "tensor_layout.hpp"
 #include "convolution_forward_specialization.hpp"
 #include "tensor_descriptor.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
index 1edae33be3..ddabd61c3d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
index de8f35a640..2881036bee 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
index eb0fb55f5d..7faee161c1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
index fd6f3b65f2..213501468a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
index c2a27ebbdb..7315fe75a3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 770e531e44..08edddf107 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -8,6 +8,7 @@
 
 #include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 4d730b1f37..da7c4f759b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -8,7 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
-
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index f40b238c8a..c904b4e7d5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 272b832e11..c0148c3b9c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -11,6 +11,7 @@
 
 #include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index b2f1dbfa5c..a93e6ded96 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -11,6 +11,7 @@
 
 #include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index 463b10de43..10d8a4a44d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
index d692aa05ce..18872e38ea 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,6 +8,7 @@
 #include <tuple>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index d9a0249da8..aa70a24fc1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index a2afb62eec..01f52881f4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -7,6 +7,7 @@
 #include <sstream>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index cc8ae1806a..e5e32a8535 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 9f6d85dd78..29150c0688 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index ffa01efe17..a22fc06a50 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index 27818b6964..7124687d5d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 
 namespace ck {
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index b805f600d5..ac3e821340 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index 715fcbcfef..c204b95d0f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 
 namespace ck {
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 6ee279a3f1..256b495c6e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp
index 809f302f74..469fb70f10 100644
--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
@@ -184,4 +184,9 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
 }
 
 } // namespace ck
+
+// environment variable to enable logging:
+// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
+CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+
 #endif
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 821b3a8e84..d9aa8b3551 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -59,7 +59,6 @@
 #include "ck_tile/core/tensor/transpose_tile.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
-#include "ck_tile/core/utility/env.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index b1d201e30e..978f673346 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -28,12 +28,6 @@
 #include "hip/hip_fp16.h"
 #endif
 
-#include "ck_tile/core/utility/env.hpp"
-
-// environment variable to enable logging:
-// export CK_TILE_LOGGING=ON or CK_TILE_LOGGING=1 or CK_TILE_LOGGING=ENABLED
-CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING)
-
 #ifdef __HIPCC__
 #define CK_TILE_HOST inline __host__
 #define CK_TILE_DEVICE inline __device__
diff --git a/include/ck_tile/core/utility/env.hpp b/include/ck_tile/core/utility/env.hpp
index 5b0b7a9071..9b148b3e0b 100644
--- a/include/ck_tile/core/utility/env.hpp
+++ b/include/ck_tile/core/utility/env.hpp
@@ -202,3 +202,7 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
 }
 
 } // namespace ck_tile
+
+// environment variable to enable logging:
+// export CK_TILE_LOGGING=ON or CK_TILE_LOGGING=1 or CK_TILE_LOGGING=ENABLED
+CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING)
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index e5b9d17bac..bc41f680f2 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
+#include "ck_tile/core/utility/env.hpp"
 
 namespace ck_tile {
 
diff --git a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
index 09e03de99c..8fb20f0135 100644
--- a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
index 367e94de11..fc2ba5a650 100644
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
diff --git a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
index 94ee2a37e4..1b17f05760 100644
--- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp"
diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
index 3a4ca24dda..cf3c3a6bae 100644
--- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 
 #include "ck/ck.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp"

From b443056a26cd25e6e621ff1c026b02eefdfe1f29 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 3 Apr 2025 16:24:34 -0700
Subject: [PATCH 022/443] Documentation for newly added struct (#2051)

---
 tile_engine/ops/gemm/gemm_host_api.hpp        | 17 ++++++++++++++++-
 tile_engine/ops/gemm/gemm_instance_builder.py |  3 +--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 4f0ea52a18..3fa6dca863 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -54,6 +54,17 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
     static constexpr const char* name = "pk_int4_t";
 };
 
+/**
+ * @brief  trait for GEMM kernel
+    * @param pipeline:   pipeline name
+    * @param scheduler:  scheduler name
+    * @param epilogue:  epilogue name
+    * @param kPadM:     padding for M dimension
+    * @param kPadN:     padding for N dimension
+    * @param kPadK:     padding for K dimension 
+ * 
+ */
+
 struct KernelTraits
 {
     std::string pipeline;
@@ -173,7 +184,11 @@ void permute_vectors_i4x4_b(Tensor& tensor)
     }
 }
 
-// verification code
+/**
+ * @brief Function to verify the kernel output with reference implementation on CPU/GPU
+ * 
+ */
+
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index c0dad03ef0..e449dff94d 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -555,11 +555,10 @@ def do_gen_blobs(args, gemm_config):
      
 
 def main(args):
-    # Read and validate json file
+    # Read json file
     with open(args.json, 'r') as json_file:
         config_data = json.load(json_file)
     
-    # Validate and parse configuration
     gemm_config = GemmConfig(config_data)
 
     if args.list_blobs:

From 3bda57c20443cd6241ce27e683a20566fe8e8631 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 3 Apr 2025 16:55:49 -0700
Subject: [PATCH 023/443] file clang formatted (#2053)

---
 tile_engine/ops/gemm/gemm_host_api.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 3fa6dca863..375f808966 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -56,13 +56,13 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
 
 /**
  * @brief  trait for GEMM kernel
-    * @param pipeline:   pipeline name
-    * @param scheduler:  scheduler name
-    * @param epilogue:  epilogue name
-    * @param kPadM:     padding for M dimension
-    * @param kPadN:     padding for N dimension
-    * @param kPadK:     padding for K dimension 
- * 
+ * @param pipeline:   pipeline name
+ * @param scheduler:  scheduler name
+ * @param epilogue:  epilogue name
+ * @param kPadM:     padding for M dimension
+ * @param kPadN:     padding for N dimension
+ * @param kPadK:     padding for K dimension
+ *
  */
 
 struct KernelTraits
@@ -186,7 +186,7 @@ void permute_vectors_i4x4_b(Tensor& tensor)
 
 /**
  * @brief Function to verify the kernel output with reference implementation on CPU/GPU
- * 
+ *
  */
 
 template <typename ADataType,

From 5a22b61de59dd6131fe98ac87a7c0621c8dfda4e Mon Sep 17 00:00:00 2001
From: slippedJim <jim.guo@amd.com>
Date: Mon, 7 Apr 2025 14:18:01 +0800
Subject: [PATCH 024/443] Add new receipt (#2055)

---
 example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 7 +++++++
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 6326a97f8e..94f89256f9 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -545,6 +545,13 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= dpad == dvpad
                     if not cond:
                         continue
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode in ["batch", "group"]
+                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                    cond &= dpad == dvpad
+                    if not cond:
+                        continue
             api_pool.register_dq_dk_dv_traits(k.api_trait())
             gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index e5d11c6dc9..d978cc1d9b 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -536,6 +536,14 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+                # Aiter aiter::mha_fwd integration
+                elif receipt == 500:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode in ['batch', 'group']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 

From 29f72662165bcdfa746b1a247d9c8487cbb68f2e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 7 Apr 2025 06:49:36 -0700
Subject: [PATCH 025/443] =?UTF-8?q?Revert=20"CkProfiler=20StreamK=20GemmUn?=
 =?UTF-8?q?iversal=20Fix=20and=20Split=20Gemm=5Funiversal=20Test=20=20(?=
 =?UTF-8?q?=E2=80=A6"=20(#2054)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 7142d8003c6a99f952a62bbd0b90d5f0261fc807.
---
 .../profile_gemm_universal_streamk_impl.hpp   |   2 +-
 test/gemm_universal/CMakeLists.txt            |  15 +--
 ...6.inc => test_gemm_universal_ut_cases.inc} |  60 +++++++---
 .../test_gemm_universal_ut_cases_fp16.inc     |  99 ---------------
 .../test_gemm_universal_ut_cases_fp8.inc      | 113 ------------------
 ...l_bf16.cpp => test_gemm_universal_xdl.cpp} |  34 ++++--
 .../test_gemm_universal_xdl_fp16.cpp          |  82 -------------
 .../test_gemm_universal_xdl_fp8.cpp           |  71 -----------
 .../test_gemm_universal_streamk_util.hpp      |  12 +-
 9 files changed, 79 insertions(+), 409 deletions(-)
 mode change 100755 => 100644 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
 mode change 100755 => 100644 test/gemm_universal/CMakeLists.txt
 rename test/gemm_universal/{test_gemm_universal_ut_cases_bf16.inc => test_gemm_universal_ut_cases.inc} (75%)
 delete mode 100644 test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
 delete mode 100644 test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
 rename test/gemm_universal/{test_gemm_universal_xdl_bf16.cpp => test_gemm_universal_xdl.cpp} (61%)
 delete mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
 delete mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp8.cpp

diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100755
new mode 100644
index e625fae808..d145ab1766
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -166,7 +166,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
             0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP,
                             // 2:2-tile Stream-K + DP
 
-        if(Grid_size == -1)
+        if(Grid_size != -1)
         {
             grid_size_list = {Grid_size};
         }
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
old mode 100755
new mode 100644
index cf5c68e220..4aab6323cc
--- a/test/gemm_universal/CMakeLists.txt
+++ b/test/gemm_universal/CMakeLists.txt
@@ -1,15 +1,4 @@
-add_gtest_executable(test_gemm_universal_fp16 test_gemm_universal_xdl_fp16.cpp)
+add_gtest_executable(test_gemm_universal test_gemm_universal_xdl.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal_fp16 PRIVATE utility device_gemm_universal_instance)
+   target_link_libraries(test_gemm_universal PRIVATE utility device_gemm_universal_instance)
  endif()
-
-add_gtest_executable(test_gemm_universal_fp8 test_gemm_universal_xdl_fp8.cpp)
-if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal_fp8 PRIVATE utility device_gemm_universal_instance)
-endif()
-
-add_gtest_executable(test_gemm_universal_bf16 test_gemm_universal_xdl_bf16.cpp)
-if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal_bf16 PRIVATE utility device_gemm_universal_instance)
-endif()
-
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc b/test/gemm_universal/test_gemm_universal_ut_cases.inc
similarity index 75%
rename from test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
rename to test/gemm_universal/test_gemm_universal_ut_cases.inc
index 8a6c672a9f..9a21666856 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases.inc
@@ -1,6 +1,6 @@
 #pragma once
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -14,7 +14,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_KN, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -28,7 +28,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_KM_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -44,7 +44,7 @@ TYPED_TEST(TestGemmUniversal_BF16_KM_KN, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_KM_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -60,7 +60,7 @@ TYPED_TEST(TestGemmUniversal_BF16_KM_NK, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -74,7 +74,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_KN, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -88,7 +88,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_NK, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_KM_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -104,7 +104,7 @@ TYPED_TEST(TestGemmUniversal_BF16_KM_KN, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_KM_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -120,7 +120,7 @@ TYPED_TEST(TestGemmUniversal_BF16_KM_NK, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -134,7 +134,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_KN, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -148,7 +148,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_NK, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_KM_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -164,7 +164,7 @@ TYPED_TEST(TestGemmUniversal_BF16_KM_KN, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_KM_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -180,7 +180,7 @@ TYPED_TEST(TestGemmUniversal_BF16_KM_NK, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_KN, Regular)
+TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -194,7 +194,7 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_KN, Regular)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_BF16_MK_NK, Regular)
+TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -207,3 +207,35 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_NK, Regular)
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
+
+TYPED_TEST(TestGemmUniversal_KM_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_KM_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
deleted file mode 100644
index b61ea0e6b4..0000000000
--- a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
+++ /dev/null
@@ -1,99 +0,0 @@
-#pragma once
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_KN, SmallM)
-{
-    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_NK, SmallM)
-{
-    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_NK, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_KN, PaddK)
-{
-    std::vector<int> Ms{127};
-    constexpr int N = 512;
-    constexpr int K = 437;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_NK, PaddK)
-{
-    std::vector<int> Ms{127};
-    constexpr int N = 512;
-    constexpr int K = 437;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_KN, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP16_MK_NK, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
deleted file mode 100644
index b831e15e9c..0000000000
--- a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
+++ /dev/null
@@ -1,113 +0,0 @@
-#pragma once
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_KN, SmallM)
-{
-    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_NK, SmallM)
-{
-    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_KN, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_NK, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_KN, PaddK)
-{
-    std::vector<int> Ms{127};
-    constexpr int N = 512;
-    constexpr int K = 437;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_NK, PaddK)
-{
-    std::vector<int> Ms{127};
-    constexpr int N = 512;
-    constexpr int K = 437;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_KN, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_FP8_MK_NK, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
diff --git a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp b/test/gemm_universal/test_gemm_universal_xdl.cpp
similarity index 61%
rename from test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
rename to test/gemm_universal/test_gemm_universal_xdl.cpp
index 8fde65657a..b872d7089a 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
@@ -7,6 +7,8 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
+using F8   = ck::f8_t;
+using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -27,25 +29,25 @@ struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
 } // namespace
 
 template <typename Tuple>
-class TestGemmUniversal_BF16_MK_KN
+class TestGemmUniversal_MK_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_BF16_MK_NK
+class TestGemmUniversal_MK_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_BF16_KM_KN
+class TestGemmUniversal_KM_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_BF16_KM_NK
+class TestGemmUniversal_KM_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
 {
 };
@@ -53,12 +55,22 @@ class TestGemmUniversal_BF16_KM_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-
+    std::tuple<      F16,       F16,             F16,     F16>,
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-
+    std::tuple<      F16,       F16,             F16,     F16>,
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 
@@ -74,9 +86,9 @@ using KernelTypes_KM_KN = ::testing::Types<
 
 // clang-format on
 
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_KM_NK, KernelTypes_KM_NK);
 
-#include "test_gemm_universal_ut_cases_bf16.inc"
+#include "test_gemm_universal_ut_cases.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
deleted file mode 100644
index 24f587daf6..0000000000
--- a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_gemm_universal_util.hpp"
-
-using F8  = ck::f8_t;
-using F16 = ck::half_t;
-
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-namespace {
-
-template <typename X, typename Y>
-struct tuple_concat;
-
-template <typename... Xs, typename... Ys>
-struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
-{
-    using type = std::tuple<Xs..., Ys...>;
-};
-
-} // namespace
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_MK_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_MK_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_KM_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP16_KM_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
-{
-};
-
-// clang-format off
-using KernelTypes_MK_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-
-#endif
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-using KernelTypes_MK_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-    
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-
-#endif
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-
-// clang-format on
-
-TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
-
-#include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
deleted file mode 100644
index e833ab7825..0000000000
--- a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "gtest/gtest.h"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "test_gemm_universal_util.hpp"
-
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
-using F32  = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-namespace {
-
-template <typename X, typename Y>
-struct tuple_concat;
-
-template <typename... Xs, typename... Ys>
-struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
-{
-    using type = std::tuple<Xs..., Ys...>;
-};
-
-} // namespace
-
-template <typename Tuple>
-class TestGemmUniversal_FP8_MK_KN
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
-{
-};
-
-template <typename Tuple>
-class TestGemmUniversal_FP8_MK_NK
-    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
-{
-};
-
-// clang-format off
-using KernelTypes_MK_KN = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
-    // Fallback test type when FP8 is not enabled
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-using KernelTypes_MK_NK = ::testing::Types<
-    //         ADataType, BDataType, ComputeDataType, CDataType
-
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
-    // Fallback test type when FP8 is not enabled
-    std::tuple<      F16,       F16,             F16,     F16>
-    >;
-
-
-TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
-
-
-#include "test_gemm_universal_ut_cases_fp8.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
index 805587a274..ef3509c0ca 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -44,8 +44,9 @@ class TestGemmUniversal_Streamk : public testing::Test
 
     void SetUp() override
     {
-        streamk_sel_list = {0, 1, 2}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
-                                      // Stream-K+ DP, // {0, 1, 2, 3, 4}
+        grid_size_list   = {38, 114, 228}; // {38, 76, 114, 152, 190, 228, 266, 304, 342, 380};
+        streamk_sel_list = {0, 1, 2};      // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
+                                           // Stream-K+ DP, // {0, 1, 2, 3, 4}
         // 2:2-tile Stream-K + DP
     }
 
@@ -57,9 +58,10 @@ class TestGemmUniversal_Streamk : public testing::Test
              const int StrideC)
     {
         for(auto streamk_sel : streamk_sel_list)
-        {
-            RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
-        }
+            for(auto grid_size : grid_size_list)
+            {
+                RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, grid_size);
+            }
     }
 
     void RunSingle(const int M,

From 179322842274a635f6bd6141c7251a2f65b5fa34 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 7 Apr 2025 07:08:39 -0700
Subject: [PATCH 026/443] fix codegen issues (#2052)

---
 include/ck/utility/amd_ck_fp8.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index b0089bb2d1..d079639c6a 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -557,7 +557,7 @@ template <ck_fp8_interpretation_t interpret,
           ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
 static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
 {
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -596,7 +596,7 @@ static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rn
         cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
         cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
 #else
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -634,7 +634,7 @@ template <ck_fp8_interpretation_t interpret,
           ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
 static __device__ fp8_storage_t cast_to_f8_from_f16(_Float16 v, unsigned int rng = 0)
 {
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -673,7 +673,7 @@ static __device__ fp8x2_storage_t cast_to_f8_from_f16(half2_t v, unsigned int rn
         cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[0], rng),
         cast_to_f8_from_f16<interpret, saturate, stochastic_rounding>(v[1], rng)};
 #else
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -805,7 +805,7 @@ template <ck_fp8_interpretation_t interpret,
           ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
 static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
 {
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -847,7 +847,7 @@ static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned in
         cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[0], rng),
         cast_to_f8_from_bf16<interpret, saturate, stochastic_rounding>(v[1], rng)};
 #else
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -891,7 +891,7 @@ template <ck_fp8_interpretation_t interpret,
           ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
 static __device__ fp8_storage_t cast_to_f8_from_bf16(ushort v, unsigned int rng = 0)
 {
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -928,7 +928,7 @@ template <ck_fp8_interpretation_t interpret,
           ck::enable_if_t<stochastic_rounding == false, bool>                      = false>
 static __device__ fp8x2_storage_t cast_to_f8_from_bf16(ushortx2_t v, unsigned int rng = 0)
 {
-    std::ignore = rng;
+    ignore = rng;
 
     union
     {
@@ -1544,7 +1544,7 @@ __host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
                                    sat == ck_saturation_t::CK_SATFINITE,
                                    stochastic_rounding>(x, rng);
 #else
-        std::ignore = rng;
+        ignore = rng;
         return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
             static_cast<float>(x));
 #endif // defined(__gfx950__)
@@ -1586,7 +1586,7 @@ __host__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
                                    sat == ck_saturation_t::CK_SATFINITE,
                                    stochastic_rounding>(x, rng);
 #else
-        std::ignore = rng;
+        ignore = rng;
         return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
             float2_t{static_cast<float>(x[0]), static_cast<float>(x[1])});
 #endif // defined(__gfx950__)
@@ -1629,7 +1629,7 @@ __host__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
                                     sat == ck_saturation_t::CK_SATFINITE,
                                     stochastic_rounding>(x, rng);
 #else
-        std::ignore = rng;
+        ignore = rng;
         return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
             bit_cast<float>(uint32_t{x} << 16)); // convert value to float
 #endif // defined(__gfx950__)
@@ -1678,7 +1678,7 @@ __host__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
                                     sat == ck_saturation_t::CK_SATFINITE,
                                     stochastic_rounding>(x, rng);
 #else
-        std::ignore = rng;
+        ignore = rng;
         return cvt_float_to_fp8<interp, ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
             float2_t{bit_cast<float>(uint32_t{x[0]} << 16),
                      bit_cast<float>(uint32_t{x[1]} << 16)}); // convert values to float

From 72c0261ef1b40587ee8674b9d49b4fd6b46b0335 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 7 Apr 2025 12:48:34 -0700
Subject: [PATCH 027/443] Fix a couple of CI issues. (#2050)

* fix jenkins jobs

* fix perf log name for gfx908

* only run gemm perf tests on gfx908
---
 Jenkinsfile | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 86cac3c485..dbd484d7bd 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -331,8 +331,10 @@ def cmake_build(Map conf=[:]){
                 }
             }
             else{
-                // run unit tests
-                sh "make check"
+                // run unit tests unless building library for all targets
+                if (!params.BUILD_INSTANCES_ONLY){
+                    sh "make check"
+                }
             }
         }
     }
@@ -604,12 +606,9 @@ def Build_CK(Map conf=[:]){
                         else if ( arch_type == 6 ){
                             // run standard tests on gfx908
                             echo "Run performance tests"
-                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
-                            archiveArtifacts "perf_gemm_gfx908.log"
+                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
                             archiveArtifacts "perf_onnx_gemm_gfx908.log"
-                            archiveArtifacts "perf_resnet50_N256_gfx908.log"
-                            archiveArtifacts "perf_resnet50_N4_gfx908.log"
-                            stash includes: "perf_**.log", name: "perf_log_gfx908"
+                            stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908"
                         }
                         }
                     }
@@ -746,8 +745,7 @@ def process_results(Map conf=[:]){
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 22 * * * % ROCMVERSION=6.3;BUILD_GFX908=true;BUILD_GFX12=false;RUN_PERFORMANCE_TESTS=false
-                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true
+                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true;
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false

From 80aae6119b47d02ffebaa0d9b153fb075d0da140 Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Tue, 8 Apr 2025 12:40:04 +0200
Subject: [PATCH 028/443] [CK_TILE] Fix GEMM Memory Pipeline (#2034)

* [CK_TILE] Fix GEMM Memory Pipeline

* Fix transpose tile

* Add comments
---
 .../ck_tile/core/tensor/transpose_tile.hpp    | 108 +++++++++++-------
 1 file changed, 69 insertions(+), 39 deletions(-)

diff --git a/include/ck_tile/core/tensor/transpose_tile.hpp b/include/ck_tile/core/tensor/transpose_tile.hpp
index f34efe5c2f..5b65b79c1a 100644
--- a/include/ck_tile/core/tensor/transpose_tile.hpp
+++ b/include/ck_tile/core/tensor/transpose_tile.hpp
@@ -83,9 +83,6 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
     constexpr index_t num_vec_in  = vec_length_out;
     constexpr index_t num_vec_out = vec_length_in;
 
-    using InVec  = array<DataType, vec_length_in>;
-    using OutVec = array<DataType, vec_length_out>;
-
     // SFC
     constexpr auto scalars_per_access_arr = generate_array(
         [&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
@@ -101,51 +98,84 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
 
     static_assert(num_access > 0, "wrong! num_access should be larger than 0");
 
-    // in/out vectors to be transposed
-    thread_buffer<InVec, num_vec_in> in_vectors;
-    thread_buffer<OutVec, num_vec_out> out_vectors;
+    if constexpr(num_vec_in == 1 || num_vec_out == 1)
+    {
+        // loop over SFC
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            // data index [y0, y1, ...] in the order of input tensor
+            constexpr auto idx_y = SFC_Y::get_index(iAccess);
 
-    // loop over SFC and do transpose
-    static_for<0, num_access, 1>{}([&](auto iAccess) {
-        // data index [y0, y1, ...] in the order of input tensor
-        constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
+            constexpr index_t in_offset  = y_in_desc.calculate_offset(idx_y);
+            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y);
 
-        // get input vectors
-        static_for<0, num_vec_in, 1>{}([&](auto i) {
-            constexpr auto idx_y_in = generate_tuple(
-                [&](auto ii) {
-                    return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
-                },
-                number<NDimY>{});
-
-            constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
-            static_assert(in_offset % vec_length_in == 0);
-
-            in_vectors(i).template get_as<InVec>()(I0) =
-                in_tensor.get_thread_buffer()
-                    .template get_as<InVec>()[number<in_offset / vec_length_in>{}];
+            if constexpr(vec_length_in == 1)
+            {
+                out_tensor.get_thread_buffer()[number<out_offset>{}] =
+                    in_tensor.get_thread_buffer()[number<in_offset>{}];
+            }
+            else
+            {
+                using Vec = array<DataType, vec_length_in>;
+                out_tensor.get_thread_buffer().template get_as<Vec>(
+                    number<out_offset / vec_length_in>{}) =
+                    in_tensor.get_thread_buffer().template get_as<Vec>(
+                        number<in_offset / vec_length_in>{});
+            }
         });
+    }
+    else
+    {
+        using InVec  = array<DataType, vec_length_in>;
+        using OutVec = array<DataType, vec_length_out>;
 
-        // transpose
-        transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
+        // in/out vectors to be transposed
+        thread_buffer<InVec, num_vec_in> in_vectors;
+        thread_buffer<OutVec, num_vec_out> out_vectors;
 
-        // set output vectors
-        static_for<0, num_vec_out, 1>{}([&](auto i) {
-            constexpr auto idx_y_out_tmp = generate_array(
-                [&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
-                number<NDimY>{});
+        // loop over SFC and do transpose
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            // data index [y0, y1, ...] in the order of input tensor
+            constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
 
-            constexpr auto idx_y_out =
-                container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+            // get input vectors
+            static_for<0, num_vec_in, 1>{}([&](auto i) {
+                constexpr auto idx_y_in = generate_tuple(
+                    [&](auto ii) {
+                        return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
+                    },
+                    number<NDimY>{});
 
-            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
-            static_assert(out_offset % vec_length_out == 0);
+                constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
+                static_assert(in_offset % vec_length_in == 0);
 
-            out_tensor.get_thread_buffer().template set_as<OutVec>(
-                number<out_offset / vec_length_out>{},
-                out_vectors[i].template get_as<OutVec>()[I0]);
+                in_vectors(i).template get_as<InVec>()(I0) =
+                    in_tensor.get_thread_buffer()
+                        .template get_as<InVec>()[number<in_offset / vec_length_in>{}];
+            });
+
+            // transpose
+            transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
+
+            // set output vectors
+            static_for<0, num_vec_out, 1>{}([&](auto i) {
+                constexpr auto idx_y_out_tmp = generate_array(
+                    [&](auto ii) {
+                        return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii];
+                    },
+                    number<NDimY>{});
+
+                constexpr auto idx_y_out =
+                    container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+
+                constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
+                static_assert(out_offset % vec_length_out == 0);
+
+                out_tensor.get_thread_buffer().template set_as<OutVec>(
+                    number<out_offset / vec_length_out>{},
+                    out_vectors[i].template get_as<OutVec>()[I0]);
+            });
         });
-    });
+    }
 }
 
 } // namespace detail

From 6ce0797dadfc6d0c6cdde3e01532e90137fc5b0c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 8 Apr 2025 09:00:51 -0700
Subject: [PATCH 029/443] simplify generate_tuple (#2043)

---
 include/ck/utility/sequence.hpp          | 15 +++++++++++++++
 include/ck/utility/tuple_helper.hpp      |  9 +++++++--
 include/ck_tile/core/container/tuple.hpp |  9 +++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index 99935a6d8d..497625f7e2 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -184,6 +184,21 @@ struct Sequence
     }
 };
 
+namespace impl {
+template <typename T, T... Ints>
+struct __integer_sequence;
+
+template <index_t... Ints>
+struct __integer_sequence<index_t, Ints...>
+{
+    using seq_type = Sequence<Ints...>;
+};
+} // namespace impl
+
+template <index_t N>
+using make_index_sequence =
+    typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type;
+
 // merge sequence
 template <typename Seq, typename... Seqs>
 struct sequence_merge
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index b4f1545aa9..b1a0c1fc5d 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -11,11 +11,16 @@
 
 namespace ck {
 
+template <typename F, index_t... ids>
+__host__ __device__ constexpr auto generate_tuple_for(F&& f, Sequence<ids...>)
+{
+    return make_tuple(f(Number<ids>{})...);
+}
+
 template <typename F, index_t N>
 __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
 {
-    return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); },
-                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+    return generate_tuple_for(f, make_index_sequence<N>{});
 }
 
 template <typename F, index_t N>
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index fd02177e25..3700d348e7 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -396,11 +396,16 @@ struct tuple_array_impl<T, 1>
 };
 } // namespace impl
 
+template <typename F, index_t... ids>
+CK_TILE_HOST_DEVICE constexpr auto generate_tuple_for(F&& f, sequence<ids...>)
+{
+    return make_tuple(f(number<ids>{})...);
+}
+
 template <typename F, index_t N>
 CK_TILE_HOST_DEVICE constexpr auto generate_tuple(F&& f, number<N>)
 {
-    return unpack([&f](auto&&... is) { return make_tuple(f(is)...); },
-                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+    return generate_tuple_for(f, make_index_sequence<N>{});
 }
 
 template <typename F, index_t N>

From b12cd6580b9737a9e8c6c055b25babc579242184 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Apr 2025 09:06:38 -0700
Subject: [PATCH 030/443] Bump rocm-docs-core from 1.18.1 to 1.18.2 in
 /docs/sphinx (#2047)

Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.18.1 to 1.18.2.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.18.1...v1.18.2)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-version: 1.18.2
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 2fcf3b3935..b89cb9fec8 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.18.1
+rocm-docs-core==1.18.2
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 12572d400e..2a52a48e4c 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -199,7 +199,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.18.1
+rocm-docs-core==1.18.2
     # via -r requirements.in
 rpds-py==0.22.3
     # via

From 2c8132126ce089885d7aca40bc277196d8e78b34 Mon Sep 17 00:00:00 2001
From: spolifroni-amd <Sandra.Polifroni@amd.com>
Date: Tue, 8 Apr 2025 13:20:31 -0400
Subject: [PATCH 031/443] fixed broken github link (#2063)

---
 docs/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.rst b/docs/index.rst
index 15a9321d43..6d46eb49b1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@ Composable Kernel User Guide
 
 The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_.
 
-The Composable Kernel repository is located at `https://github.com/ROCm/composable-kernel <https://github.com/ROCm/composable-kernel>`_.
+The Composable Kernel repository is located at `https://github.com/ROCm/composable_kernel <https://github.com/ROCm/composable_kernel>`_.
 
 .. grid:: 2
   :gutter: 3

From 263ff689e0cc03f9772f6a76eca57258db48698e Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 8 Apr 2025 15:14:53 -0700
Subject: [PATCH 032/443] New instances for
 gemm_multiply_multiply_weightpreshuffle operator (#2061)

* Add new instances for weight_preshuffle for f8->bf16

* Add new instances for weight_preshuffle for f8->f16

* clang formatted

---------

Co-authored-by: Khushbu Agarwal <khuagar@amd.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 ...ultiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp | 12 +++++++++++-
 ...multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp |  8 +++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
index 4266ab9aa3..e5ada03a46 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
@@ -100,7 +100,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     32,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     32,    64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    64,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
         // clang-format on
         >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
index 94e44ee600..dc9db8889a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
@@ -115,7 +115,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   256,  16,  16,  16,   16,    1,    1,     S<16, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   128,   8,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
         // clang-format on
         >;
 

From 2c563fecf76eeecd49a28950ca601ff5ba5a735f Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Wed, 9 Apr 2025 06:16:30 +0800
Subject: [PATCH 033/443] add passthrough for int32->float32 (#2062)

---
 .../gpu/element/unary_element_wise_operation.hpp            | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index f602e36e73..672998d811 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -357,6 +357,12 @@ struct PassThrough
         y = type_convert<half_t>(x);
     }
 
+    template <>
+    __host__ __device__ void operator()<float, int32_t>(float& y, const int32_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
     template <>
     __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
     {

From 03ce8729fd52ce1e8e8c4290d5d1ea79ec12ffa4 Mon Sep 17 00:00:00 2001
From: MHYang-gh <meng-hsuan.yang@amd.com>
Date: Wed, 9 Apr 2025 06:34:11 +0800
Subject: [PATCH 034/443] Make buffer coherence configurable in tensor view
 (#2041)

* Make buffer coherence configurable in tensor view

* Fix clang-format for tensor_view.hpp
---
 include/ck_tile/core/tensor/tensor_view.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 336793c5b1..32de227b52 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -411,18 +411,21 @@ struct null_tensor_view
 };
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Ts>
 CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* p,
                                                     const tensor_descriptor<Ts...>& desc)
 {
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
     return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
           memory_operation_enum DstInMemOp      = memory_operation_enum::set,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Lengths,
           typename... Strides,
@@ -441,12 +444,14 @@ make_naive_tensor_view(DataType* p,
                                              number<GuaranteedLastDimensionVectorLength>{},
                                              number<GuaranteedLastDimensionVectorStride>{});
 
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
     return tensor_view<decltype(buffer_view), decltype(desc), DstInMemOp>{buffer_view, desc};
 }
 
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
           typename DataType,
           typename... Lengths,
           index_t GuaranteedLastDimensionVectorLength = -1>
@@ -458,7 +463,8 @@ make_naive_tensor_view_packed(DataType* p,
     auto desc =
         make_naive_tensor_descriptor_packed(lengths, number<GuaranteedLastDimensionVectorLength>{});
 
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
 
     return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }

From 3e6d21adeb33db1319899a3833113c9caf715358 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 9 Apr 2025 10:06:42 -0700
Subject: [PATCH 035/443] enable gfx115x support (#2065)

---
 example/CMakeLists.txt                               |  8 ++++----
 include/ck/ck.hpp                                    |  3 ++-
 include/ck/host_utility/device_prop.hpp              |  4 +++-
 include/ck_tile/core/config.hpp                      |  3 ++-
 .../src/tensor_operation_instance/gpu/CMakeLists.txt | 12 ++++++------
 test/CMakeLists.txt                                  | 10 +++++-----
 6 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 64ff2a6813..996a543ecc 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -114,14 +114,14 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl" AND NOT FILE_NAME MATCHES "_pk_i4")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
             message("trimming targets for ${FILE_NAME}")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -212,7 +212,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         endif()
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 9d5d5fbc0b..0c2dc799ab 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -65,7 +65,8 @@
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
-    defined(__gfx1103__) || defined(__gfx11_generic__)
+    defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
+    defined(__gfx1152__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index 3323ab6c7b..5439bbe1f0 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -86,7 +86,9 @@ inline bool is_gfx103_supported()
 inline bool is_gfx11_supported()
 {
     return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
-           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
+           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103" ||
+           ck::get_device_name() == "gfx1150" || ck::get_device_name() == "gfx1151" ||
+           ck::get_device_name() == "gfx1152";
 }
 
 inline bool is_gfx12_supported()
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 978f673346..414509e479 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -15,7 +15,8 @@
 #define __gfx103__
 #endif
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
-    defined(__gfx1103__) || defined(__gfx11_generic__)
+    defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
+    defined(__gfx1152__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
 #if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index a16418ec7e..2542dd236b 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -95,26 +95,26 @@ function(add_instance_library INSTANCE_NAME)
         foreach(source IN LISTS ARGN)
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             elseif(source MATCHES "_wmma")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
             elseif(source MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             endif()
             #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
             else()
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                 endif()    
             endif()
             set(offload_targets)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 38fbf5385f..18611d8052 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -101,11 +101,11 @@ function(add_test_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
@@ -197,13 +197,13 @@ function(add_gtest_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_mx") #only build mx example for gfx950
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})

From f14e648e7ca69c161c8910778e50c4c3a9d63f1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Thu, 10 Apr 2025 09:48:37 +0200
Subject: [PATCH 036/443] Replace inline assembly with builtins in FHMA (#2067)

* Replace inline assembly with builtins in FHMA

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
---
 .../core/arch/amd_buffer_addressing.hpp       | 174 +++++++++++++++---
 1 file changed, 153 insertions(+), 21 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 33faa3a18b..5d6d6ce348 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -14,6 +14,15 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 
+// This attribute gives a hint to the compiler that a branch is likely to be taken.
+// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
+// have been generated.
+#if __cplusplus >= 202002L
+#define LIKELY(x) (x) [[likely]]
+#else
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+#endif
+
 namespace ck_tile {
 
 // 128 bit SGPRs to supply buffer resource in buffer instructions
@@ -58,10 +67,36 @@ template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payloa
 // TODO: glc/slc/...
 template <index_t bytes, bool pre_nop = false>
 struct buffer_load;
+
+template <index_t bytes, bool pre_nop = false>
+struct buffer_load_if;
+
+template <index_t bytes>
+struct buffer_store;
+
+template <index_t bytes>
+struct buffer_store_if;
+
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 // TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
 // (exp_vector_type(xxx))
+
+#define HAS_RAW_BUFFER_BUILTINS                             \
+    __has_builtin(__builtin_amdgcn_raw_buffer_load_b32) &&  \
+        __has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
+        __has_builtin(__builtin_amdgcn_raw_buffer_store_b32)
+
+#if HAS_RAW_BUFFER_BUILTINS
+CK_TILE_DEVICE __amdgpu_buffer_rsrc_t cast_to_amdgpu_buffer_rsrc_t(int32x4_t res)
+{
+    __amdgpu_buffer_rsrc_t as_rsrc;
+    static_assert(sizeof(res) == sizeof(as_rsrc) && "Size of buffer resource should match");
+    memcpy(&as_rsrc, &res, sizeof(res));
+    return as_rsrc;
+}
+#endif
+
 template <bool pre_nop>
 struct buffer_load<16, pre_nop>
 {
@@ -76,6 +111,11 @@ struct buffer_load<16, pre_nop>
     {
         static_assert(sizeof(T) == 16);
         using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b128(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
@@ -87,6 +127,7 @@ struct buffer_load<16, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -104,6 +145,11 @@ struct buffer_load<8, pre_nop>
     {
         static_assert(sizeof(T) == 8);
         using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b64(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
@@ -115,6 +161,7 @@ struct buffer_load<8, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -132,6 +179,12 @@ struct buffer_load<4, pre_nop>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
+
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b32(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
@@ -143,6 +196,7 @@ struct buffer_load<4, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -160,6 +214,12 @@ struct buffer_load<2, pre_nop>
     {
         static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
         using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
+
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
@@ -171,6 +231,7 @@ struct buffer_load<2, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
@@ -188,6 +249,11 @@ struct buffer_load<1, pre_nop>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset                 = i_offset;
+        reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
+            cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
@@ -199,12 +265,31 @@ struct buffer_load<1, pre_nop>
                          : "+v"(reinterpret_cast<mbuf_t&>(value))
                          : "v"(v_offset), "s"(res), "n"(i_offset)
                          : "memory");
+#endif
     }
 };
 
-template <index_t bytes, bool pre_nop = false>
-struct buffer_load_if;
-
+#if HAS_RAW_BUFFER_BUILTINS
+template <index_t bytes, bool pre_nop>
+struct buffer_load_if
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t s_offset,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        if LIKELY(1 <= flag)
+        {
+            buffer_load<bytes, pre_nop>{}(
+                value, res, v_offset, s_offset, i_offset, flag, bool_constant<pre_nop>{});
+        }
+    }
+};
+#else
 template <bool pre_nop>
 struct buffer_load_if<16, pre_nop>
 {
@@ -214,12 +299,12 @@ struct buffer_load_if<16, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 16);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
         static_assert(sizeof(mbuf_t) == sizeof(T));
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
@@ -248,12 +333,12 @@ struct buffer_load_if<8, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 8);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -281,12 +366,12 @@ struct buffer_load_if<4, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -314,12 +399,12 @@ struct buffer_load_if<2, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -347,12 +432,12 @@ struct buffer_load_if<1, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
+                                   index_t flag = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
+        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -370,9 +455,9 @@ struct buffer_load_if<1, pre_nop>
                          : "memory");
     }
 };
+#endif
+
 #pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
-template <index_t bytes>
-struct buffer_store;
 
 template <>
 struct buffer_store<16>
@@ -387,10 +472,16 @@ struct buffer_store<16>
     {
         static_assert(sizeof(T) == 16);
         using mbuf_t = fp32x4_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b128(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -407,10 +498,16 @@ struct buffer_store<8>
     {
         static_assert(sizeof(T) == 8);
         using mbuf_t = fp32x2_t;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b64(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -427,10 +524,16 @@ struct buffer_store<4>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = float;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b32(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -447,10 +550,16 @@ struct buffer_store<2>
     {
         static_assert(sizeof(T) == 2);
         using mbuf_t = short;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b16(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
@@ -467,16 +576,38 @@ struct buffer_store<1>
     {
         static_assert(sizeof(T) == 4);
         using mbuf_t = float;
+#if HAS_RAW_BUFFER_BUILTINS
+        index_t s_offset = i_offset;
+        __builtin_amdgcn_raw_buffer_store_b8(
+            bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
+#else
         asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
                      :
                      : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
                      : "memory");
+#endif
     }
 };
 
+#if HAS_RAW_BUFFER_BUILTINS
 template <index_t bytes>
-struct buffer_store_if;
-
+struct buffer_store_if
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t s_offset,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        if LIKELY(1 <= flag)
+        {
+            buffer_store<bytes>{}(value, res, v_offset, s_offset, i_offset);
+        }
+    }
+};
+#else
 template <>
 struct buffer_store_if<16>
 {
@@ -490,7 +621,7 @@ struct buffer_store_if<16>
     {
         static_assert(sizeof(T) == 16);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = fp32x4_t;
+        using mbuf_t = fp32x4_t;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -547,7 +678,7 @@ struct buffer_store_if<4>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
+        using mbuf_t = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -575,7 +706,7 @@ struct buffer_store_if<2>
     {
         static_assert(sizeof(T) == 2);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = short;
+        using mbuf_t = short;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -603,7 +734,7 @@ struct buffer_store_if<1>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
+        using mbuf_t = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -617,6 +748,7 @@ struct buffer_store_if<1>
                      : "memory");
     }
 };
+#endif
 
 CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
 {

From 5f885d2b7af1e6b2f40eefa9126f58e93e164e6d Mon Sep 17 00:00:00 2001
From: slippedJim <jim.guo@amd.com>
Date: Thu, 10 Apr 2025 23:21:13 +0800
Subject: [PATCH 037/443] add fmha fwd splitkv receipt for aiter c++ api
 (#2068)

* add s_randval for c++ api

* Fix bug of bias in splitkv

---------

Co-authored-by: rocking <ChunYu.Lai@amd.com>
---
 example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py     | 13 +++++++++++--
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py     |  5 ++---
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py | 12 ++++++++++++
 example/ck_tile/01_fmha/generate.py                 |  4 ++--
 .../ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp     |  6 +++---
 5 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 94f89256f9..1e6755c631 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -545,10 +545,9 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= dpad == dvpad
                     if not cond:
                         continue
+            # aiter::mha_bwd C++ api integration
             elif receipt == 600:
                     cond = dtype in ['fp16', 'bf16']
-                    cond &= mode in ["batch", "group"]
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
                     cond &= dpad == dvpad
                     if not cond:
                         continue
@@ -689,6 +688,11 @@ def get_bwd_dot_do_o_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaB
                     cond &= mode == "group"
                     if not cond:
                         continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
             gen.append(k)
 
     return gen
@@ -841,6 +845,11 @@ def get_bwd_convert_dq_blobs(kernel_filter : Optional[str], receipt) -> List[Fmh
                     cond &= mode == "group"
                     if not cond:
                         continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
             gen.append(k)
 
     return gen
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index d978cc1d9b..10a6e5c1d7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -536,10 +536,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
-                # Aiter aiter::mha_fwd integration
-                elif receipt == 500:
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
                     cond = dtype in ['fp16', 'bf16']
-                    cond &= mode in ['batch', 'group']
                     cond &= pipeline.F_vlayout == 'row'
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index c6d1a01792..0dccdf6bd6 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -738,6 +738,13 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
@@ -796,6 +803,11 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
                     cond &= mode == "group"
                     if not cond:
                         continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
                 gen.append(k)
 
     return gen
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index 0d35db14d4..25931da141 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -109,8 +109,8 @@ if __name__ == "__main__":
              "  100-199: Only generate instance for Aiter(mha_fwd) integration\n" + \
              "  200-299: Only generate instance for Aiter(mha_varlen_fwd) integration\n" + \
              "  300-399: Only generate instance for Aiter(mha_bwd) integration\n" + \
-             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration"
-
+             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration\n" + \
+             "  600-699: Only generate instance for aiter::mha_fwd && aiter::mha_fwd_splitkv && aiter::mha_bwd C++ api integration"
     )
 
     args = parser.parse_args()
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 143abe8048..ea1762abc1 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -95,8 +95,8 @@ struct FmhaFwdSplitKVKernel
             "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) + 
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + 
+            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) +
             (kDoFp8StaticQuant ? "_squant" : "_nsquant") + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
         #undef _SS_
         #undef _TS_
@@ -563,7 +563,7 @@ struct FmhaFwdSplitKVKernel
             }
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+                batch_offset_bias = query_start * kargs.stride_bias;
             }
 
             batch_offset_lse_acc = query_start;

From 6c61f4d237a9841c5b5d8b4380eaf9c2af14947e Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Fri, 11 Apr 2025 12:18:26 +0200
Subject: [PATCH 038/443] [CK_TILE] Add 2:4 structured sparsity support for
 fp16 gemm (#1957)

* add structured sparsity fp16 support for gemm

* added reviewer suggestions

* update changelog

* update changelog

* add reviewers suggestions

* Minor fix

* clang fix

* fix doxygen
---
 CHANGELOG.md                                  |   1 +
 example/ck_tile/03_gemm/gemm_utils.hpp        |   3 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  24 ++--
 example/ck_tile/03_gemm/universal_gemm.cpp    |   3 +-
 include/ck_tile/host/fill.hpp                 |  43 +++++++
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   3 +-
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |   4 +-
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |   8 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  13 +-
 .../gemm/warp/warp_gemm_attribute_smfmac.hpp  |  80 ++++++++++++
 .../warp/warp_gemm_attribute_smfmac_impl.hpp  | 114 ++++++++++++++++++
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  15 ++-
 .../ops/gemm/warp/warp_gemm_smfmac_impl.hpp   | 110 +++++++++++++++++
 13 files changed, 401 insertions(+), 20 deletions(-)
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49ef2998eb..e3d7971c71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
+* Added support for FP16 2:4 structured sparsity to universal GEMM.
 
 ### Optimized
 
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 3254a407fd..973006196b 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -93,7 +93,8 @@ struct GemmConfig
     static constexpr bool PermuteA = false;
     static constexpr bool PermuteB = false;
 
-    static constexpr bool TransposeC = false;
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
 
     static constexpr int kBlockPerCu                         = 1;
     static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index c3b4ec609c..b4ea5d22c0 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -55,7 +55,8 @@ void permute_tensor_b(Tensor& tensor)
                                                                  ALayout,
                                                                  BLayout,
                                                                  CLayout,
-                                                                 GemmConfig::TransposeC>;
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;
 
     using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                        BDataType,
@@ -185,13 +186,15 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
-              << " B Type = " << DataTypeTraits<BDataType>::name
-              << " C Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
 
     return ave_time;
 }
@@ -259,6 +262,11 @@ int run_gemm_example_with_layouts(int argc,
         b_k_n.SetZero();
     }
 
+    if(GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
     ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
     ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index eef8d3b60e..2ba16ca89d 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -46,7 +46,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                  ALayout,
                                                                  BLayout,
                                                                  CLayout,
-                                                                 GemmConfig::TransposeC>;
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 006026470b..d90c0cf6cf 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -364,6 +364,49 @@ struct FillConstant
     }
 };
 
+//----------------------------------------------------------------------------------------------
+/// @brief      Transforms given input to fit 2:4 structured sparsity pattern so
+///             every subgroup of 4 elements contain at most 2 non-zero elements
+template <typename T>
+struct AdjustToStructuredSparsity
+{
+    size_t start{0};
+    // masks represent all valid 2:4 structured sparsity permutations
+    // clang-format off
+    static constexpr int32_t masks[] = {0, 0, 1, 1,
+                                        0, 1, 0, 1,
+                                        0, 1, 1, 0,
+                                        1, 0, 0, 1,
+                                        1, 0, 1, 0,
+                                        1, 1, 0, 0,
+                                        0, 0, 0, 1,
+                                        0, 0, 1, 0,
+                                        0, 1, 0, 0,
+                                        1, 0, 0, 0};
+    // clang-format on
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::transform(first, last, first, [=, index = start](T val) mutable {
+            auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
+            index += 1;
+
+            return type_convert<T>(tmp);
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
 template <typename T, bool UseCos = true, bool UseAbs = false>
 struct FillTrigValue
 {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index f833ccc849..cba3677332 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -194,7 +194,8 @@ struct UniversalGemmPipelineProblem
     static constexpr auto HasHotLoop = HasHotLoop_;
     static constexpr auto TailNum    = TailNum_;
 
-    static constexpr bool TransposeC = Traits::TransposeC;
+    static constexpr bool TransposeC            = Traits::TransposeC;
+    static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index c504a51ad0..b555cf75e0 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -580,7 +580,9 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
                                                 WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                Problem::TransposeC,
+                                                false,
+                                                Problem::UseStructuredSparsity>;
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index d0e1f60d38..0dae2eeca5 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -36,7 +36,8 @@ template <bool kPadM_,
           typename ALayout_,
           typename BLayout_,
           typename CLayout_,
-          bool TransposeC_ = false>
+          bool TransposeC_            = false,
+          bool UseStructuredSparsity_ = false>
 struct TileGemmUniversalTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -49,7 +50,8 @@ struct TileGemmUniversalTraits
     using BLayout = BLayout_;
     using CLayout = CLayout_;
 
-    static constexpr bool TransposeC = TransposeC_;
+    static constexpr bool TransposeC            = TransposeC_;
+    static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 1fd12973f6..33f3dde256 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,6 +7,9 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 
+#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
+
 namespace ck_tile {
 
 // fp16
@@ -64,6 +67,14 @@ using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK
     WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>,
     4>>;
 
+// fp16 2:4 structured sparsity
+
+using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
+    WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
+    WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+
 // bf16
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
new file mode 100644
index 0000000000..adf548aaca
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
@@ -0,0 +1,80 @@
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
+
+namespace ck_tile {
+
+/**
+ *  @brief Class describing structured sparsity mfma instructions.
+ *
+ * @paragraph Overview "Overview"
+ * Currently only 2:4 structured sparsity is supported, which is based on requirement that in every
+ * groups of four continuous elements there are at most two non-zero, which results in processing
+ * only half of elements in smfmac instruction. Because of structured sparsity A vector in smfmac
+ * instruction will be smaller than B vector by the factor of CompressionRatio. The indexes of
+ * non-zero elements are stored in `index` which is an additional parameter to assembly instruction.
+ * Every pair of two bit indexes are containing information about which two  elements in current
+ * group of 4 values are non-zero and should be used inside smfmac instruction. Structured sparsity
+ * format is supported only for A matrix for now.
+ */
+template <typename WarpGemmAttributeSmfmacImpl_>
+struct WarpGemmAttributeSmfmac
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeSmfmacImpl_>;
+
+    using ADataType   = typename Impl::ADataType;
+    using BDataType   = typename Impl::BDataType;
+    using IdxDataType = typename Impl::IdxDataType;
+    using CDataType   = typename Impl::CDataType;
+
+    using AVecType = typename Impl::AVecType;
+    using BVecType = typename Impl::BVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM                = Impl::kM;
+    static constexpr index_t kN                = Impl::kN;
+    static constexpr index_t kK                = Impl::kK;
+    static constexpr index_t kKPerThread       = Impl::kABKPerLane;
+    static constexpr index_t kCompressionRatio = Impl::CompressionRatio;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
+    static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
+                  "Multi-block WarpGemmAttributeSmfmacImpl is not supported");
+
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+              sequence<Impl::kCNLane>>,
+        tuple<sequence<1, 2>>,
+        tuple<sequence<1, 0>>,
+        sequence<1, 1>,
+        sequence<0, 2>>;
+
+    // c_vec += a_vec * b_vec[idx]
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   const int32_t& idx,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        Impl{}(c_vec, a_vec, b_vec, idx, bool_constant<post_nop_>{});
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
new file mode 100644
index 0000000000..97fd2a8742
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "warp_gemm_attribute_mfma_impl.hpp"
+
+namespace ck_tile {
+
+// fp16 2:4 structured sparsity
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeSmfmacImplF16F16F32M32N32K16
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using IdxDataType                   = int32_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    static constexpr index_t CompressionRatio = 2;
+
+    // c_vec += a_vec * b_vec[idx]
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   const int32_t& idx,
+                                   bool_constant<post_nop_> = {}) const
+    {
+#if defined(__gfx9__)
+        c_vec = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a_vec, b_vec, c_vec, idx, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = idx;
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeSmfmacImplF16F16F32M16N16K32
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using IdxDataType                   = int32_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    static constexpr index_t CompressionRatio = 2;
+
+    // c_vec += a_vec * b_vec[idx]
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   const int32_t& idx,
+                                   bool_constant<post_nop_> = {}) const
+    {
+#if defined(__gfx9__)
+        c_vec = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a_vec, b_vec, c_vec, idx, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = idx;
+#endif
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 9c319b5e5f..6320b33598 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,7 +16,8 @@ template <typename AType,
           index_t NPerWave,
           index_t KPerWave,
           bool TransposeC,
-          bool SwizzleA = false>
+          bool SwizzleA              = false,
+          bool UseStructuredSparsity = false>
 struct WarpGemmMfmaDispatcher;
 
 // clang-format off
@@ -35,6 +36,10 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
 
+// fp16 2:4 structural sparsity
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
+
 // bf16
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
@@ -70,7 +75,8 @@ template <typename AType,
           index_t NPerWave,
           index_t KPerWave,
           bool TransposeC,
-          bool SwizzleA = false>
+          bool SwizzleA              = false,
+          bool UseStructuredSparsity = false>
 using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      BType,
                                                                      CType,
@@ -78,6 +84,7 @@ using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      NPerWave,
                                                                      KPerWave,
                                                                      TransposeC,
-                                                                     SwizzleA>::Type;
+                                                                     SwizzleA,
+                                                                     UseStructuredSparsity>::Type;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp
new file mode 100644
index 0000000000..9e028ddab0
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+
+template <typename WarpGemmAttribute_>
+struct WarpGemmSmfmacImpl
+{
+    using WarpGemmAttribute = remove_cvref_t<WarpGemmAttribute_>;
+
+    static constexpr index_t kM = WarpGemmAttribute::kM;
+    static constexpr index_t kN = WarpGemmAttribute::kN;
+    static constexpr index_t kK = WarpGemmAttribute::kK;
+    /// @brief The number of elements in K dimension processed by single thread in wavefront.
+    ///
+    /// @note  Note that WarpGemm may run MFMA instruction multiple times (on different K).
+    ///        In such situation this value reflects this fact.
+    static constexpr index_t kKPerThread = WarpGemmAttribute::kKPerThread;
+
+    using ADataType = typename WarpGemmAttribute::ADataType;
+    using BDataType = typename WarpGemmAttribute::BDataType;
+    using CDataType = typename WarpGemmAttribute::CDataType;
+
+    using AWarpDstrEncoding = typename WarpGemmAttribute::AWarpDstrEncoding;
+    using BWarpDstrEncoding = typename WarpGemmAttribute::BWarpDstrEncoding;
+    using CWarpDstrEncoding = typename WarpGemmAttribute::CWarpDstrEncoding;
+
+    using AWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(AWarpDstrEncoding{}))>;
+    using BWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(BWarpDstrEncoding{}))>;
+    using CWarpDstr = remove_cvref_t<decltype(make_static_tile_distribution(CWarpDstrEncoding{}))>;
+
+    using AWarpTensor = static_distributed_tensor<ADataType, AWarpDstr>;
+    using BWarpTensor = static_distributed_tensor<BDataType, BWarpDstr>;
+    using CWarpTensor = static_distributed_tensor<CDataType, CWarpDstr>;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access()
+    {
+        return WarpGemmAttribute_::get_num_of_access();
+    }
+
+    //----------------------------------------------------------------------------------------------
+    /// @brief      Compress A vector for 2:4 structured sparsity instruction by moving all non-zero
+    ///             elements into lower part of a_vec to half its effective size.
+    ///
+    /// @param      a_vec  Vector to be compressed.
+    ///
+    /// @return     Four 2-bit indexes of non-zero elements locations
+    ///
+    template <typename AVec>
+    CK_TILE_DEVICE int32_t compress_a(AVec& a_vec) const
+    {
+        int32_t idx = 0b11101110;
+
+        static_for<0, 2, 1>{}([&](auto i) {
+            ADataType nonzero_elems[2] = {a_vec[i * 4 + 2], a_vec[i * 4 + 3]};
+            int32_t non_zero_pos       = 0;
+
+            static_for<0, 3, 1>{}([&](auto j) {
+                if(a_vec[i * 4 + j] != 0.0f)
+                {
+                    nonzero_elems[non_zero_pos] = a_vec[i * 4 + j];
+                    idx &= ~(0b11 << 2 * (i * 2 + non_zero_pos));
+                    idx |= j << 2 * (i * 2 + non_zero_pos);
+                    ++non_zero_pos;
+                }
+            });
+            a_vec[i * 2]     = nonzero_elems[0];
+            a_vec[i * 2 + 1] = nonzero_elems[1];
+        });
+
+        return idx;
+    }
+
+    template <typename CTensor, typename ATensor, typename BTensor, bool post_nop_ = false>
+    CK_TILE_DEVICE void
+    operator()(CTensor& c, const ATensor& a, const BTensor& b, bool_constant<post_nop_> = {}) const
+    {
+        static_assert(detail::is_similiar_distributed_tensor_v<CTensor, CWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<ATensor, AWarpTensor> &&
+                      detail::is_similiar_distributed_tensor_v<BTensor, BWarpTensor>);
+        constexpr auto CompressionRatio = WarpGemmAttribute::kCompressionRatio;
+
+        using AVec = ext_vector_t<ADataType, ATensor::get_thread_buffer_size()>;
+        using AVecCompressed =
+            ext_vector_t<ADataType, ATensor::get_thread_buffer_size() / CompressionRatio>;
+        using BVec = ext_vector_t<BDataType, BTensor::get_thread_buffer_size()>;
+        using CVec = ext_vector_t<CDataType, CTensor::get_thread_buffer_size()>;
+
+        constexpr auto I0 = number<0>{};
+
+        auto a_vec       = a.get_thread_buffer().template get_as<AVec>()[I0];
+        const auto b_vec = b.get_thread_buffer().template get_as<BVec>()[I0];
+        auto c_vec       = c.get_thread_buffer().template get_as<CVec>()[I0];
+
+        const int32_t idx = compress_a(a_vec);
+
+        // @TODO can we simply set a_vec_pruned to a_vec[0:3]?
+        const AVecCompressed a_vec_pruned = {a_vec[0], a_vec[1], a_vec[2], a_vec[3]};
+
+        // c_vec += a_vec * b_vec[idx]
+        WarpGemmAttribute{}(c_vec, a_vec_pruned, b_vec, idx, bool_constant<post_nop_>{});
+
+        c.get_thread_buffer().template set_as<CVec>(I0, c_vec);
+    }
+};
+
+} // namespace ck_tile

From 74fda2e796fbdce6688882347c12a3710eeef250 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Fri, 11 Apr 2025 10:17:29 -0700
Subject: [PATCH 039/443] CkProfiler StreamK GemmUniversal Fix and Split
 Gemm_universal Test Redo PR #2044 (#2070)

* fix and split gemm_universal test


* Update test_gemm_universal_streamk_ut_cases_fp8.inc
---
 .../profile_gemm_universal_streamk_impl.hpp   |   2 +-
 test/gemm_universal/CMakeLists.txt            |  15 ++-
 ... => test_gemm_universal_ut_cases_bf16.inc} |  60 +++-------
 .../test_gemm_universal_ut_cases_fp16.inc     | 113 ++++++++++++++++++
 .../test_gemm_universal_ut_cases_fp8.inc      | 113 ++++++++++++++++++
 ...l.cpp => test_gemm_universal_xdl_bf16.cpp} |  34 ++----
 .../test_gemm_universal_xdl_fp16.cpp          |  82 +++++++++++++
 .../test_gemm_universal_xdl_fp8.cpp           |  71 +++++++++++
 ...t_gemm_universal_streamk_ut_cases_fp16.inc |  28 -----
 ...st_gemm_universal_streamk_ut_cases_fp8.inc |  28 -----
 .../test_gemm_universal_streamk_util.hpp      |  12 +-
 11 files changed, 423 insertions(+), 135 deletions(-)
 mode change 100644 => 100755 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
 mode change 100644 => 100755 test/gemm_universal/CMakeLists.txt
 rename test/gemm_universal/{test_gemm_universal_ut_cases.inc => test_gemm_universal_ut_cases_bf16.inc} (75%)
 create mode 100644 test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
 create mode 100644 test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
 rename test/gemm_universal/{test_gemm_universal_xdl.cpp => test_gemm_universal_xdl_bf16.cpp} (61%)
 create mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
 create mode 100644 test/gemm_universal/test_gemm_universal_xdl_fp8.cpp

diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100644
new mode 100755
index d145ab1766..e625fae808
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -166,7 +166,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
             0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP,
                             // 2:2-tile Stream-K + DP
 
-        if(Grid_size != -1)
+        if(Grid_size == -1)
         {
             grid_size_list = {Grid_size};
         }
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
old mode 100644
new mode 100755
index 4aab6323cc..cf5c68e220
--- a/test/gemm_universal/CMakeLists.txt
+++ b/test/gemm_universal/CMakeLists.txt
@@ -1,4 +1,15 @@
-add_gtest_executable(test_gemm_universal test_gemm_universal_xdl.cpp)
+add_gtest_executable(test_gemm_universal_fp16 test_gemm_universal_xdl_fp16.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal PRIVATE utility device_gemm_universal_instance)
+   target_link_libraries(test_gemm_universal_fp16 PRIVATE utility device_gemm_universal_instance)
  endif()
+
+add_gtest_executable(test_gemm_universal_fp8 test_gemm_universal_xdl_fp8.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_fp8 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_bf16 test_gemm_universal_xdl_bf16.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_universal_bf16 PRIVATE utility device_gemm_universal_instance)
+endif()
+
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases.inc b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
similarity index 75%
rename from test/gemm_universal/test_gemm_universal_ut_cases.inc
rename to test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
index 9a21666856..8a6c672a9f 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
@@ -1,6 +1,6 @@
 #pragma once
 
-TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -14,7 +14,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -28,7 +28,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -44,7 +44,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, SmallM)
 {
     std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
@@ -60,7 +60,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -74,7 +74,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -88,7 +88,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -104,7 +104,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
     constexpr int N = 512;
@@ -120,7 +120,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, MidLargeM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -134,7 +134,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -148,7 +148,7 @@ TYPED_TEST(TestGemmUniversal_MK_NK, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -164,7 +164,7 @@ TYPED_TEST(TestGemmUniversal_KM_KN, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, PaddK)
 {
     std::vector<int> Ms{127};
     constexpr int N = 512;
@@ -180,7 +180,7 @@ TYPED_TEST(TestGemmUniversal_KM_NK, PaddK)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
+TYPED_TEST(TestGemmUniversal_BF16_MK_KN, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -194,7 +194,7 @@ TYPED_TEST(TestGemmUniversal_MK_KN, Regular)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
+TYPED_TEST(TestGemmUniversal_BF16_MK_NK, Regular)
 {
     std::vector<int> Ms{512};
     constexpr int N = 512;
@@ -207,35 +207,3 @@ TYPED_TEST(TestGemmUniversal_MK_NK, Regular)
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
-
-TYPED_TEST(TestGemmUniversal_KM_KN, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-    {
-        int StrideA = M;
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-    }
-}
-
-TYPED_TEST(TestGemmUniversal_KM_NK, Regular)
-{
-    std::vector<int> Ms{512};
-    constexpr int N = 512;
-    constexpr int K = 512;
-
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-    {
-        int StrideA = M;
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-    }
-}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
new file mode 100644
index 0000000000..6f6d550625
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
new file mode 100644
index 0000000000..b831e15e9c
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
@@ -0,0 +1,113 @@
+#pragma once
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmUniversal_FP8_MK_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_universal/test_gemm_universal_xdl.cpp b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
similarity index 61%
rename from test/gemm_universal/test_gemm_universal_xdl.cpp
rename to test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
index b872d7089a..8fde65657a 100644
--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_bf16.cpp
@@ -7,8 +7,6 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
-using F8   = ck::f8_t;
-using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -29,25 +27,25 @@ struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
 } // namespace
 
 template <typename Tuple>
-class TestGemmUniversal_MK_KN
+class TestGemmUniversal_BF16_MK_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_MK_NK
+class TestGemmUniversal_BF16_MK_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_KM_KN
+class TestGemmUniversal_BF16_KM_KN
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
 {
 };
 
 template <typename Tuple>
-class TestGemmUniversal_KM_NK
+class TestGemmUniversal_BF16_KM_NK
     : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
 {
 };
@@ -55,22 +53,12 @@ class TestGemmUniversal_KM_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    std::tuple<      F16,       F16,             F16,     F16>,
-#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
-    std::tuple<      F16,        F8,             F16,     F16>,
-    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<       F8,        F8,              F8,    BF16>,
-#endif
+
     std::tuple<     BF16,      BF16,            BF16,    BF16>
     >;
 
@@ -86,9 +74,9 @@ using KernelTypes_KM_KN = ::testing::Types<
 
 // clang-format on
 
-TYPED_TEST_SUITE(TestGemmUniversal_MK_KN, KernelTypes_MK_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_MK_NK, KernelTypes_MK_NK);
-TYPED_TEST_SUITE(TestGemmUniversal_KM_KN, KernelTypes_KM_KN);
-TYPED_TEST_SUITE(TestGemmUniversal_KM_NK, KernelTypes_KM_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
 
-#include "test_gemm_universal_ut_cases.inc"
+#include "test_gemm_universal_ut_cases_bf16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
new file mode 100644
index 0000000000..24f587daf6
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F8  = ck::f8_t;
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+
+#endif
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
new file mode 100644
index 0000000000..e833ab7825
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp8.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F8   = ck::f8_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+
+#if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    // Fallback test type when FP8 is not enabled
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
+
+
+#include "test_gemm_universal_ut_cases_fp8.inc"
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
index b2fdfe8193..99c8e6d163 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
@@ -28,34 +28,6 @@ TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_NK, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
 TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
index b3da08f703..b98ee92800 100755
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
@@ -28,34 +28,6 @@ TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
-TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_NK, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
 TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
index ef3509c0ca..805587a274 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_util.hpp
@@ -44,9 +44,8 @@ class TestGemmUniversal_Streamk : public testing::Test
 
     void SetUp() override
     {
-        grid_size_list   = {38, 114, 228}; // {38, 76, 114, 152, 190, 228, 266, 304, 342, 380};
-        streamk_sel_list = {0, 1, 2};      // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
-                                           // Stream-K+ DP, // {0, 1, 2, 3, 4}
+        streamk_sel_list = {0, 1, 2}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile
+                                      // Stream-K+ DP, // {0, 1, 2, 3, 4}
         // 2:2-tile Stream-K + DP
     }
 
@@ -58,10 +57,9 @@ class TestGemmUniversal_Streamk : public testing::Test
              const int StrideC)
     {
         for(auto streamk_sel : streamk_sel_list)
-            for(auto grid_size : grid_size_list)
-            {
-                RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, grid_size);
-            }
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, streamk_sel, -1);
+        }
     }
 
     void RunSingle(const int M,

From 0d4f14507818d118696fc345a3a7623b20470c4e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 11 Apr 2025 12:12:53 -0700
Subject: [PATCH 040/443] Fix build issues for multiple targets. (#2077)

* build for multiple targets on gfx942

* add missing ignore statements
---
 Jenkinsfile                                   | 28 +++----------------
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 10 +++++++
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index dbd484d7bd..d105e385ab 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1112,7 +1112,7 @@ pipeline {
                         beforeAgent true
                         expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx90a") }
+                    agent{ label rocmnode("gfx942") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
                                          -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
@@ -1128,26 +1128,6 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK and run Tests on gfx942")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx942") }
-                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx942" -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx942" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
-                    }
-                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
-                        cleanWs()
-                    }
-                }
                 stage("Build CK and run Tests on gfx908")
                 {
                     when {
@@ -1194,13 +1174,13 @@ pipeline {
                         beforeAgent true
                         expression { params.BUILD_INSTANCES_ONLY.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx90a") }
+                    agent{ label rocmnode("gfx942") }
                     environment{
                         execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
-                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j32 """
+                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1151;gfx1201"  \
+                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index f6ea23a1e7..d56c7abcde 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -81,6 +81,11 @@ __global__ void
                                         k_idx);
 #else
     ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = num_k_per_block;
 #endif // end of if (defined(__gfx9__)
 }
 
@@ -140,6 +145,11 @@ __global__ void
                                              k_idx);
 #else
     ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = num_k_per_block;
 #endif // end of if (defined(__gfx9__)
 }
 

From 269f4f6af5aba8c8ac6fe215fcf6ea604dc6b101 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Sun, 13 Apr 2025 20:09:30 -0700
Subject: [PATCH 041/443] Solve the Static Encoding Pattern compile error when
 the tile size is too small (#2079)

---
 include/ck_tile/core.hpp                      |  1 +
 .../algorithm/static_encoding_pattern.hpp     | 27 ++++++++++---------
 include/ck_tile/ops/epilogue.hpp              |  2 +-
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index d9aa8b3551..821b3a8e84 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -59,6 +59,7 @@
 #include "ck_tile/core/tensor/transpose_tile.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/env.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index 78884f3f9f..b56bda3741 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -73,10 +73,11 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
-    static constexpr index_t warp_size = get_warp_size();
-    static constexpr index_t num_warps = BlockSize / get_warp_size();
-    static constexpr index_t X1        = VecSize;
-    static constexpr index_t X0        = XPerTile / X1; // # of threads in X dim
+    static constexpr index_t warp_size  = get_warp_size();
+    static constexpr index_t num_warps  = BlockSize / get_warp_size();
+    static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
+    static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
+    static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
 
     // # of rows in Y dim accessed by single wavefront in one iteration
     static constexpr index_t Y1 = warp_size / X0;
@@ -124,10 +125,11 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 {
 
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
-    static constexpr index_t warp_size = get_warp_size();
-    static constexpr index_t num_warps = BlockSize / get_warp_size();
-    static constexpr index_t X1        = VecSize;
-    static constexpr index_t X0        = XPerTile / X1; // # of threads in X dim
+    static constexpr index_t warp_size  = get_warp_size();
+    static constexpr index_t num_warps  = BlockSize / get_warp_size();
+    static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
+    static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
+    static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
 
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
@@ -173,10 +175,11 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
-    static constexpr index_t warp_size = get_warp_size();
-    static constexpr index_t num_warps = BlockSize / get_warp_size();
-    static constexpr index_t X1        = VecSize;
-    static constexpr index_t X0        = XPerTile / X1; // # of threads in X dim
+    static constexpr index_t warp_size  = get_warp_size();
+    static constexpr index_t num_warps  = BlockSize / get_warp_size();
+    static constexpr index_t LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size);
+    static constexpr index_t X1         = VecSize > LargestVec ? LargestVec : VecSize;
+    static constexpr index_t X0         = XPerTile / X1; // # of threads in X dim
     static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
     static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
     static constexpr index_t Y1 = num_warps;
diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp
index 12e53e13e6..6cc0fa8540 100644
--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
@@ -4,9 +4,9 @@
 #pragma once
 
 #include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
+#include "ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp"
 #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
 #include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
-#include "ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"

From 56378f810fdd328fec449e6574af656148e4c894 Mon Sep 17 00:00:00 2001
From: Mingtao Gu <145657261+mtgu0705@users.noreply.github.com>
Date: Mon, 14 Apr 2025 16:58:57 +0800
Subject: [PATCH 042/443] CK pk_i4_t test failures fix (SWDEV-518629) (#2075)

* fix pk_i4_v3 tests failures in Unbuntu env.

* fix pk_i4_t tests failure on Unbuntu issues.

* some fixed.

---------

Co-authored-by: mtgu0705 <mtgu@amd.com>
---
 example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp    | 12 +++++++---
 example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp    | 12 +++++++---
 .../gemm_xdl_fp16_pk_i4_v3_b_scale.cpp        | 12 +++++++---
 .../gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp     | 13 ++++++++---
 example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp     | 12 +++++++---
 ..._batched_gemm_example_fp16int4_b_scale.inc |  3 ++-
 .../moe_gemm1_xdl_pk_i4.cpp                   | 11 +++++++---
 .../moe_gemm2_xdl_pk_i4.cpp                   | 11 +++++++---
 ...evice_batched_gemm_xdl_fpAintB_b_scale.hpp | 16 ++++++++++----
 .../impl/device_gemm_xdl_cshuffle_v3.hpp      | 22 +++++++++++++++----
 ...vice_gemm_xdl_cshuffle_v3_b_preshuffle.hpp | 22 +++++++++++++++----
 .../device_gemm_xdl_cshuffle_v3_b_scale.hpp   | 22 +++++++++++++++----
 .../gpu/device/impl/device_moe_gemm.hpp       | 22 +++++++++++++++----
 13 files changed, 148 insertions(+), 42 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
index 7c232f1bcf..7178ad46b9 100644
--- a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp
@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     // weight permute
@@ -192,14 +192,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
                                       b_element_op,
                                       c_element_op);
 
-    if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!gemm.IsSupportedArgument(argument))
     {
         std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
 
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
index 61c5a32d5d..e16f184a20 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3.cpp
@@ -134,7 +134,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     // weight permute
@@ -242,14 +242,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
                                       b_element_op,
                                       c_element_op);
 
-    if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!gemm.IsSupportedArgument(argument))
     {
         std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
 
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
index 468dd699a1..f83d479713 100644
--- a/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_pk_i4_v3_b_scale.cpp
@@ -161,7 +161,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
@@ -274,14 +274,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
                           b_element_op,
                           c_element_op);
 
-    if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!gemm.IsSupportedArgument(argument))
     {
         std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
 
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
index 80f7e95d30..266a1e9d3e 100644
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_bpreshuffle_v3.cpp
@@ -152,7 +152,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_preshuffled.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_preshuffled.mDesc.GetElementSpaceSize() /
+                               2);
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     // do GEMM
@@ -261,14 +262,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
                                       b_element_op,
                                       c_element_op);
 
-    if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!gemm.IsSupportedArgument(argument))
     {
         std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
 
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
index 7b72461dd9..0575314dff 100644
--- a/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp8_pk_i4_v3.cpp
@@ -132,7 +132,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     // weight permute
@@ -240,14 +240,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
                                       b_element_op,
                                       c_element_op);
 
-    if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
-       ck::get_device_name() != "gfx950")
+    if(!gemm.IsSupportedArgument(argument))
     {
         std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
 
         return true;
     }
 
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+
+        return true;
+    }
+
     bool pass = true;
     if(config.do_verification)
     {
diff --git a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
index 8c4913dbcc..3582bc5e33 100644
--- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc
@@ -212,7 +212,8 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
     std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
 
     DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * b_g_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * b_g_k_n_permute.mDesc.GetElementSpaceSize() /
+                                 2);
     DeviceMem b1_g_scale_device_buf(sizeof(BScaleDataType) * b1_g_k_n.mDesc.GetElementSpaceSize());
     DeviceMem c_g_m_n_device_buf(sizeof(CDataType) *
                                  c_g_m_n_device_result.mDesc.GetElementSpaceSize());
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index 1102ce1054..a25d1b5fa3 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -301,7 +301,7 @@ int main(int argc, char* argv[])
     DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
     DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
     DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.mDesc.GetElementSpaceSize());
-    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize() / 2);
     DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
     DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
     DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
@@ -440,13 +440,18 @@ int main(int argc, char* argv[])
                                b_element_op,
                                cde_element_op);
 
-    if(!device_op.IsSupportedArgument(argument) ||
-       !(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!device_op.IsSupportedArgument(argument))
     {
         throw std::runtime_error(
             "wrong! device_gemm with the specified compilation parameters does "
             "not support this GEMM problem");
     }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
     if(time_kernel)
     {
         float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index 528503a2c4..8c2c70b4a1 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -298,7 +298,7 @@ int main(int argc, char* argv[])
     DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
     DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
     DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.mDesc.GetElementSpaceSize());
-    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize() / 2);
     DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
     DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
     DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
@@ -407,13 +407,18 @@ int main(int argc, char* argv[])
                                b_element_op,
                                cde_element_op);
 
-    if(!device_op.IsSupportedArgument(argument) ||
-       !(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    if(!device_op.IsSupportedArgument(argument))
     {
         throw std::runtime_error(
             "wrong! device_gemm with the specified compilation parameters does "
             "not support this GEMM problem");
     }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
     if(time_kernel)
     {
         // not result correct here because output buf not setzero
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
index 963f0edd08..7d9555dc82 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
@@ -224,12 +224,20 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
         PermuteA,
         PermuteB>;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     static constexpr index_t BPackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
             return 2;
         else
             return 1;
     }();
+
     struct ComputePtrOffsetOfStridedBatch
     {
         ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
@@ -352,10 +360,10 @@ struct DeviceBatchedGemm_Xdl_CShuffleV3_BScale
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
index 51c223efd2..dde21725d0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -229,6 +229,20 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     /// @brief  Helper structure responsible for kernel invocation.
     ///
     /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
@@ -278,10 +292,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index 58a182b924..faa235be50 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -130,6 +130,20 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
 
     using Argument = typename GridwiseGemm::Argument;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     int GetPreShuffleParameters() override { return NPerXDL; }
 
     // Invoker
@@ -168,10 +182,10 @@ struct DeviceGemm_Xdl_CShuffleV3_BPreshuffle : public DeviceGemmV2BPreshuffle<AL
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
index 044350d11c..456e5e90d1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -139,6 +139,20 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     // Invoker
     struct Invoker : public BaseInvoker
     {
@@ -174,10 +188,10 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
index 950fe0236d..f3fc1aaa9f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -139,6 +139,20 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
     int GetPreShuffleParameters() override { return NPerXDL; }
 
     // Invoker
@@ -179,10 +193,10 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
 
                     const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
                         arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);

From d55c9cb313ed7c38afb58ad182a809d055f9d99d Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 14 Apr 2025 16:41:47 -0700
Subject: [PATCH 043/443] Upgrade default docker image to ROCm6.4 release.
 (#2082)

* upgrade to rocm6.4

* fix gfx10 generic target syntax

* use gfx1101 target for unit tests

* use gfx1201 target for unit tests

* do not use generic targets until 6.4.1 release

* update target list and dockerfile.compiler
---
 Dockerfile          | 15 +++++++--------
 Dockerfile.compiler |  2 +-
 Jenkinsfile         | 14 +++++++-------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 17800d92d5..2a8fb707c9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:22.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.3
+ARG ROCMVERSION=6.4
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -13,15 +13,15 @@ RUN set -xe && \
     apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
     curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
-RUN if [ "$ROCMVERSION" != "6.4" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
+RUN if [ "$ROCMVERSION" != "6.5" ]; then \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
         apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
         wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
-        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
-        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
+        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
+        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
     fi
 
-RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
     amdgpu-install -y --usecase=rocm --no-dkms
 
 ## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
@@ -51,7 +51,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     mpich \
     net-tools \
     pkg-config \
-    python \
     python3 \
     python3-dev \
     python3-pip \
@@ -99,7 +98,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
     dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
     pip3 install --upgrade pip && \
-    pip3 install --upgrade pytest sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust setuptools>=75 sshtunnel==0.4.0 && \
+    pip3 install --upgrade pytest sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
 # Add render group
     groupadd -f render && \
 # Install the new rocm-cmake version
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index a22103b96b..f4aa12f356 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.4"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index d105e385ab..e6256fc3d8 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -39,7 +39,7 @@ def getBaseDockerImageName(){
     }
     else{
         def ROCM_numeric = "${params.ROCMVERSION}" as float
-        if ( ROCM_numeric < 6.4 ){
+        if ( ROCM_numeric < 6.5 ){
             img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
             }
         else{
@@ -519,13 +519,13 @@ def Build_CK(Map conf=[:]){
                     else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
                         arch_type = 2
                     }
-                    else if ( runShell('grep -n "gfx1030" rocminfo.log') ) {
+                    else if ( runShell('grep -n "gfx10" rocminfo.log') ) {
                         arch_type = 3
                     }
-                    else if ( runShell('grep -n "gfx1101" rocminfo.log') ) {
+                    else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
                         arch_type = 4
                     }
-                    else if ( runShell('grep -n "gfx1201" rocminfo.log') ) {
+                    else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
                         arch_type = 5
                     }
                     else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
@@ -744,8 +744,8 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true;
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+                                              0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true;
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
@@ -770,7 +770,7 @@ pipeline {
             description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
         string(
             name: 'ROCMVERSION', 
-            defaultValue: '6.3',
+            defaultValue: '6.4',
             description: 'Specify which ROCM version to use: 6.3 (default).')
         string(
             name: 'COMPILER_VERSION', 

From 7106976a72897f44b05260bd1ae1f70b319a4e75 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:17:07 -0600
Subject: [PATCH 044/443] MX GEMM - New GEMM pipeline for MX data types (#2059)

* Allow selection of mfma_scale instructions

* Read B tensor from LDS to VGPR in chunks of 16 in MFMA order

* Add constexpr and synchronize return type for `get_exponent_value`

* Pass scales by reference and add comments to `mfma_scale_f32_32x32x64`

* Add support for microscaling instructions in `XdlopsGemm`

* Fix `mfma_scale_f32_16x16x128f8f6f4` wrapper

* Remove software implementation of MX GEMM

* Make interface of `intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>` consistent with the other scale instruction

* Update README

* Updated CHANGELOG

* Remove unused static methods
---
 CHANGELOG.md                                  |   1 +
 example/67_gemm_microscaling/CMakeLists.txt   |   9 +-
 example/67_gemm_microscaling/README.md        |   8 +-
 .../67_gemm_microscaling/gemm_mx_common.hpp   |  79 +--
 example/67_gemm_microscaling/gemm_mx_fp8.cpp  |  98 ++++
 .../gemm_mx_fp8_e8m0_scale.cpp                |  42 --
 .../gemm_mx_fp8_fp16_scale.cpp                |  42 --
 .../gemm_mx_fp8_fp8_scale.cpp                 |  42 --
 ...blockwise_gemm_mx_pipeline_xdlops_base.hpp | 363 ++++++++++++
 ...kwise_gemm_pipeline_xdlops_mx_selector.hpp |  35 +-
 .../blockwise_gemm_pipeline_xdlops_v1_mx.hpp  | 546 +++++++++---------
 .../impl/device_gemm_xdl_cshuffle_v3_mx.hpp   |  14 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp | 122 ++--
 .../threadwise_tensor_slice_transfer.hpp      |   3 +-
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  89 ++-
 include/ck/utility/amd_xdlops.hpp             |  16 +-
 include/ck/utility/e8m0.hpp                   |   4 +-
 include/ck/utility/mxfp_utils.hpp             |   4 +-
 test/mx_mfma_op/mx_mfma_op.hpp                |  98 ++--
 19 files changed, 1007 insertions(+), 608 deletions(-)
 create mode 100644 example/67_gemm_microscaling/gemm_mx_fp8.cpp
 delete mode 100644 example/67_gemm_microscaling/gemm_mx_fp8_e8m0_scale.cpp
 delete mode 100644 example/67_gemm_microscaling/gemm_mx_fp8_fp16_scale.cpp
 delete mode 100644 example/67_gemm_microscaling/gemm_mx_fp8_fp8_scale.cpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e3d7971c71..b9012c0a77 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
+* Added GEMM pipeline for microscaling (MX) data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 
 ### Optimized
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 9e95c3e007..93770684df 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -1,10 +1,5 @@
 add_custom_target(example_gemm_mx)
 
-add_example_executable(example_gemm_mx_fp8_e8m0_scale gemm_mx_fp8_e8m0_scale.cpp)
-add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_e8m0_scale)
+add_example_executable(example_gemm_mx_fp8 gemm_mx_fp8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
 
-add_example_executable(example_gemm_mx_fp8_fp8_scale gemm_mx_fp8_fp8_scale.cpp)
-add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_fp8_scale)
-
-add_example_executable(example_gemm_mx_fp8_fp16_scale gemm_mx_fp8_fp16_scale.cpp)
-add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_fp16_scale)
diff --git a/example/67_gemm_microscaling/README.md b/example/67_gemm_microscaling/README.md
index 713902588d..57b6490eda 100644
--- a/example/67_gemm_microscaling/README.md
+++ b/example/67_gemm_microscaling/README.md
@@ -10,16 +10,16 @@ Custom verification parameters:
 # arg4: verbosity (0=no info, 1=verbose info)
 # arg5 to 10: M(128x), N(128x), K(64x), StrideA, StrideB, StrideC
 # arg11: KBatch
-./bin/example_gemm_mx_fp8_e8m0_scale 1 1 0 1
+./bin/example_gemm_mx_fp8 1 1 0 1
 ```
 
 Custom tensor shapes:
 ```bash
-./bin/example_gemm_mx_fp8_fp16_scale 1 2 1 0 128  128  64 -1 -1 -1 1
+./bin/example_gemm_mx_fp8 1 2 1 0 128  128  256 -1 -1 -1 1
 ```
 
 Default invocation:
 ```bash
-# Implies: ./bin/example_gemm_mx_fp8_fp8_scale 1 2 0 0
-./bin/example_gemm_mx_fp8_fp8_scale
+# Implies: ./bin/example_gemm_mx_fp8 1 2 0 0
+./bin/example_gemm_mx_fp8
 ```
\ No newline at end of file
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 9a05954c73..32ef975192 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -95,7 +95,7 @@ bool parse_cmd_args(int argc,
                   << std::endl
                   << "arg3: time kernel (0=no, 1=yes)" << std::endl
                   << "arg4: verbosity (0=no info, 1=verbose info)" << std::endl
-                  << "arg5 to 10: M(128x), N(128x), K(64x), StrideA, StrideB, StrideC" << std::endl
+                  << "arg5 to 10: M(128x), N(128x), K(256x), StrideA, StrideB, StrideC" << std::endl
                   << "arg11: KBatch" << std::endl;
         return false;
     }
@@ -103,7 +103,8 @@ bool parse_cmd_args(int argc,
     return true;
 }
 
-template <typename ADataType,
+template <typename DeviceOpInstance,
+          typename ADataType,
           typename BDataType,
           typename XDataType,
           typename CDataType,
@@ -115,65 +116,9 @@ template <typename ADataType,
           typename CElementOp,
           typename AccDataType,
           typename CShuffleDataType,
-          ck::index_t MXVectorSize>
+          ck::index_t ScaleBlockSize>
 bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& config)
 {
-    static constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
-    static constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
-    static constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
-
-    static constexpr ck::index_t ScaleBlockSize = MXVectorSize;
-
-    static constexpr ck::index_t KPerBlock = 64;
-    using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
-        ALayout,          // ALayout
-        BLayout,          // BLayout
-        CLayout,          // CLayout
-        ADataType,        // ADataType
-        XDataType,        // AScaleDataType
-        BDataType,        // BDataType
-        XDataType,        // BScaleDataType
-        CDataType,        // CDataType
-        AccDataType,      // GemmAccDataType
-        CShuffleDataType, // CShuffleDataType
-        AElementOp,       // AElementwiseOperation
-        BElementOp,       // BElementwiseOperation
-        CElementOp,       // CElementwiseOperation
-        GemmSpec,         // GemmSpec
-        MXVectorSize,     // ScaleBlockSize: Scaling block size
-        256,              // BlockSize: Thread block size
-        128,              // MPerBlock
-        128,              // NPerBlock
-        KPerBlock,        // KPerBlock
-        16,               // AK1
-        16,               // BK1
-        32,               // MPerXDL
-        32,               // NPerXDL
-        2,                // MXdlPerWave
-        2,                // NXdlPerWave
-        S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
-        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
-        2,                // ABlockTransferSrcVectorDim
-        16,               // ABlockTransferSrcScalarPerVector
-        16,               // ABlockTransferDstScalarPerVector_AK1
-        false,            // ABlockLdsExtraM
-        S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
-        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
-        2,                // BBlockTransferSrcVectorDim
-        16,               // BBlockTransferSrcScalarPerVector
-        16,               // BBlockTransferDstScalarPerVector_BK1
-        false,            // BBlockLdsExtraN
-        1,                // CShuffleMXdlPerWavePerShuffle
-        1,                // CShuffleNXdlPerWavePerShuffle
-        S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
-        BlkGemmPSched,    // BlkGemmPipeSched
-        BlkGemmPVer,      // BlkGemmPipelineVer
-        ADataType,        // ComputeTypeA
-        BDataType         // ComputeTypeB
-        >;
 
     auto M       = problem_size.M;
     auto N       = problem_size.N;
@@ -230,8 +175,8 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     auto Scale_Stride_AM = f_get_default_stride(M, K / ScaleBlockSize, -1, AScaleLayout{});
     auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
 
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, AScaleLayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BScaleLayout{}));
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
 
     Tensor<XDataType> a_m_k_scale(f_host_tensor_descriptor(
         M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{})); // scales for A
@@ -428,8 +373,10 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
 
     if(config.time_kernel)
     {
-        std::size_t flop = std::size_t(2) * M * N * K +
-                           std::size_t(2) * M * N * K / ScaleBlockSize; // GEMM + A scale + B scale
+        // Output size(M*N) * [dot product(2K) + product of scales(K/ScaleBlockSize) + scaling of
+        // partial sums(K/ScaleBlockSize)]
+        // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
+        std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
         std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                 sizeof(CDataType) * M * N +
                                 sizeof(XDataType) * (M * K + K * N) / ScaleBlockSize;
@@ -445,7 +392,8 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     return res_verified;
 }
 
-template <typename ADataType,
+template <typename DeviceOpInstance,
+          typename ADataType,
           typename BDataType,
           typename XDataType,
           typename CDataType,
@@ -464,7 +412,8 @@ bool run_mx_gemm_example(int argc, char* argv[])
     ExecutionConfig config;
 
     return parse_cmd_args(argc, argv, problem_size, config) &&
-           run_mx_gemm<ADataType,
+           run_mx_gemm<DeviceOpInstance,
+                       ADataType,
                        BDataType,
                        XDataType,
                        CDataType,
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8.cpp b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
new file mode 100644
index 0000000000..9fc5666197
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::f8_t;
+using BDataType = ck::f8_t;
+
+using XDataType = ck::e8m0_bexp_t;
+
+using CDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+constexpr ck::index_t KPerBlock      = 256;
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    128,              // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    32,               // MPerXDL
+    32,               // NPerXDL
+    2,                // MXdlPerWave
+    2,                // NXdlPerWave
+    S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    false,            // ABlockLdsExtraM
+    S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    false,            // BBlockLdsExtraN
+    1,                // CShuffleMXdlPerWavePerShuffle
+    1,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8_e8m0_scale.cpp b/example/67_gemm_microscaling/gemm_mx_fp8_e8m0_scale.cpp
deleted file mode 100644
index 393f4a2ea7..0000000000
--- a/example/67_gemm_microscaling/gemm_mx_fp8_e8m0_scale.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gemm_mx_common.hpp"
-
-using ADataType = ck::f8_t;
-using BDataType = ck::f8_t;
-
-using XDataType = ck::e8m0_bexp_t;
-
-using CDataType        = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = CDataType;
-
-using ALayout = Row;
-using BLayout = Col;
-using CLayout = Row;
-
-using AElementOp = PassThrough; // elementwise transformation for A matrix
-using BElementOp = PassThrough; // elementwise transformation for B matrix
-using CElementOp = PassThrough; // elementwise transformation for C matrix
-
-constexpr ck::index_t mx_vector_size = 32; // scaling block size
-
-int main(int argc, char* argv[])
-{
-    return run_mx_gemm_example<ADataType,
-                               BDataType,
-                               XDataType,
-                               CDataType,
-                               ALayout,
-                               BLayout,
-                               CLayout,
-                               AElementOp,
-                               BElementOp,
-                               CElementOp,
-                               AccDataType,
-                               CShuffleDataType,
-                               mx_vector_size>(argc, argv)
-               ? 0
-               : -1;
-}
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8_fp16_scale.cpp b/example/67_gemm_microscaling/gemm_mx_fp8_fp16_scale.cpp
deleted file mode 100644
index dd654a8f69..0000000000
--- a/example/67_gemm_microscaling/gemm_mx_fp8_fp16_scale.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gemm_mx_common.hpp"
-
-using ADataType = ck::f8_t;
-using BDataType = ck::f8_t;
-
-using XDataType = ck::half_t;
-
-using CDataType        = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = CDataType;
-
-using ALayout = Row;
-using BLayout = Col;
-using CLayout = Row;
-
-using AElementOp = PassThrough; // elementwise transformation for A matrix
-using BElementOp = PassThrough; // elementwise transformation for B matrix
-using CElementOp = PassThrough; // elementwise transformation for C matrix
-
-constexpr ck::index_t mx_vector_size = 32; // scaling block size
-
-int main(int argc, char* argv[])
-{
-    return run_mx_gemm_example<ADataType,
-                               BDataType,
-                               XDataType,
-                               CDataType,
-                               ALayout,
-                               BLayout,
-                               CLayout,
-                               AElementOp,
-                               BElementOp,
-                               CElementOp,
-                               AccDataType,
-                               CShuffleDataType,
-                               mx_vector_size>(argc, argv)
-               ? 0
-               : -1;
-}
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8_fp8_scale.cpp b/example/67_gemm_microscaling/gemm_mx_fp8_fp8_scale.cpp
deleted file mode 100644
index c42d9783be..0000000000
--- a/example/67_gemm_microscaling/gemm_mx_fp8_fp8_scale.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "gemm_mx_common.hpp"
-
-using ADataType = ck::f8_t;
-using BDataType = ck::f8_t;
-
-using XDataType = ck::f8_t;
-
-using CDataType        = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = CDataType;
-
-using ALayout = Row;
-using BLayout = Col;
-using CLayout = Row;
-
-using AElementOp = PassThrough; // elementwise transformation for A matrix
-using BElementOp = PassThrough; // elementwise transformation for B matrix
-using CElementOp = PassThrough; // elementwise transformation for C matrix
-
-constexpr ck::index_t mx_vector_size = 32; // scaling block size
-
-int main(int argc, char* argv[])
-{
-    return run_mx_gemm_example<ADataType,
-                               BDataType,
-                               XDataType,
-                               CDataType,
-                               ALayout,
-                               BLayout,
-                               CLayout,
-                               AElementOp,
-                               BElementOp,
-                               CElementOp,
-                               AccDataType,
-                               CShuffleDataType,
-                               mx_vector_size>(argc, argv)
-               ? 0
-               : -1;
-}
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
new file mode 100644
index 0000000000..ebe075b55d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool TransposeC = false>
+struct BlockwiseGemmXdlops_mx_pipeline_base
+{
+    using ComputeTypeA = ADataType;
+    using ComputeTypeB = BDataType;
+    using AccType      = float; // for now only support V_MFMA_SCALE_F32
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // Hardcode to 64, as HIP-provided "warpSize" would return 32 on RDNA GPUs.
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<ComputeTypeA, MPerXDL, NPerXDL, KPack, ComputeTypeB, TransposeC, true>{};
+
+    static constexpr index_t AMmaKStride = KPack;
+    static constexpr index_t BMmaKStride = KPack;
+
+    //> store rows/cols into thread registers in chunks of 16
+    //> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
+    static constexpr index_t KThreadChunk = 16;
+
+    static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
+    static constexpr index_t KRepeat       = KPerThread / KPack;
+    static constexpr index_t KPerInnerLoop = KPack;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+
+    using HotLoopInstList =
+        ck::BlockwiseGemmXdlops_pipeline_hotloop_inst<BlockSize,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      A_K1,
+                                                      B_K1,
+                                                      A_K1,
+                                                      B_K1,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      xdlops_gemm.KPerXdlops>;
+
+    static_assert(KPerThread % KPack == 0,
+                  "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              AccType,
+                              MRepeat * NRepeat,
+                              xdlops_gemm.GetRegSizePerXdlops(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KThreadChunk * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KThreadChunk * xdlops_b_idx[I0]);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
+
+    /**
+     * @brief Constructor for BlockwiseGemmXdlops_mx_pipeline_base.
+     *
+     * This constructor initializes the thread copy objects for matrices A and B.
+     * It also performs several compile-time checks to ensure the correctness of the
+     * matrix tile descriptors.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     * @note The constructor includes static assertions to ensure that:
+     * - The matrix tile descriptors for A and B are known at compile-time.
+     * - The number of threads in the thread block matches the product of MWaves, NWaves, and
+     * WaveSize.
+     * - The dimensions of the block are divisible by the product of the corresponding XDL and
+     * repeat dimensions.
+     */
+    __host__ __device__
+    BlockwiseGemmXdlops_mx_pipeline_base(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
+                                         Tuple4 b_origin = CalculateBThreadOriginDataIndex())
+        : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
+    {
+        static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, N, M0, M1, M2));
+    }
+
+    // XDL output supporting C_xdl = A_xdl * B_xdl
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1, Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    // XDL output supporting C_xdl = A_xdl * B_xdl
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_block_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_G_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+    {
+        const auto G = c_grid_desc_g_m_n.GetLength(I0);
+        const auto M = c_grid_desc_g_m_n.GetLength(I1);
+        const auto N = c_grid_desc_g_m_n.GetLength(I2);
+
+        const auto c_grid_desc_g_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_g_m_n,
+            make_tuple(make_pass_through_transform(G),
+                       make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3, 5>{}, Sequence<2, 4, 6>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_grid_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k;
+    static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k;
+
+    protected:
+    // M1, N1 as double buffer index
+    // Read buffer + Compute buffer
+    // A[M0, M1, M2, KPack]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<MRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
+        make_tuple(
+            Number<KPack>{}, Number<KRepeat * MRepeat * KPack>{}, Number<MRepeat * KPack>{}, I1));
+
+    // B[N0, N1, N2, KPack]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
+        make_tuple(
+            Number<KPack>{}, Number<KRepeat * NRepeat * KPack>{}, Number<NRepeat * KPack>{}, I1));
+
+    // C[M, N, NumRegXdlops]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeTypeA,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                                         ComputeTypeB,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
index 24f6afc381..c1433659d6 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
@@ -7,6 +7,35 @@
 
 namespace ck {
 
+/**
+ * @brief Define matrix data types that have hardware support for MX GEMMs
+ */
+template <typename T>
+static constexpr bool is_scale_mfma_data_type()
+{
+    return is_same_v<T, f8_ocp_t> || is_same_v<T, bf8_ocp_t> || is_same_v<T, f6_t> ||
+           is_same_v<T, bf6_t> || is_same_v<T, f4_t>;
+}
+
+/**
+ * @brief Define scale data types that have hardware support for MX GEMMs
+ */
+template <typename T>
+static constexpr bool is_scale_mfma_scale_type()
+{
+    return is_same_v<T, e8m0_bexp_t>;
+}
+
+/**
+ * @brief Combination of data types that have hardware support for MX GEMMs
+ */
+template <typename ADataType, typename BDataType, typename AScaleDataType, typename BScaleDataType>
+static constexpr bool scale_mfma_hw_support()
+{
+    return is_scale_mfma_data_type<ADataType>() && is_scale_mfma_data_type<BDataType>() &&
+           is_scale_mfma_scale_type<AScaleDataType>() && is_scale_mfma_scale_type<BScaleDataType>();
+}
+
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           BlockGemmPipelineScheduler BlkGemmPipeSche,
           index_t ThreadBlockSize,
@@ -34,6 +63,8 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t KPack>
 constexpr auto BlockGemmMXPipeline_Selector()
 {
+
+    // Hardware MX GEMM pipeline
     if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
     {
         return BlockwiseGemmXdlops_pipeline_v1_mx<BlkGemmPipeSche,
@@ -43,8 +74,6 @@ constexpr auto BlockGemmMXPipeline_Selector()
                                                   AScaleDataType,
                                                   BDataType,
                                                   BScaleDataType,
-                                                  ComputeDataType,
-                                                  AccDataType,
                                                   ATileDesc,
                                                   BTileDesc,
                                                   AMmaTileDesc,
@@ -62,7 +91,7 @@ constexpr auto BlockGemmMXPipeline_Selector()
     }
     else
     {
-        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+        std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
     }
 }
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
index 628dafb063..9acf401410 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
 
 namespace ck {
 
@@ -20,8 +20,6 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
           typename AScaleDataType,
           typename BDataType,
           typename BScaleDataType,
-          typename ComputeDataType,
-          typename AccDataType,
           typename ATileDesc,
           typename BTileDesc,
           typename AMmaTileDesc,
@@ -46,8 +44,6 @@ template <index_t ThreadBlockSize,
           typename AScaleDataType,
           typename BDataType,
           typename BScaleDataType,
-          typename ComputeDataType,
-          typename AccDataType,
           typename ATileDesc,
           typename BTileDesc,
           typename AMmaTileDesc,
@@ -69,8 +65,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
                                           AScaleDataType,
                                           BDataType,
                                           BScaleDataType,
-                                          ComputeDataType,
-                                          AccDataType,
                                           ATileDesc,
                                           BTileDesc,
                                           AMmaTileDesc,
@@ -85,46 +79,43 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
                                           MRepeat,
                                           NRepeat,
                                           KPack>
-    : BlockwiseGemmXdlops_pipeline_base<ThreadBlockSize,
-                                        ADataType,
-                                        BDataType,
-                                        ComputeDataType,
-                                        AccDataType,
-                                        ATileDesc,
-                                        BTileDesc,
-                                        AMmaTileDesc,
-                                        BMmaTileDesc,
-                                        ABlockTransferSrcScalarPerVector,
-                                        BBlockTransferSrcScalarPerVector,
-                                        MPerBlock,
-                                        NPerBlock,
-                                        KPerBlock,
-                                        MPerXDL,
-                                        NPerXDL,
-                                        MRepeat,
-                                        NRepeat,
-                                        KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
 
 {
-    using Base = BlockwiseGemmXdlops_pipeline_base<ThreadBlockSize,
-                                                   ADataType,
-                                                   BDataType,
-                                                   ComputeDataType,
-                                                   AccDataType,
-                                                   ATileDesc,
-                                                   BTileDesc,
-                                                   AMmaTileDesc,
-                                                   BMmaTileDesc,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   BBlockTransferSrcScalarPerVector,
-                                                   MPerBlock,
-                                                   NPerBlock,
-                                                   KPerBlock,
-                                                   MPerXDL,
-                                                   NPerXDL,
-                                                   MRepeat,
-                                                   NRepeat,
-                                                   KPack>;
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
     using Base::I0;
     using Base::I1;
     using Base::KRepeat;
@@ -134,7 +125,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
     using Base::xdlops_gemm;
 
     using Base::CalculateCThreadOriginDataIndex;
-    using Base::CalculateCThreadOriginDataIndex8D;
     using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
@@ -151,15 +141,26 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::KThreadChunk;
 
-    using Tuple4 = typename Base::Tuple4;
+    using AccType      = typename Base::AccType;
+    using Tuple4       = typename Base::Tuple4;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
 
     static constexpr index_t PrefetchStages  = 1;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
 
     static constexpr auto ScalesPerKBlockSize =
-        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block size
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
 
     __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
     {
@@ -172,45 +173,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
         return TailNumber::Full;
     }
 
-    __device__ static auto CalculateAThreadOriginDataIndex()
-    {
-        const auto wave_idx = GetWaveIdx();
-
-        const auto waveId_m = wave_idx[I0];
-
-        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
-
-        return make_tuple(0, waveId_m, xdlops_a_idx[I1], xdlops_gemm.KPerXdlops * xdlops_a_idx[I0]);
-    }
-
-    __device__ static auto CalculateBThreadOriginDataIndex()
-    {
-        const auto wave_idx = GetWaveIdx();
-
-        const auto waveId_n = wave_idx[I1];
-
-        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
-
-        return make_tuple(0, waveId_n, xdlops_b_idx[I1], xdlops_gemm.KPerXdlops * xdlops_b_idx[I0]);
-    }
-
-    /**
-     * @brief Constructor for BlockwiseGemmXdlops_pipeline_v1_mx.
-     *
-     * The primary purpose of this constructor is to modify default initialization of the base class
-     * with the origin data index suitable for microscaling.
-     *
-     * @param a_origin The origin data index for matrix A.
-     * @param b_origin The origin data index for matrix B.
-     *
-     */
-    __host__ __device__
-    BlockwiseGemmXdlops_pipeline_v1_mx(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
-                                       Tuple4 b_origin = CalculateBThreadOriginDataIndex())
-        : Base(a_origin, b_origin)
-    {
-    }
-
     template <bool HasMainLoop,
               TailNumber TailNum,
               typename AGridDesc,
@@ -258,9 +220,9 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
         const BScaleGridBuffer& b_scale_grid_buf,
         index_t num_loop) const
     {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
             b_thread_desc_.GetElementSpaceSize());
 
         auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
@@ -276,49 +238,31 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-        static_assert(xdlops_gemm.mfma_instr.num_groups_per_blk *
-                              xdlops_gemm.mfma_instr.group_size ==
-                          xdlops_gemm.GetRegSizePerXdlops(),
-                      "Assume num_regs_per_blk == num_groups_per_blk * group_size");
-
         // Prefetch a_scales
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}([&](auto g) {
-                    auto a_scale_thread_buf_group =
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto a_scale_offset =
+                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
+                    auto a_scale_thread_buf_copy =
                         make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-                            a_scale_thread_desc_group.GetElementSpaceSize());
-
+                            a_scale_thread_desc_copy.GetElementSpaceSize());
                     a_scale_thread_copy.Run(a_scale_grid_desc,
                                             a_scale_grid_buf,
-                                            a_scale_thread_desc_group,
+                                            a_scale_thread_desc_copy,
                                             make_tuple(I0, I0),
-                                            a_scale_thread_buf_group);
+                                            a_scale_thread_buf_copy);
 
-                    static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}([&](auto i) {
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, g, i));
-                        a_scale_thread_buf(Number<a_scale_offset>{}) =
-                            a_scale_thread_buf_group[Number<i>{}];
-                    });
-                    // go to the next group
+                    a_scale_thread_buf(Number<a_scale_offset>{}) =
+                        a_scale_thread_buf_copy[Number<0>{}];
                     a_scale_thread_copy.MoveSrcSliceWindow(
                         a_scale_grid_desc,
-                        make_multi_index(2 * xdlops_gemm.mfma_instr.group_size, 0));
-                }); // g
-
-                // restore row id and advance to the next scale
-                a_scale_thread_copy.MoveSrcSliceWindow(
-                    a_scale_grid_desc,
-                    make_multi_index(-2 * xdlops_gemm.mfma_instr.group_size *
-                                         xdlops_gemm.mfma_instr.num_groups_per_blk,
-                                     1));
-            }); // k0
-
-            // restore column id and advance to the next set of rows
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
             a_scale_thread_copy.MoveSrcSliceWindow(
                 a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
-        }); // m0
+        });
 
         // restore row id and advance to the next set of scales
         a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
@@ -326,15 +270,32 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
 
         // Prefetch b_scales
         static_for<0, NRepeat, 1>{}([&](auto n0) {
-            b_scale_thread_copy.Run(b_scale_grid_desc,
-                                    b_scale_grid_buf,
-                                    b_scale_thread_desc,
-                                    make_tuple(n0, I0),
-                                    b_scale_thread_buf);
-            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                                   make_multi_index(NWaves * NPerXDL, 0));
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_buf(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
         });
 
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
         b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
                                                make_multi_index(-NPerBlock, ScalesPerKBlockSize));
 
@@ -345,8 +306,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
         // Initialize C
         c_thread_buf.Clear();
 
-        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>();
-
         // main body
         if constexpr(HasMainLoop)
         {
@@ -363,141 +322,166 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
 
                 block_sync_lds();
 
+                // k indexes mapping to threads for 32x32x64:
+                // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                //              k = 0                 k = 1
+
+                //  k indexes mapping to threads for 16x16x128:
+                // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                //              k = 0                    k = 1
                 static_for<0, KRepeat, 1>{}([&](auto k) {
-                    constexpr auto a_k_step = k * AMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
-                    constexpr auto b_k_step = k * BMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
+                    constexpr auto k_step =
+                        k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                           make_tuple(m0, I0, I0, Number<a_k_step>{}),
-                                           a_block_buf,
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, k, I0),
-                                           a_thread_buf);
+                        static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k,
+                                make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                a_block_buf,
+                                a_thread_desc_,
+                                make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                a_thread_buf);
+                        });
                     });
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<b_k_step>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, I0),
-                                           b_thread_buf);
+                        // read block data in chunks to assemble correct thread vectors
+                        static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(
+                                b_block_desc_n0_n1_n2_k,
+                                make_tuple(n0, I0, I0, Number<b_k_step_chunk>{}),
+                                b_block_buf,
+                                b_thread_desc_,
+                                make_tuple(n0, I0, k, Number<chunk * KThreadChunk>{}),
+                                b_thread_buf);
+                        });
                     });
                 });
 
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            c_thread_buf_per_scale.Clear();
-                            vector_type<ComputeDataType, KPack> a_thread_vec;
-                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack> b_thread_vec;
 
                             static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                         make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                         make_tuple(n0, I0, k0, ik))>{}];
                             });
 
-                            using mfma_input_type =
-                                typename vector_type<ComputeDataType,
-                                                     xdlops_gemm.K1PerXdlops>::type;
+                            constexpr index_t a_scale_offset =
+                                a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                            constexpr index_t b_scale_offset =
+                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                            static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                          "Must have at least one scale per Xdlops per Thread.");
+
+                            vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
+                                a_scale_thread_vec;
+                            vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                b_scale_thread_vec;
+
+                            // Pack scale_thread_buf into scale_thread_vec
+                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                    a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                                b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                    b_scale_thread_buf[Number<b_scale_offset + s>{}];
+                            });
+
+                            using mfma_input_type_a =
+                                typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                            using mfma_input_type_b =
+                                typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
 
                             // MFMA accumulation
-                            // m = 1:MPerXDL
-                            //   n = 1:NPerXDL
-                            //     k = 1:KPack
-                            //       c(m,n) += a(m,k)*b(k,n)
                             xdlops_gemm.template Run<>(
-                                a_thread_vec.template AsType<mfma_input_type>(),
-                                b_thread_vec.template AsType<mfma_input_type>(),
-                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
-
-                            // one scale per k0
-                            constexpr index_t b_scale_offset =
-                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0));
-
-                            static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}(
-                                [&](auto g) {
-                                    static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}(
-                                        [&](auto r) {
-                                            constexpr index_t a_scale_offset =
-                                                a_scale_thread_desc.CalculateOffset(
-                                                    make_tuple(m0, k0, g, r));
-
-                                            constexpr auto reg_offset =
-                                                g * xdlops_gemm.mfma_instr.group_size + r;
-
-                                            constexpr index_t c_offset =
-                                                c_thread_desc_.CalculateOffset(
-                                                    make_tuple(m0, n0, reg_offset));
-
-                                            c_thread_buf(Number<c_offset>{}) +=
-                                                c_thread_buf_per_scale[Number<reg_offset>{}] *
-                                                type_convert<AccDataType>(
-                                                    b_scale_thread_buf[Number<b_scale_offset>{}]) *
-                                                type_convert<AccDataType>(
-                                                    a_scale_thread_buf[Number<a_scale_offset>{}]);
-                                        });
-                                });
+                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                b_scale_thread_vec.template AsType<BScaleDataType>(),
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                         });
                     });
                 });
 
+                // Prefetch a_scales
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}([&](auto g) {
-                            auto a_scale_thread_buf_group =
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            constexpr auto a_scale_offset =
+                                a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
+                            auto a_scale_thread_buf_copy =
                                 make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-                                    a_scale_thread_desc_group.GetElementSpaceSize());
-
+                                    a_scale_thread_desc_copy.GetElementSpaceSize());
                             a_scale_thread_copy.Run(a_scale_grid_desc,
                                                     a_scale_grid_buf,
-                                                    a_scale_thread_desc_group,
+                                                    a_scale_thread_desc_copy,
                                                     make_tuple(I0, I0),
-                                                    a_scale_thread_buf_group);
+                                                    a_scale_thread_buf_copy);
 
-                            static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}([&](auto r) {
-                                constexpr index_t a_scale_offset =
-                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, g, r));
-                                a_scale_thread_buf(Number<a_scale_offset>{}) =
-                                    a_scale_thread_buf_group[Number<r>{}];
-                            });
-                            // go to the next group
+                            a_scale_thread_buf(Number<a_scale_offset>{}) =
+                                a_scale_thread_buf_copy[Number<0>{}];
                             a_scale_thread_copy.MoveSrcSliceWindow(
                                 a_scale_grid_desc,
-                                make_multi_index(2 * xdlops_gemm.mfma_instr.group_size, 0));
-                        }); // g
-
-                        // restore row id and advance to the next scale
-                        a_scale_thread_copy.MoveSrcSliceWindow(
-                            a_scale_grid_desc,
-                            make_multi_index(-2 * xdlops_gemm.mfma_instr.group_size *
-                                                 xdlops_gemm.mfma_instr.num_groups_per_blk,
-                                             1));
-                    }); // k0
-
-                    // restore column id and advance to the next set of rows
+                                make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                        });
+                    });
                     a_scale_thread_copy.MoveSrcSliceWindow(
                         a_scale_grid_desc,
                         make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
-                }); // m0
+                });
 
                 // restore row id and advance to the next set of scales
                 a_scale_thread_copy.MoveSrcSliceWindow(
                     a_scale_grid_desc, make_multi_index(-MPerBlock, ScalesPerKBlockSize));
 
+                // Prefetch b_scales
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc,
-                                            make_tuple(n0, I0),
-                                            b_scale_thread_buf);
-                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                                           make_multi_index(NWaves * NPerXDL, 0));
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            constexpr auto b_scale_offset =
+                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                            auto b_scale_thread_buf_copy =
+                                make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                    b_scale_thread_desc_copy.GetElementSpaceSize());
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc_copy,
+                                                    make_tuple(I0, I0),
+                                                    b_scale_thread_buf_copy);
+
+                            b_scale_thread_buf(Number<b_scale_offset>{}) =
+                                b_scale_thread_buf_copy[Number<0>{}];
+                            b_scale_thread_copy.MoveSrcSliceWindow(
+                                b_scale_grid_desc,
+                                make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                        });
+                    });
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
                 });
+
+                // restore col id and advance to the next set of scales
                 // NWaves * NPerXDL * NRepeat == NPerBlock
                 b_scale_thread_copy.MoveSrcSliceWindow(
                     b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
@@ -507,7 +491,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
                 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
 
                 i += 1;
-
             } while(i < (num_loop - 1));
         }
 
@@ -517,94 +500,107 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
             block_sync_lds();
 
             static_for<0, KRepeat, 1>{}([&](auto k) {
-                constexpr auto a_k_step = k * AMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
-                constexpr auto b_k_step = k * BMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
 
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                       make_tuple(m0, I0, I0, Number<a_k_step>{}),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, k, I0),
-                                       a_thread_buf);
+                    // read block data in chunks to assemble correct thread
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
                 });
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                       make_tuple(n0, I0, I0, Number<b_k_step>{}),
-                                       b_block_buf,
-                                       b_thread_desc_,
-                                       make_tuple(n0, I0, k, I0),
-                                       b_thread_buf);
+                    // read block data in chunks to assemble correct thread
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<b_k_step_chunk>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
                 });
             });
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        c_thread_buf_per_scale.Clear();
-                        vector_type<ComputeDataType, KPack> a_thread_vec;
-                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
 
                         static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
                                     make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_buf[Number<b_thread_desc_.CalculateOffset(
                                     make_tuple(n0, I0, k0, ik))>{}];
                         });
 
-                        using mfma_input_type =
-                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
 
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type>(),
-                            b_thread_vec.template AsType<mfma_input_type>(),
-                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
-
-                        // one scale per k0
                         constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0));
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
 
-                        static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}([&](auto g) {
-                            static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}([&](auto r) {
-                                constexpr index_t a_scale_offset =
-                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, g, r));
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
 
-                                constexpr auto reg_offset =
-                                    g * xdlops_gemm.mfma_instr.group_size + r;
-
-                                constexpr index_t c_offset =
-                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, reg_offset));
-
-                                c_thread_buf(Number<c_offset>{}) +=
-                                    c_thread_buf_per_scale[Number<reg_offset>{}] *
-                                    type_convert<AccDataType>(
-                                        b_scale_thread_buf[Number<b_scale_offset>{}]) *
-                                    type_convert<AccDataType>(
-                                        a_scale_thread_buf[Number<a_scale_offset>{}]);
-                            });
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_buf[Number<b_scale_offset + s>{}];
                         });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
             });
         }
     }
 
-    // TODO: make this field protected when a_scale_thread_copy_ is moved here
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
     static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MRepeat>{},
-                   Number<KRepeat>{},
-                   Number<xdlops_gemm.mfma_instr.num_groups_per_blk>{},
-                   Number<xdlops_gemm.mfma_instr.group_size>{}));
+        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
 
     // Is used to copy data from a_scale_grid to a_scale_thread
-    static constexpr auto a_scale_thread_desc_group = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<xdlops_gemm.mfma_instr.group_size>{}, Number<1>{}));
+    static constexpr auto a_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
 
-    // TODO: make this field protected when b_scale_thread_copy_ is moved here
-    static constexpr auto b_scale_thread_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, Number<KRepeat>{}));
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from b_scale_grid to b_scale_thread_buf
+    static constexpr auto b_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
 
     protected:
     using Base::a_thread_copy_;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index 34df9a1d7b..8a370304c6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -694,14 +694,7 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
 
     static constexpr bool IsValidCompilationParameter()
     {
-        // TODO: properly implement this check
-        static_assert((is_same_v<ADataType, f8_t> || is_same_v<ADataType, bf8_t> ||
-                       is_same_v<ADataType, f6_t> || is_same_v<ADataType, bf6_t> ||
-                       is_same_v<ADataType, f4_t>)&&(is_same_v<BDataType, f8_t> ||
-                                                     is_same_v<BDataType, bf8_t> ||
-                                                     is_same_v<BDataType, f6_t> ||
-                                                     is_same_v<BDataType, bf6_t> ||
-                                                     is_same_v<BDataType, f4_t>),
+        static_assert(is_scale_mfma_data_type<ADataType>() && is_scale_mfma_data_type<BDataType>(),
                       "Only microscaling formats are supported for ADataType and BDataType");
 
         static_assert(ScaleBlockSize == 32, "Only ScaleBlockSize 32 is supported");
@@ -711,6 +704,11 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
 
     static bool IsSupportedArgument(const Argument& arg)
     {
+        if constexpr(!IsValidCompilationParameter())
+        {
+            return false;
+        }
+
         if(!ck::is_xdl_supported())
         {
             return false;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index c204b95d0f..44d515e76c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -159,16 +159,22 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
     static constexpr auto AK1Number = Number<AK1Value>{};
     static constexpr auto BK1Number = Number<BK1Value>{};
 
-    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
-    static constexpr bool is_single_rate_mfma =
-        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-         lcm_AK1_BK1 <= 4)
-            ? true
-            : false;
+    static constexpr auto lcm_AK1_BK1         = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
+
+    //> KPack is at least the k_per_blk of selected mfma
+    //
+    // Should be a multiple of k_per_blk.
+    // TODO: Move this to blockwise pipeline base
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeB,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
@@ -1088,10 +1094,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         static_assert(KPerBlock % ScaleBlockSize == 0,
                       "KPerBlock should be multiple of ScaleBlockSize");
 
-        static_assert(KPerBlock / ScaleBlockSize == BlockwiseGemmPipe::KRepeat,
-                      "Single call to xdlops_gemm::Run should process exactly ScaleBlockSize "
-                      "elements in k dimension");
-
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
@@ -1476,61 +1478,63 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        static constexpr auto mfma        = BlockwiseGemmPipe::xdlops_gemm.mfma;
-        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
-        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
-        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
-        static constexpr auto KPerThread  = KPerBlock / K0PerXdlops;
-
-        // NXdlPerWave == NRepeat
-        // MXdlPerWave == MRepeat
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
-        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-
-        // Initial thread mapping for MPerXdl=NPerXdl=32 and MPerBlock=NPerBlock=128 MWaves=NWaves=2
+        // Initial thread mapping for:
+        // BlockSize = 256
+        // MPerXdl=NPerXdl=32 and MPerBlock=NPerBlock=128 MRepeat=NRepeat=2 MWaves=NWaves=2
+        // For each [m0, n0] tile, there are 4 waves:
         // tId in [  0,  63]  m x n = [ 0, 31] x [ 0, 31]  waveId = [0, 0]
         // tId in [ 64, 127]  m x n = [ 0, 31] x [32, 63]  waveId = [0, 1]
         // tId in [128, 191]  m x n = [32, 63] x [ 0, 31]  waveId = [1, 0]
         // tId in [192, 255]  m x n = [32, 63] x [32, 63]  waveId = [1, 1]
 
-        auto a_thread_offset_m =
-            MPerXdl * ((get_thread_local_1d_id() / BlockwiseGemmPipe::WaveSize) / MWaves) +
-            mfma.selected_mfma.group_size *
-                ((get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) / MPerXdl);
-        auto a_thread_offset_k = KPerThread * (get_thread_local_1d_id() % MPerXdl) / MPerXdl;
+        // BlockSize = 128
+        // MPerXdl=NPerXdl=16 and MPerBlock=128 NPerBlock=16 MRepeat=4 NRepeat=1 MWaves=2 NWaves=1
+        // For each [m0, n0] tile, there are 2 waves:
+        // tId in [  0,  63]  m x n = [ 0, 15] x [0, 15]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [16, 31] x [0, 15]  waveId = [1, 0]
 
-        auto b_thread_offset_n =
-            get_thread_local_1d_id() % NPerXdl +
-            (get_thread_local_1d_id() / BlockwiseGemmPipe::WaveSize) % NWaves * NPerXdl;
-        auto b_thread_offset_k = KPerThread * (get_thread_local_1d_id() % NPerXdl) / NPerXdl;
+        // TODO: Document initial thread mapping for more combinations of parameters
 
-        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
-            AScaleDataType,
-            AScaleDataType,
-            decltype(a_scale_grid_desc_am_ak),                      // SrcDesc
-            decltype(BlockwiseGemmPipe::a_scale_thread_desc_group), // DstDesc
-            Sequence<mfma.selected_mfma.group_size, 1>,             // SliceLengths
-            Sequence<0, 1>,                                         // DimAccessOrder
-            0,                                                      // SrcVectorDim
-            1,                                                      // SrcScalarPerVector
-            1,                                                      // SrcScalarStrideInVector
-            true>(a_scale_grid_desc_am_ak,
-                  make_multi_index(block_m_id * MPerBlock + a_thread_offset_m,
-                                   a_thread_offset_k / ScaleBlockSize));
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
 
-        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
-            BScaleDataType,
-            BScaleDataType,
-            decltype(b_scale_grid_desc_bn_ak),
-            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
-            Sequence<1, BlockwiseGemmPipe::KRepeat>, // SliceLengths
-            Sequence<0, 1>,                          // DimAccessOrder
-            1,                                       // SrcVectorDim
-            BlockwiseGemmPipe::KRepeat,              // SrcScalarPerVector
-            1,
-            false>(b_scale_grid_desc_bn_ak,
-                   make_multi_index(block_n_id * NPerBlock + b_thread_offset_n,
-                                    b_thread_offset_k / ScaleBlockSize));
+        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+
+        auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
+                               mfma.selected_mfma.num_threads_per_blk;
+
+        auto a_thread_offset_m = get_thread_local_1d_id() % MPerXdl + waveId_m * MPerXdl;
+
+        auto a_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<AScaleDataType,
+                                             AScaleDataType,
+                                             decltype(a_scale_grid_desc_am_ak),
+                                             decltype(BlockwiseGemmPipe::a_scale_thread_desc_copy),
+                                             Sequence<1, 1>, // SliceLengths
+                                             Sequence<0, 1>, // DimAccessOrder
+                                             1,              // SrcVectorDim
+                                             1,              // SrcScalarPerVector
+                                             1,              // SrcScalarStrideInVector
+                                             true>(
+                a_scale_grid_desc_am_ak,
+                make_multi_index(block_m_id * MPerBlock + a_thread_offset_m, thread_offset_k));
+
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl + waveId_n * NPerXdl;
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleDataType,
+                                             BScaleDataType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(BlockwiseGemmPipe::b_scale_thread_desc_copy),
+                                             Sequence<1, 1>, // SliceLengths
+                                             Sequence<0, 1>, // DimAccessOrder
+                                             1,              // SrcVectorDim
+                                             1,              // SrcScalarPerVector
+                                             1,
+                                             true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock + b_thread_offset_n, thread_offset_k));
 
         blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
                                                                          a_block_desc_ak0_m_ak1,
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 0310fe37a0..2255505985 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -211,8 +211,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3
  * @tparam SrcVectorDim The dimension along which vectorized access is performed in the source
  * tensor.
  * @tparam SrcScalarPerVector The number of scalar elements per vector in the source tensor.
- * @tparam SrcScalarStrideInVector The stride of scalar elements within a vector in the source
- * tensor.
+ * @tparam SrcScalarStrideInVector Not used.
  * @tparam SrcResetCoordinateAfterRun controls whether source coordinate is restored after each Run
  * or rolled back one step in MoveSrcSliceWindow
  * @tparam InvalidElementAsNaN Whether to fill invalid elements with NaN (only applicable for
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index a638ca8608..529a1a1729 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -845,15 +845,24 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
     static constexpr bool is_k_reduction         = true; // ???
     // clang-format on
 
-    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              class FloatA,
+              class ScaleA,
+              class FloatB,
+              class ScaleB,
+              class FloatC>
     __device__ void run(const FloatA& a,
-                        const int32_t scale_a,
+                        const ScaleA& scale_a,
                         const FloatB& b,
-                        const int32_t scale_b,
+                        const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
+        static_assert(scalar_type<ScaleA>::vector_size == 1, "Expect single scale at this point.");
+        static_assert(scalar_type<ScaleB>::vector_size == 1, "Expect single scale at this point.");
+
         intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
-            a, scale_a, b, scale_b, reg_c);
+            a, utils::get_exponent_value(scale_a), b, utils::get_exponent_value(scale_b), reg_c);
     }
 };
 
@@ -874,15 +883,24 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
     static constexpr bool is_k_reduction         = true; // ???
     // clang-format on
 
-    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              class FloatA,
+              class ScaleA,
+              class FloatB,
+              class ScaleB,
+              class FloatC>
     __device__ void run(const FloatA& a,
-                        const int32_t scale_a,
+                        const ScaleA& scale_a,
                         const FloatB& b,
-                        const int32_t scale_b,
+                        const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
+        static_assert(scalar_type<ScaleA>::vector_size == 1, "Expect single scale at this point.");
+        static_assert(scalar_type<ScaleB>::vector_size == 1, "Expect single scale at this point.");
+
         intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
-            a, scale_a, b, scale_b, reg_c);
+            a, utils::get_exponent_value(scale_a), b, utils::get_exponent_value(scale_b), reg_c);
     }
 };
 
@@ -890,14 +908,16 @@ template <typename base_type,
           index_t MPerXdlops,
           index_t NPerXdlops,
           typename additional_type = base_type,
-          bool is_single_rate_mfma = false>
+          bool is_single_rate_mfma = false,
+          bool is_scale_mfma       = false>
 struct MfmaSelector
 {
     template <typename base_type_,
               index_t MPerXdlops_,
               index_t NPerXdlops_,
               typename additional_type_ = base_type_,
-              bool is_single_rate_mfma_ = false>
+              bool is_single_rate_mfma_ = false,
+              bool is_scale_mfma_       = false>
     static constexpr auto GetMfma();
 
     template <>
@@ -1103,12 +1123,24 @@ struct MfmaSelector
         return MfmaInstr::mfma_f32_32x32x16f8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 32, 32, f8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<f8_t, 16, 16>()
     {
         return MfmaInstr::mfma_f32_16x16x32f8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 16, 16, f8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<bf8_t, 32, 32>()
     {
@@ -1145,8 +1177,12 @@ struct MfmaSelector
         return MfmaInstr::mfma_f32_16x16x32bf8f8;
     }
 
-    static constexpr auto selected_mfma = mfma_type<
-        GetMfma<base_type, MPerXdlops, NPerXdlops, additional_type, is_single_rate_mfma>()>{};
+    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type,
+                                                            MPerXdlops,
+                                                            NPerXdlops,
+                                                            additional_type,
+                                                            is_single_rate_mfma,
+                                                            is_scale_mfma>()>{};
 
     __host__ __device__ constexpr MfmaSelector()
     {
@@ -1194,7 +1230,8 @@ template <typename base_type,
           index_t NPerXdlops,
           index_t KPack,
           typename additional_type = base_type,
-          bool TransposeC          = false>
+          bool TransposeC          = false,
+          bool is_scale_mfma       = false>
 struct XdlopsGemm
 {
     static constexpr auto I0 = Number<0>{};
@@ -1225,7 +1262,7 @@ struct XdlopsGemm
                           MPerXdlops == 64,
                       "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
 
-        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack cannot be divided by k_per_blk");
+        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack should be a multiple of k_per_blk");
     }
 
     // XDL output supporting C = A * B
@@ -1368,6 +1405,27 @@ struct XdlopsGemm
         });
     }
 
+    template <class FloatA, class ScaleA, class FloatB, class ScaleB, class FloatC>
+    __device__ void Run(const FloatA& p_a_wave,
+                        const ScaleA& a_scale_thread,
+                        const FloatB& p_b_wave,
+                        const ScaleB& b_scale_thread,
+                        FloatC& p_c_thread) const
+    {
+        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
+            if constexpr(!TransposeC)
+            {
+                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                    p_a_wave[k], a_scale_thread[k], p_b_wave[k], b_scale_thread[k], p_c_thread);
+            }
+            else
+            {
+                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                    p_b_wave[k], b_scale_thread[k], p_a_wave[k], a_scale_thread[k], p_c_thread);
+            }
+        });
+    }
+
     __device__ static auto GetLaneId() { return get_thread_local_1d_id() % mfma_instr.wave_size; }
 
     __device__ static auto GetBlkIdx()
@@ -1455,7 +1513,8 @@ struct XdlopsGemm
                             KPack <= 4) ||
                            (is_same<base_type, int8_t>::value && KPack <= 8))
                               ? true
-                              : false > {};
+                              : false,
+                          is_scale_mfma > {};
 
     static constexpr auto mfma_instr = mfma.selected_mfma;
 
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 0d4611becc..a54a181bf1 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -520,9 +520,9 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
 {
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
-                               const int32_t scale_a,
+                               const int32_t& scale_a,
                                const f8x32_t& reg_b,
-                               const int32_t scale_b,
+                               const int32_t& scale_b,
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
@@ -538,6 +538,14 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 scale_a,
                 0, // OPSEL
                 scale_b);
+        // XXX: Note on the scale_a and scale_b parameters:
+        // If compiler detects that one or both scales are constant values, it will treat that
+        // constant as F32 constant. I.e., if scale_a at some point was declared as
+        // `e8m0_bexp_t a_scale{1.0f}`, the instruction would only work if scale_a parameter is
+        // assigned value `bit_cast<int32_t>(static_cast<float>(a_scale))`.
+
+        // XXX: Note on the OPSEL parameters: Instruction always takes byte0 as a scale value even
+        // when OPSEL is set otherwise.
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -556,9 +564,9 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
 {
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
-                               const int32_t scale_a,
+                               const int32_t& scale_a,
                                const f8x32_t& reg_b,
-                               const int32_t scale_b,
+                               const int32_t& scale_b,
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
diff --git a/include/ck/utility/e8m0.hpp b/include/ck/utility/e8m0.hpp
index a692f533f8..f7d2a2f594 100644
--- a/include/ck/utility/e8m0.hpp
+++ b/include/ck/utility/e8m0.hpp
@@ -67,10 +67,10 @@ struct e8m0_bexp_t
 namespace utils {
 
 template <typename T>
-__host__ __device__ inline int get_exponent_value(T x);
+__host__ __device__ inline constexpr int32_t get_exponent_value(T x);
 
 template <>
-__host__ __device__ inline int get_exponent_value<e8m0_bexp_t>(e8m0_bexp_t x)
+__host__ __device__ inline constexpr int32_t get_exponent_value<e8m0_bexp_t>(e8m0_bexp_t x)
 {
     return x.data;
 }
diff --git a/include/ck/utility/mxfp_utils.hpp b/include/ck/utility/mxfp_utils.hpp
index f0a86f8750..cf7a3e8713 100644
--- a/include/ck/utility/mxfp_utils.hpp
+++ b/include/ck/utility/mxfp_utils.hpp
@@ -32,13 +32,13 @@ template <typename T>
 __host__ __device__ inline bool is_inf(e8m0_bexp_t const scale, T const data);
 
 template <typename T>
-__host__ __device__ inline int get_exponent_value(T x)
+__host__ __device__ inline constexpr int32_t get_exponent_value(T x)
 {
     x >>= NumericUtils<T>::mant;
 
     x &= ((1 << NumericUtils<T>::exp) - 1);
 
-    return static_cast<int>(x);
+    return static_cast<int32_t>(x);
 }
 
 template <typename T>
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 1f9091ebc5..d22157c3b3 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -30,48 +30,69 @@ enum class MFMA_F8F6F4
 
 };
 
-template <typename AFragT, typename BFragT, typename AccumFragT, int32_t BLOCK_M, int32_t BLOCK_N>
+template <int32_t BLOCK_M, int32_t BLOCK_N>
 struct mfma_type_selector;
 
-template <typename AFragT, typename BFragT, typename AccumFragT>
-struct mfma_type_selector<AFragT, BFragT, AccumFragT, 16, 16>
+template <>
+struct mfma_type_selector<16, 16>
 {
-    __device__ void operator()(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
+    template <typename AFragT, typename BFragT, typename AccumFragT>
+    __device__ static void run(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_f32_16x16x128f8f6f4>{};
-        op.template run<16, 16, AFragT, BFragT, AccumFragT>(fragA, fragB, fragAcc);
-    }
-
-    __device__ void operator()(AFragT const& fragA,
-                               const int32_t scale_a,
-                               BFragT const& fragB,
-                               const int32_t scale_b,
-                               AccumFragT& fragAcc)
-    {
-        auto op = mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>{};
-        op.template run<16, 16, AFragT, BFragT, AccumFragT>(
-            fragA, scale_a, fragB, scale_b, fragAcc);
+        op.template run<16, 16>(fragA, fragB, fragAcc);
     }
 };
 
-template <typename AFragT, typename BFragT, typename AccumFragT>
-struct mfma_type_selector<AFragT, BFragT, AccumFragT, 32, 32>
+template <>
+struct mfma_type_selector<32, 32>
 {
-    __device__ void operator()(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
+    template <typename AFragT, typename BFragT, typename AccumFragT>
+    __device__ static void run(AFragT const& fragA, BFragT const& fragB, AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>{};
-        op.template run<32, 32, AFragT, BFragT, AccumFragT>(fragA, fragB, fragAcc);
+        op.template run<32, 32>(fragA, fragB, fragAcc);
     }
+};
 
-    __device__ void operator()(AFragT const& fragA,
-                               const int32_t scale_a,
+template <int32_t BLOCK_M, int32_t BLOCK_N>
+struct mfma_scale_type_selector;
+
+template <>
+struct mfma_scale_type_selector<16, 16>
+{
+    template <typename AFragT,
+              typename AScaleFragT,
+              typename BFragT,
+              typename BScaleFragT,
+              typename AccumFragT>
+    __device__ static void run(AFragT const& fragA,
+                               AScaleFragT const& scale_a,
                                BFragT const& fragB,
-                               const int32_t scale_b,
+                               BScaleFragT const& scale_b,
+                               AccumFragT& fragAcc)
+    {
+        auto op = mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>{};
+        op.template run<16, 16>(fragA, scale_a[Number<0>{}], fragB, scale_b[Number<0>{}], fragAcc);
+    }
+};
+
+template <>
+struct mfma_scale_type_selector<32, 32>
+{
+    template <typename AFragT,
+              typename AScaleFragT,
+              typename BFragT,
+              typename BScaleFragT,
+              typename AccumFragT>
+    __device__ static void run(AFragT const& fragA,
+                               AScaleFragT const& scale_a,
+                               BFragT const& fragB,
+                               BScaleFragT const& scale_b,
                                AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>{};
-        op.template run<32, 32, AFragT, BFragT, AccumFragT>(
-            fragA, scale_a, fragB, scale_b, fragAcc);
+        op.template run<32, 32>(fragA, scale_a[Number<0>{}], fragB, scale_b[Number<0>{}], fragAcc);
     }
 };
 
@@ -334,8 +355,7 @@ __device__ AFragT load_mx_A_row_major(AType const* input_ptr,
     // BLOCK_K / BLOCK_X is a stride in xA matrix
     auto startOffset = row_major(startCoord2D, BLOCK_K / BLOCK_X);
 
-    // obtain 8-bit exponent
-    fragX = utils::get_exponent_value(scale_ptr[startOffset]) & 0xFF;
+    fragX = scale_ptr[startOffset];
 
     return load_A_row_major<AType, AFragT, BLOCK_M, BLOCK_K>(input_ptr);
 }
@@ -502,7 +522,7 @@ __device__ BFragT load_mx_B_col_major(BType const* input_ptr,
     auto startOffset = col_major(startCoord2D, BLOCK_K / BLOCK_X);
 
     // obtain 8-bit exponent
-    fragX = utils::get_exponent_value(scale_ptr[startOffset]) & 0xFF;
+    fragX = scale_ptr[startOffset];
 
     return load_B_col_major<BType, BFragT, BLOCK_K, BLOCK_N>(input_ptr);
 }
@@ -773,7 +793,8 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
 
     // Matrix multiply-accumulate using MFMA units
     // Accumulation intermediate = BLOCK_M x BLOCK_N
-    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(fragA, fragB, fragAcc);
+    using mfma = mfma_type_selector<BLOCK_M, BLOCK_N>;
+    mfma::template run<>(fragA, fragB, fragAcc);
 
     for(int i = 0; i < vectorSize(fragC); ++i)
     {
@@ -805,29 +826,34 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
     using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
     using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
-    using ScaleFragT    = int32_t;
+    using AScaleFragT   = vector_type<ScaleType, 1>::type;
+    using BScaleFragT   = vector_type<ScaleType, 1>::type;
 
     // Create frags
     auto fragA   = AFragT{};
     auto fragB   = BFragT{};
     auto fragC   = CFragT{};
     auto fragAcc = AccumFragT{0};
-    auto fragXa  = ScaleFragT{0};
-    auto fragXb  = ScaleFragT{0};
+    auto fragXa  = AScaleFragT{};
+    auto fragXb  = BScaleFragT{};
 
     // Load the inputs.
     // A = col major, BLOCK_M x BLOCK_K
-    fragA = load_mx_A_row_major<AType, AFragT, ScaleType, ScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
+    fragA = load_mx_A_row_major<AType, AFragT, ScaleType, AScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
         a, xa, fragXa);
 
     // B = col major, BLOCK_K x BLOCK_N
-    fragB = load_mx_B_col_major<BType, BFragT, ScaleType, ScaleFragT, BLOCK_K, BLOCK_N, BLOCK_X>(
+    fragB = load_mx_B_col_major<BType, BFragT, ScaleType, BScaleFragT, BLOCK_K, BLOCK_N, BLOCK_X>(
         b, xb, fragXb);
 
     // Scaled Matrix multiply-accumulate using MFMA units
     // Accumulation intermediate = BLOCK_M x BLOCK_N
-    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(
-        fragA, fragXa, fragB, fragXb, fragAcc);
+    using mfma = mfma_scale_type_selector<BLOCK_M, BLOCK_N>;
+    mfma::template run<>(fragA,
+                         fragXa.template AsType<ScaleType>(),
+                         fragB,
+                         fragXb.template AsType<ScaleType>(),
+                         fragAcc);
 
     for(int i = 0; i < vectorSize(fragC); ++i)
     {

From 94d47b1680eaafacca142f2498fb94d08a5b66d3 Mon Sep 17 00:00:00 2001
From: joyeamd <john.ye@amd.com>
Date: Wed, 16 Apr 2025 09:21:04 +0800
Subject: [PATCH 045/443] fmha hdim256 vectorize improve (#2086)

For hdim 256, will not have vectorized buffer load when seqlen % 256 != 0 and hdim % 256 = 0; this commit tries to solve this condition.
---
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 10a6e5c1d7..3634810b37 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -445,6 +445,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                 # if True:
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', bias, lse, dropout, squant, mask))
 
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))

From c5975529bb016318ae135431d61761b885f0f5b9 Mon Sep 17 00:00:00 2001
From: felix <felix.li@amd.com>
Date: Wed, 16 Apr 2025 10:53:21 +0800
Subject: [PATCH 046/443] add preshuffle gemm fp16 (#2036)

* add preshuffle gemm fp16

* clang format and test ok

* Update gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp

remove useless comments in example

* Update gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp

remove 2

---------

Co-authored-by: coderfeli <coderfeli@163.com>
---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   1 +
 ...multiply_multiply_xdl_fp16_bpreshuffle.cpp | 371 ++++++++++++++++++
 2 files changed, 372 insertions(+)
 create mode 100644 example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 95fd8bace8..deca85ae64 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp)
+add_example_executable(example_gemm_multiply_multiply_xdl_fp16_bpreshuffle gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp)
 add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp)
 add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
new file mode 100644
index 0000000000..69803c7eeb
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F16;
+using B0DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F16;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<F16, float, float, float>(F16& e,
+                                                                            const float& c,
+                                                                            const float& d0,
+                                                                            const float& d1) const
+    {
+        const float x0_f = c * d0 * d1;
+
+        e = ck::type_convert<F16>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<BF16, float, float, float>(BF16& e,
+                                                                             const float& c,
+                                                                             const float& d0,
+                                                                             const float& d1) const
+    {
+        const float x0_f = c * d0 * d1;
+
+        e = ck::type_convert<BF16>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, int, float, float>(
+        ck::half_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
+        ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::bhalf_t>(x0_f);
+    }
+};
+
+void preShuffleBuffer(const F16* src, F16* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(F16);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyMultiply;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
+    // clang-format off
+///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+///######|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
+///###### RCR
+        // kernel 1: 256->32x128x128 
+        <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,
+               32,   128,    128,
+               8,   8,
+               32,   32,
+               1,    1,
+               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               1,    1,   S<1, 16, 1, 16>, S<8, 8, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, F16>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+
+    ck::index_t KBatch = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        KBatch = std::stoi(argv[11]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf(
+            "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B0DataType> b0_preshuffled(
+        f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        break;
+    case 2:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    constexpr auto I0 = ck::Number<0>{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(b0_k_n.mData.data(), b0_preshuffled.mData.data(), N, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, NumDTensor>{I0, I0},
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 50, false, 1});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false});
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+                                                                                B0DataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}

From eaf1f0bf3b8fc015971be2e300e82abdd97ccfed Mon Sep 17 00:00:00 2001
From: "BingYuan.Zhou" <BingYuan.Zhou@amd.com>
Date: Wed, 16 Apr 2025 16:51:17 +0800
Subject: [PATCH 047/443] [flatmm] implement basic fp16 flatmm (#2089)

* [flatmm] implement basic fp16 flatmm

* fix CI build fail

---------

Co-authored-by: root <root@hjbog-srdc-50.amd.com>
Co-authored-by: solin <bingzhou@amd.com>
---
 example/ck_tile/18_flatmm/CMakeLists.txt      |   7 +
 example/ck_tile/18_flatmm/README.md           |  35 ++
 example/ck_tile/18_flatmm/flatmm_basic.cpp    | 102 ++++
 example/ck_tile/18_flatmm/flatmm_basic.hpp    | 100 ++++
 .../ck_tile/18_flatmm/run_flatmm_example.inc  | 281 ++++++++++
 .../18_flatmm/script/smoke_test_basic.sh      |  34 ++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/ops/flatmm.hpp                |   6 +
 .../block_flatmm_asmem_bsmem_creg_v1.hpp      | 187 +++++++
 ...atmm_asmem_bsmem_creg_v1_custom_policy.hpp |  38 ++
 .../ops/flatmm/kernel/flatmm_kernel.hpp       | 496 ++++++++++++++++++
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   | 208 ++++++++
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp | 265 ++++++++++
 .../ops/flatmm/pipeline/tile_flatmm_shape.hpp |  43 ++
 14 files changed, 1803 insertions(+)
 create mode 100644 example/ck_tile/18_flatmm/CMakeLists.txt
 create mode 100644 example/ck_tile/18_flatmm/README.md
 create mode 100644 example/ck_tile/18_flatmm/flatmm_basic.cpp
 create mode 100644 example/ck_tile/18_flatmm/flatmm_basic.hpp
 create mode 100644 example/ck_tile/18_flatmm/run_flatmm_example.inc
 create mode 100755 example/ck_tile/18_flatmm/script/smoke_test_basic.sh
 create mode 100644 include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
 create mode 100644 include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
 create mode 100644 include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
 create mode 100644 include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
 create mode 100644 include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
 create mode 100644 include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp

diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
new file mode 100644
index 0000000000..9fbe65e3a7
--- /dev/null
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
+
+set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-local-typedef)
+target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/18_flatmm/README.md b/example/ck_tile/18_flatmm/README.md
new file mode 100644
index 0000000000..beaac785fc
--- /dev/null
+++ b/example/ck_tile/18_flatmm/README.md
@@ -0,0 +1,35 @@
+# FLATMM Matrix Multiplication
+
+This folder contains example for FLATMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile FLATMM, but creates the placeholders for the future support on different FLATMM pipeline and different FLATMM modules. In the near future, we will gradually migrate all the FLATMM features from old CK to CK Tile.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the flatmm calculation
+make tile_example_flatmm_basic -j
+```
+This will result in an executable `build/bin/tile_example_flatmm_basic`
+
+## example
+```
+args:
+          -b    batch size (default:1)
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
+          -k    k dimension (default:64)
+   -a_layout    Tensor A data layout (default: R)
+   -b_layout    Tensor B data layout (default: R)
+   -c_layout    Tensor C data layout (default: R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -e    Absolute error tolerance (default:1e-5)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+```
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
new file mode 100644
index 0000000000..05d0c73b7e
--- /dev/null
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "flatmm_basic.hpp"
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
+{
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 2;
+
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    using CodegenFlatmmShape =
+        ck_tile::TileFlatmmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                 ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                 ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenFlatmmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
+                                                                BDataType,
+                                                                AccDataType,
+                                                                CodegenFlatmmShape,
+                                                                CodegenGemmTraits>;
+    using GemmEpilogue           = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<ADataType,
+                                         BDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         CLayout,
+                                         CodegenPipelineProblem::kBlockSize,
+                                         TilePartitioner::MPerBlock,
+                                         TilePartitioner::NPerBlock,
+                                         M_Warp,
+                                         N_Warp,
+                                         M_Warp_Tile,
+                                         N_Warp_Tile,
+                                         K_Warp_Tile,
+                                         CodegenPipelineProblem::TransposeC>>;
+
+    using CodegenFlatmmPolicy = ck_tile::UniversalFlatmmPipelineAgBgCrPolicy;
+    using CodegenFlatmmPipeline =
+        ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenFlatmmPolicy>;
+
+    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+    using Kernel = ck_tile::FlatmmKernel<TilePartitioner, CodegenFlatmmPipeline, GemmEpilogue>;
+
+    auto kargs = Kernel::MakeKernelArgs(args);
+
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+    constexpr dim3 blocks = Kernel::BlockSize();
+
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Launching kernel with args:"
+                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+    }
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+#include "run_flatmm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_flatmm_example(argc, argv); }
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
new file mode 100644
index 0000000000..355ac45ebe
--- /dev/null
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -0,0 +1,100 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/flatmm.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE 1
+#define CK_TILE_PIPELINE_MEMORY 2
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
+template <typename DataType>
+struct GemmBasicTypeConfig;
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+using Types = GemmBasicTypeConfig<ck_tile::half_t>;
+
+// Specific type aliases for easy access
+using ADataType   = Types::ADataType;
+using BDataType   = Types::BDataType;
+using AccDataType = Types::AccDataType;
+using CDataType   = Types::CDataType;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "256", "m dimension")
+        .insert("n", "256", "n dimension")
+        .insert("k", "128", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
new file mode 100644
index 0000000000..864d888074
--- /dev/null
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// mfma_type, 0:32x32, 1:16x16
+template <typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type = 0)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 16, 2, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 32, 4, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 32, 2, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 64, 4, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    return t;
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
+                    ck_tile::DeviceMem& b_shuffle_dev_buf,
+                    ck_tile::DeviceMem& c_dev_buf,
+                    ck_tile::index_t M,
+                    ck_tile::index_t N,
+                    ck_tile::index_t K,
+                    ck_tile::index_t stride_A,
+                    ck_tile::index_t stride_B,
+                    ck_tile::index_t stride_C,
+                    ck_tile::index_t kbatch,
+                    int n_warmup,
+                    int n_repeat)
+{
+    ck_tile::FlatmmHostArgs args;
+    args.a_ptr         = a_dev_buf.GetDeviceBuffer();
+    args.b_shuffle_ptr = b_shuffle_dev_buf.GetDeviceBuffer();
+    args.c_ptr         = c_dev_buf.GetDeviceBuffer();
+
+    args.k_batch  = kbatch;
+    args.M        = M;
+    args.N        = N;
+    args.K        = K;
+    args.stride_A = stride_A;
+    args.stride_B = stride_B;
+    args.stride_C = stride_C;
+
+    float ave_time = flatmm_calc<ALayout, BLayout, CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Flatmm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+int run_flatmm_example_with_layouts(int argc,
+                                    char* argv[],
+                                    const ALayout a_layout                  = ALayout{},
+                                    const BLayout b_layout                  = BLayout{},
+                                    [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    int n_warmup            = arg_parser.get_int("warmup");
+    int n_repeat            = arg_parser.get_int("repeat");
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_host(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_origin_host(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_rslt_host(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    // TODO: add different init types
+    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+
+    ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_dev_buf(c_rslt_host.get_element_space_size_in_bytes());
+
+    a_dev_buf.ToDevice(a_host.data());
+    c_rslt_host.SetZero();
+
+    // do pre-shuffle
+    std::string mfma                              = arg_parser.get_str("prec");
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, 0);
+    ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
+    b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
+
+    invoke_flatmm<ALayout, BLayout, CLayout>(a_dev_buf,
+                                             b_shuffle_dev_buf,
+                                             c_dev_buf,
+                                             M,
+                                             N,
+                                             K,
+                                             stride_A,
+                                             stride_B,
+                                             stride_C,
+                                             kbatch,
+                                             n_warmup,
+                                             n_repeat);
+
+    c_dev_buf.FromDevice(c_rslt_host.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_ref_host(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_ref_host.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_host, b_origin_host, c_ref_host);
+        const float max_accumulated_value =
+            *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
+        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_rslt_host,
+                                  c_ref_host,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::DeviceMem b_origin_dev_buf(b_origin_host.get_element_space_size_in_bytes());
+        b_origin_dev_buf.ToDevice(b_origin_host.data());
+
+        ck_tile::HostTensor<CDataType> c_gpu_ref_host(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        ck_tile::DeviceMem c_gpu_ref_dev_buf(c_gpu_ref_host.get_element_space_size_in_bytes());
+        c_gpu_ref_host.SetZero();
+        c_gpu_ref_dev_buf.SetZero();
+
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType)));
+
+        ck_tile::hip_check_error(hipMemcpy(
+            d_A, a_dev_buf.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice));
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_origin_dev_buf.GetDeviceBuffer(),
+                                           N * K * sizeof(BDataType),
+                                           hipMemcpyHostToDevice));
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        ck_tile::hip_check_error(hipMemcpy(c_gpu_ref_dev_buf.GetDeviceBuffer(),
+                                           d_C,
+                                           M * N * sizeof(CDataType),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
+
+        c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
+        const float max_accumulated_value =
+            *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
+        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_rslt_host,
+                                  c_gpu_ref_host,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+int run_flatmm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string a_layout = arg_parser.get_str("a_layout");
+    std::string b_layout = arg_parser.get_str("b_layout");
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_flatmm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
diff --git a/example/ck_tile/18_flatmm/script/smoke_test_basic.sh b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
new file mode 100755
index 0000000000..a3fc61cc31
--- /dev/null
+++ b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+EXE="$(find . -name tile_example_flatmm_basic -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+
+run_tests() {
+    for m in 128 1024; do
+        for n in 128 2048; do
+            for k in 128 4096; do
+
+                $EXE -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -prec=$1 $COMMON_ARGS
+                if [ $? -eq 0 ]; then
+                    echo "Success: Test with m=$m, n=$n, k=$k executed successfully."
+                else
+                    echo "Error: Test with m=$m, n=$n, k=$k failed to execute properly."
+                    # Optionally, exit or break if you need to halt further execution
+                    # exit 1
+                fi
+
+            done
+        done
+    done
+}
+
+set -x
+
+run_tests "bf16"
+run_tests "fp16"
+
+set +x
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 7f4ba2ed35..88efe0d8d9 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -17,4 +17,5 @@ add_subdirectory(14_moe_smoothquant)
 add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
+add_subdirectory(18_flatmm)
 add_subdirectory(35_batched_transpose)
diff --git a/include/ck_tile/ops/flatmm.hpp b/include/ck_tile/ops/flatmm.hpp
index 82f6d48eda..1714789e63 100644
--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -3,10 +3,16 @@
 
 #pragma once
 
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
+#include "ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+#include "ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
new file mode 100644
index 0000000000..935eb2c028
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp"
+
+namespace ck_tile {
+
+// A is block window on shared memory
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename BlockPolicy_>
+struct BlockFlatmmASmemBSmemCRegV1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using BlockPolicy    = remove_cvref_t<BlockPolicy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+
+        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockWindow, typename BFlatBlockWindow>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockWindow& a_block_window,
+                                   const BFlatBlockWindow& b_flat_block_window) const
+    {
+        static_assert(std::is_same_v<ADataType, typename ABlockWindow::DataType> &&
+                          std::is_same_v<BDataType, typename BFlatBlockWindow::DataType> &&
+                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                      "wrong!");
+        constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}];
+
+        static_assert(MPerBlock == BlockGemmShape::kM && KPerBlock == BlockGemmShape::kK, "wrong!");
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG              = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp =
+            BlockTile::at(idxN) / (WarpTile::at(idxN) * BlockWarps::at(idxN));
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        constexpr index_t NFlatPerBlockPerIter = BlockGemmShape::flatNPerWarp;
+        constexpr index_t KFlatPerBlockPerIter = BlockGemmShape::flatKPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+
+        // construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // construct Bflat-warp-window
+        auto b_flat_warp_windows_tmp = b_flat_block_window;
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_flat_warp_windows_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_flat_warp_windows;
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_flat_warp_windows(nIter)(kIter) = b_flat_warp_windows_tmp;
+
+                move_tile_window(b_flat_warp_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+            });
+        });
+
+        // auto b_warp_windows = b_origin_warp_windows;
+        auto b_warp_windows = b_flat_warp_windows;
+
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block window
+                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+
+    // C = A * B
+    template <typename ABlockTensorTmp, typename BFlatBlockWindow>
+    CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BFlatBlockWindow& b_flat_block_window) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor_tmp, b_flat_block_window);
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
new file mode 100644
index 0000000000..d5b062a1b3
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// Default policy for BlockGemmASmemBSmemCRegV1
+// Default policy class should not be templated, put template on member functions instead
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_>
+struct BlockFlatmmASmemBSmemCRegV1CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
new file mode 100644
index 0000000000..eb45e6c0bd
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
+namespace ck_tile {
+
+struct FlatmmProblem
+{
+    CK_TILE_HOST FlatmmProblem() = default;
+    CK_TILE_HOST FlatmmProblem(
+        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
+        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+
+struct FlatmmHostArgs : public FlatmmProblem
+{
+    CK_TILE_HOST FlatmmHostArgs() = default;
+    CK_TILE_HOST FlatmmHostArgs(const void* a_ptr_,
+                                const void* b_shuffle_ptr_,
+                                void* c_ptr_,
+                                index_t k_batch_,
+                                index_t M_,
+                                index_t N_,
+                                index_t K_,
+                                index_t stride_A_,
+                                index_t stride_B_,
+                                index_t stride_C_)
+        : FlatmmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_),
+          a_ptr(a_ptr_),
+          b_shuffle_ptr(b_shuffle_ptr_),
+          c_ptr(c_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_shuffle_ptr;
+    void* c_ptr;
+    index_t k_batch;
+};
+
+template <typename TilePartitioner_, typename FlatmmPipeline_, typename EpiloguePipeline_>
+struct FlatmmKernel
+{
+    using TilePartitioner = remove_cvref_t<TilePartitioner_>;
+    using FlatmmPipeline  = remove_cvref_t<FlatmmPipeline_>;
+    using BlockGemmShape =
+        remove_cvref_t<typename FlatmmPipeline::BlockGemmShape>; // TileFlatmmShape
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename FlatmmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename FlatmmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename FlatmmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = FlatmmPipeline::BlockSize;
+
+    using ADataType = remove_cvref_t<typename FlatmmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename FlatmmPipeline::BDataType>;
+    // Below type is actually accumulation data type - the output of block GEMM.
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>, FlatmmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    struct FlatmmKernelArgs
+    {
+        const void* a_ptr;
+        const void* b_shuffle_ptr;
+        void* c_ptr;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t stride_A;
+        index_t stride_B;
+        index_t stride_C;
+        index_t k_batch;
+    };
+
+    CK_TILE_HOST static constexpr FlatmmKernelArgs MakeKernelArgs(const FlatmmHostArgs& hostArgs)
+    {
+        return FlatmmKernelArgs{hostArgs.a_ptr,
+                                hostArgs.b_shuffle_ptr,
+                                hostArgs.c_ptr,
+                                hostArgs.M,
+                                hostArgs.N,
+                                hostArgs.K,
+                                hostArgs.stride_A,
+                                hostArgs.stride_B,
+                                hostArgs.stride_C,
+                                hostArgs.k_batch};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(FlatmmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const FlatmmKernelArgs& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = kargs.k_batch * K1;
+            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead * kargs.stride_A;
+            }
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead * kargs.stride_B;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead;
+            }
+
+            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            {
+                splitted_k = KRead;
+            }
+            else
+            {
+                splitted_k = kargs.K - KRead * (kargs.k_batch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static bool IsSupportedArgument(const FlatmmKernelArgs& kargs)
+    {
+        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                     is_any_of<CDataType, fp16_t, bf16_t>::value)
+        {
+            if(kargs.k_batch != 1)
+            {
+                std::cerr << "Conditions not met for Kbatch >1 !" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % TilePartitioner::KPerBlock != 0 && FlatmmPipeline::kPadK == false)
+            {
+                std::cerr << "Can't support K that is not a multiple of KPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.K % FlatmmPipeline::GetVectorSizeA() != 0)
+            {
+                std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+            {
+                std::cerr << "Can't support M that is not a multiple of MPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.M % FlatmmPipeline::GetVectorSizeA() != 0)
+            {
+                std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+            {
+                std::cerr << "Can't support N that is not a multiple of NPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0)
+            {
+                std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % TilePartitioner::KPerBlock != 0 && FlatmmPipeline::kPadK == false)
+            {
+                std::cerr << "Can't support K that is not a multiple of KPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.K % FlatmmPipeline::GetVectorSizeB() != 0)
+            {
+                std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+            {
+                std::cerr << "Can't support N that is not a multiple of NPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+            {
+                std::cerr << "Can't support M that is not a multiple of MPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                                   const BDataType* b_flat_ptr,
+                                                   CDataType* c_ptr,
+                                                   const FlatmmKernelArgs& kargs,
+                                                   const SplitKBatchOffset& splitk_batch_offset)
+    {
+        const auto& a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                    make_tuple(kargs.stride_A, 1),
+                    number<FlatmmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                    make_tuple(kargs.stride_A, 1),
+                    number<FlatmmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+        }();
+
+        index_t kFlatK = FlatmmPipeline::flatKPerWarp * (splitk_batch_offset.splitted_k /
+                                                         BlockGemmShape::WarpTile::at(number<2>{}));
+        index_t kFlatN = kargs.N * kargs.K / kFlatK;
+        const auto& b_flat_tensor_view = [&]() {
+            return make_naive_tensor_view<address_space_enum::global>(
+                b_flat_ptr,
+                make_tuple(kFlatN, kFlatK),
+                make_tuple(kFlatK, 1),
+                number<FlatmmPipeline::GetVectorSizeB()>{},
+                number<1>{});
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(a_tensor_view, b_flat_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                  number<TilePartitioner::MPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadM>{});
+            }
+        }();
+
+        const auto& b_flat_tensor_view = views.at(I1);
+
+        // TODO vector write in for C in ColMajor
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I2);
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<FlatmmPipeline::kPadM, false>{});
+            }
+        }();
+
+        return make_tuple(a_pad_view, b_flat_tensor_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view      = views.at(I0);
+        const auto& b_flat_pad_view = views.at(I1);
+        const auto& c_pad_view      = views.at(I2);
+
+        const auto& a_block_window = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::KPerBlock>{}),
+                                        {i_m, 0});
+            }
+            else
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                   number<TilePartitioner::MPerBlock>{}),
+                                        {0, i_m});
+            }
+        }();
+
+        const auto& b_flat_block_window =
+            make_tile_window(b_flat_pad_view,
+                             make_tuple(number<FlatmmPipeline::flatNPerWarp>{},
+                                        number<FlatmmPipeline::flatKPerWarp>{}),
+                             {static_cast<int>(i_n / BlockGemmShape::WarpTile::at(idxN)), 0});
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_flat_block_window, c_block_window);
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
+                                         const BDataType* b_flat_ptr,
+                                         CDataType* c_ptr,
+                                         void* smem_ptr,
+                                         const FlatmmKernelArgs& kargs,
+                                         const SplitKBatchOffset& splitk_batch_offset,
+                                         const index_t block_idx_m,
+                                         const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_flat_ptr, c_ptr, kargs, splitk_batch_offset);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window      = gemm_tile_windows.at(I0);
+        const auto& b_flat_block_window = gemm_tile_windows.at(I1);
+        const auto& c_block_tile        = FlatmmPipeline{}.template operator()(
+            a_block_window, b_flat_block_window, num_loop, smem_ptr);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I2);
+
+        EpiloguePipeline{}
+            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
+                c_block_window, c_block_tile, smem_ptr);
+    }
+
+    CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
+    {
+        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
+        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const SplitKBatchOffset splitk_batch_offset(kargs);
+        // options
+        const ADataType* a_ptr =
+            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+        const BDataType* b_flat_ptr = static_cast<const BDataType*>(kargs.b_shuffle_ptr) +
+                                      splitk_batch_offset.b_k_split_offset;
+        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        if(kargs.k_batch == 1)
+        {
+            RunFlatmm(a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
+        else
+        {
+            // Do not compile in case where we have unsupported
+            // VectorSizeC & data type configuration.
+            if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<CDataType, fp16_t, bf16_t>::value))
+            {
+                RunFlatmm<memory_operation_enum::atomic_add>(
+                    a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
new file mode 100644
index 0000000000..3d08c7a788
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename PipelinePolicy = UniversalFlatmmPipelineAgBgCrPolicy>
+struct FlatmmPipelineAGmemBGmemCRegV1
+{
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockFlatmm =
+        remove_cvref_t<decltype(PipelinePolicy::template GetBlockFlatmm<Problem>())>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kMPerBlock = BlockGemmShape::kM;
+    static constexpr index_t kNPerBlock = BlockGemmShape::kN;
+    static constexpr index_t kKPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
+    static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
+
+    static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; }
+    static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
+    static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr index_t kLdsAlignmentInBytes = 16;
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "pipeline_AGmemBGmemCRegV1", 
+                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock,  BlockSize),
+                      concat('x', GetVectorSizeA(), GetVectorSizeB(), GetVectorSizeC()),
+                      concat('x', kPadM, kPadN, kPadK));
+        // clang-format on
+    }
+
+    // For the basic gemm pipelien DoubleSmemBuffer set to be false naturally.
+    static constexpr bool DoubleSmemBuffer = false;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return PipelinePolicy::template GetSmemSize<Problem>();
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const AElementFunction& a_element_func,
+                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                        index_t num_loop,
+                                        void* p_smem) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}],
+                      "wrong!");
+        static_assert(kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // Block GEMM
+        auto block_flatmm = BlockFlatmm();
+
+        // B flat DRAM window for load
+        auto b_flat_distribution =
+            PipelinePolicy::template MakeBFlatDramTileDistribution<Problem>();
+        auto b_flat_dram_window = // tile_window_with_static_distribution
+            make_tile_window(
+                b_flat_dram_block_window_tmp.get_bottom_tensor_view(), // from kernel gemm_pad_views
+                make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
+                b_flat_dram_block_window_tmp.get_window_origin(),
+                b_flat_distribution);
+
+        // Acc register tile
+        auto c_block_tile = decltype(block_flatmm(a_lds_gemm_window, b_flat_dram_window)){};
+
+        // prefetch
+        // global read 0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+
+        {
+            // move to 1
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                    PipelinePolicy::template MakeShuffledARegBlockDistribution<Problem>());
+                shuffle_tile(a_shuffle_tmp, a_block_tile);
+                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
+                store_tile(a_copy_lds_window, a_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+            }
+        }
+
+        index_t iCounter = num_loop - 1;
+        while(iCounter > 0)
+        {
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            block_sync_lds();
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_lds_gemm_window, b_flat_dram_window);
+
+            block_sync_lds();
+
+            // move to i + 2
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // LDS write i + 1
+            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            iCounter--;
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            // GEMM num_loop - 1
+            block_flatmm(c_block_tile, a_lds_gemm_window, b_flat_dram_window);
+        }
+
+        return c_block_tile;
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_flat_dram_block_window_tmp,
+            num_loop,
+            p_smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
new file mode 100644
index 0000000000..d1aac07d54
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+struct UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    // 3d + padding
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
+            make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
+            number<8>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kMPerBlock),
+                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
+                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
+    {
+        return Problem::VectorLoadSize;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * M0))
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = 16 / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M1, M2 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t KBPerLoad =
+            Problem::VectorLoadSize / sizeof(BDataType); // dwordx4 load B elem cnt
+        constexpr index_t KThdPerWave = WaveSize;        // threads cnt in K dim
+        constexpr index_t KWavePerBlk = 1;
+        constexpr index_t KRepeat     = 1;
+
+        constexpr index_t NBPerLoad   = 1;
+        constexpr index_t NThdPerWave = 1;
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(TileShape::idxN); // N_Warp
+        constexpr index_t NRepeat     = 1;
+
+        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<WaveRepeat>,                                          // ?
+                tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>,  // second direction
+                      sequence<KRepeat, KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
+                // wave in blk,     // thd in wave
+                // <M, K>           // <M, K>
+                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                // <repeat, vec_load>
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDistribution()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = kMPerBlock / M1;
+        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackA<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * M0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
+    {
+        using AccDataType = float;
+        using BlockWarps  = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile    = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm    = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                Problem::TransposeC>;
+
+        using BlockFlatmmPolicy =
+            BlockFlatmmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                    typename Problem::BDataType,
+                                                    typename Problem::CDataType,
+                                                    BlockWarps,
+                                                    WarpGemm>;
+        return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
new file mode 100644
index 0000000000..551d390ec6
--- /dev/null
+++ b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+template <typename BlockTile_, typename BlockWarps_, typename WarpTile_>
+struct TileFlatmmShape
+{
+    using BlockTile  = remove_cvref_t<BlockTile_>;
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+    using WarpTile   = remove_cvref_t<WarpTile_>;
+
+    static constexpr auto idxM = number<0>{};
+    static constexpr auto idxN = number<1>{};
+    static constexpr auto idxK = number<2>{};
+
+    static constexpr index_t NumWarps = reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+
+    static constexpr index_t kM = BlockTile::at(idxM);
+    static constexpr index_t kN = BlockTile::at(idxN);
+    static constexpr index_t kK = BlockTile::at(idxK);
+
+    static constexpr index_t flatNPerWarp  = BlockWarps::at(idxN);
+    static constexpr index_t flatKPerWarp  = WarpTile::at(idxK) * WarpTile::at(idxN);
+    static constexpr index_t flatKPerBlock = flatKPerWarp * kK / WarpTile::at(idxK);
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "tile_flatmm_shape",
+                      concat('x', kM, kN, kK, NumWarps),
+                      concat('x', BlockWarps::at(idxM), BlockWarps::at(idxN), BlockWarps::at(idxK)),
+                      concat('x', (WarpTile::at(idxM)), WarpTile::at(idxN), WarpTile::at(idxK)));
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile

From 7c32652e03d9a2015f5ab04c5193723869e2525e Mon Sep 17 00:00:00 2001
From: aledudek <aleksander.dudek@amd.com>
Date: Wed, 16 Apr 2025 11:00:55 +0200
Subject: [PATCH 048/443] Add grouped conv fwd 3d GKCYX instances for f32, f16,
 bf16 (#2069)

* Part1

* Add grouped conv fwd 3d GKCYX instances for f32, f16, bf16

* Add missing coma

* Add missing cpp instance files

* Fix 3d layout

* Add missing closing bracket

* Add missing comp x2 and part2 instances

* Fix typo in instance name

* fix

* Fix

---------

Co-authored-by: Bartlomiej Kocot <barkocot@amd.com>
---
 .../gpu/grouped_convolution_forward.hpp       |  64 ++++++++++-
 .../grouped_convolution_forward_comp_xdl.inc  | 105 ++++++++++++++++++
 ...uped_convolution_forward_mem_inter_xdl.inc |  49 ++++++++
 ...uped_convolution_forward_mem_intra_xdl.inc |  49 ++++++++
 .../gpu/grouped_convolution_forward_xdl.inc   |  49 ++++++++
 ..._convolution_forward_xdl_merged_groups.inc |  49 ++++++++
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |  19 ++++
 ...hw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp |  43 +++++++
 ...gcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp |  54 +++++++++
 ...gkczyx_ngkdhw_bf16_comp_part2_instance.cpp |  45 ++++++++
 ...dhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp |  43 +++++++
 ...ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp |  54 +++++++++
 ..._gkczyx_ngkdhw_f16_comp_part2_instance.cpp |  45 ++++++++
 ...ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp |  54 +++++++++
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |  53 +++++++++
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |  53 +++++++++
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp |  53 +++++++++
 ..._gkczyx_ngkdhw_bf16_mem_inter_instance.cpp |  55 +++++++++
 ..._gkczyx_ngkdhw_bf16_mem_intra_instance.cpp |  55 +++++++++
 ...w_gkczyx_ngkdhw_f16_mem_inter_instance.cpp |  55 +++++++++
 ...w_gkczyx_ngkdhw_f16_mem_intra_instance.cpp |  55 +++++++++
 ...w_gkczyx_ngkdhw_f32_mem_inter_instance.cpp |  55 +++++++++
 ...w_gkczyx_ngkdhw_f32_mem_intra_instance.cpp |  55 +++++++++
 ...ups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |  47 ++++++++
 ...oups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |  47 ++++++++
 ...oups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp |  47 ++++++++
 profiler/src/profile_grouped_conv_fwd.cpp     |  27 ++++-
 .../test_grouped_convnd_fwd.cpp               |   4 +-
 28 files changed, 1377 insertions(+), 6 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 0b7df6ecfb..638a3f98a3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -523,7 +523,69 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             }
 #endif
         }
+
+        // layout NGCDHW/GKCZYX/NGKDHW
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NGCDHW> &&
+                     is_same_v<WeiLayout, GKCZYX> && is_same_v<OutLayout, NGKDHW>)
+        {
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+
+#endif // CK_USE_XDL
 
 #ifdef CK_USE_WMMA
         if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
@@ -639,7 +701,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
             }
 #endif
         }
-#endif
+#endif // CK_USE_WMMA
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index 48bc8942a8..b830bdce71 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -283,6 +283,111 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
index 3900c7a0fb..00351ceefd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
@@ -171,6 +171,55 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instan
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
index b7815f5023..bd44116057 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
@@ -171,6 +171,55 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instan
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index b934b9aef6..d3624b0fd9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -517,6 +517,55 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instances(
                                                                 F8>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
index 966b883301..9f54c4b633 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
@@ -178,6 +178,55 @@ void add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_in
                                                                 PassThrough>>>& instances);
 #endif
 
+// grouped conv3d forward, NGCDHW/GKCZYX/NGKDHW
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 1e572f9ceb..7b9ccf6609 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -8,6 +8,9 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
 
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -16,18 +19,34 @@ set(GROUPED_CONV3D_FWD
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
    
    xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
 
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..3e1a2dd48b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NGCDHW,
+                                                               GKCZYX,
+                                                               Empty_Tuple,
+                                                               NGKDHW,
+                                                               ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..43241454a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NGCDHW,
+                                                                                   GKCZYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKDHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..85a1c9137c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGKDHW,
+                                                                 ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..9b8bf4fa42
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NGCDHW,
+                                                              GKCZYX,
+                                                              Empty_Tuple,
+                                                              NGKDHW,
+                                                              ConvFwdDefault>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
new file mode 100644
index 0000000000..d02d9f6778
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..eaac75ee9e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instance.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>&)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+#if 0 // TODO: Improve compilation time and enable these instances
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGKDHW,
+                                                                 ConvFwdDefault>{});
+#endif
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
new file mode 100644
index 0000000000..696ea7f34e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..060eebebc1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..85b088f416
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..2b3e596355
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..fac3098341
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..f3eccc7dc8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..abea0bea81
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..ba5d9fb1de
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..5a2c4a0d5b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..701b8eb4a4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..71bde2faa5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGKDHW,
+                                                                 ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NGCDHW,
+                                                                 GKCZYX,
+                                                                 Empty_Tuple,
+                                                                 NGKDHW,
+                                                                 ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..2e71b76256
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..8a53dea612
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/merged_groups/device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                ConvFwd3x3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
index 9ee05d1304..a7714b4c73 100644
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -114,17 +114,19 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
     using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 
     // using GKCX   = ck::tensor_layout::convolution::GKXC;
-    using GKCYX = ck::tensor_layout::convolution::GKCYX;
-    // using GKCZYX = ck::tensor_layout::convolution::GKZYXC;
+    using GKCYX  = ck::tensor_layout::convolution::GKCYX;
+    using GKCZYX = ck::tensor_layout::convolution::GKCZYX;
 
     using GNWK   = ck::tensor_layout::convolution::GNWK;
     using GNHWK  = ck::tensor_layout::convolution::GNHWK;
     using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
 
     //
-    using NGCHW = ck::tensor_layout::convolution::NGCHW;
+    using NGCHW  = ck::tensor_layout::convolution::NGCHW;
+    using NGCDHW = ck::tensor_layout::convolution::NGCDHW;
 
-    using NGKHW = ck::tensor_layout::convolution::NGKHW;
+    using NGKHW  = ck::tensor_layout::convolution::NGKHW;
+    using NGKDHW = ck::tensor_layout::convolution::NGKDHW;
 
     //
     using NWGC   = ck::tensor_layout::convolution::NWGC;
@@ -366,6 +368,23 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
             return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, BF8{}, F8{});
         }
     }
+    // NGCDHW_GKCZYX_NGKDHW
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
 
     std::cout << "this data_type & layout is not implemented" << std::endl;
 
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
index 43b77641d1..1cf91df52c 100644
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -77,7 +77,9 @@ using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>
                                        std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
                                        std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
                                        std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
-                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;
+                                       std::tuple<float, NGCDHW, GKCZYX, NGKDHW>,
+                                       std::tuple<ck::half_t, NGCDHW, GKCZYX, NGKDHW>,
+                                       std::tuple<ck::bhalf_t, NGCDHW, GKCZYX, NGKDHW>>;
 
 template <typename Tuple>
 class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>

From 3bb62f16cd023095dac9467351253861b9d92555 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:10:15 -0700
Subject: [PATCH 049/443] Upgrade default docker to Ubuntu24.04 (#2090)

* upgrade docker to Ubuntu24.04

* add break-system-packages flag to pip install

* fix dockerfile
---
 Dockerfile          | 14 +++++---------
 Dockerfile.compiler |  2 +-
 Jenkinsfile         |  8 ++++----
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2a8fb707c9..f77c685000 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:22.04
+FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=6.4
 ARG compiler_version=""
@@ -14,8 +14,8 @@ RUN set -xe && \
     curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
 RUN if [ "$ROCMVERSION" != "6.5" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.3.60300-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60400-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60400-1_all.deb && \
         wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
         sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
         sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
@@ -44,7 +44,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     iputils-ping \
     jq \
     libelf-dev \
-    libncurses5-dev \
     libnuma-dev \
     libpthread-stubs0-dev \
     llvm-amdgpu \
@@ -73,10 +72,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
 # Remove unnecessary rocm components that take a lot of space
     apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
 
-# Update the cmake to version 3.27.5
-RUN pip install --upgrade cmake==3.27.5 && \
 #Install latest ccache
-    git clone https://github.com/ccache/ccache.git && \
+RUN git clone https://github.com/ccache/ccache.git && \
     cd ccache && mkdir build && cd build && cmake .. && make install && \
 #Install ninja build tracing tools
     cd / && \
@@ -97,8 +94,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
     wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
     dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
 # Install packages for processing the performance results
-    pip3 install --upgrade pip && \
-    pip3 install --upgrade pytest sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
+    pip3 install --break-system-packages --upgrade pytest pymysql pandas==2.2.3 sqlalchemy==2.0.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
 # Add render group
     groupadd -f render && \
 # Install the new rocm-cmake version
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index f4aa12f356..7534910681 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.4"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index e6256fc3d8..3d7019bd1f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -40,10 +40,10 @@ def getBaseDockerImageName(){
     else{
         def ROCM_numeric = "${params.ROCMVERSION}" as float
         if ( ROCM_numeric < 6.5 ){
-            img = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
             }
         else{
-            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm${params.ROCMVERSION}"
+            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm${params.ROCMVERSION}"
             }
         }
     return img
@@ -535,7 +535,7 @@ def Build_CK(Map conf=[:]){
                     if ( !params.BUILD_LEGACY_OS && arch_type == 1 ){
                             echo "Run inductor codegen tests"
                             sh """
-                                  pip install --verbose .
+                                  pip install --break-system-packages --verbose .
                                   pytest python/test/test_gen_instances.py
                             """
                     }
@@ -745,7 +745,7 @@ def process_results(Map conf=[:]){
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true;
+                                              0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false

From da54464cce95c2f0334676ce24b863eed202d873 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Wed, 16 Apr 2025 15:25:02 -0600
Subject: [PATCH 050/443] MX GEMM - Add MX BF8 example (#2071)

* Add MX GEMM example for MX BF8

* Verified MX FP8 with 16x16x128 scale builtin

* Verify MX BF8 GEMM with BF16 output
---
 example/67_gemm_microscaling/CMakeLists.txt   |  3 +
 example/67_gemm_microscaling/gemm_mx_bf8.cpp  | 98 +++++++++++++++++++
 .../impl/device_gemm_xdl_cshuffle_v3_mx.hpp   |  3 +
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  6 ++
 include/ck/utility/amd_xdlops.hpp             | 29 ++++++
 5 files changed, 139 insertions(+)
 create mode 100644 example/67_gemm_microscaling/gemm_mx_bf8.cpp

diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 93770684df..34125465a9 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -3,3 +3,6 @@ add_custom_target(example_gemm_mx)
 add_example_executable(example_gemm_mx_fp8 gemm_mx_fp8.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
 
+add_example_executable(example_gemm_mx_bf8 gemm_mx_bf8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
+
diff --git a/example/67_gemm_microscaling/gemm_mx_bf8.cpp b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
new file mode 100644
index 0000000000..8e341fb591
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::bf8_t;
+using BDataType = ck::bf8_t;
+
+using XDataType = ck::e8m0_bexp_t;
+
+using CDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+constexpr ck::index_t KPerBlock      = 128;
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    128,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    16,               // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    4,                // MXdlPerWave
+    1,                // NXdlPerWave
+    S<8, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    false,            // ABlockLdsExtraM
+    S<8, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    false,            // BBlockLdsExtraN
+    1,                // CShuffleMXdlPerWavePerShuffle
+    1,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 16, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    2,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index 8a370304c6..62bc2c4499 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -699,6 +699,9 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
 
         static_assert(ScaleBlockSize == 32, "Only ScaleBlockSize 32 is supported");
 
+        static_assert(is_same_v<ComputeTypeA, ADataType> && is_same_v<ComputeTypeB, BDataType>,
+                      "ComputeTypeA and ComputeTypeB must be the same as ADataType and BDataType");
+
         return true;
     }
 
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 529a1a1729..08c4e4ba6e 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1141,6 +1141,12 @@ struct MfmaSelector
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
     }
 
+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<bf8_t, 32, 32>()
     {
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index a54a181bf1..a8c3baa31b 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -588,6 +588,35 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
         ignore = reg_b;
         ignore = scale_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const bf8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz
+                1, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
 #endif
     }
 };

From 213b203a3c4409cc0906cf13ecbc3a09092f67b2 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Wed, 16 Apr 2025 19:56:00 -0600
Subject: [PATCH 051/443] MX GEMM - Parameterized Test Template (#2088)

* Tests for MX FP8 GEMM

* Improve documentation
---
 .../impl/device_gemm_xdl_cshuffle_v3_mx.hpp   |  16 +-
 .../tensor_operation_instance/gpu/gemm_mx.hpp | 111 ++++
 .../gpu/CMakeLists.txt                        |  13 +
 .../gpu/gemm_mx/CMakeLists.txt                |  14 +
 ...device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp |  63 +++
 ...l_f8_f8_bf16_mk_nk_mn_default_instance.cpp |  32 ++
 .../device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp |  63 +++
 ...dl_f8_f8_f16_mk_nk_mn_default_instance.cpp |  32 ++
 test/CMakeLists.txt                           |   1 +
 test/gemm_mx/CMakeLists.txt                   |   4 +
 test/gemm_mx/test_gemm_mx.cpp                 | 108 ++++
 test/gemm_mx/test_gemm_mx_util.hpp            | 498 ++++++++++++++++++
 12 files changed, 948 insertions(+), 7 deletions(-)
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
 create mode 100644 test/gemm_mx/CMakeLists.txt
 create mode 100644 test/gemm_mx/test_gemm_mx.cpp
 create mode 100644 test/gemm_mx/test_gemm_mx_util.hpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index 62bc2c4499..c37af49387 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -22,6 +22,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+// clang-format off
 /**
  * \brief WIP: Implements XDL CShuffle V3 GEMM for microscale-compliant data types
  *
@@ -31,8 +32,8 @@ namespace device {
  * Assumptions:
  * - A and B data types are compliant with the OCP Microscaling Formats (MX) Specification
  * - Each scale applies to ScaleBlockSize elements in K direction
- * - A scale matrix is row-major
- * - B scale matrix is column-major
+ * - A scale matrix is a row-major
+ * - B scale matrix is a column-major
  * - Scale data types must have get_exponent_value() specialization, whereas lowest 8 bits of the
  * exponent will be interpreted as conventional biased Float32 exponent (E8M0)
  *
@@ -72,10 +73,10 @@ namespace device {
  *             for(int mw = m0; mw < m0 + MWaves * MPerXDL; mw += MPerXDL){
  *               for(int nw = n0; nw < n0 + NWaves * NPerXDL; nw += NPerXDL){
  *                 for(int k0 = kb; k0 < kb + KPerBlock; k0 += mfma.num_input_blks*KPack){
- *                   // MFMA accumulation for multirate instructions
- *                   for(int k_pack = k0; k_pack < k0 + mfma.num_input_blks*KPack; k_pack += KPack){
- *                     for(int k_mfma = k_pack; k_mfma < k_pack + KPack; k_mfma += mfma.k_per_blk){
- *                       // MFMA instruction
+ *                   // MFMA accumulation
+ *                   for(int k_pack = k0; k_pack < k0 + mfma.num_input_blks*KPack; k_pack += KPerXdlops){
+ *                     // MFMA instruction
+ *                     for(int k_mfma = k_pack; k_mfma < k_pack + KPerXdlops; k_mfma += mfma.k_per_blk){
  *                       for(int m = mw; m < mw + MPerXDL; m++){
  *                         for(int n = nw; n < nw + NPerXDL; n++){
  *                           for(int k = k_mfma; k < k_mfma + mfma.k_per_blk; k++){
@@ -96,6 +97,7 @@ namespace device {
  * \endcode
  *
  */
+// clang-format on
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -104,7 +106,7 @@ template <typename ALayout,
           typename BDataType,
           typename BScaleDataType,
           typename CDataType,
-          typename GemmAccDataType,
+          typename GemmAccDataType, // TODO: always float
           typename CShuffleDataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
new file mode 100644
index 0000000000..1c40ccec5d
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+template <typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          index_t ScaleBlockSize,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                               BLayout,
+                                               CLayout,
+                                               ADataType,
+                                               AScaleDataType,
+                                               BDataType,
+                                               BScaleDataType,
+                                               CDataType,
+                                               ScaleBlockSize,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmMX<ALayout,
+                                  BLayout,
+                                  CLayout,
+                                  ADataType,
+                                  AScaleDataType,
+                                  BDataType,
+                                  BScaleDataType,
+                                  CDataType,
+                                  ScaleBlockSize,
+                                  ck::tensor_operation::element_wise::PassThrough,
+                                  ck::tensor_operation::element_wise::PassThrough,
+                                  ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> && is_same_v<CLayout, Row>)
+        {
+            if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
+                         is_same_v<CDataType, F16>)
+            {
+
+                add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(op_ptrs);
+            }
+            if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
+                         is_same_v<CDataType, BF16>)
+            {
+
+                add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 2542dd236b..70e54962ed 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -60,6 +60,13 @@ function(add_instance_library INSTANCE_NAME)
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
+    # Do not build MX instances if gfx950 targets are not on the target list
+    foreach(source IN LISTS ARGN)
+        if(NOT INST_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
+            message("removing MX instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+    endforeach()
     # Do not build WMMA instances if gfx11 targets are not on the target list
     foreach(source IN LISTS ARGN)
 	if(NOT INST_TARGETS MATCHES "gfx11" AND NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
@@ -100,6 +107,8 @@ function(add_instance_library INSTANCE_NAME)
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
             elseif(source MATCHES "mha")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            elseif(source MATCHES "_mx")
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             endif()
             #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
@@ -234,6 +243,10 @@ FOREACH(subdir_path ${dir_list})
         if(("${cmake_instance}" MATCHES "ONLY XDL_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx9"))
             message("Found only xdl instances, but gfx9 is not on the targets list. Skipping.")
             set(add_inst 0)
+        endif()
+        if(("${cmake_instance}" MATCHES "ONLY MX_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx950"))
+            message("Found only MX instances, but gfx950 is not on the targets list. Skipping.")
+            set(add_inst 0)
         endif()
 	    if(("${cmake_instance}" MATCHES "ONLY WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12"))
             message("Found only wmma instances, but gfx11 is not on the targets list. Skipping.")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
new file mode 100644
index 0000000000..a166fc4ce4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
@@ -0,0 +1,14 @@
+# ONLY MX_KERNELS
+set(GEMM_MX_INSTANCES)
+
+list(APPEND GEMM_MX_INSTANCES 
+        device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
+    )
+
+
+set_source_files_properties(device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+
+add_instance_library(device_gemm_mx_instance ${GEMM_MX_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..1e979f69ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+using E8M0 = ck::e8m0_bexp_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+
+//Require verification
+      //DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..05914e06b5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             E8M0,
+                                             F8,
+                                             E8M0,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..0ca4f2a3ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+using E8M0 = ck::e8m0_bexp_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+
+      //Require verification
+      //DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..f4e59cf92d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             E8M0,
+                                             F8,
+                                             E8M0,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 18611d8052..72c51823be 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -279,6 +279,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9
 endif()
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx950") 
     add_subdirectory(mx_mfma_op)
+    add_subdirectory(gemm_mx)
 endif()
 add_subdirectory(position_embedding)
 add_subdirectory(scatter_gather)
diff --git a/test/gemm_mx/CMakeLists.txt b/test/gemm_mx/CMakeLists.txt
new file mode 100644
index 0000000000..71a0a98f2d
--- /dev/null
+++ b/test/gemm_mx/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_gtest_executable(test_gemm_mx test_gemm_mx.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_gemm_mx PRIVATE utility device_gemm_mx_instance)
+ endif()
diff --git a/test/gemm_mx/test_gemm_mx.cpp b/test/gemm_mx/test_gemm_mx.cpp
new file mode 100644
index 0000000000..6e1957e60a
--- /dev/null
+++ b/test/gemm_mx/test_gemm_mx.cpp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "test_gemm_mx_util.hpp"
+
+using E8M0 = ck::e8m0_bexp_t;
+using F8   = ck::f8_t;
+using BF8  = ck::bf8_t;
+using F6   = ck::f6_t;
+using BF6  = ck::bf6_t;
+using F4   = ck::f4_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmMX_MK_NK
+    : public ck::test::TestGemmMX<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_NK = ::testing::Types<
+#if defined(CK_ENABLE_FP8)
+    //         ADataType, BDataType,       CDataType, ScaleBlockSize
+    std::tuple<       F8,        F8,             F16, ck::Number<32> >,
+    std::tuple<       F8,        F8,            BF16, ck::Number<32> >
+#endif
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmMX_MK_NK, KernelTypes_MK_NK);
+
+TYPED_TEST(TestGemmMX_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_NK, Regular)
+{
+    std::vector<int> Ms{3840};
+    constexpr int N = 512;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_NK, Large)
+{
+    std::vector<int> Ms{4096};
+    constexpr int N = 3840;
+    constexpr int K = 4096;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_mx/test_gemm_mx_util.hpp b/test/gemm_mx/test_gemm_mx_util.hpp
new file mode 100644
index 0000000000..3bca4ceded
--- /dev/null
+++ b/test/gemm_mx/test_gemm_mx_util.hpp
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+namespace ck {
+namespace test {
+
+namespace {
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          int ScaleBlockSize>
+bool profile_gemm_mx_impl(int do_verification,
+                          int init_method,
+                          bool do_log,
+                          bool time_kernel,
+                          int M,
+                          int N,
+                          int K,
+                          int StrideA,
+                          int StrideB,
+                          int StrideC,
+                          int KBatch,
+                          int n_warmup,
+                          int n_iter,
+                          uint64_t rotating = 0)
+{
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    using ScaleDataType = e8m0_bexp_t;
+    using AScaleLayout  = Row;
+    using BScaleLayout  = Col;
+
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    auto f_get_default_stride =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<ck::index_t>(col);
+                }
+                else
+                {
+                    return static_cast<ck::index_t>(row);
+                }
+            }
+            else
+                return static_cast<ck::index_t>(stride);
+        };
+
+    auto Scale_Stride_AM = f_get_default_stride(M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    Tensor<ScaleDataType> a_m_k_scale(f_host_tensor_descriptor(
+        M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{})); // scales for A
+    Tensor<ScaleDataType> b_k_n_scale(f_host_tensor_descriptor(
+        K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{})); // scales for B
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::size_t total_gemm_needed =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes() +
+        a_m_k_scale.GetElementSpaceSizeInBytes() + b_k_n_scale.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "a_m_k_scale: " << a_m_k_scale.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b_k_n_scale: " << b_k_n_scale.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: // Initializations for development and debugging
+        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.0f)}(a_m_k);
+        ck::utils::FillConstant<ScaleDataType>{ck::type_convert<ScaleDataType>(2.0f)}(a_m_k_scale);
+        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(0.5f)}(b_k_n);
+        ck::utils::FillConstant<ScaleDataType>{ck::type_convert<ScaleDataType>(1.0f)}(b_k_n_scale);
+        if(do_log)
+        {
+            std::cout << "Init A = {1}" << std::endl;
+            std::cout << "Init A scale = {2.0}" << std::endl;
+            std::cout << "Init B = {0.5}" << std::endl;
+            std::cout << "Init B scale = {1.0}" << std::endl;
+            std::cout << "Expect C = {K}" << std::endl;
+        }
+        break;
+
+    case 1:
+
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5}); // Z[-4,4]
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5}); // Z[-4,4]
+
+        a_m_k_scale.GenerateTensorValue(
+            GeneratorTensor_2<ScaleDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorValue(
+            GeneratorTensor_2<ScaleDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+
+        break;
+
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        a_m_k_scale.GenerateTensorValue(
+            GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f}); // R[2^-125, 1]
+
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        b_k_n_scale.GenerateTensorValue(
+            GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f});
+        break;
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    if(do_log > 0)
+        std::cout << "Device memory allocation..." << std::endl;
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a_scale_device_buf(sizeof(ScaleDataType) * a_m_k_scale.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_scale_device_buf(sizeof(ScaleDataType) * b_k_n_scale.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    if(do_log > 0)
+        std::cout << "Upload data to device..." << std::endl;
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    a_scale_device_buf.ToDevice(a_m_k_scale.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    b_scale_device_buf.ToDevice(b_k_n_scale.mData.data());
+
+    if(do_log > 0)
+        std::cout << "Done." << std::endl;
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                ADataType,
+                                                                ScaleDataType,
+                                                                BDataType,
+                                                                ScaleDataType,
+                                                                CDataType,
+                                                                ScaleBlockSize,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMXGemm<ADataType,
+                                                        BDataType,
+                                                        CDataType,
+                                                        float, // AccDataType
+                                                        ScaleDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CElementOp,
+                                                        float, // ComputeTypeA
+                                                        float  // ComputeTypeB
+                                                        >;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
+                                                  a_m_k_scale,
+                                                  b_k_n,
+                                                  b_k_n_scale,
+                                                  c_m_n_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    std::optional<std::string> best_op_object_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; // use these when KBatch <= 0
+
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<ScaleDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                static_cast<ScaleDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                M,
+                N,
+                K,
+                StrideA,
+                Scale_Stride_AM,
+                StrideB,
+                Scale_Stride_BN,
+                StrideC,
+                kbatch_curr,
+                a_element_op,
+                b_element_op,
+                c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                    if(do_log)
+                    {
+
+                        if(init_method == 0)
+                        {
+                            auto expected = static_cast<float>(K);
+                            auto computed = type_convert<float>(c_m_n_device_result(0, 12));
+
+                            pass = pass & (std::abs(expected - computed) <= 0.0f);
+                            std::cout << "\nExpected vs Computed: " << expected << " vs "
+                                      << computed << ((pass) ? " (PASSED!)" : " (FAILED!)")
+                                      << std::endl
+                                      << std::endl;
+                        }
+                        else
+                        {
+                            LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "a_scale : ", a_m_k_scale.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "b_scale: ", b_k_n_scale.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                                << std::endl;
+                        }
+                    }
+
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                }
+
+                std::string op_name                    = op_ptr->GetTypeString();
+                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();
+
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});
+
+                // Output size(M*N) * [dot product(2K) + product of scales(K/ScaleBlockSize) +
+                // scaling of partial sums(K/ScaleBlockSize)]
+                // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
+                std::size_t flop =
+                    std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
+
+                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                        sizeof(CDataType) * M * N +
+                                        sizeof(ScaleDataType) * (M * K + K * N) / ScaleBlockSize;
+
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;
+
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name        = op_name;
+                    best_op_object_name = op_obj_name;
+                    best_tflops         = tflops;
+                    best_ave_time       = ave_time;
+                    best_gb_per_sec     = gb_per_sec;
+                    best_kbatch         = kbatch_curr;
+                }
+            }
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    if(best_op_object_name)
+        std::cout << best_op_object_name.value() << std::endl;
+
+    return pass;
+}
+
+template <typename Tuple>
+class TestGemmMX : public testing::Test
+{
+    using Row       = ck::tensor_layout::gemm::RowMajor;
+    using F32       = float;
+    using ScaleType = e8m0_bexp_t;
+
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = Row;
+    using ADataType   = std::tuple_element_t<2, Tuple>;
+    using BDataType   = std::tuple_element_t<3, Tuple>;
+    using CDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = float;
+
+    public:
+    static constexpr index_t ScaleBlockSize = std::tuple_element_t<5, Tuple>{};
+    static constexpr bool verify_           = true;
+    static constexpr int init_method_       = 2; // decimal value initialization
+    static constexpr bool log_              = false;
+    static constexpr bool bench_            = false; // measure kernel performance
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch   = 1,
+                   int n_warmup = 1,
+                   int n_iter   = 10)
+    {
+        bool pass = ck::test::profile_gemm_mx_impl<ADataType,
+                                                   BDataType,
+                                                   CDataType,
+                                                   ALayout,
+                                                   BLayout,
+                                                   CLayout,
+                                                   ScaleBlockSize>(verify_,
+                                                                   init_method_,
+                                                                   log_,
+                                                                   bench_,
+                                                                   M,
+                                                                   N,
+                                                                   K,
+                                                                   StrideA,
+                                                                   StrideB,
+                                                                   StrideC,
+                                                                   kbatch,
+                                                                   n_warmup,
+                                                                   n_iter);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck

From bcf5bb41be976d948b504f3d66c29e5baa82618a Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Fri, 18 Apr 2025 10:45:49 +0800
Subject: [PATCH 052/443] enable do top k weights in moe stage1 gemm (#2094)

* add switch for mul topk weights

* fix bf16/f16 bugs

* complete
---
 .../moe_gemm1_xdl_fp8.cpp                     | 64 +++++++++++--
 .../moe_gemm1_xdl_pk_i4.cpp                   | 63 +++++++++++--
 .../moe_gemm2_xdl_fp8.cpp                     |  8 +-
 .../moe_gemm2_xdl_pk_i4.cpp                   |  8 +-
 .../gpu/device/impl/device_moe_gemm.hpp       |  8 +-
 .../gpu/grid/gridwise_moe_gemm.hpp            | 93 +++++++++++--------
 .../cpu/reference_moe_gemm.hpp                | 15 ++-
 .../cpu/reference_moe_gemm2.hpp               | 12 ++-
 8 files changed, 203 insertions(+), 68 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index 66825edcf9..f594080755 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -39,14 +39,16 @@ using AccDataType      = F32;
 using CShuffleDataType = F32;
 using D0DataType       = F32;
 using D1DataType       = F32;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
 
 using A0Layout = Row;
 using B0Layout = Col;
 using ELayout  = Row;
 using D0Layout = Row;
 using D1Layout = Col;
-using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
 
 // for gate, a_scale, b_scale
 struct MulABScale
@@ -83,9 +85,36 @@ struct MulABScaleSilu
     }
 };
 
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for real kernel use
+        // warning: hack hack hack here!!!! ignore d0 right now as kernel mul d0 * d2 outside.
+        // tofix:felix
+        (void)d2;
+        e = ck::type_convert<EDataType>(c * d1 * d0);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight; // combine MulRoutedWeight = true
 // using DsLayout = DsLayoutGate;
 // using DsDataType       = DsDataTypeGate;
-using CDEElementOp = MulABScale;
+// using CDEElementOp = MulABScale; // combine MulRoutedWeight = false
 
 // using CDEElementOp = MulABScaleSiluMulGate;
 
@@ -133,11 +162,13 @@ static constexpr ck::index_t NPerBlock   = 128;
 static constexpr ck::index_t MNPerXDL    = 32;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
 static constexpr ck::index_t Nswizzle    = true;
+static constexpr bool MulRoutedWeight    = false;
 static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
 static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
 static constexpr ck::index_t D0Vec       = 1;
 static constexpr ck::index_t D1Vec       = 1;
+static constexpr ck::index_t D2Vec       = 1;
 // using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
 using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
@@ -157,8 +188,8 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-                2,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, A0DataType>;
+                2,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
 
 // clang-format on
 
@@ -224,7 +255,7 @@ int main(int argc, char* argv[])
     ck::index_t StrideB              = K;
     ck::index_t StrideE              = N;
     constexpr ck::index_t NumDTensor = DsDataType::Size();
-    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{1, 0};
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
 
     ck::index_t KBatch = 1;
 
@@ -266,6 +297,7 @@ int main(int argc, char* argv[])
     Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
     Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
     Tensor<EDataType> e_t_n_device_result(
         HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
@@ -273,6 +305,7 @@ int main(int argc, char* argv[])
     std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
     std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
     std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
     std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
 
     switch(init_method)
@@ -283,24 +316,28 @@ int main(int argc, char* argv[])
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
         d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
         d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{-2, 2});
         break;
     case 2:
         a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
         d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
         d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
         break;
     case 3:
         a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
         d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
         d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
         break;
     default:
         a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
         d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
         d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
     }
     DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
                                    sorted_token_ids.mDesc.GetElementSpaceSize());
@@ -310,6 +347,7 @@ int main(int argc, char* argv[])
     DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
     DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
     DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
     DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
     // a0_t_k.savetxt("a.txt");
     // d0_t_n.savetxt("d0_t_n.txt", "int");
@@ -320,6 +358,7 @@ int main(int argc, char* argv[])
     a0_device_buf.ToDevice(a0_t_k.mData.data());
     d0_device_buf.ToDevice(d0_t_n.mData.data());
     d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
 
     auto a_element_op   = AElementOp{};
     auto b_element_op   = BElementOp{};
@@ -342,7 +381,8 @@ int main(int argc, char* argv[])
                                a0_device_buf.GetDeviceBuffer(),
                                b0_device_buf.GetDeviceBuffer(),
                                std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
-                                                                   d1_device_buf.GetDeviceBuffer()},
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
                                e_device_buf.GetDeviceBuffer(),
                                tokens,
                                topk,
@@ -392,10 +432,12 @@ int main(int argc, char* argv[])
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
                                                                                    B0DataType,
                                                                                    CShuffleDataType,
+                                                                                   D2DataType,
                                                                                    AccDataType,
                                                                                    PassThrough,
                                                                                    PassThrough,
-                                                                                   PassThrough>;
+                                                                                   PassThrough,
+                                                                                   MulRoutedWeight>;
         auto ref_moe_gemm           = ReferenceGemmInstance{};
         auto ref_invoker            = ref_moe_gemm.MakeInvoker();
 
@@ -406,6 +448,7 @@ int main(int argc, char* argv[])
                                                       a0_t_k,
                                                       b0_e_n_k,
                                                       c_t_k_n,
+                                                      d2_e_n,
                                                       PassThrough{},
                                                       PassThrough{},
                                                       PassThrough{});
@@ -428,7 +471,8 @@ int main(int argc, char* argv[])
                 cde_element_op(e_t_n_host_result(t, topk_id, n),
                                c_t_k_n(t, topk_id, n),
                                d0_t_n(t, n),
-                               d1_e_n(e, n));
+                               d1_e_n(e, n),
+                               1.f);
             }
         }
 
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index a25d1b5fa3..fb8a8b9826 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -39,14 +39,15 @@ using AccDataType      = F32;
 using CShuffleDataType = F32;
 using D0DataType       = F32;
 using D1DataType       = F32;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
 
 using A0Layout = Row;
 using B0Layout = Col;
 using ELayout  = Row;
 using D0Layout = Row;
 using D1Layout = Col;
-using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, ELayout>;
 
 // for gate, a_scale, b_scale
 struct MulABScale
@@ -91,7 +92,39 @@ struct MulABScaleSilu
     }
 };
 
-using CDEElementOp = MulABScale;
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d2;
+
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c * d1 * d0 * 16);
+#else
+        e = ck::type_convert<EDataType>(c * d1 * d0);
+#endif
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2 * 16);
+#else
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+#endif
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
 
 #if 1
 void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
@@ -164,6 +197,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
 #else
 static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t Nswizzle = false;
+static constexpr bool MulRoutedWeight = false;
 // clang-format off
 using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
             Row, Col, DsLayout, ELayout, 
@@ -175,8 +209,8 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
             4,    1,
             S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
             S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0,
-            1,    1,   S<1, 32, 1, 8>, S<8, 1, 1>,
-            ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, A0DataType>;
+            1,    1,   S<1, 32, 1, 8>, S<8, 1, 1, 1>,
+            ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
 // clang-format on
 #endif
 
@@ -265,6 +299,7 @@ int main(int argc, char* argv[])
     Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
     Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
     Tensor<EDataType> e_t_n_device_result(
         HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
@@ -283,18 +318,21 @@ int main(int argc, char* argv[])
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
         d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
         d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{-2, 2});
         break;
     case 2:
         a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
         d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
         d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
         break;
     default:
         a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
         b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
         d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
         d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
     }
     DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
                                    sorted_token_ids.mDesc.GetElementSpaceSize());
@@ -304,6 +342,7 @@ int main(int argc, char* argv[])
     DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize() / 2);
     DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
     DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
     DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
 
     sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
@@ -312,6 +351,7 @@ int main(int argc, char* argv[])
     a0_device_buf.ToDevice(a0_t_k.mData.data());
     d0_device_buf.ToDevice(d0_t_n.mData.data());
     d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
 
     auto a_element_op   = AElementOp{};
     auto b_element_op   = BElementOp{};
@@ -424,7 +464,8 @@ int main(int argc, char* argv[])
                                a0_device_buf.GetDeviceBuffer(),
                                b0_device_buf.GetDeviceBuffer(),
                                std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
-                                                                   d1_device_buf.GetDeviceBuffer()},
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
                                e_device_buf.GetDeviceBuffer(),
                                tokens,
                                topk,
@@ -480,10 +521,12 @@ int main(int argc, char* argv[])
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
                                                                                    B0DataType,
                                                                                    CShuffleDataType,
+                                                                                   D2DataType,
                                                                                    AccDataType,
                                                                                    PassThrough,
                                                                                    PassThrough,
-                                                                                   PassThrough>;
+                                                                                   PassThrough,
+                                                                                   MulRoutedWeight>;
         auto ref_moe_gemm           = ReferenceGemmInstance{};
         auto ref_invoker            = ref_moe_gemm.MakeInvoker();
 
@@ -494,6 +537,7 @@ int main(int argc, char* argv[])
                                                       a0_t_k,
                                                       b0_e_n_k,
                                                       c_t_k_n,
+                                                      d2_e_n,
                                                       PassThrough{},
                                                       PassThrough{},
                                                       PassThrough{});
@@ -516,7 +560,8 @@ int main(int argc, char* argv[])
                 cde_element_op(e_t_n_host_result(t, topk_id, n),
                                c_t_k_n(t, topk_id, n),
                                d0_t_n(t, n),
-                               d1_e_n(e, n));
+                               d1_e_n(e, n),
+                               1.f);
             }
         }
 
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 0d12441016..04f10b53ae 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -135,6 +135,7 @@ static constexpr ck::index_t EVec          = 2;
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
 static constexpr ck::index_t D2Vec         = 1;
+static constexpr bool MulRoutedWeight      = false;
 using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
 ///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -164,7 +165,7 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                2,        1,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, MulRoutedWeight, A0DataType>;
         // kernel 2: 128->32x128x128
         //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
 
@@ -409,7 +410,8 @@ int main(int argc, char* argv[])
                                                           AccDataType,
                                                           PassThrough,
                                                           PassThrough,
-                                                          CDEElementOp>;
+                                                          CDEElementOp,
+                                                          MulRoutedWeight>;
         auto ref_moe_gemm = ReferenceGemmInstance{};
         auto ref_invoker  = ref_moe_gemm.MakeInvoker();
         auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index 8c2c70b4a1..ba4e40151f 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -138,6 +138,7 @@ static constexpr ck::index_t EVec          = 2;
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
 static constexpr ck::index_t D2Vec         = 1;
+static constexpr bool MulRoutedWeight      = true;
 using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
         <      Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
@@ -149,7 +150,7 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
                S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
                1,    1,   S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, MulRoutedWeight, A0DataType>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -455,7 +456,8 @@ int main(int argc, char* argv[])
                                                           AccDataType,
                                                           PassThrough,
                                                           PassThrough,
-                                                          CDEElementOp>;
+                                                          CDEElementOp,
+                                                          MulRoutedWeight>;
 
         auto ref_moe_gemm = ReferenceGemmInstance{};
         auto ref_invoker  = ref_moe_gemm.MakeInvoker();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
index f3fc1aaa9f..03db4bdd41 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -67,6 +67,7 @@ template <typename ALayout,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
           bool NSwizzle                               = false,
           bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           typename LDSTypeA                           = ComputeTypeA,
@@ -270,6 +271,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                 MemoryDataOp,
                                                                 minimum_occupancy,
                                                                 IsInputGemm,
+                                                                MulRoutedWeight,
                                                                 TailNumber::Odd>;
                             RunKernel(kernel);
                         }
@@ -280,6 +282,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                 MemoryDataOp,
                                                                 minimum_occupancy,
                                                                 IsInputGemm,
+                                                                MulRoutedWeight,
                                                                 TailNumber::Even>;
                             RunKernel(kernel);
                         }
@@ -295,6 +298,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
                                                                  IsInputGemm,
+                                                                 MulRoutedWeight,
                                                                  TailNumber::Odd>;
                         RunKernel(kernel);
                     }
@@ -305,6 +309,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
                                                                  IsInputGemm,
+                                                                 MulRoutedWeight,
                                                                  TailNumber::Even>;
                         RunKernel(kernel);
                     }
@@ -325,6 +330,7 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                         InMemoryDataOperationEnum::Set,
                                                         minimum_occupancy,
                                                         IsInputGemm,
+                                                        MulRoutedWeight,
                                                         TailNumber::Odd>;
                     RunKernel(kernel);
                 }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 1924c27b2b..a2d1114bbe 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -31,6 +31,7 @@ template <typename GridwiseGemm,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
           bool IsInputGemm         = false,
+          bool MulRoutedWeight     = true,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -44,19 +45,22 @@ __global__ void
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, IsInputGemm, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
+    GridwiseGemm::template Run<HasMainKBlockLoop,
+                               CGlobalMemoryDataOperation,
+                               IsInputGemm,
+                               MulRoutedWeight,
+                               TailNum>(karg.p_sorted_token_ids,
+                                        karg.p_sorted_expert_ids,
+                                        karg.p_max_token_id,
+                                        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+                                        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+                                        karg.p_ds_grid,
+                                        karg.p_c_grid,
+                                        p_shared,
+                                        karg,
+                                        karg.a_element_op,
+                                        karg.b_element_op,
+                                        karg.c_element_op);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -67,6 +71,7 @@ template <typename GridwiseGemm,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
           bool IsInputGemm         = false,
+          bool MulRoutedWeight     = true,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -81,21 +86,23 @@ __global__ void
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::
-        template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, IsInputGemm, TailNum>(
-            karg.p_sorted_token_ids,
-            karg.p_sorted_expert_ids,
-            karg.p_max_token_id,
-            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-            karg.p_ds_grid,
-            karg.p_c_grid,
-            p_shared,
-            p_shared1,
-            karg,
-            karg.a_element_op,
-            karg.b_element_op,
-            karg.c_element_op);
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop,
+                                    CGlobalMemoryDataOperation,
+                                    IsInputGemm,
+                                    MulRoutedWeight,
+                                    TailNum>(karg.p_sorted_token_ids,
+                                             karg.p_sorted_expert_ids,
+                                             karg.p_max_token_id,
+                                             karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+                                             karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+                                             karg.p_ds_grid,
+                                             karg.p_c_grid,
+                                             p_shared,
+                                             p_shared1,
+                                             karg,
+                                             karg.a_element_op,
+                                             karg.b_element_op,
+                                             karg.c_element_op);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -1134,8 +1141,9 @@ struct GridwiseMoeGemm
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              bool IsInputGemm   = true,
-              TailNumber TailNum = TailNumber::Odd>
+              bool IsInputGemm     = true,
+              bool MulRoutedWeight = true,
+              TailNumber TailNum   = TailNumber::Odd>
     __device__ static void Run(const index_t* p_sorted_token_ids,
                                const index_t* p_sorted_expert_ids,
                                const index_t* p_max_token_id,
@@ -1492,7 +1500,7 @@ struct GridwiseMoeGemm
             using CDEBlockTransferCluster =
                 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
-            constexpr index_t scatter_weight_idx  = IsInputGemm ? 1 : 3; // hack fix felix
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
                 ThisThreadBlock,
                 decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
@@ -1579,10 +1587,13 @@ struct GridwiseMoeGemm
                     {
                         token_offset = token_offset * problem.TopK + (fused_token >> 24);
                     }
-                    else
+                    if constexpr(MulRoutedWeight)
                     {
                         const float* p_sorted_weights_2 = p_ds_grid[I2];
-                        weight = weight * p_sorted_weights_2[c_token_pos + m0];
+                        if constexpr(sizeof(ADataType) < 2)
+                            weight = p_sorted_weights_2[c_token_pos + m0] * weight;
+                        else
+                            weight = p_sorted_weights_2[c_token_pos + m0];
                     }
                     scatter_offsets(m0) = token_offset * problem.N;
                     scatter_weights(m0) = weight;
@@ -1632,8 +1643,9 @@ struct GridwiseMoeGemm
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              bool IsInputGemm   = true,
-              TailNumber TailNum = TailNumber::Odd>
+              bool IsInputGemm     = true,
+              bool MulRoutedWeight = true,
+              TailNumber TailNum   = TailNumber::Odd>
     __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
                                     const index_t* p_sorted_expert_ids,
                                     const index_t* p_max_token_id,
@@ -1998,7 +2010,7 @@ struct GridwiseMoeGemm
             using CDEBlockTransferCluster =
                 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
-            constexpr index_t scatter_weight_idx  = IsInputGemm ? 1 : 3; // hack fix felix
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
                 ThisThreadBlock,
                 decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
@@ -2086,10 +2098,13 @@ struct GridwiseMoeGemm
                     {
                         token_offset = token_offset * problem.TopK + (fused_token >> 24);
                     }
-                    else
+                    if constexpr(MulRoutedWeight)
                     {
                         const float* p_sorted_weights_2 = p_ds_grid[I2];
-                        weight = weight * p_sorted_weights_2[c_token_pos + m0];
+                        if constexpr(sizeof(ADataType) < 2)
+                            weight = p_sorted_weights_2[c_token_pos + m0] * weight;
+                        else
+                            weight = p_sorted_weights_2[c_token_pos + m0];
                     }
                     scatter_offsets(m0) = token_offset * problem.N;
                     scatter_weights(m0) = weight;
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
index af735925ed..72c9dc86ac 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -18,10 +18,12 @@ namespace host {
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename D2DataType,
           typename AccDataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
+          bool MulRoutedWeight  = false,
           typename ComputeTypeA = CDataType,
           typename ComputeTypeB = ComputeTypeA>
 struct ReferenceMoeGemm : public device::BaseOperator
@@ -36,6 +38,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
                  const Tensor<ADataType>& a_t_k,
                  const Tensor<BDataType>& b_e_n_k,
                  Tensor<CDataType>& c_t_k_n,
+                 const Tensor<D2DataType>& d2,
                  AElementwiseOperation a_element_op,
                  BElementwiseOperation b_element_op,
                  CElementwiseOperation c_element_op)
@@ -46,6 +49,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
               a_t_k_{a_t_k},
               b_e_n_k_{b_e_n_k},
               c_t_k_n_{c_t_k_n},
+              d2_{d2},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
               c_element_op_{c_element_op}
@@ -59,6 +63,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
         const Tensor<ADataType>& a_t_k_;
         const Tensor<BDataType>& b_e_n_k_;
         Tensor<CDataType>& c_t_k_n_;
+        const Tensor<D2DataType>& d2_;
 
         AElementwiseOperation a_element_op_;
         BElementwiseOperation b_element_op_;
@@ -81,6 +86,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
                 const int topk_id   = (arg.sorted_token_ids_(m) & 0xff000000) >> 24;
                 const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
                 const int token_cnt = arg.a_t_k_.mDesc.GetLengths()[0];
+                D2DataType v_topk_w = arg.d2_(m, 0); // expert
                 if(t < token_cnt)
                 {
                     for(int k = 0; k < K; ++k)
@@ -128,6 +134,11 @@ struct ReferenceMoeGemm : public device::BaseOperator
                     }
                     CDataType v_c{0};
 
+                    if constexpr(MulRoutedWeight)
+                    {
+                        v_acc *= v_topk_w;
+                    }
+
                     arg.c_element_op_(v_c, v_acc);
 
                     arg.c_t_k_n_(t, topk_id, n) = v_c;
@@ -164,6 +175,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
                              const Tensor<ADataType>& a_t_k,
                              const Tensor<BDataType>& b_e_n_k,
                              Tensor<CDataType>& c_t_k_n,
+                             const Tensor<D2DataType>& d2,
                              AElementwiseOperation a_element_op,
                              BElementwiseOperation b_element_op,
                              CElementwiseOperation c_element_op)
@@ -175,6 +187,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
                         a_t_k,
                         b_e_n_k,
                         c_t_k_n,
+                        d2,
                         a_element_op,
                         b_element_op,
                         c_element_op};
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
index 1e8a086bc4..fb5c71e30a 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -25,6 +25,7 @@ template <typename ADataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
+          bool MulRoutedWeight  = false,
           typename ComputeTypeA = CDataType,
           typename ComputeTypeB = ComputeTypeA>
 struct ReferenceMoeGemm2 : public device::BaseOperator
@@ -143,7 +144,14 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                     CDataType v_c{0};
                     D0DataType v_d0 = arg.d0_(m, n); // a
                     D0DataType v_d1 = arg.d1_(e, n); // b
-                    arg.c_element_op_(v_c, v_acc, v_d0, v_d1, v_topk_w);
+                    if constexpr(MulRoutedWeight)
+                    {
+                        arg.c_element_op_(v_c, v_acc, v_d0, v_d1, v_topk_w);
+                    }
+                    else
+                    {
+                        arg.c_element_op_(v_c, v_acc, v_d0, v_d1, 1.f);
+                    }
                     arg.c_t_n_(t, n) += v_c;
                 }
             };

From c318ec0778f0b9db90618ac51185ff6f9dfab0e1 Mon Sep 17 00:00:00 2001
From: solin <bingzhou@amd.com>
Date: Fri, 18 Apr 2025 09:15:27 +0000
Subject: [PATCH 053/443] fix CI build fail

---
 .../ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 3d08c7a788..611aff318f 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
 #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
 
 namespace ck_tile {

From 7cadf187e28693eb211c9cfb76d72ba0d6fb28b8 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Mon, 21 Apr 2025 08:39:45 -0700
Subject: [PATCH 054/443] multi instance generation for CkTileEngine (#2080)

* Add support for multi-instance verification, print detail for each instance, documentation fix

* clang formatted

* Added Readme file

* updated readme

* Addressing review comments

* clang formatted

* Updated ReadMe and GPU reference code

* simplified dispatch kernel code

* indentation
---
 tile_engine/ops/gemm/README.md                |  51 ++++++
 .../gemm/configs/instance_combination.json    |   2 +-
 tile_engine/ops/gemm/gemm_host_api.cpp        |  79 +++++-----
 tile_engine/ops/gemm/gemm_host_api.hpp        | 146 +++++++-----------
 tile_engine/ops/gemm/gemm_instance_builder.py |  64 ++++++--
 5 files changed, 202 insertions(+), 140 deletions(-)
 create mode 100644 tile_engine/ops/gemm/README.md

diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
new file mode 100644
index 0000000000..495232f19b
--- /dev/null
+++ b/tile_engine/ops/gemm/README.md
@@ -0,0 +1,51 @@
+# GEMM Matrix Multiplication
+
+Use the files in this folder to generate and build applications that run Matrix multiplications using ck_tile programming based on the kernel parameters mentioned in the config file `./configs/instance_combination.json`.
+
+# Kernel Configurations
+
+User needs to provide kernel configuration such as datatype, layout, tile size, warp size, padding, pipeline, scheduler and epilogue in the config file. For reference please see `./configs/instance_combination.json`
+
+## Build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# To generate the executable
+make tile_engine_gemm -j
+```
+`tile_engine_gemm` will be located in the `./bin/` directory.
+
+## tile_engine_gemm inputs
+```
+
+          -m    m dimension (default:3840)
+          -n    n dimension (default:4096)
+          -k    k dimension (default:2048)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+    -split_k    SplitK value (default:1)
+          -v    No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
+     -warmup    Number of iterations before benchmark the kernel (default:50)
+     -repeat    Number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+       -init    Value for initializing tensor- random: 0, linear: 1, constant(1): 2 (default:0)
+   -pipeline    possible values are: compv3, compv4, mem (default:compv3)
+  -scheduler    possible values are: intrawave, interwave (default:intrawave)
+   -epilogue    possible values are: cshuffle, default (default:cshuffle)
+      -pad_m    Pad in m direction - true/false (default:false)
+      -pad_n    Pad in n direction - true/false (default:false)
+      -pad_k    Pad in k direction - true/false (default:false)
+
+Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json 
+```
+
+## Example
+
+Below example will run gemm kernel with default dimensions of matrices, for compv3 pipeline, intrawave scheduler and default epilogue with all possible tile sizes mentioned in Config file.
+
+```
+./bin/tile_engine_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default 
+```
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
index e21197d1de..e23df11500 100644
--- a/tile_engine/ops/gemm/configs/instance_combination.json
+++ b/tile_engine/ops/gemm/configs/instance_combination.json
@@ -19,7 +19,7 @@
       "values": [256]
     },
     "tile_k": {
-      "values": [64]
+      "values": [64, 32]
     },
     "warp_m": {
       "values": [2]
diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp
index 508f634920..3cef425a51 100644
--- a/tile_engine/ops/gemm/gemm_host_api.cpp
+++ b/tile_engine/ops/gemm/gemm_host_api.cpp
@@ -6,11 +6,16 @@
 #include "gemm_dispatcher.hpp"
 #include "gemm_host_api.hpp"
 
-float gemm_kernel_launch(KernelTraits& trait,
-                         ck_tile::GemmHostArgs& args,
-                         const ck_tile::stream_config& s)
+void gemm_kernel_launch(ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                        int verify,
+                        KernelTraits& trait,
+                        ck_tile::GemmHostArgs& args,
+                        const ck_tile::stream_config& s)
 {
-    return GemmDispatcher::dispatch(trait, args, s);
+    return GemmDispatcher::dispatch(
+        c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, trait, args, s);
 }
 
 template <typename ADataType,
@@ -20,11 +25,10 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-bool run(const ck_tile::ArgParser& arg_parser)
+void run(const ck_tile::ArgParser& arg_parser)
 {
     const ALayout a_layout = ALayout{};
     const BLayout b_layout = BLayout{};
-    // const CLayout c_layout = CLayout{};
 
     ck_tile::index_t kbatch = arg_parser.get_int("split_k");
     ck_tile::index_t M      = arg_parser.get_int("m");
@@ -113,43 +117,47 @@ bool run(const ck_tile::ArgParser& arg_parser)
     trait.kPadN     = arg_parser.get_bool("pad_n");
     trait.kPadK     = arg_parser.get_bool("pad_k");
 
-    float ave_time = gemm_kernel_launch(
-        trait, gemm_args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
-
-    std::size_t flop = std::size_t(2) * M * N * K;
-    std::size_t num_byte =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
-    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-
     std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
               << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
               << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
               << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
               << " B Type = " << DataTypeTraits<BDataType>::name
-              << " C Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+              << " C Type = " << DataTypeTraits<CDataType>::name << std::endl;
+
+    ck_tile::HostTensor<CDataType> c_m_n_host_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
 
-    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-    bool pass = true;
     if(verify)
     {
-        pass = gemm_verify<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            verify,
-            a_m_k,
-            b_k_n,
-            c_m_n_dev_result,
-            a_m_k_dev_buf,
-            b_k_n_dev_buf,
-            M,
-            N,
-            K,
-            stride_A,
-            stride_B,
-            stride_C,
-            kbatch);
+        gemm_host_reference<ADataType,
+                            BDataType,
+                            AccDataType,
+                            CDataType,
+                            ALayout,
+                            BLayout,
+                            CLayout>(verify,
+                                     a_m_k,
+                                     b_k_n,
+                                     c_m_n_host_result,
+                                     a_m_k_dev_buf,
+                                     b_k_n_dev_buf,
+                                     M,
+                                     N,
+                                     K,
+                                     stride_A,
+                                     stride_B,
+                                     stride_C);
     }
-    return pass;
+
+    gemm_kernel_launch(c_m_n_dev_buf,
+                       c_m_n_host_result,
+                       c_m_n_dev_result,
+                       verify,
+                       trait,
+                       gemm_args,
+                       ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    return;
 }
 
 int main(int argc, char* argv[])
@@ -159,7 +167,8 @@ int main(int argc, char* argv[])
         auto [result, parser] = create_args(argc, argv);
         if(!result)
             return EXIT_FAILURE;
-        return run<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(parser);
+        run<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(parser);
+        return 0;
     }
     catch(const std::exception& e)
     {
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 375f808966..c1e1e1dc4f 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include <hip/hip_runtime.h>
 
 #include <cstring>
@@ -54,24 +57,21 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
     static constexpr const char* name = "pk_int4_t";
 };
 
-/**
- * @brief  trait for GEMM kernel
- * @param pipeline:   pipeline name
- * @param scheduler:  scheduler name
- * @param epilogue:  epilogue name
- * @param kPadM:     padding for M dimension
- * @param kPadN:     padding for N dimension
- * @param kPadK:     padding for K dimension
- *
- */
-
+/// @brief Defines the configuration parameters for a GEMM operation, enabling the selection of a
+/// specific kernel instance based on the provided settings.
 struct KernelTraits
 {
+    /// @brief The name of the pipeline.
     std::string pipeline;
+    /// @brief The name of the scheduler (e.g., "intrawave", "interwave").
     std::string scheduler;
+    /// @brief The name of the epilogue (e.g., "cshuffle", "default").
     std::string epilogue;
+    /// @brief Indicates whether padding is applied to the M dimension.
     bool kPadM;
+    /// @brief Indicates whether padding is applied to the N dimension.
     bool kPadN;
+    /// @brief Indicates whether padding is applied to the K dimension.
     bool kPadK;
 };
 
@@ -184,11 +184,28 @@ void permute_vectors_i4x4_b(Tensor& tensor)
     }
 }
 
-/**
- * @brief Function to verify the kernel output with reference implementation on CPU/GPU
- *
- */
+/// @brief Function to compare the results of the device and host computations
+void compare(ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
+    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        K, kbatch, max_accumulated_value);
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
 
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+}
+
+/// @brief Function to get the kernel output with reference implementation on CPU/GPU
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
@@ -196,43 +213,25 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-bool gemm_verify(int verify,
-                 ck_tile::HostTensor<ADataType>& a_m_k,
-                 ck_tile::HostTensor<BDataType>& b_k_n,
-                 ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                 ck_tile::DeviceMem& a_m_k_dev_buf,
-                 ck_tile::DeviceMem& b_k_n_dev_buf,
-                 ck_tile::index_t M,
-                 ck_tile::index_t N,
-                 ck_tile::index_t K,
-                 ck_tile::index_t stride_A,
-                 ck_tile::index_t stride_B,
-                 ck_tile::index_t stride_C,
-                 ck_tile::index_t kbatch)
+void gemm_host_reference(int verify,
+                         ck_tile::HostTensor<ADataType>& a_m_k,
+                         ck_tile::HostTensor<BDataType>& b_k_n,
+                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                         ck_tile::DeviceMem& a_m_k_dev_buf,
+                         ck_tile::DeviceMem& b_k_n_dev_buf,
+                         ck_tile::index_t M,
+                         ck_tile::index_t N,
+                         ck_tile::index_t K,
+                         ck_tile::index_t stride_A,
+                         ck_tile::index_t stride_B,
+                         ck_tile::index_t stride_C)
 {
-    bool pass = true;
     if(verify == 1)
     {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
+        c_m_n_host_result.SetZero();
 
         ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_ref);
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+            a_m_k, b_k_n, c_m_n_host_result);
     }
     else if(verify == 2)
     {
@@ -241,29 +240,14 @@ bool gemm_verify(int verify,
             // Restore input for B for gpu reference
             b_k_n_dev_buf.ToDevice(b_k_n.data());
         }
-        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
-        c_m_n_gpu_ref.SetZero();
+
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes());
+        c_m_n_host_result.SetZero();
         c_m_n_gpu_buf_ref.SetZero();
 
-        ADataType* d_A;
-        BDataType* d_B;
-        CDataType* d_C;
-
-        ck_tile::hip_check_error(hipMalloc(&d_A, a_m_k.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(hipMalloc(&d_B, b_k_n.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(
-            hipMalloc(&d_C, c_m_n_dev_result.get_element_space_size_in_bytes()));
-
-        ck_tile::hip_check_error(hipMemcpy(d_A,
-                                           a_m_k_dev_buf.GetDeviceBuffer(),
-                                           a_m_k.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
-        ck_tile::hip_check_error(hipMemcpy(d_B,
-                                           b_k_n_dev_buf.GetDeviceBuffer(),
-                                           b_k_n.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
 
         ck_tile::reference_gemm_gpu<ADataType,
                                     BDataType,
@@ -273,30 +257,6 @@ bool gemm_verify(int verify,
                                     BLayout,
                                     CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
 
-        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
-                                           d_C,
-                                           c_m_n_dev_result.get_element_space_size_in_bytes(),
-                                           hipMemcpyDeviceToHost));
-
-        ck_tile::hip_check_error(hipFree(d_A));
-        ck_tile::hip_check_error(hipFree(d_B));
-        ck_tile::hip_check_error(hipFree(d_C));
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
-        const float max_accumulated_value =
-            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_gpu_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data());
     }
-    return pass;
 }
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index e449dff94d..cfefd38cd2 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -447,6 +447,17 @@ struct GemmKernel {{
 
         return ave_time;
     }}
+    static std::string get_name() {{
+        return std::string("GemmKernel<Bllktile: ") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) + ", " +
+                "WaveMap: " + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + ", " +
+                "WarpTile: " + std::to_string(WarpTileM) + "x" + std::to_string(WarpTileN) + "x" + std::to_string(WarpTileK) + ", " +
+                "PadidngM: " + "{kPadM}" + ", " +
+                "PaddingN: " + "{kPadN}" + ", " +
+                "PaddingK: " + "{kPadK}" + ", " +
+                "Pipeline: " + "{pipeline}" + ", " +
+                "Epilogue: " + "{epilogue}" + ", " +
+                "Scheduler: " + "{scheduler}";
+                }}
 }};
 """
 
@@ -476,7 +487,10 @@ struct GemmDispatcher {
     static auto& get_kernel_map() {
         // Use a static local variable
         static std::unordered_map<std::string, 
-            std::function<float(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>> kernel_map;
+            std::function<void(ck_tile::DeviceMem& c_m_n_dev_buf,
+                               ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                               ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                               int verify, ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>> kernel_map;
         return kernel_map;
     }
 
@@ -499,9 +513,12 @@ struct GemmDispatcher {
 
         
         for group in self.all_kernels:
-            content += f"""            kernel_map["{group}"] = [](ck_tile::GemmHostArgs& args, 
-                                        const ck_tile::stream_config& s) {{
-                std::vector<float> results;"""
+            content += f"""            kernel_map["{group}"] = [](ck_tile::DeviceMem& c_m_n_dev_buf,
+                                                                  ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                                                                  ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                                                                  int verify, ck_tile::GemmHostArgs& args,
+                                                                  const ck_tile::stream_config& s) {{
+                        """
             for tile in tile_params:
                 # Check if we have valid tile/warp combinations 
                 # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
@@ -509,21 +526,46 @@ struct GemmDispatcher {
                    ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
                     continue
                 content += f"""
-                //we can have multiple tiles config for the one kernel_trait
-                return {group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}>::launch(args, s);"""
-            content += """
-            };\n"""
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, s);"""
+            content += f"""
+            }};\n"""
 
         content += """    }
-        
     
-    static float dispatch(const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
+    template <typename Kernel>
+    static void run_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
+                           ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                           ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                           int verify, ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+    {
+        float avg_time = Kernel::launch(args, s);
+        std::string description = Kernel::get_name();
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        
+        std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
+        std::size_t num_byte = sizeof(ADataType) * args.M * args.K + sizeof(BDataType) * args.N * args.K + sizeof(CDataType) * args.M * args.N;
+        float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+        float gb_per_sec = num_byte / 1.E6 / avg_time;
+
+        std::cout << "Performance for " << description << " : " << avg_time << " ms, "
+                << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+        if(verify)
+            compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result);
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+    }
+
+    static auto dispatch(ck_tile::DeviceMem& c_m_n_dev_buf,
+                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                         ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                         int verify, const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
                          const ck_tile::stream_config& s) {
         init();
         const std::string key = assemble_key(trait);
         auto& kernel_map = get_kernel_map(); 
         if(auto it = kernel_map.find(key); it != kernel_map.end()) {
-            return it->second(gemm_args, s); //Running single instance
+            return it->second(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify,gemm_args, s); 
         }
         throw std::runtime_error("No suitable kernel found: " + key);
     }

From ce6175953804dceec37cb1f19e4b5194b3ed9a24 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 21 Apr 2025 08:48:22 -0700
Subject: [PATCH 055/443] fix daily gfx942 build (#2106)

---
 Jenkinsfile | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3d7019bd1f..f8043ba918 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -291,11 +291,6 @@ def cmake_build(Map conf=[:]){
             setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace "  .. """)
             build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
         }
-        else if (setup_args.contains("gfx908;gfx90a;gfx942")){
-            //limit the number of build threads when building for multiple gfx9 targets
-            setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
-            build_cmd = conf.get("build_cmd", "${build_envs} make -j32 ${config_targets}")
-        }
         else{
             setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
             build_cmd = conf.get("build_cmd", "${build_envs} make -j${nt} ${config_targets}")
@@ -604,7 +599,7 @@ def Build_CK(Map conf=[:]){
                             stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
                         }
                         else if ( arch_type == 6 ){
-                            // run standard tests on gfx908
+                            // run basic tests on gfx908
                             echo "Run performance tests"
                             sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
                             archiveArtifacts "perf_onnx_gemm_gfx908.log"
@@ -1115,11 +1110,11 @@ pipeline {
                     agent{ label rocmnode("gfx942") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
+                                         -DGPU_TARGETS="gfx90a;gfx942" \
                                          -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
+                                           -DGPU_TARGETS="gfx90a;gfx942" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }

From a738e43445f9f82227220922fcd2d683cc9ef626 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 21 Apr 2025 10:21:35 -0700
Subject: [PATCH 056/443] MFMA 16x16x32fp8 (#2103)

* add mfma_16x16x32_fp8

* clang format code

* Finished the fix for gemm basic

* clang foramt

* rebuild CI

* recover gemm.hpp

* add MFMA 16*16*32bf8

---------

Co-authored-by: solin <bingzhou@amd.com>
---
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   2 +
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |   3 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  14 ++
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 167 +++++++++++++++++-
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |   4 +
 5 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index cba3677332..0b38e7789e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -32,6 +32,8 @@ struct GemmPipelineProblemBase
 
     static constexpr bool TransposeC = Traits::TransposeC;
 
+    static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
+
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
 
     static constexpr bool kPadM = Traits::kPadM;
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index 0dae2eeca5..a31004b425 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -26,7 +26,8 @@ struct TileGemmTraits
     using BLayout = BLayout_;
     using CLayout = CLayout_;
 
-    static constexpr bool TransposeC = false;
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
 };
 
 template <bool kPadM_,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 33f3dde256..2c29814b73 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -145,6 +145,20 @@ using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
 using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
+using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
+    2>>;
+
+using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
+    2>>;
+
+using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 21a865e792..64c7543ffe 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -623,6 +623,165 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
 };
 
 // FP8
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        if constexpr(Ctrl == WGAttrCtlEnum::Raw_vvv)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_116x16x32_fp8_bf8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "v", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "v", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vaa)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "a", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "a", "a", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vav)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "a", "v", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "a", "v", "v")
+            }
+        }
+        else if constexpr(Ctrl == WGAttrCtlEnum::Raw_vva)
+        {
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_fp8", "+v", "v", "a", "v")
+            }
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            {
+                DISPATCH_MFMA_("mfma_f32_16x16x32_bf8_bf8", "+v", "v", "a", "v")
+            }
+        }
+        else
+        {
+#if defined(__gfx94__)
+            if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+                c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
+                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx94__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_316x16x32_bf8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 {
@@ -809,11 +968,17 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, Ctrl_>;
-
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<fp8_t, fp8_t, Ctrl_>;
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_>;
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<bf8_t, bf8_t, Ctrl_>;
+
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, fp8_t, Ctrl_>;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 6320b33598..f437ee10c5 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -57,12 +57,16 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 
 // fp8
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
 // clang-format on

From b092c18da708422fb529193de40b6224446007c5 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Mon, 21 Apr 2025 11:44:07 -0700
Subject: [PATCH 057/443] MI308 fix for streamk 1-Tile floating point exception
 (#2101)

---
 .../gpu/grid/block_to_ctile_map.hpp           | 67 ++++++++++++++++---
 ...t_gemm_universal_streamk_ut_cases_bf16.inc | 28 --------
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 64fad1ca48..311545aad6 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -1438,6 +1438,7 @@ struct BlockToCTileMap_GemmStreamK_v2
     __host__ __device__ BlockToCTileMap_GemmStreamK_v2(
         uint32_t m, uint32_t n, uint32_t k, uint32_t grid_size = 1, uint32_t streamk_sel = 1)
     {
+
         // total output tiles
         uint32_t num_tiles =
             math::integer_divide_ceil(m, MPerBlock) * math::integer_divide_ceil(n, NPerBlock);
@@ -1445,6 +1446,9 @@ struct BlockToCTileMap_GemmStreamK_v2
 
         uint32_t dp_tiles, dp_num_blocks, sk_total_iters;
 
+        // Ensure grid_size is at least 1 to avoid division by zero
+        grid_size = math::max(grid_size, 1u);
+
         // default to regular DP GEMM if sk blocks == 0
         if(streamk_sel == 0)
         {
@@ -1460,31 +1464,45 @@ struct BlockToCTileMap_GemmStreamK_v2
         // 2-tile sk + DP GEMM
         else
         {
-
             // check if there's enough work for DP+ stream-k
             bool bigEnough = num_tiles > grid_size;
-            // select between stream-k strategies
+
+            // Select between stream-k strategies
+            // Add safety checks to prevent zero or negative values
             uint32_t sk_tiles = 0;
             if(streamk_sel == 1) // 1 tile stream-k
             {
                 sk_tiles = bigEnough ? (num_tiles % grid_size) : num_tiles;
+
+                // Ensure sk_tiles is at least 1
+                sk_tiles = math::max(sk_tiles, 1u);
             }
             else if(streamk_sel == 2) // 2-tile stream-k
             {
                 sk_tiles = bigEnough ? (grid_size + num_tiles % grid_size) : num_tiles;
+
+                // Ensure sk_tiles is at least 1 but not more than num_tiles
+                sk_tiles = math::min(math::max(sk_tiles, 1u), num_tiles);
             }
             else if(streamk_sel == 3) // 3-tile stream-k
             {
                 sk_tiles = (num_tiles > (2 * grid_size)) ? (2 * grid_size + num_tiles % grid_size)
                                                          : num_tiles;
+
+                // Ensure sk_tiles is at least 1 but not more than num_tiles
+                sk_tiles = math::min(math::max(sk_tiles, 1u), num_tiles);
             }
             else if(streamk_sel == 4) // 4-tile stream-k
             {
                 sk_tiles = (num_tiles > (3 * grid_size)) ? (3 * grid_size + num_tiles % grid_size)
                                                          : num_tiles;
+
+                // Ensure sk_tiles is at least 1 but not more than num_tiles
+                sk_tiles = math::min(math::max(sk_tiles, 1u), num_tiles);
             }
+
             sk_num_blocks = sk_tiles;
-            // remaining tiles are DP tiles
+            // Remaining tiles are DP tiles
             dp_tiles = bigEnough ? (num_tiles - sk_tiles) : 0;
 
             sk_total_iters = k_iters_per_tile.get() * sk_tiles;
@@ -1500,24 +1518,51 @@ struct BlockToCTileMap_GemmStreamK_v2
             //      => sk_blocks * m + b = sk_total_iters
             //      => b = sk_total_iters - m * sk_blocks
             //      NOTE: big could be zero
-            uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
-            sk_num_big_blocks             = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
-            k_iters_per_big_block         = k_iters_per_sk_block + 1;
+
+            // Add safety check for sk_num_blocks to prevent division by zero
+            if(sk_num_blocks > 0)
+            {
+                uint32_t k_iters_per_sk_block = sk_total_iters / sk_num_blocks;
+                sk_num_big_blocks     = sk_total_iters - k_iters_per_sk_block * sk_num_blocks;
+                k_iters_per_big_block = k_iters_per_sk_block + 1;
+            }
+            else
+            {
+                // Fallback to default GEMM if no stream-k blocks
+                sk_num_blocks         = 0;
+                sk_num_big_blocks     = 0;
+                k_iters_per_big_block = 0;
+                dp_tiles              = num_tiles;
+                dp_num_blocks         = num_tiles;
+                dp_start_block_idx    = 0;
+                sk_total_iters        = 0;
+            }
 
             dp_num_blocks      = dp_tiles;
             dp_start_block_idx = sk_num_blocks;
         }
 
         n_tiles = MDiv2(math::integer_divide_ceil(n, NPerBlock));
-        // using multiple blocks for parallel reduction
+        // Using multiple blocks for parallel reduction
         reduction_start_block_idx = dp_start_block_idx + dp_num_blocks;
 
         if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction)
         {
-            uint32_t upper_big    = math::lcm(k_iters_per_big_block, k_iters_per_tile.get());
-            uint32_t upper_little = math::lcm(k_iters_per_big_block - 1, k_iters_per_tile.get());
-            equiv_tiles_big       = MDiv(upper_big / k_iters_per_tile.get());
-            equiv_tiles_little    = MDiv(upper_little / k_iters_per_tile.get());
+            // Add additional safety checks
+            if(k_iters_per_big_block > 0 && k_iters_per_tile.get() > 0)
+            {
+                uint32_t upper_big = math::lcm(k_iters_per_big_block, k_iters_per_tile.get());
+                uint32_t upper_little =
+                    math::lcm(math::max(k_iters_per_big_block - 1, 1u), k_iters_per_tile.get());
+                equiv_tiles_big    = MDiv(upper_big / k_iters_per_tile.get());
+                equiv_tiles_little = MDiv(upper_little / k_iters_per_tile.get());
+            }
+            else
+            {
+                // Default safe values
+                equiv_tiles_big    = MDiv(1);
+                equiv_tiles_little = MDiv(1);
+            }
         }
     }
 
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
index b6970c4233..22977866b5 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
@@ -44,34 +44,6 @@ TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, SmallM)
     }
 }
 
-TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = N;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
-TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_NK, MidLargeM)
-{
-    std::vector<int> Ms{127, 255, 312, 799, 1573};
-    constexpr int N = 512;
-    constexpr int K = 320;
-
-    constexpr int StrideA = K;
-    constexpr int StrideB = K;
-    constexpr int StrideC = N;
-
-    for(int M : Ms)
-        this->Run(M, N, K, StrideA, StrideB, StrideC);
-}
-
 TYPED_TEST(TestGemmUniversal_Streamk_BF16_KM_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};

From 4bef60aa57c35575708a4af636f838e6cf26147d Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 21 Apr 2025 13:53:03 -0700
Subject: [PATCH 058/443] update code owner (#2113)

---
 .github/CODEOWNERS | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 15903314f9..eb69bd7f39 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing
 # Documentation files
-docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
-*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
-*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
-.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing

From 0cca8fa28ff31ee7403a667deffc954bd467041f Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 22 Apr 2025 01:13:22 -0700
Subject: [PATCH 059/443] GEMM Multiply Multiply Fix (#2102)

* fix the type convert and increase the BF16 conversion + the profile comment

* fix the CI
---
 include/ck/utility/type_convert.hpp             | 2 +-
 profiler/src/profile_gemm_multiply_multiply.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index c8127aa887..04ae046ac8 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -117,7 +117,7 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
 #if CK_USE_RNE_BF16_CONVERSION
     return bf16_convert_rtn<bhalf_t>(x);
 #else
-    return uint16_t(uint32_t{x} >> 16);
+    return uint16_t(static_cast<uint32_t>(x) >> 16);
 #endif
 }
 
diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp
index ad2bb77544..42192b5985 100644
--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -42,7 +42,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
                "f16->f8; 7: f8->bf16, "
-               "comp f8; 8: int8->bf16; 9: f8->f16, comp f8;)\n");
+               "comp f8; 8: int8->bf16; 9: int8->f16, 10. f8->f16;)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");

From 416e851584f5ec7d8b9cfc6ea73b829900b73750 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Tue, 22 Apr 2025 16:08:48 -0500
Subject: [PATCH 060/443] Temporarily disable MX FP4 device tests (#2112)

---
 include/ck/ck.hpp              | 3 +++
 test/data_type/test_mx_fp4.cpp | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 0c2dc799ab..83b76382bc 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -244,6 +244,9 @@
 // workaround: compiler issue on gfx950
 #define CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION 1
 
+// workaround: compiler issue on gfx950
+#define CK_TEMP_DISABLE_FP4_TESTS 1
+
 // workaround: compiler issue on gfx950
 #define CK_WORKAROUND_FP16_TO_FP8_CONVERSION 1
 
diff --git a/test/data_type/test_mx_fp4.cpp b/test/data_type/test_mx_fp4.cpp
index 449f6fc777..7aca42567c 100644
--- a/test/data_type/test_mx_fp4.cpp
+++ b/test/data_type/test_mx_fp4.cpp
@@ -240,6 +240,7 @@ TEST(MXFP4, HostScaledConvert)
     EXPECT_EQ(test_size, i);
 }
 
+#if !CK_TEMP_DISABLE_FP4_TESTS
 __global__ void test_mx_fp4_device_scaled_convert(uint64_t N, float* p_test, uint64_t* p_completed)
 {
     test_mx_fp4_scaled_convert(N, p_test, p_completed);
@@ -539,3 +540,4 @@ TEST(MXFP4, DeviceF4x32ToF32x32ScaledConvert)
     EXPECT_EQ(N, completed);
     EXPECT_EQ(N, i);
 }
+#endif // CK_TEMP_DISABLE_FP4_TESTS

From 504f563f78fbf1a78d1d68fc94cdd69dfea2fb60 Mon Sep 17 00:00:00 2001
From: Gino Lu <xm35p4fu6@gmail.com>
Date: Wed, 23 Apr 2025 06:52:36 +0800
Subject: [PATCH 061/443] [CK-Tile] warp-gemm support for using
 V_MFMA_F32_16x16x32_BF16 (#2073)

* draft v_mfma_f32_16x16x32_bf16

* fix error config and add debug code.

* Solve the CShuffle Problem

* draft v_mfma_f32_16x16x32_bf16

* fix error config and add debug code.

* Solve the CShuffle Problem

* fix error while testing new command

* Finished the feature of new mfma 16*16*32

* Addressed the comment

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |   0
 example/ck_tile/03_gemm/gemm_utils.hpp        |  12 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |   1 -
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  23 +++-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 126 ++++++++++++++++++
 5 files changed, 154 insertions(+), 8 deletions(-)
 mode change 100755 => 100644 example/ck_tile/03_gemm/gemm_basic.cpp

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
old mode 100755
new mode 100644
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 973006196b..25fab6bde0 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -55,17 +55,17 @@ struct GemmConfig
 #endif
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
     // Compute friendly for Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64;
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128;
 
     static constexpr ck_tile::index_t M_Warp = 2;
     static constexpr ck_tile::index_t N_Warp = 2;
     static constexpr ck_tile::index_t K_Warp = 1;
 
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 32;
 
     static constexpr bool DoubleSmemBuffer = false;
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index b4ea5d22c0..79ed9ce76b 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -402,7 +402,6 @@ int run_gemm_example_with_layouts(int argc,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
                                   rtol_atol.at(ck_tile::number<1>{}));
-
         std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
                   << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
                   << std::endl;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 2c29814b73..bd7a0566a2 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -24,9 +24,14 @@ using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterate
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
 using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
@@ -49,10 +54,16 @@ using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
@@ -76,7 +87,6 @@ using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmf
     WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
 
 // bf16
-
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
@@ -87,9 +97,14 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaItera
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
@@ -113,10 +128,16 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 64c7543ffe..f937899ffd 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -61,6 +61,69 @@ enum class WGAttrCtlEnum
         DISPATCH_MFMA_(mfma_, "+a", "v", "v", "a")     \
     }
 
+// V_MFMA_F32_16x16x32_BF16
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<bf16_t, 8>;
+    using BVecType = ext_vector_t<bf16_t, 8>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x32_bf16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x32_bf16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
 // FP16
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
@@ -188,6 +251,69 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF16F16F32M16N16K32
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 8>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_16x16x32f16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_16x16x32_f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x32_f16(a_vec, b_vec, fp32x4_t{0.f}, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImplF16F16F32M4N64K4
 {

From 94662b02d0456bd29c7d3c36eeff39a0f7f49eed Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 22 Apr 2025 15:55:19 -0700
Subject: [PATCH 062/443] Adding include directory in tile_engine (#2116)

---
 tile_engine/include/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100755 tile_engine/include/CMakeLists.txt

diff --git a/tile_engine/include/CMakeLists.txt b/tile_engine/include/CMakeLists.txt
new file mode 100755
index 0000000000..d11a4b3bee
--- /dev/null
+++ b/tile_engine/include/CMakeLists.txt
@@ -0,0 +1 @@
+message("Add include directory")

From 39ba03f25d4c4c4e9f551a2dcf001cadd0b86cbe Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Wed, 23 Apr 2025 10:35:34 +0800
Subject: [PATCH 063/443] Moe gemm activation (#2026)

* fix useless code and remove usless oob

* clang format

* fix coredump in e2e test

* fix2

* fix clang format

* fix output oob

* impl int64 but result not correct

* int64 index ok now

* input output all ok

* fix uint32

* revert v1 test

* use uint32

* mork to support 13w tokens

* moe sorting fix moebuf

* fix merge

* update moe api fix aiter build

* fix buid

* fuse silu

* silu ok

* acale ok

* add silu

* change code

* gemm2 ok

* gufusion compatible ok, fix warnings

* gu fusion for m32 m64 ok

* support bf16 cshuffle

* i4 gemm2 ok

* i4 gemm2 ok and i4 gemm1 build

* 16x16 run ok

* change flops; change cshuffle dtype

* fuse gelu silu act in moe gemm1

* fp8 with act ready

* int4 act ready

* remove useless changes

* remove useless code change

* fix clang format

* add the arch limit of int4 moe gemm

* fuse moe activation

* fix fp8 16x16

* fix no quant case

* fix bugs

* fix fp8 gufusion bug

* remove useless comments

* refine activation code & complete moe example

* fix int8 bugs

* merge tkw1

---------

Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: feli <felix.li@amd.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: root <root@hjbog-srdc-51.amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   6 +
 .../moe_gemm1_xdl_fp8.cpp                     | 163 +++--
 .../moe_gemm1_xdl_pk_i4.cpp                   | 166 ++---
 .../moe_gemm2_xdl_fp8.cpp                     |  82 +--
 .../moe_gemm2_xdl_pk_i4.cpp                   |  19 +-
 ...dlops_b_preshuffle_gufusion_dequant_v1.hpp | 621 ++++++++++++++++++
 ...peline_xdlops_b_preshuffle_gufusion_v1.hpp | 573 ++++++++++++++++
 ..._pipeline_xdlops_b_preshuffle_selector.hpp | 141 ++--
 .../blockwise_gemm_pipeline_xdlops_base.hpp   |   5 +-
 ...roup_tensor_slice_transfer_v4r1_gather.hpp |   4 +-
 ...oup_tensor_slice_transfer_v7r3_scatter.hpp |  14 +-
 .../gpu/device/impl/device_moe_gemm.hpp       |  26 +-
 .../gpu/grid/gridwise_moe_gemm.hpp            | 444 +++++++++----
 ...wise_tensor_slice_transfer_v3r1_gather.hpp |   7 +-
 ...ise_tensor_slice_transfer_v7r3_scatter.hpp |  46 +-
 include/ck/utility/dynamic_buffer.hpp         |  60 +-
 include/ck/utility/tuple_helper.hpp           |   7 +
 .../cpu/reference_moe_gemm.hpp                |  85 ++-
 .../cpu/reference_moe_gemm2.hpp               |   2 +-
 19 files changed, 1975 insertions(+), 496 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index deca85ae64..3c1947c058 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -13,6 +13,12 @@ foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
         add_example_executable(example_moe_gemm1_xdl_pk_i4 moe_gemm1_xdl_pk_i4.cpp)
         add_example_executable(example_moe_gemm2_xdl_pk_i4 moe_gemm2_xdl_pk_i4.cpp)
+        if(CK_hip_VERSION VERSION_LESS_EQUAL 6.3.42132)
+            set(EXAMPLE_COMPILE_OPTIONS)
+            list(APPEND EXAMPLE_COMPILE_OPTIONS -mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1)
+            target_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+            target_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+        endif()
         set(target 1)
     endif()
 endforeach()
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index f594080755..3b31460953 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -25,7 +25,6 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 using F16 = ck::half_t;
-// using BF16 = ck::bhalf_t;
 using F8  = ck::f8_t;
 using F32 = float;
 
@@ -36,7 +35,7 @@ using A0DataType       = F8;
 using B0DataType       = F8;
 using EDataType        = F16;
 using AccDataType      = F32;
-using CShuffleDataType = F32;
+using CShuffleDataType = EDataType;
 using D0DataType       = F32;
 using D1DataType       = F32;
 using D2DataType       = F32;
@@ -61,27 +60,25 @@ struct MulABScale
     __host__ __device__ constexpr void operator()<EDataType, float, float, float>(
         EDataType& e, const float& c, const float& d0, const float& d1) const
     {
-        e = ck::type_convert<EDataType>(c * d1 * d0);
+        (void)d0;
+        (void)d1;
+        e = ck::type_convert<EDataType>(c);
     }
-};
-
-// for gate, a_scale, b_scale, fuse silu,
-struct MulABScaleSilu
-{
-    template <typename E, typename C, typename D0, typename D1>
-    __host__ __device__ constexpr void
-    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
-
     template <>
-    __host__ __device__ constexpr void operator()<EDataType, float, float>(EDataType& e,
-                                                                           const float& c,
-                                                                           const float& d0,
-                                                                           const float& d1) const
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1) const
     {
-        // act
-        float x0 = 0;
-        ck::tensor_operation::element_wise::Silu{}(x0, c * d1 * d0);
-        e = ck::type_convert<EDataType>(x0);
+        (void)d0;
+        (void)d1;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, EDataType, EDataType>(
+        EDataType& e, const EDataType& c, const EDataType& d0, const EDataType& d1) const
+    {
+        (void)d0;
+        (void)d1;
+        e = ck::type_convert<EDataType>(c);
     }
 };
 
@@ -95,11 +92,19 @@ struct MulABScaleExpertWeight
     __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
         EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
     {
-        // for real kernel use
-        // warning: hack hack hack here!!!! ignore d0 right now as kernel mul d0 * d2 outside.
-        // tofix:felix
+        (void)d0;
+        (void)d1;
         (void)d2;
-        e = ck::type_convert<EDataType>(c * d1 * d0);
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
     // for reference cpu
     template <>
@@ -107,16 +112,14 @@ struct MulABScaleExpertWeight
         float& e, const float& c, const float& d0, const float& d1, const float& d2) const
     {
         // for reference cpu
-        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
 };
 
-using CDEElementOp = MulABScaleExpertWeight; // combine MulRoutedWeight = true
-// using DsLayout = DsLayoutGate;
-// using DsDataType       = DsDataTypeGate;
-// using CDEElementOp = MulABScale; // combine MulRoutedWeight = false
-
-// using CDEElementOp = MulABScaleSiluMulGate;
+using CDEElementOp = MulABScaleExpertWeight;
 
 void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
 {
@@ -155,22 +158,21 @@ using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t MXDLPerWave = 2;
+static constexpr ck::index_t MXDLPerWave = 4;
 static constexpr ck::index_t NXDLPerWave = 2;
 static constexpr ck::index_t BLOCKSIZE   = 256;
-static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 32;
+static constexpr ck::index_t NPerBlock   = 64;
+static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
-static constexpr ck::index_t Nswizzle    = true;
-static constexpr bool MulRoutedWeight    = false;
+static constexpr ck::index_t Nswizzle    = false;
 static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
 static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
 static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
 static constexpr ck::index_t D0Vec       = 1;
 static constexpr ck::index_t D1Vec       = 1;
-static constexpr ck::index_t D2Vec       = 1;
-// using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
-using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
+static constexpr ck::index_t ActOP       = 1; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight    = false;
+using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
         <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
                AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
@@ -188,8 +190,8 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-                2,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
+                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
 
 // clang-format on
 
@@ -201,15 +203,13 @@ int main(int argc, char* argv[])
 
     // GEMM shape
     ck::index_t N               = 4096;
-    ck::index_t K               = 4096;
+    ck::index_t K               = 6144;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 8;
-    ck::index_t valid_tile_num  = 8;
-    ck::index_t tokens          = 128;
+    ck::index_t sorted_tile_num = 16;
+    ck::index_t valid_tile_num  = 13;
+    ck::index_t tokens          = 64;
     ck::index_t topk            = 2;
 
-    // ck::index_t tokens = batch * topk;
-
     if(argc == 1)
     {
         // use default case
@@ -255,28 +255,22 @@ int main(int argc, char* argv[])
     ck::index_t StrideB              = K;
     ck::index_t StrideE              = N;
     constexpr ck::index_t NumDTensor = DsDataType::Size();
-    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{1, 1, 1};
 
     ck::index_t KBatch = 1;
 
-    // const ck::index_t experts = 8;
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
-    // max_token_id.mData =  {valid_size, 2, 2, 1, 1, 2, 2, 2,2, 2, 2, 2, 2,1,0,0,0};
-    // max_token_id.mData =  {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    // int eids[] = {0, 0,1, 2,3, 3, 4,4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
-    // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    // int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
-    max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+    max_token_id.mData = {valid_size};
+    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
         expert_ids.mData[i] = eids[i];
     }
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;
-    // sorted_token_ids.mData[0] = 0;
+
     for(int i = 0; i < sorted_size; i++)
     {
         int tile_off = i % MPerBlock;
@@ -290,13 +284,12 @@ int main(int argc, char* argv[])
             sorted_token_ids.mData[i] = tokens;
         }
     }
-    // expert_ids.savetxt("expert_ids.txt", "int");
-    // sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
-    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D1DataType> d1_e_n(
+        HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
     Tensor<EDataType> e_t_n_device_result(
@@ -304,6 +297,7 @@ int main(int argc, char* argv[])
     std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
     std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
     std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
     std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
     std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
     std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
@@ -312,25 +306,25 @@ int main(int argc, char* argv[])
     {
     case 0: break;
     case 1:
-        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
-        d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
-        d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
-        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{-2, 2});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
         break;
     case 2:
-        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
-        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0, 1});
         d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
         d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
         break;
     case 3:
-        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
         d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
-        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
-        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
         break;
     default:
         a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
@@ -349,9 +343,7 @@ int main(int argc, char* argv[])
     DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
     DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
     DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
-    // a0_t_k.savetxt("a.txt");
-    // d0_t_n.savetxt("d0_t_n.txt", "int");
-    // d1_e_n.savetxt("d1_e_n.txt", "int");
+
     sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
     expert_ids_dev.ToDevice(expert_ids.mData.data());
     max_token_id_dev.ToDevice(max_token_id.mData.data());
@@ -369,7 +361,8 @@ int main(int argc, char* argv[])
 
     int NPerXdl = device_op.GetPreShuffleParameters();
 
-    preShuffleBuffer(b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * experts, K, NPerXdl);
+    preShuffleBuffer(
+        b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * 2 * experts, K, NPerXdl);
 
     b0_device_buf.ToDevice(b0_preshuffled.mData.data());
 
@@ -408,9 +401,9 @@ int main(int argc, char* argv[])
     {
         float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
-        std::size_t flop      = std::size_t(2) * tokens * topk * N * K;
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * 2 * K;
         std::size_t num_btype = sizeof(A0DataType) * valid_tile_num * K +
-                                sizeof(B0DataType) * K * N * experts +
+                                sizeof(B0DataType) * K * N * 2 * experts +
                                 sizeof(EDataType) * valid_tile_num * N;
 
         float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -437,6 +430,7 @@ int main(int argc, char* argv[])
                                                                                    PassThrough,
                                                                                    PassThrough,
                                                                                    PassThrough,
+                                                                                   ActOP,
                                                                                    MulRoutedWeight>;
         auto ref_moe_gemm           = ReferenceGemmInstance{};
         auto ref_invoker            = ref_moe_gemm.MakeInvoker();
@@ -446,7 +440,9 @@ int main(int argc, char* argv[])
                                                       max_token_id,
                                                       MPerBlock,
                                                       a0_t_k,
+                                                      d0_t_n,
                                                       b0_e_n_k,
+                                                      d1_e_n,
                                                       c_t_k_n,
                                                       d2_e_n,
                                                       PassThrough{},
@@ -472,15 +468,14 @@ int main(int argc, char* argv[])
                                c_t_k_n(t, topk_id, n),
                                d0_t_n(t, n),
                                d1_e_n(e, n),
-                               1.f);
+                               d2_e_n(e, n));
             }
         }
 
         e_device_buf.FromDevice(e_t_n_device_result.mData.data());
-        // e_t_n_device_result.savetxt("out.txt");
-        // e_t_n_host_result.savetxt("ref.txt");
+
         return ck::utils::check_err(
-                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
                    ? 0
                    : 1;
     }
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index fb8a8b9826..3c3ef16198 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -36,7 +36,7 @@ using A0DataType       = F8;
 using B0DataType       = I4;
 using EDataType        = F16;
 using AccDataType      = F32;
-using CShuffleDataType = F32;
+using CShuffleDataType = F16;
 using D0DataType       = F32;
 using D1DataType       = F32;
 using D2DataType       = F32;
@@ -47,7 +47,8 @@ using B0Layout = Col;
 using ELayout  = Row;
 using D0Layout = Row;
 using D1Layout = Col;
-using DsLayout = ck::Tuple<D0Layout, D1Layout, ELayout>;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
 
 // for gate, a_scale, b_scale
 struct MulABScale
@@ -56,42 +57,32 @@ struct MulABScale
     __host__ __device__ constexpr void
     operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
 
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1) const
+    {
+        (void)d0;
+        (void)d1;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        e = ck::type_convert<EDataType>(c);
+#else
+        e = ck::type_convert<EDataType>(c);
+#endif
+    }
     template <>
     __host__ __device__ constexpr void operator()<EDataType, float, float, float>(
         EDataType& e, const float& c, const float& d0, const float& d1) const
     {
+        (void)d0;
+        (void)d1;
 #if CK_USE_PK4_LAYOUT_SHUFFLE
-        e = ck::type_convert<EDataType>(c * d1 * d0 * 16);
+        e = ck::type_convert<EDataType>(c);
 #else
-        e = ck::type_convert<EDataType>(c * d1 * d0);
+        e = ck::type_convert<EDataType>(c);
 #endif
     }
 };
 
-// for gate, a_scale, b_scale, fuse silu,
-struct MulABScaleSilu
-{
-    template <typename E, typename C, typename D0, typename D1>
-    __host__ __device__ constexpr void
-    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
-
-    template <>
-    __host__ __device__ constexpr void operator()<EDataType, float, float>(EDataType& e,
-                                                                           const float& c,
-                                                                           const float& d0,
-                                                                           const float& d1) const
-    {
-        // act
-        float x0 = 0;
-#if CK_USE_PK4_LAYOUT_SHUFFLE
-        ck::tensor_operation::element_wise::Silu{}(x0, c * d1 * d0 * 16);
-#else
-        ck::tensor_operation::element_wise::Silu{}(x0, c * d1 * d0);
-#endif
-        e = ck::type_convert<EDataType>(x0);
-    }
-};
-
 struct MulABScaleExpertWeight
 {
     template <typename E, typename C, typename D0, typename D1, typename D2>
@@ -102,13 +93,19 @@ struct MulABScaleExpertWeight
     __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
         EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
     {
+        (void)d0;
+        (void)d1;
         (void)d2;
-
-#if CK_USE_PK4_LAYOUT_SHUFFLE
-        e = ck::type_convert<EDataType>(c * d1 * d0 * 16);
-#else
-        e = ck::type_convert<EDataType>(c * d1 * d0);
-#endif
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
     // for reference cpu
     template <>
@@ -116,15 +113,18 @@ struct MulABScaleExpertWeight
         float& e, const float& c, const float& d0, const float& d1, const float& d2) const
     {
         // for reference cpu
-#if CK_USE_PK4_LAYOUT_SHUFFLE
-        e = ck::type_convert<EDataType>(c * d0 * d1 * d2 * 16);
-#else
-        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
-#endif
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
 };
 
-using CDEElementOp = MulABScaleExpertWeight;
+static constexpr bool MulRoutedWeight = true;
+
+using CDEElementOp = MulABScaleExpertWeight; // combine MulRoutedWeight = true
+
+// using CDEElementOp = MulABScale; // combine MulRoutedWeight = true
 
 #if 1
 void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
@@ -165,54 +165,24 @@ using AElementOp = PassThrough;
 using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
-#if 0
-static constexpr ck::index_t MPerBlock = 64;
-static constexpr ck::index_t MXDLPerWave = 1; 
-static constexpr ck::index_t NXDLPerWave = 2; 
-static constexpr ck::index_t BLOCKSIZE = 256;
-static constexpr ck::index_t NPerBlock = 128;
-static constexpr ck::index_t MNPerXDL = 32;
-static constexpr ck::index_t KPerBlock = 64 / sizeof(A0DataType);
-static constexpr ck::index_t Nswizzle = false;
-static constexpr ck::index_t AK1 = 16 / sizeof(A0DataType);
-static constexpr ck::index_t BK1 = 32 / sizeof(B0DataType);
-static constexpr ck::index_t EVec = 16 / sizeof(EDataType);
-static constexpr ck::index_t D0Vec = 1;
-static constexpr ck::index_t D1Vec = 1;
 
-// clang-format off
-using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
-            Row, Col, DsLayout, ELayout, 
-            A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
-            AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
-            BLOCKSIZE,   MPerBlock,   NPerBlock,    KPerBlock,
-            AK1,   BK1,
-            MNPerXDL,   MNPerXDL,
-            MXDLPerWave,    NXDLPerWave,
-            S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-            S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
-            MXDLPerWave,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
-            ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, A0DataType>;
-// clang-format on
-#else
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t Nswizzle = false;
-static constexpr bool MulRoutedWeight = false;
+static constexpr ck::index_t Nswizzle  = false;
+static constexpr ck::index_t Act_OP    = 1; // 0: gelu_and_mul, 1: silu_and_mul
 // clang-format off
 using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm<
             Row, Col, DsLayout, ELayout, 
             A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
             AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
-            256,   MPerBlock,   128,    128,
+            256,   MPerBlock,   64,    128,
             16,   32,
-            32,   32,
-            4,    1,
+            16,   16,
+            8,    1,
             S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
             S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 32, 32, 0,
-            1,    1,   S<1, 32, 1, 8>, S<8, 1, 1, 1>,
-            ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, MulRoutedWeight, A0DataType>;
+            2,    1,   S<1, 32, 1, 8>, S<8, 1, 1>,
+            ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Act_OP, Nswizzle, true, MulRoutedWeight, true, ck::index_t, A0DataType>;
 // clang-format on
-#endif
 
 int main(int argc, char* argv[])
 {
@@ -220,13 +190,10 @@ int main(int argc, char* argv[])
     int init_method      = 1;
     bool time_kernel     = true;
 
-    // tokens = 1
-    // topk = 1
-    // experts = 8
     // per expert:
     // GEMM shape
-    ck::index_t N               = 4096 * 2;
-    ck::index_t K               = 6144;
+    ck::index_t N               = 14336;
+    ck::index_t K               = 4096;
     ck::index_t experts         = 8;
     ck::index_t sorted_tile_num = 16;
     ck::index_t valid_tile_num  = 13;
@@ -266,20 +233,20 @@ int main(int argc, char* argv[])
     ck::index_t StrideB              = K;
     ck::index_t StrideE              = N;
     constexpr ck::index_t NumDTensor = DsDataType::Size();
-    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0};
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
 
     ck::index_t KBatch = 1;
 
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
-    max_token_id.mData = {valid_size, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 0};
+    max_token_id.mData = {valid_size};
     int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
         expert_ids.mData[i] = eids[i];
     }
-    int token_per_tile = tokens * topk / valid_tile_num;
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;
     for(int i = 0; i < sorted_size; i++)
     {
@@ -294,11 +261,12 @@ int main(int argc, char* argv[])
             sorted_token_ids.mData[i] = tokens;
         }
     }
+
     Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
-    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
-    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N * 2}, {1, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
     Tensor<EDataType> e_t_n_device_result(
@@ -306,6 +274,7 @@ int main(int argc, char* argv[])
 
     std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
     std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
     std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
     std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
     std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
@@ -314,11 +283,11 @@ int main(int argc, char* argv[])
     {
     case 0: break;
     case 1:
-        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
-        d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
-        d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
-        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{-2, 2});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
         break;
     case 2:
         a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
@@ -497,9 +466,9 @@ int main(int argc, char* argv[])
     {
         float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
-        std::size_t flop      = std::size_t(2) * tokens * topk * N * K;
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * 2 * K;
         std::size_t num_btype = sizeof(A0DataType) * valid_tile_num * K +
-                                sizeof(B0DataType) / 2 * K * N * experts +
+                                sizeof(B0DataType) / 2 * K * N * 2 * experts +
                                 sizeof(EDataType) * valid_tile_num * N;
 
         float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -526,6 +495,7 @@ int main(int argc, char* argv[])
                                                                                    PassThrough,
                                                                                    PassThrough,
                                                                                    PassThrough,
+                                                                                   Act_OP,
                                                                                    MulRoutedWeight>;
         auto ref_moe_gemm           = ReferenceGemmInstance{};
         auto ref_invoker            = ref_moe_gemm.MakeInvoker();
@@ -535,7 +505,9 @@ int main(int argc, char* argv[])
                                                       max_token_id,
                                                       MPerBlock,
                                                       a0_t_k,
+                                                      d0_t_n,
                                                       b0_e_n_k,
+                                                      d1_e_n,
                                                       c_t_k_n,
                                                       d2_e_n,
                                                       PassThrough{},
@@ -561,13 +533,13 @@ int main(int argc, char* argv[])
                                c_t_k_n(t, topk_id, n),
                                d0_t_n(t, n),
                                d1_e_n(e, n),
-                               1.f);
+                               d2_e_n(e, n));
             }
         }
 
         e_device_buf.FromDevice(e_t_n_device_result.mData.data());
         return ck::utils::check_err(
-                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
                    ? 0
                    : 1;
     }
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 04f10b53ae..42d892fe26 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -25,7 +25,6 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 using F16 = ck::half_t;
-// using BF16 = ck::bhalf_t;
 using F8  = ck::f8_t;
 using F32 = float;
 
@@ -36,7 +35,7 @@ using A0DataType       = F8;
 using B0DataType       = F8;
 using EDataType        = F16;
 using AccDataType      = F32;
-using CShuffleDataType = F32;
+using CShuffleDataType = F16;
 using D0DataType       = F32;
 using D1DataType       = F32;
 using D2DataType       = F32;
@@ -48,7 +47,6 @@ using ELayout  = Row;
 using D0Layout = Row;
 using D1Layout = Col;
 using D2Layout = ELayout;
-// using DsLayoutGate = ck::Tuple<D0Layout, D1Layout>;
 using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
 
 // d0: ascale, d1: bscale, d2:expert weight
@@ -62,11 +60,19 @@ struct MulABScaleExpertWeight
     __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
         EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
     {
-        // for real kernel use
-        // warning: hack hack hack here!!!! ignore d0 right now as kernel mul d0 * d2 outside.
-        // tofix:felix
         (void)d0;
-        e = ck::type_convert<EDataType>(c * d1 * d2);
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, EDataType, float, float, float>(
+        EDataType& e, const EDataType& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
     }
     // for reference cpu
     template <>
@@ -119,14 +125,12 @@ using CDEElementOp = MulABScaleExpertWeight;
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t BLOCKSIZE = 256;
-static constexpr ck::index_t MXDLPerWave = 2;
-static constexpr ck::index_t NXDLPerWave = 2;
+static constexpr ck::index_t MXDLPerWave = 4;
+static constexpr ck::index_t NXDLPerWave = 4;
 static constexpr ck::index_t NPerBlock   = 128;
-static constexpr ck::index_t MNPerXDL    = 32;
+static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
 
-// static constexpr ck::index_t MXDLPerWave = MPerBlock / 32; //todo fix this constraint
-// static constexpr ck::index_t CShuffleMXDLPerWave = MPerBlock / 32;
 static constexpr ck::index_t CShuffleNLane = 32;
 static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
 static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
@@ -135,7 +139,7 @@ static constexpr ck::index_t EVec          = 2;
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
 static constexpr ck::index_t D2Vec         = 1;
-static constexpr bool MulRoutedWeight      = false;
+static constexpr bool MulRoutedWeight      = true;
 using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
 ///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -164,8 +168,8 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-               2,        1,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, MulRoutedWeight, A0DataType>;
+               4,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, false, int32_t, A0DataType>;
         // kernel 2: 128->32x128x128
         //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
 
@@ -177,16 +181,13 @@ int main(int argc, char* argv[])
     int init_method      = 1;
     bool time_kernel     = true;
 
-    // tokens = 1
-    // topk = 1
-    // experts = 8
     // per expert:
     // GEMM shape
     ck::index_t N               = 4096;
     ck::index_t K               = 4096;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 6;
-    ck::index_t valid_tile_num  = 6;
+    ck::index_t sorted_tile_num = 16;
+    ck::index_t valid_tile_num  = 13;
     ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
     ck::index_t valid_size      = valid_tile_num * MPerBlock;
     ck::index_t tokens          = 128;
@@ -212,6 +213,18 @@ int main(int argc, char* argv[])
         K               = std::stoi(argv[5]);
         tokens          = std::stoi(argv[6]);
     }
+    else if(argc == 9)
+    {
+
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+        sorted_tile_num = std::stoi(argv[7]);
+        valid_tile_num  = std::stoi(argv[8]);
+    }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
@@ -229,15 +242,13 @@ int main(int argc, char* argv[])
 
     ck::index_t KBatch = 1;
 
-    // const ck::index_t experts = 8;
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
-    // max_token_id.mData[0] = valid_size;
-    // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
-    max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+
+    max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
+    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
+
     for(int i = 0; i < sorted_tile_num; i++)
     {
         expert_ids.mData[i] = eids[i];
@@ -249,7 +260,7 @@ int main(int argc, char* argv[])
     }
     int token_per_tile = tokens * topk / valid_tile_num;
     int tokenid        = 0;
-    // sorted_token_ids.mData[0] = 0;
+
     for(int i = 0; i < sorted_size; i++)
     {
         int tile_off = i % MPerBlock;
@@ -263,8 +274,7 @@ int main(int argc, char* argv[])
             sorted_token_ids.mData[i] = tokens;
         }
     }
-    expert_ids.savetxt("expert_ids.txt", "int");
-    sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+
     Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
     Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
     Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
@@ -315,12 +325,7 @@ int main(int argc, char* argv[])
     DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
     DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
     DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
-    // a0_t_k_k.savetxt("a.txt");
-    // expert_ids.savetxt("expert_ids.txt", "int");
-    // sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
-    // d0_t_n.savetxt("d0_t_n.txt", "int");
-    // d1_e_n.savetxt("d1_e_n.txt", "int");
-    // d2_e_n.savetxt("d2_e_n.txt", "int");
+
     sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
     expert_ids_dev.ToDevice(expert_ids.mData.data());
     max_token_id_dev.ToDevice(max_token_id.mData.data());
@@ -398,7 +403,7 @@ int main(int argc, char* argv[])
         e_device_buf.ToDevice(e_t_n_device_result.mData.data());
         invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
 
-        Tensor<CShuffleDataType> c_t_n({tokens, N});
+        Tensor<float> c_t_n({tokens, N});
 
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceMoeGemm2<A0DataType,
@@ -406,7 +411,7 @@ int main(int argc, char* argv[])
                                                           D0DataType,
                                                           D1DataType,
                                                           D2DataType,
-                                                          CShuffleDataType,
+                                                          float,
                                                           AccDataType,
                                                           PassThrough,
                                                           PassThrough,
@@ -439,8 +444,7 @@ int main(int argc, char* argv[])
         }
 
         e_device_buf.FromDevice(e_t_n_device_result.mData.data());
-        // e_t_n_device_result.savetxt("out.txt");
-        // e_t_n_host_result.savetxt("ref.txt");
+
         return ck::utils::check_err(
                    e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
                    ? 0
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index ba4e40151f..b9621cc9b3 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -62,11 +62,13 @@ struct MulABScaleExpertWeight
         EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
     {
         (void)d0;
+        (void)d1;
+        (void)d2;
 
 #if CK_USE_PK4_LAYOUT_SHUFFLE
-        e = ck::type_convert<EDataType>(c * d1 * d2 * 16);
+        e = ck::type_convert<EDataType>(c * 16);
 #else
-        e = ck::type_convert<EDataType>(c * d1 * d2);
+        e = ck::type_convert<EDataType>(c);
 #endif
     }
     // for reference cpu
@@ -125,10 +127,10 @@ using CDEElementOp = MulABScaleExpertWeight;
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t BLOCKSIZE = 256;
-static constexpr ck::index_t MXDLPerWave   = 4;
-static constexpr ck::index_t NXDLPerWave   = 1;
+static constexpr ck::index_t MXDLPerWave   = 8;
+static constexpr ck::index_t NXDLPerWave   = 2;
 static constexpr ck::index_t NPerBlock     = 128;
-static constexpr ck::index_t MNPerXDL      = 32;
+static constexpr ck::index_t MNPerXDL      = 16;
 static constexpr ck::index_t KPerBlock     = 128 / sizeof(A0DataType);
 static constexpr ck::index_t CShuffleNLane = 32;
 static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
@@ -149,8 +151,8 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
                MXDLPerWave,    NXDLPerWave,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
                S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
-               1,    1,   S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, MulRoutedWeight, A0DataType>;
+               2,    2,   S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, false, ck::index_t, A0DataType>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -159,9 +161,6 @@ int main(int argc, char* argv[])
     int init_method      = 1;
     bool time_kernel     = true;
 
-    // tokens = 1
-    // topk = 1
-    // experts = 8
     // per expert:
     // GEMM shape
     ck::index_t N               = 4096;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
new file mode 100644
index 0000000000..29750b8baa
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::c_thread_desc_;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        BBlockTransfer& b_blockwise_copy_up,
+                        const BGridBuffer& b_grid_buf,
+                        const BGridBuffer& b_grid_buf_up,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        CThreadBuffer& c_thread_buf_up,
+                        index_t num_loop) const
+
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto b_thread_dequant_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}> b_thread_dequant_bufs;
+        StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}>
+            b_thread_dequant_bufs_up;
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+        // B VGPR->VGPR dequant
+        b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                   b_block_origin_idx,
+                                   b_thread_bufs(I0),
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_dequant_bufs(I0));
+        b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                   b_block_origin_idx,
+                                   b_thread_bufs_up(I0),
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_dequant_bufs_up(I0));
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_dequant_bufs[mfma_reg_buf]
+                                                             [Number<b_thread_desc_.CalculateOffset(
+                                                                 make_tuple(n0, I0, k0, ik))>{}];
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_dequant_bufs_up
+                                            [mfma_reg_buf][Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, k0, ik))>{}];
+                                });
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+                    // B VGPR->VGPR dequant
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs(local_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs(local_read_buf));
+                    b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                               b_block_origin_idx,
+                                               b_thread_bufs_up(local_read_buf),
+                                               b_thread_desc_,
+                                               make_tuple(I0, I0, I0, I0),
+                                               b_thread_dequant_bufs_up(local_read_buf));
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+            // B VGPR->VGPR dequant
+            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                       b_block_origin_idx,
+                                       b_thread_bufs(I1),
+                                       b_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_dequant_bufs(I1));
+
+            b_thread_dequant_copy_.Run(b_block_desc_n0_n1_k0_k1,
+                                       b_block_origin_idx,
+                                       b_thread_bufs_up(I1),
+                                       b_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_dequant_bufs_up(I1));
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_dequant_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using BThreadDequantCopy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+        BDataType,
+        ComputeDataType,
+        decltype(b_block_desc_n0_n1_k0_k1),
+        decltype(b_block_desc_n0_n1_k0_k1),
+        tensor_operation::element_wise::PassThrough,
+        Sequence<Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}>,
+        Sequence<1, 2, 0, 3>,
+        3,
+        KPack>;
+
+    const PassThrough b_element_op{};
+    BThreadDequantCopy b_thread_dequant_copy_{b_element_op};
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
new file mode 100644
index 0000000000..73749c6309
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                            BlockSize,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            ATileDesc,
+                                                            BTileDesc,
+                                                            AMmaTileDesc,
+                                                            BMmaTileDesc,
+                                                            ABlockTransferSrcScalarPerVector,
+                                                            BBlockTransferSrcScalarPerVector,
+                                                            MPerBlock,
+                                                            NPerBlock,
+                                                            KPerBlock,
+                                                            MPerXDL,
+                                                            NPerXDL,
+                                                            MRepeat,
+                                                            NRepeat,
+                                                            KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::c_thread_desc_;
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b =
+            HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves * 2;
+        constexpr auto mfma_interleave = MPerXDL == 32 ? 1 : 2;
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 64)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
+            }
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // if constexpr(i.value < num_buffer_load_inst_a) {
+            //     __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            //     __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            //     __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            //     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // }
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, MPerXDL == 32 ? num_ds_read_inst_a / 2 : num_ds_read_inst_a, 1>{}(
+            [&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);                     // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, MPerXDL == 32 ? 2 : 1, 0); // DS read
+            });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        BBlockTransfer& b_blockwise_copy_up,
+                        const BGridBuffer& b_grid_buf,
+                        const BGridBuffer& b_grid_buf_up,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        CThreadBuffer& c_thread_buf_up,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                        [Number<b_thread_desc_.CalculateOffset(
+                                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
index a94ef03008..074b5873ee 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -3,8 +3,10 @@
 
 #pragma once
 
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp"
@@ -33,57 +35,112 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t NPerXDL,
           index_t MRepeat,
           index_t NRepeat,
-          index_t KPack>
+          index_t KPack,
+          bool GUFusion = false>
 constexpr auto BlockGemmBPreshufflePipeline_Selector()
 {
     if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
     {
         if constexpr(std::is_same<ADataType, BDataType>::value)
         {
-            return BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlkGemmPipeSche,
-                                                               BlockSize,
-                                                               ADataType,
-                                                               BDataType,
-                                                               ComputeDataType,
-                                                               AccDataType,
-                                                               ATileDesc,
-                                                               BTileDesc,
-                                                               AMmaTileDesc,
-                                                               BMmaTileDesc,
-                                                               ABlockTransferSrcScalarPerVector,
-                                                               BBlockTransferSrcScalarPerVector,
-                                                               MPerBlock,
-                                                               NPerBlock,
-                                                               KPerBlock,
-                                                               MPerXDL,
-                                                               NPerXDL,
-                                                               MRepeat,
-                                                               NRepeat,
-                                                               KPack>{};
+            if constexpr(GUFusion)
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+            else
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlkGemmPipeSche,
+                                                                   BlockSize,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   ComputeDataType,
+                                                                   AccDataType,
+                                                                   ATileDesc,
+                                                                   BTileDesc,
+                                                                   AMmaTileDesc,
+                                                                   BMmaTileDesc,
+                                                                   ABlockTransferSrcScalarPerVector,
+                                                                   BBlockTransferSrcScalarPerVector,
+                                                                   MPerBlock,
+                                                                   NPerBlock,
+                                                                   KPerBlock,
+                                                                   MPerXDL,
+                                                                   NPerXDL,
+                                                                   MRepeat,
+                                                                   NRepeat,
+                                                                   KPack>{};
+            }
         }
         else
         {
-            return BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1<
-                BlkGemmPipeSche,
-                BlockSize,
-                ADataType,
-                BDataType,
-                ComputeDataType,
-                AccDataType,
-                ATileDesc,
-                BTileDesc,
-                AMmaTileDesc,
-                BMmaTileDesc,
-                ABlockTransferSrcScalarPerVector,
-                BBlockTransferSrcScalarPerVector,
-                MPerBlock,
-                NPerBlock,
-                KPerBlock,
-                MPerXDL,
-                NPerXDL,
-                MRepeat,
-                NRepeat,
-                KPack>{};
+            if constexpr(GUFusion)
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+            else
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
         }
     }
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index d7ba2559ea..ce507ca8d3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -46,7 +46,8 @@ struct BlockwiseGemmXdlops_pipeline_base
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
     static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
-    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 =
+        BTileDesc{}.GetLength(Number < BTileDesc{}.GetNumOfDimension() == 4 ? 3 : 2 > {});
 
     static constexpr auto xdlops_gemm =
         XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType, TransposeC>{};
@@ -333,7 +334,7 @@ struct BlockwiseGemmXdlops_pipeline_base
         return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
             c_grid_desc_g_m0_n0_m1_n1_m2_n2);
     }
-
+    __host__ __device__ static constexpr auto GetCThreadDesc() { return c_thread_desc_; }
     static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k;
     static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k;
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp
index 859649185a..92aef65388 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp
@@ -41,6 +41,7 @@ template <typename ThreadGroup,
           index_t DstScalarStrideInVector,
           bool ThreadTransferSrcResetCoordinateAfterRun,
           bool ThreadTransferDstResetCoordinateAfterRun,
+          typename IndexType,
           index_t GatherDim        = 1,
           index_t NumThreadScratch = 1>
 struct ThreadGroupTensorSliceTransfer_v4r1_gather
@@ -58,7 +59,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1_gather
         const DstDesc& dst_desc,
         const Index& dst_block_slice_origin,
         const DstElementwiseOperation& dst_element_op,
-        const StaticallyIndexedArray<index_t, gather_num>& gather_offsets)
+        const StaticallyIndexedArray<IndexType, gather_num>& gather_offsets)
         : threadwise_transfer_(src_desc,
                                make_zero_multi_index<nDim>(),
                                src_element_op,
@@ -190,6 +191,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1_gather
                                                   DstScalarStrideInVector,
                                                   ThreadTransferSrcResetCoordinateAfterRun,
                                                   ThreadTransferDstResetCoordinateAfterRun,
+                                                  IndexType,
                                                   GatherDim,
                                                   NumThreadScratch>;
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
index cf758e4d5f..bee0b01a74 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -42,6 +42,7 @@ template <typename ThreadGroup,
           index_t DstScalarPerVector,
           typename ThreadTransferSrcResetCoordinateAfterRunFlags,
           typename ThreadTransferDstResetCoordinateAfterRunFlags,
+          typename IndexType,
           index_t ScatterDim       = 1,
           bool OutputScatter       = true,
           index_t ScatterWeightIdx = 3,
@@ -133,13 +134,12 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
     template <typename SrcBuffers, index_t ThreadScratchId = 0>
     __device__ void RunRead(const SrcDescs& src_descs,
                             const SrcBuffers& src_bufs,
-                            StaticallyIndexedArray<float, scatter_num>& scatter_weights,
                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
            ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
         {
-            threadwise_transfer_.RunRead(src_descs, src_bufs, scatter_weights, thread_scratch_id);
+            threadwise_transfer_.RunRead(src_descs, src_bufs, thread_scratch_id);
         }
     }
 
@@ -149,7 +149,7 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
     template <typename DstBuffers, index_t ThreadScratchId = 0>
     __device__ void RunWrite(const DstDescs& dst_descs,
                              DstBuffers dst_bufs,
-                             StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
+                             StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
                              Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
@@ -169,10 +169,9 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
                         const SrcBuffers& src_bufs,
                         const DstDescs& dst_descs,
                         DstBuffers dst_bufs,
-                        StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
-                        StaticallyIndexedArray<float, scatter_num>& scatter_weights)
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
     {
-        RunRead(src_descs, src_bufs, scatter_weights);
+        RunRead(src_descs, src_bufs);
         RunWrite(dst_descs, dst_bufs, scatter_offsets);
     }
 
@@ -230,6 +229,7 @@ struct ThreadGroupTensorSliceTransfer_v7r3_scatter
                                                    DstScalarPerVector,
                                                    ThreadTransferSrcResetCoordinateAfterRunFlags,
                                                    ThreadTransferDstResetCoordinateAfterRunFlags,
+                                                   IndexType,
                                                    ScatterDim,
                                                    OutputScatter,
                                                    ScatterWeightIdx,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
index 03db4bdd41..08d177035e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -65,9 +65,12 @@ template <typename ALayout,
           typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
           bool NSwizzle                               = false,
           bool IsInputGemm                            = true,
           bool MulRoutedWeight                        = true,
+          bool PerTokenQuant                          = true,
+          typename IndexType                          = index_t,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           typename LDSTypeA                           = ComputeTypeA,
@@ -132,7 +135,12 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                         CDEShuffleBlockTransferScalarPerVectors,
                         BlkGemmPipeSched,
                         BlkGemmPipelineVer,
+                        ActivationOP,
                         NSwizzle,
+                        IsInputGemm,
+                        MulRoutedWeight,
+                        PerTokenQuant,
+                        IndexType,
                         ComputeTypeA,
                         ComputeTypeB,
                         LDSTypeA,
@@ -247,10 +255,10 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
 
             constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize /
                                              4 * (1 + GridwiseGemm::NWave);
-            constexpr auto estimated_reg_b =
-                NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize / 4 * (2);
-            constexpr auto estimated_reg_c =
-                MPerBlock * NPerBlock * sizeof(GemmAccDataType) / BlockSize / 4;
+            constexpr auto estimated_reg_b = NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize /
+                                             4 * (2) * (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_c = MPerBlock * NPerBlock * sizeof(GemmAccDataType) /
+                                             BlockSize / 4 * (IsInputGemm ? 2 : 1);
             constexpr auto estimated_reg_total =
                 estimated_reg_a + estimated_reg_b + estimated_reg_c;
 
@@ -270,8 +278,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                 true,
                                                                 MemoryDataOp,
                                                                 minimum_occupancy,
-                                                                IsInputGemm,
-                                                                MulRoutedWeight,
                                                                 TailNumber::Odd>;
                             RunKernel(kernel);
                         }
@@ -281,8 +287,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                 true,
                                                                 MemoryDataOp,
                                                                 minimum_occupancy,
-                                                                IsInputGemm,
-                                                                MulRoutedWeight,
                                                                 TailNumber::Even>;
                             RunKernel(kernel);
                         }
@@ -297,8 +301,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                  true,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
-                                                                 IsInputGemm,
-                                                                 MulRoutedWeight,
                                                                  TailNumber::Odd>;
                         RunKernel(kernel);
                     }
@@ -308,8 +310,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                                  true,
                                                                  MemoryDataOp,
                                                                  minimum_occupancy,
-                                                                 IsInputGemm,
-                                                                 MulRoutedWeight,
                                                                  TailNumber::Even>;
                         RunKernel(kernel);
                     }
@@ -329,8 +329,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                                                         true,
                                                         InMemoryDataOperationEnum::Set,
                                                         minimum_occupancy,
-                                                        IsInputGemm,
-                                                        MulRoutedWeight,
                                                         TailNumber::Odd>;
                     RunKernel(kernel);
                 }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index a2d1114bbe..255fb8cff4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -12,7 +12,7 @@
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
 
@@ -26,12 +26,17 @@ namespace ck {
 // two lds chunks.
 // 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
 // buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
-          bool IsInputGemm         = false,
-          bool MulRoutedWeight     = true,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -45,22 +50,19 @@ __global__ void
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop,
-                               CGlobalMemoryDataOperation,
-                               IsInputGemm,
-                               MulRoutedWeight,
-                               TailNum>(karg.p_sorted_token_ids,
-                                        karg.p_sorted_expert_ids,
-                                        karg.p_max_token_id,
-                                        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-                                        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-                                        karg.p_ds_grid,
-                                        karg.p_c_grid,
-                                        p_shared,
-                                        karg,
-                                        karg.a_element_op,
-                                        karg.b_element_op,
-                                        karg.c_element_op);
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -70,8 +72,6 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
-          bool IsInputGemm         = false,
-          bool MulRoutedWeight     = true,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -86,23 +86,20 @@ __global__ void
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop,
-                                    CGlobalMemoryDataOperation,
-                                    IsInputGemm,
-                                    MulRoutedWeight,
-                                    TailNum>(karg.p_sorted_token_ids,
-                                             karg.p_sorted_expert_ids,
-                                             karg.p_max_token_id,
-                                             karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-                                             karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-                                             karg.p_ds_grid,
-                                             karg.p_c_grid,
-                                             p_shared,
-                                             p_shared1,
-                                             karg,
-                                             karg.a_element_op,
-                                             karg.b_element_op,
-                                             karg.c_element_op);
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        p_shared1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -154,7 +151,12 @@ template <typename ALayout,
           typename CDEShuffleBlockTransferScalarPerVectors,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
           bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          bool PerTokenQuant                          = false,
+          typename IndexType                          = index_t,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           typename LDSTypeA                           = ADataType,
@@ -227,6 +229,7 @@ struct GridwiseMoeGemm
         const index_t mblock = math::integer_divide_ceil(M, MPerBlock);
         const index_t gridx  = NSwizzle ? nblock * mblock : nblock;
         const index_t gridy  = NSwizzle ? 1 : mblock;
+
         return std::make_tuple(gridx, gridy, 1);
     }
 
@@ -305,7 +308,7 @@ struct GridwiseMoeGemm
     }
 
     __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
-        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
     {
         const auto a_grid_desc_mraw_kraw = [&]() {
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
@@ -497,8 +500,8 @@ struct GridwiseMoeGemm
     }
 
     template <typename ELayout>
-    __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
     {
         const auto c_grid_desc_mraw_nraw = [&]() {
             if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
@@ -909,7 +912,8 @@ struct GridwiseMoeGemm
                                 NPerXdl,
                                 MXdlPerWave,
                                 NXdlPerWave,
-                                KPack>())>;
+                                KPack,
+                                IsInputGemm>())>;
 
     __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
     {
@@ -1141,9 +1145,7 @@ struct GridwiseMoeGemm
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              bool IsInputGemm     = true,
-              bool MulRoutedWeight = true,
-              TailNumber TailNum   = TailNumber::Odd>
+              TailNumber TailNum = TailNumber::Odd>
     __device__ static void Run(const index_t* p_sorted_token_ids,
                                const index_t* p_sorted_expert_ids,
                                const index_t* p_max_token_id,
@@ -1203,6 +1205,7 @@ struct GridwiseMoeGemm
                 return {blockIdx.x, blockIdx.y};
             }
         }();
+
         const index_t block_n_id = block_mn.first;
         const index_t block_m_id = block_mn.second;
         const index_t token0 =
@@ -1218,7 +1221,7 @@ struct GridwiseMoeGemm
 
         if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<index_t, AMRepeats> gather_offsets;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
             index_t token_offset      = fused_token & 0xffffff;
@@ -1226,9 +1229,10 @@ struct GridwiseMoeGemm
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            gather_offsets(m0) = token_offset * problem.K;
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
 
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
@@ -1239,7 +1243,6 @@ struct GridwiseMoeGemm
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_grid + expert_id * expert_stride / BPackedSize,
             b_grid_desc_bpreshuffled.GetElementSpaceSize());
-
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
@@ -1269,6 +1272,7 @@ struct GridwiseMoeGemm
             1,
             AThreadTransferSrcResetCoordinateAfterRun,
             true,
+            IndexType,
             1,
             BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
                                                 make_multi_index(0, 0, 0),
@@ -1311,24 +1315,74 @@ struct GridwiseMoeGemm
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
         auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
         auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
-
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
-                                                                         a_block_desc_ak0_m_ak1,
-                                                                         a_blockwise_copy,
-                                                                         a_grid_buf,
-                                                                         a_block_buf,
-                                                                         a_block_slice_copy_step,
-                                                                         b_grid_desc_bpreshuffled,
-                                                                         b_blockwise_copy,
-                                                                         b_grid_buf,
-                                                                         b_block_buf,
-                                                                         b_block_slice_copy_step,
-                                                                         c_thread_buf,
-                                                                         num_k_block_main_loop);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack * (get_thread_local_1d_id() % warpSize)));
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                num_k_block_main_loop);
+        }
 
         // shuffle C and write out
         {
@@ -1356,6 +1410,185 @@ struct GridwiseMoeGemm
             constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
             constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
 
+            // mul scales
+            const float* p_sorted_weights_0 = p_ds_grid[I0];
+            const float* p_scale_b          = p_ds_grid[I1];
+
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            if(p_sorted_weights_0 != nullptr && p_scale_b != nullptr)
+            {
+                if constexpr(PerTokenQuant)
+                {
+                    constexpr index_t scale_stride = (IsInputGemm ? 2 : 1);
+                    p_scale_b += expert_id * problem.N * scale_stride + block_n_id * NPerBlock +
+                                 get_warp_local_1d_id() % NWave * NPerXdl + threadIdx.x % NPerXdl;
+                }
+                else
+                {
+                    p_scale_b += expert_id;
+                }
+
+                vector_type<int32_t, 4> scale_token_ids;
+                vector_type<float, 4> topk_weights;
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    const float scale_b = p_scale_b[n0 * NWave * NPerXdl * PerTokenQuant];
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(PerTokenQuant)
+                            {
+                                scale_token_ids =
+                                    *c_style_pointer_cast<const vector_type<int32_t, M4>*>(
+                                        p_sorted_token_ids + m_pos);
+                            }
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                float scale_a = [&]() {
+                                    if constexpr(PerTokenQuant)
+                                    {
+                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
+                                        const index_t token_offset = fused_token & 0xffffff;
+                                        return token_offset < problem.NumTokens
+                                                   ? p_sorted_weights_0[token_offset]
+                                                   : 0.0;
+                                    }
+                                    else
+                                    {
+                                        return p_sorted_weights_0[0];
+                                    }
+                                }();
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        scale_a * scale_b * c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf_fp32(cidx) *
+                                                                  topk_weights.AsType<float>()[m4];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+            else
+            {
+                vector_type<float, 4> topk_weights; // for gemm2 only
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = topk_weights.AsType<float>()[m4] *
+                                                                  c_thread_buf_fp32[cidx];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
@@ -1453,17 +1686,8 @@ struct GridwiseMoeGemm
 
             const auto ds_grid_buf = generate_tuple(
                 [&](auto i) {
-                    using DDataType       = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
-                    const DDataType* ptr_ = p_ds_grid[i];
-                    // hack logic here to support different kind of strides. todo fix it.
-                    // ascale t, 1; bscale E, N, 1, move ptr to E
-                    if(i.value == 1)
-                    {
-                        ptr_ +=
-                            expert_id * (problem.StrideDs[1] ? problem.StrideDs[1] * problem.N : 1);
-                    }
                     return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                        ptr_, ds_grid_desc_m_n[i].GetElementSpaceSize());
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
                 },
                 Number<NumDTensor>{});
 
@@ -1526,7 +1750,8 @@ struct GridwiseMoeGemm
                     Sequence<true>,
                     uniform_sequence_gen_t<NumDTensor,
                                            false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>,   // ThreadTransferDstResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
                 1,                 // ScatterDim
                 true,              // OutputScatter: false, only use scatter weights
                 scatter_weight_idx // ScatterWeightIdx: ascale
@@ -1538,7 +1763,6 @@ struct GridwiseMoeGemm
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-            // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
                 SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
@@ -1568,35 +1792,21 @@ struct GridwiseMoeGemm
             constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
             constexpr auto ENThreads =
                 CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
-            const float* p_sorted_weights_0 = p_ds_grid[I0];
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to write to LDS
-                StaticallyIndexedArray<index_t, EMRepeats> scatter_offsets;
-                StaticallyIndexedArray<float, EMRepeats> scatter_weights; //= for topk
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
 
                 auto dstidx = sfc_cde_block.GetIndex(access_id);
                 const index_t c_token_pos =
                     block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
                 static_for<0, EMRepeats, 1>{}([&](auto m0) {
                     const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
-                    index_t token_offset      = fused_token & 0xffffff;
-                    float weight              = token_offset < problem.NumTokens
-                                                    ? p_sorted_weights_0[token_offset * problem.StrideDs[0]]
-                                                    : 0.0;
+                    IndexType token_offset    = fused_token & 0xffffff;
                     if constexpr(IsInputGemm)
                     {
                         token_offset = token_offset * problem.TopK + (fused_token >> 24);
                     }
-                    if constexpr(MulRoutedWeight)
-                    {
-                        const float* p_sorted_weights_2 = p_ds_grid[I2];
-                        if constexpr(sizeof(ADataType) < 2)
-                            weight = p_sorted_weights_2[c_token_pos + m0] * weight;
-                        else
-                            weight = p_sorted_weights_2[c_token_pos + m0];
-                    }
-                    scatter_offsets(m0) = token_offset * problem.N;
-                    scatter_weights(m0) = weight;
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
                 });
 
                 block_sync_lds();
@@ -1604,7 +1814,7 @@ struct GridwiseMoeGemm
                 // each thread write its data from VGPR to LDS
                 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                               sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
-                                              c_thread_buf,
+                                              c_thread_buf_fp32,
                                               c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                               c_shuffle_block_buf);
 
@@ -1617,8 +1827,7 @@ struct GridwiseMoeGemm
                     c_ds_buf_refs,
                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
                     tie(c_grid_buf),
-                    scatter_offsets,
-                    scatter_weights);
+                    scatter_offsets);
 
                 if constexpr(access_id < num_access - 1)
                 {
@@ -1643,9 +1852,7 @@ struct GridwiseMoeGemm
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              bool IsInputGemm     = true,
-              bool MulRoutedWeight = true,
-              TailNumber TailNum   = TailNumber::Odd>
+              TailNumber TailNum = TailNumber::Odd>
     __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
                                     const index_t* p_sorted_expert_ids,
                                     const index_t* p_max_token_id,
@@ -1721,7 +1928,7 @@ struct GridwiseMoeGemm
         if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
            token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<index_t, AMRepeats>
+        StaticallyIndexedArray<IndexType, AMRepeats>
             gather_offsets; //= p_sorted_token_ids[token_pos];
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
@@ -1730,7 +1937,7 @@ struct GridwiseMoeGemm
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            gather_offsets(m0) = token_offset * problem.K;
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
         const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
 
@@ -1773,6 +1980,7 @@ struct GridwiseMoeGemm
             1,
             AThreadTransferSrcResetCoordinateAfterRun,
             true,
+            IndexType,
             1,
             2>(a_grid_desc_ak0_m_ak1,
                make_multi_index(0, 0, 0),
@@ -1967,11 +2175,12 @@ struct GridwiseMoeGemm
                     const DDataType* ptr_ = p_ds_grid[i];
                     // hack logic here to support different kind of strides. todo fix it.
                     // ascale t, 1; bscale E, N, 1, move ptr to E
-                    if(i.value == 1)
-                    {
-                        ptr_ +=
-                            expert_id * (problem.StrideDs[1] ? problem.StrideDs[1] * problem.N : 1);
-                    }
+                    // if(i.value == 1)
+                    // {
+                    //     ptr_ +=
+                    //         expert_id * (problem.StrideDs[1] ? problem.StrideDs[1] * problem.N :
+                    //         1);
+                    // }
                     return make_dynamic_buffer<AddressSpaceEnum::Global>(
                         ptr_, ds_grid_desc_m_n[i].GetElementSpaceSize());
                 },
@@ -2036,7 +2245,8 @@ struct GridwiseMoeGemm
                     Sequence<true>,
                     uniform_sequence_gen_t<NumDTensor,
                                            false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>,   // ThreadTransferDstResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
                 1,                 // ScatterDim
                 true,              // OutputScatter: false, only use scatter weights
                 scatter_weight_idx // ScatterWeightIdx: ascale
@@ -2078,12 +2288,9 @@ struct GridwiseMoeGemm
             constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
             constexpr auto ENThreads =
                 CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
-            const float* p_sorted_weights_0 = p_ds_grid[I0];
             static_for<0, num_access, 1>{}([&](auto access_id) {
                 // make sure it's safe to write to LDS
-                StaticallyIndexedArray<index_t, EMRepeats>
-                    scatter_offsets; //= p_sorted_token_ids[c_token_pos];
-                StaticallyIndexedArray<float, EMRepeats> scatter_weights; //= for topk
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
 
                 auto dstidx = sfc_cde_block.GetIndex(access_id);
                 const index_t c_token_pos =
@@ -2091,23 +2298,11 @@ struct GridwiseMoeGemm
                 static_for<0, EMRepeats, 1>{}([&](auto m0) {
                     const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
                     index_t token_offset      = fused_token & 0xffffff;
-                    float weight              = token_offset < problem.NumTokens
-                                                    ? p_sorted_weights_0[token_offset * problem.StrideDs[0]]
-                                                    : 0.0;
                     if constexpr(IsInputGemm)
                     {
                         token_offset = token_offset * problem.TopK + (fused_token >> 24);
                     }
-                    if constexpr(MulRoutedWeight)
-                    {
-                        const float* p_sorted_weights_2 = p_ds_grid[I2];
-                        if constexpr(sizeof(ADataType) < 2)
-                            weight = p_sorted_weights_2[c_token_pos + m0] * weight;
-                        else
-                            weight = p_sorted_weights_2[c_token_pos + m0];
-                    }
-                    scatter_offsets(m0) = token_offset * problem.N;
-                    scatter_weights(m0) = weight;
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
                 });
 
                 block_sync_lds();
@@ -2128,8 +2323,7 @@ struct GridwiseMoeGemm
                     c_ds_buf_refs,
                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
                     tie(c_grid_buf),
-                    scatter_offsets,
-                    scatter_weights);
+                    scatter_offsets);
 
                 if constexpr(access_id < num_access - 1)
                 {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
index bb9a452761..bd6fe772e4 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
@@ -41,6 +41,7 @@ template <typename SliceLengths,
           bool DstResetCoordinateAfterRun, // control whether to move back dst coordinate after each
                                            // RunWrite(),  will be fused with MoveDstSliceWindow to
                                            // save addr computation
+          typename IndexType,
           index_t GatherDim        = 1,
           index_t NumThreadScratch = 1>
 struct ThreadwiseTensorSliceTransfer_v3r1_gather
@@ -88,7 +89,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
         const DstDesc& dst_desc,
         const Index& dst_slice_origin,
         const DstElementwiseOperation& dst_element_op,
-        const StaticallyIndexedArray<index_t, gather_num>& gather_offsets)
+        const StaticallyIndexedArray<IndexType, gather_num>& gather_offsets)
         : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
           dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
           src_element_op_(src_element_op),
@@ -221,7 +222,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
             auto gather_offset =
                 gather_offsets_(ordered_src_access_idx[Number<ordered_gather_dim>{}]);
 
-            const index_t ld_offset = src_coord_.GetOffset() + gather_offset;
+            const IndexType ld_offset = src_coord_.GetOffset() + gather_offset;
             src_oob_thread_scratch_tuple_(thread_scratch_id)
                 .template SetAsType<bool>(src_data_idx_seq, true);
 
@@ -935,7 +936,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
     DstCoord dst_coord_;
     const SrcElementwiseOperation src_element_op_;
     const DstElementwiseOperation dst_element_op_;
-    StaticallyIndexedArray<index_t, gather_num> gather_offsets_;
+    StaticallyIndexedArray<IndexType, gather_num> gather_offsets_;
 };
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
index 6a1c195dc1..7cd0a0fc7f 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -43,6 +43,7 @@ template <typename SrcDatas,
           index_t DstScalarPerVector,
           typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
           typename DstResetCoordinateAfterRunFlags, // Sequence<bool ...>
+          typename IndexType,
           index_t ScatterDim       = 1,
           bool OutputScatter       = true,
           index_t ScatterWeightIdx = 3,
@@ -153,7 +154,6 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
               enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
     __device__ void RunRead(const SrcDescs& src_descs,
                             const SrcBuffers& src_bufs,
-                            StaticallyIndexedArray<float, scatter_num>& scatter_weights,
                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         // loop over space-filling curve
@@ -172,31 +172,8 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                                                                                 src_coords_[i]);
 
                 oob_val = oob_val & is_src_valid;
-                if(i.value == ScatterWeightIdx)
-                {
-                    static_assert(SrcScalarPerVectors{}[Number<ScatterWeightIdx>{}] == 1,
-                                  "scatter weight dim, should only one vec");
-                    constexpr auto iScatter =
-                        SrcSpaceFillingCurve::GetIndex(iAccess)(Number<ScatterDim>{});
-                    static_for<0, SrcScalarPerVector, 1>{}([&](auto j) {
-                        src_vectors(i).template AsType<float>()(j) =
-                            scatter_weights(Number<iScatter>{});
-                    });
-                }
-                else if constexpr(SrcScalarPerVectors{}[i] == 1)
-                {
-                    auto data_types = SrcDatas{};
-                    using DataType  = remove_cvref_t<decltype(data_types[i])>;
-                    const auto tmp =
-                        src_bufs[i].template Get<DataType>(src_coords_[i].GetOffset(), true);
-                    static_for<0, SrcScalarPerVector, 1>{}(
-                        [&](auto j) { src_vectors(i).template AsType<DataType>()(j) = tmp; });
-                }
-                else
-                {
-                    src_vectors(i).template AsType<src_vector_t>()(I0) =
-                        src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
-                }
+                src_vectors(i).template AsType<src_vector_t>()(I0) =
+                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(), true);
             });
 
             constexpr auto get_elem_op_vec_len = []() {
@@ -412,7 +389,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
               enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1, bool> = false>
     __device__ void RunWrite(const DstDescs& dst_descs,
                              DstBuffers dst_bufs,
-                             StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
+                             StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets,
                              Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
     {
         OOBCheck(thread_scratch_id);
@@ -420,8 +397,8 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
 
         // loop over space-filling curve
         static_for<0, dst_num_access, 1>{}([&](auto iAccess) {
-            auto dst_vectors    = dst_vectors_tuple_[thread_scratch_id][iAccess];
-            auto scatter_offset = 0;
+            auto dst_vectors         = dst_vectors_tuple_[thread_scratch_id][iAccess];
+            IndexType scatter_offset = 0;
             if constexpr(OutputScatter)
             {
                 constexpr auto iScatter =
@@ -431,8 +408,10 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
             // copy data from buf_vectors into dst_bufs
             static_for<0, nDst, 1>{}([&](auto i) {
                 using dst_vector_t      = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
-                auto dst_offset         = scatter_offset + dst_coords_[i].GetOffset();
+                IndexType dst_offset    = scatter_offset + (dst_coords_[i].GetOffset());
                 const bool is_dst_valid = dst_offset < dst_descs[i].GetElementSpaceSize();
+                // coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
+                //                                                             dst_coords_[i]);
                 constexpr InMemoryDataOperationEnum DstInMemOp =
                     static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
                 dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
@@ -488,10 +467,9 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                         const SrcBuffers& src_bufs,
                         const DstDescs& dst_descs,
                         DstBuffers dst_bufs,
-                        StaticallyIndexedArray<index_t, scatter_num>& scatter_offsets,
-                        StaticallyIndexedArray<float, scatter_num>& scatter_weights)
+                        StaticallyIndexedArray<IndexType, scatter_num>& scatter_offsets)
     {
-        RunRead(src_descs, src_bufs, scatter_weights);
+        RunRead(src_descs, src_bufs);
         RunWrite(dst_descs, dst_bufs, scatter_offsets);
     }
 
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 1a0ea27eab..1d80f196b5 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -24,7 +24,8 @@ template <AddressSpaceEnum BufferAddressSpace,
           typename T,
           typename ElementSpaceSize,
           bool InvalidElementUseNumericalZeroValue,
-          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename IndexType               = index_t>
 struct DynamicBuffer
 {
     using type = T;
@@ -59,16 +60,16 @@ struct DynamicBuffer
         return BufferAddressSpace;
     }
 
-    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    __host__ __device__ constexpr const T& operator[](IndexType i) const { return p_data_[i]; }
 
-    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+    __host__ __device__ constexpr T& operator()(IndexType i) { return p_data_[i]; }
 
     template <typename X,
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value ||
                                      !is_native_type<X>(),
                                  bool>::type = false>
-    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    __host__ __device__ constexpr auto Get(IndexType i, bool is_valid_element) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -79,7 +80,7 @@ struct DynamicBuffer
                       "wrong! X should contain multiple T");
 
 #if CK_USE_AMD_BUFFER_LOAD
-        bool constexpr use_amd_buffer_addressing = true;
+        bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t);
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -140,7 +141,7 @@ struct DynamicBuffer
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
-    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void Update(IndexType i, bool is_valid_element, const X& x)
     {
         if constexpr(Op == InMemoryDataOperationEnum::Set)
         {
@@ -191,8 +192,8 @@ struct DynamicBuffer
 
     template <typename DstBuffer, index_t NumElemsPerThread>
     __host__ __device__ void DirectCopyToLds(DstBuffer& dst_buf,
-                                             index_t src_offset,
-                                             index_t dst_offset,
+                                             IndexType src_offset,
+                                             IndexType dst_offset,
                                              bool is_valid_element) const
     {
         // Copy data from global to LDS memory using direct loads.
@@ -214,7 +215,7 @@ struct DynamicBuffer
                                          typename scalar_type<remove_cvref_t<T>>::type>::value ||
                                      !is_native_type<X>(),
                                  bool>::type = false>
-    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void Set(IndexType i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -224,8 +225,8 @@ struct DynamicBuffer
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
 
-#if CK_USE_AMD_BUFFER_STORE
-        bool constexpr use_amd_buffer_addressing = true;
+#if CK_USE_AMD_BUFFER_LOAD
+        bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t);
 #else
         bool constexpr use_amd_buffer_addressing      = false;
 #endif
@@ -342,11 +343,12 @@ struct DynamicBuffer
         {
             if(is_valid_element)
             {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+#if 0
                 X tmp = x;
 
                 __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
 #else
+                // if(i >= 2169041600)
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
 #endif
             }
@@ -357,7 +359,7 @@ struct DynamicBuffer
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
-    __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void AtomicAdd(IndexType i, bool is_valid_element, const X& x)
     {
         using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
 
@@ -378,12 +380,14 @@ struct DynamicBuffer
             (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
             (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0);
 #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
+        bool constexpr use_amd_buffer_addressing =
+            sizeof(IndexType) <= sizeof(int32_t) && is_same_v<remove_cvref_t<scalar_t>, int32_t>;
 #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
         bool constexpr use_amd_buffer_addressing =
-            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
-            (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0);
+            sizeof(IndexType) <= sizeof(int32_t) &&
+            (is_same_v<remove_cvref_t<scalar_t>, float> ||
+             (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0) ||
+             (is_same_v<remove_cvref_t<scalar_t>, bhalf_t> && scalar_per_x_vector % 2 == 0));
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -408,12 +412,12 @@ struct DynamicBuffer
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                          typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
-    __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
+    __host__ __device__ void AtomicMax(IndexType i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr IndexType scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr IndexType scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -421,8 +425,9 @@ struct DynamicBuffer
         static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
 
 #if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
-        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
+        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+        bool constexpr use_amd_buffer_addressing =
+            sizeof(IndexType) <= sizeof(int32_t) && is_same_v<remove_cvref_t<scalar_t>, double>;
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -455,6 +460,17 @@ __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize el
         p, element_space_size};
 }
 
+template <AddressSpaceEnum BufferAddressSpace,
+          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename T,
+          typename ElementSpaceSize>
+__host__ __device__ constexpr auto make_long_dynamic_buffer(T* p,
+                                                            ElementSpaceSize element_space_size)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence, long_index_t>{
+        p, element_space_size};
+}
+
 template <
     AddressSpaceEnum BufferAddressSpace,
     AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index b1a0c1fc5d..ec055fb2a2 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -23,6 +23,13 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
     return generate_tuple_for(f, make_index_sequence<N>{});
 }
 
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tuple(F&& f, LongNumber<N>)
+{
+    return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
 template <typename F, index_t N>
 __host__ __device__ constexpr auto generate_tie(F&& f, Number<N>)
 {
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
index 72c9dc86ac..120bf7484a 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -23,12 +23,14 @@ template <typename ADataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
-          bool MulRoutedWeight  = false,
-          typename ComputeTypeA = CDataType,
-          typename ComputeTypeB = ComputeTypeA>
+          index_t ActivationType_ = 0,
+          bool MulRoutedWeight    = true,
+          typename ComputeTypeA   = CDataType,
+          typename ComputeTypeB   = ComputeTypeA>
 struct ReferenceMoeGemm : public device::BaseOperator
 {
     // Argument
+    static constexpr auto ActivationType = ActivationType_;
     struct Argument : public device::BaseArgument
     {
         Argument(const Tensor<ck::index_t>& sorted_token_ids,
@@ -36,7 +38,9 @@ struct ReferenceMoeGemm : public device::BaseOperator
                  const Tensor<ck::index_t>& max_token_id,
                  const index_t sorted_tile_size,
                  const Tensor<ADataType>& a_t_k,
+                 const Tensor<float>& a_scale_t,
                  const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<float>& b_scale_e_n,
                  Tensor<CDataType>& c_t_k_n,
                  const Tensor<D2DataType>& d2,
                  AElementwiseOperation a_element_op,
@@ -47,7 +51,9 @@ struct ReferenceMoeGemm : public device::BaseOperator
               max_token_id_{max_token_id},
               sorted_tile_size_{sorted_tile_size},
               a_t_k_{a_t_k},
+              a_scale_t_{a_scale_t},
               b_e_n_k_{b_e_n_k},
+              b_scale_e_n_{b_scale_e_n},
               c_t_k_n_{c_t_k_n},
               d2_{d2},
               a_element_op_{a_element_op},
@@ -61,7 +67,9 @@ struct ReferenceMoeGemm : public device::BaseOperator
         const Tensor<ck::index_t>& max_token_id_;
         index_t sorted_tile_size_;
         const Tensor<ADataType>& a_t_k_;
+        const Tensor<float>& a_scale_t_;
         const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<float>& b_scale_e_n_;
         Tensor<CDataType>& c_t_k_n_;
         const Tensor<D2DataType>& d2_;
 
@@ -77,11 +85,17 @@ struct ReferenceMoeGemm : public device::BaseOperator
 
         float Run(const Argument& arg)
         {
-            auto f_mk_kn_mn = [&](auto m, auto n) {
+            static_assert(ActivationType < 2, "Not supported activation type");
+            const int full_n = arg.c_t_k_n_.mDesc.GetLengths()[2];
+            auto f_mk_kn_mn  = [&](auto m, auto n) {
                 const int K = arg.a_t_k_.mDesc.GetLengths()[1];
+                AccDataType v_acc_up{0};
+                ComputeTypeB v_b_up{0};
                 AccDataType v_acc{0};
+
                 ComputeTypeA v_a{0};
                 ComputeTypeB v_b{0};
+
                 const int t         = arg.sorted_token_ids_(m) & 0xffffff;
                 const int topk_id   = (arg.sorted_token_ids_(m) & 0xff000000) >> 24;
                 const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
@@ -102,7 +116,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
 #if CK_USE_PK4_LAYOUT_SHUFFLE
                             v_a = i4_to_f32_gfx9(i4);
 #else
-                            v_a = i4 - 8;
+                            v_a    = i4 - 8;
 #endif
                         }
                         else
@@ -112,42 +126,79 @@ struct ReferenceMoeGemm : public device::BaseOperator
                         // same for B matrix
                         if constexpr(is_same_v<BDataType, pk_i4_t>)
                         {
-                            uint8_t i4x2 = arg.b_e_n_k_(e, k, n).data;
-                            uint8_t i4   = 0;
+                            uint8_t i4x2    = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4x2_up = arg.b_e_n_k_(e, k, n + full_n).data;
+                            uint8_t i4      = 0;
+                            uint8_t i4_up   = 0;
                             if(k % 2 == 1)
-                                i4 = (i4x2 >> 0) & 0xf;
+                            {
+                                i4    = (i4x2 >> 0) & 0xf;
+                                i4_up = (i4x2_up >> 0) & 0xf;
+                            }
                             else
-                                i4 = (i4x2 >> 4) & 0xf;
+                            {
+                                i4    = (i4x2 >> 4) & 0xf;
+                                i4_up = (i4x2_up >> 4) & 0xf;
+                            }
 #if CK_USE_PK4_LAYOUT_SHUFFLE
-                            v_b = i4_to_f32_gfx9(i4);
+                            v_b    = i4_to_f32_gfx9(i4);
+                            v_b_up = i4_to_f32_gfx9(i4_up);
 #else
-                            v_b = i4 - 8;
+                            v_b    = i4 - 8;
+                            v_b_up = i4_up - 8;
 #endif
                         }
                         else
                         {
                             arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
+                            arg.b_element_op_(v_b_up, arg.b_e_n_k_(e, k, n + full_n));
                         }
 
                         v_acc +=
                             ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                        v_acc_up += ck::type_convert<AccDataType>(v_a) *
+                                    ck::type_convert<AccDataType>(v_b_up);
                     }
                     CDataType v_c{0};
-
+                    CDataType v_c_up{0};
                     if constexpr(MulRoutedWeight)
                     {
                         v_acc *= v_topk_w;
+                        v_acc_up *= v_topk_w;
                     }
 
                     arg.c_element_op_(v_c, v_acc);
+                    arg.c_element_op_(v_c_up, v_acc_up);
 
-                    arg.c_t_k_n_(t, topk_id, n) = v_c;
+                    if constexpr(ActivationType == 1)
+                    {
+                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t);
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Silu{}(v_c, v_c);
+                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                    else if constexpr(ActivationType == 0)
+                    {
+                        v_c = v_c * arg.b_scale_e_n_(e, n) * arg.a_scale_t_(t);
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Gelu{}(v_c, v_c);
+                        v_c_up = v_c_up * arg.b_scale_e_n_(e, n + full_n) * arg.a_scale_t_(t);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
                 }
             };
 
             const ck::index_t max_token_id = arg.max_token_id_(0);
-            make_ParallelTensorFunctor(
-                f_mk_kn_mn, max_token_id, arg.c_t_k_n_.mDesc.GetLengths()[2])(
+            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, full_n)(
                 std::thread::hardware_concurrency());
 
             return 0;
@@ -173,7 +224,9 @@ struct ReferenceMoeGemm : public device::BaseOperator
                              const Tensor<ck::index_t>& max_token_id,
                              const index_t sorted_tile_size,
                              const Tensor<ADataType>& a_t_k,
+                             const Tensor<float>& a_scale_n,
                              const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<float>& b_scale_e_n,
                              Tensor<CDataType>& c_t_k_n,
                              const Tensor<D2DataType>& d2,
                              AElementwiseOperation a_element_op,
@@ -185,7 +238,9 @@ struct ReferenceMoeGemm : public device::BaseOperator
                         max_token_id,
                         sorted_tile_size,
                         a_t_k,
+                        a_scale_n,
                         b_e_n_k,
+                        b_scale_e_n,
                         c_t_k_n,
                         d2,
                         a_element_op,
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
index fb5c71e30a..5c932fcb18 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -25,7 +25,7 @@ template <typename ADataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
-          bool MulRoutedWeight  = false,
+          bool MulRoutedWeight  = true,
           typename ComputeTypeA = CDataType,
           typename ComputeTypeB = ComputeTypeA>
 struct ReferenceMoeGemm2 : public device::BaseOperator

From 854159fd00d43736bc7c69a491f30006a5dce67e Mon Sep 17 00:00:00 2001
From: John Afaganis <john.afaganis@amd.com>
Date: Wed, 23 Apr 2025 11:25:41 -0600
Subject: [PATCH 064/443] Update CODEOWNERS (#2119)

---
 .github/CODEOWNERS | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index eb69bd7f39..ccdfb0f6fb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli
 # Documentation files
-docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
-*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
-*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
-.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli

From 5487289fc479c875b181152c0383fdf1da7b2f00 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 24 Apr 2025 03:40:18 +0800
Subject: [PATCH 065/443] [CK_TILE] support gfx950 matrix core in 01_fmha fwd
 (#2110)

* gfx950 01_fmha fwd

* fix comment

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 include/ck_tile/ops/gemm.hpp                  |   3 +
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  24 ++
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    |  27 ++-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 229 ++++++++++++++++++
 .../gemm/warp/warp_gemm_attribute_smfmac.hpp  |   5 +
 5 files changed, 286 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 794f7f21f2..35f5170179 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -44,8 +44,11 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index bd7a0566a2..e6350a8827 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -49,10 +49,16 @@ using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
@@ -65,10 +71,16 @@ using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
         2>>;
 #endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>,
@@ -123,10 +135,16 @@ using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
@@ -139,10 +157,16 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
         2>>;
 #endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+#else
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
+#endif
 
 using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index e7d4c37966..93ccdb5f57 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -356,7 +356,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
     }
 };
 
-template <typename WarpGemmAttributeMfmaImpl_>
+template <typename WarpGemmAttributeMfmaImpl_, index_t SFactor_ = 2>
 struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
@@ -373,6 +373,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
     static constexpr index_t kN          = Impl::kM;
     static constexpr index_t kK          = Impl::kK;
     static constexpr index_t kKPerThread = Impl::kABKPerLane;
+    static constexpr index_t SFactor     = SFactor_; // group how many CM1 together
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
@@ -386,7 +387,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
         tuple<sequence<0, 0>>,
         sequence<2>,
         sequence<1>>;
-
+#if 0
     using BWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
         tuple<sequence<Impl::kAMLane / (Impl::kABKPerLane * Impl::kABKLane * 2),
@@ -407,7 +408,29 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
         tuple<sequence<1, 0>>,
         sequence<2, 2>,
         sequence<0, 2>>;
+#else
+    // TODO: more test not only 32x32
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane / (Impl::kCMLane * SFactor * Impl::kCM1PerLane),
+                       Impl::kCMLane,
+                       SFactor,
+                       Impl::kCM1PerLane>,
+              sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+        tuple<sequence<2, 1, 1, 1, 1>>,
+        tuple<sequence<0, 0, 2, 1, 3>>,
+        sequence<2>,
+        sequence<1>>;
 
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCNLane>,
+              sequence<Impl::kCM0PerLane / SFactor, Impl::kCMLane, Impl::kCM1PerLane * SFactor>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<1, 0>>,
+        sequence<2, 2>,
+        sequence<0, 2>>;
+#endif
     template <bool post_nop_ = false>
     // c_vec += a_vec * b_vec
     CK_TILE_DEVICE void operator()(CVecType& c_vec,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index f937899ffd..08f813a1e3 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -748,6 +748,235 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4
     }
 };
 
+// gfx950
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplF16F16F32M32N32K16
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = fp16_t;
+    using BDataType                     = fp16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<fp16_t, 8>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x16_f16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_f16(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(
+                    reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                        .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                        .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#elif defined(__gfx908__)
+            static_for<0, 4, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x4f16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return __builtin_amdgcn_mfma_f32_32x32x16_f16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(
+                reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                    .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                    .template get_as<ext_vector_t<fp16_t, 4>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 4, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x4f16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<fp16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = bf16_t;
+    using BDataType                     = bf16_t;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<bf16_t, 8>;
+    using BVecType = ext_vector_t<bf16_t, 8>;
+    using CVecType = ext_vector_t<float, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_f32_32x32x16_bf16", Ctrl)
+        else
+        {
+#if defined(__gfx950__)
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a_vec, b_vec, c_vec, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+            static_for<0, 2, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+                    reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#elif defined(__gfx908__)
+            static_for<0, 4, 1>{}([&](auto k) {
+                c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                    reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                        .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                    c_vec,
+                    0,
+                    0,
+                    0);
+            });
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a_vec, b_vec, fp32x16_t{0.f}, 0, 0, 0);
+#elif defined(__gfx90a__) || defined(__gfx94__)
+        CVecType c_vec{0.f};
+        static_for<0, 2, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+                reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 4>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#elif defined(__gfx908__)
+        CVecType c_vec{0.f};
+        static_for<0, 4, 1>{}([&](auto k) {
+            c_vec = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+                reinterpret_cast<const thread_buffer<ADataType, 4>&>(a_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                reinterpret_cast<const thread_buffer<BDataType, 4>&>(b_vec)
+                    .template get_as<ext_vector_t<bf16_t, 2>>()[number<k>{}],
+                c_vec,
+                0,
+                0,
+                0);
+        });
+        return c_vec;
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
 // FP8
 template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
index adf548aaca..84cdf17d66 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp
@@ -1,3 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
 

From 02ce6d39ea11b06d583da04a5d3feb4cb66a55a0 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Thu, 24 Apr 2025 18:52:58 +0800
Subject: [PATCH 066/443] Only generate specific hdim (#2120)

---
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   |  8 +++++--
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 13 ++++++----
 .../01_fmha/codegen/ops/fmha_fwd_appendkv.py  |  6 +++--
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 10 ++++----
 example/ck_tile/01_fmha/generate.py           | 24 ++++++++++++++-----
 5 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 1e6755c631..932f6020b6 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -866,9 +866,11 @@ def write_single_bwd_convert_dq_kernel(kernel: FmhaBwdConvertQGradKernel, autoge
 def write_bwd_api(api_pool : FmhaBwdApiPool, autogen_dir: Path) -> None:
     (autogen_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, filter_list : str, receipt, mask_impl) -> None:
+def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     filter_list = filter_list.split('@')
     filter_list.extend([''] * (3 - len(filter_list)))
+    # TODO
+    assert optdim_list == [-1]
 
     kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
     for kernel in kernels:
@@ -881,9 +883,11 @@ def write_blobs(output_dir : Path, filter_list : str, receipt, mask_impl) -> Non
         write_single_bwd_dq_dk_dv_kernel(kernel, output_dir)
     write_bwd_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, filter_list : str, receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     filter_list = filter_list.split('@')
     filter_list.extend([''] * (3 - len(filter_list)))
+    # TODO
+    assert optdim_list == [-1]
 
     with file_path.open('a') as f:
         kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 3634810b37..c31a0ce954 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -429,7 +429,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     else:
         return None
 
-def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
     #       support this in future
     def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
@@ -507,6 +507,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                 if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                 # 2 - Flash attention integration
                 if receipt in (2, 3):
                     cond = dtype in ['fp16', 'bf16']
@@ -557,15 +560,15 @@ def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
     (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, kernel_filter : str, receipt, mask_impl) -> None:
-    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
     for kernel in kernels:
         write_single_fwd_kernel(kernel, output_dir)
     write_fwd_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, kernel_filter : str, receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
     with file_path.open('a') as f:
-        _, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index f243020dc4..dc7ef712e2 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -343,13 +343,15 @@ def write_single_kernel(kernel: FmhaFwdAppendKVKernel, autogen_dir: Path) -> Non
 def write_fwd_appendkv_api(api_pool : FmhaFwdAppendKVApiPool, autogen_dir: Path) -> None:
     (autogen_dir / FMHA_FWD_APPENDKV_API_FILENAME).write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
+    assert optdim_list == [-1]
     api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
     write_fwd_appendkv_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
+    assert optdim_list == [-1]
     with file_path.open('a') as f:
         _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
         for kernel in kernels:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 0dccdf6bd6..ca49af1496 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -440,10 +440,10 @@ class FmhaFwdSplitKVCombinePipeline:
         n = f'{self.tag}'
         if pn != '' : n += f'_{pn}'
         else: n += '_npad'
-        
+
         if self.F_lse == 't' : n += '_lse'
         else: n += '_nlse'
-        
+
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
         return n
@@ -819,9 +819,10 @@ def write_fwd_splitkv_api(api_pool : FmhaFwdSplitKVApiPool, autogen_dir: Path) -
     file_path = autogen_dir / FMHA_FWD_SPLITKV_API_FILENAME
     file_path.write_text(api_pool.api)
 
-def write_blobs(output_dir : Path, filter_list : str, receipt, mask_impl) -> None:
+def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     filter_list = filter_list.split('@')
     filter_list.extend([''] * (2 - len(filter_list)))
+    assert optdim_list == [-1]
 
     kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
     for kernel in kernels:
@@ -831,9 +832,10 @@ def write_blobs(output_dir : Path, filter_list : str, receipt, mask_impl) -> Non
         write_single_kernel(kernel, output_dir)
     write_fwd_splitkv_api(api_pool, output_dir)
 
-def list_blobs(file_path : Path, filter_list : str, receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     filter_list = filter_list.split('@')
     filter_list.extend([''] * (2 - len(filter_list)))
+    assert optdim_list == [-1]
 
     with file_path.open('a') as f:
         kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index 25931da141..c2b0924eb3 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -30,7 +30,7 @@ handlers = dict(
 )
 assert 0 < len(handlers)
 
-def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list : List[str], receipt, mask_impl) -> None:
+def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list : List[str], optdim_list : List[int], receipt, mask_impl) -> None:
     if output_dir is None:
         output_dir = Path(__file__).parent
     else:
@@ -40,10 +40,10 @@ def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list :
 
     for api, kernel_filter in zip(api_list, filters_list):
         handler = handlers[api][HandlerId.WRITE_BLOBS]
-        handler(output_dir, kernel_filter, receipt, mask_impl)
+        handler(output_dir, kernel_filter, receipt, optdim_list, mask_impl)
 
 # list all the files that will be generated
-def list_blobs(output_file : Optional[str], api_list : List[str], filters_list : List[str], receipt, mask_impl) -> None:
+def list_blobs(output_file : Optional[str], api_list : List[str], filters_list : List[str], optdim_list : List[int], receipt, mask_impl) -> None:
     assert output_file is not None
     file_path = Path(output_file)
 
@@ -52,7 +52,7 @@ def list_blobs(output_file : Optional[str], api_list : List[str], filters_list :
 
     for api, kernel_filter in zip(api_list, filters_list):
         handler = handlers[api][HandlerId.LIST_BLOBS]
-        handler(file_path, kernel_filter, receipt, mask_impl)
+        handler(file_path, kernel_filter, receipt, optdim_list, mask_impl)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -113,12 +113,24 @@ if __name__ == "__main__":
              "  600-699: Only generate instance for aiter::mha_fwd && aiter::mha_fwd_splitkv && aiter::mha_bwd C++ api integration"
     )
 
+    parser.add_argument(
+        "--optdim",
+        default='-1',
+        required=False,
+        help="only optimize the hdim in the list. separated by comma. -1 is the default choice" + \
+              "eg. --optdim=32,64,128,256"
+    )
+
     args = parser.parse_args()
     api_list = args.direction.split(',')
     filter_list = args.filter.split(',')
     filter_list.extend([''] * (len(api_list) - len(filter_list)))
+    optdim_list = [int(hdim) for hdim in args.optdim.split(',')]
+
+    if len(api_list) > 1:
+        assert optdim_list == [-1]
 
     if args.list_blobs is not None:
-        list_blobs(args.list_blobs, api_list, filter_list, int(args.receipt), mask_impl=args.mask)
+        list_blobs(args.list_blobs, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
     else:
-        write_blobs(args.output_dir, api_list, filter_list, int(args.receipt), mask_impl=args.mask)
+        write_blobs(args.output_dir, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)

From ba97363acd615efba5a4c5e3e0553c3ee14e566f Mon Sep 17 00:00:00 2001
From: alexxu-amd <159800977+alexxu-amd@users.noreply.github.com>
Date: Thu, 24 Apr 2025 11:35:06 -0400
Subject: [PATCH 067/443] Setup Doxygen API reference for Docs (#2115)

* setup Doxygen settings

* add api_reference to requirements.txt

* add doxygen file header

* omit latex generation

* remove testing entry

* update Doxyfile
---
 docs/conf.py                 |   1 +
 docs/doxygen/Doxyfile        | 938 +++++++++++++++++++++++------------
 docs/sphinx/requirements.in  |   2 +-
 docs/sphinx/requirements.txt | 143 ++++--
 4 files changed, 724 insertions(+), 360 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index e8617a09ef..fe8a1c1d79 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -28,6 +28,7 @@ external_toc_path = "./sphinx/_toc.yml"
 
 docs_core = ROCmDocs(left_nav_title)
 docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
+docs_core.enable_api_reference()
 docs_core.setup()
 
 external_projects_current_project = "composable_kernel"
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index fac9e138e1..d6f38e0ca9 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.10
+# Doxyfile 1.9.7
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,16 +12,26 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -44,14 +54,14 @@ PROJECT_NUMBER         = v3.0.1.0
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HiP"
+PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HIP"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
-PROJECT_LOGO           = 
+PROJECT_LOGO           =
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -60,16 +70,28 @@ PROJECT_LOGO           =
 
 OUTPUT_DIRECTORY       = .
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,14 +103,14 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
@@ -162,7 +184,8 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        =
+#STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        = /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/latest/ 
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +194,8 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    =
+STRIP_FROM_INC_PATH    = 
+
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -189,6 +213,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -209,6 +243,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -232,20 +274,19 @@ TAB_SIZE               = 4
 # the documentation. An alias has the form:
 # name=value
 # For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -274,28 +315,40 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -303,6 +356,26 @@ EXTENSION_MAPPING      =
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN Use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0. and GITHUB Use the lower case version of title
+# with any whitespace replaced by '-' and punctations characters removed..
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -328,7 +401,7 @@ BUILTIN_STL_SUPPORT    = YES
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -414,6 +487,27 @@ TYPEDEF_HIDES_STRUCT   = YES
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = YES
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -434,6 +528,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -471,6 +571,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -482,14 +589,15 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -508,12 +616,20 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = NO
 
@@ -531,6 +647,12 @@ HIDE_SCOPE_NAMES       = NO
 
 HIDE_COMPOUND_REFERENCE= NO
 
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -688,7 +810,8 @@ FILE_VERSION_FILTER    =
 # output files in an output format independent way. To create the layout file
 # that represents doxygen's defaults, run doxygen with the -l option. You can
 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
 #
 # Note that if you run doxygen from a directory containing a file called
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -699,7 +822,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -734,34 +857,81 @@ WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
 # The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
 # will be replaced by the file and line number from which the warning originated
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -785,12 +955,23 @@ INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -799,11 +980,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
@@ -824,6 +1009,7 @@ FILE_PATTERNS          = *.c \
                          *.hxx \
                          *.hpp \
                          *.h++ \
+                         *.l \
                          *.cs \
                          *.d \
                          *.php \
@@ -837,13 +1023,19 @@ FILE_PATTERNS          = *.c \
                          *.mm \
                          *.dox \
                          *.py \
-                         *.tcl \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
                          *.vhd \
                          *.vhdl \
                          *.ucf \
                          *.qsf \
-                         *.as \
-                         *.js
+                         *.ice
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -880,10 +1072,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        =
 
@@ -927,6 +1116,15 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -936,6 +1134,10 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
@@ -959,7 +1161,16 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ../README.md
+USE_MDFILE_AS_MAINPAGE = ../../README.md
+
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -988,7 +1199,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1020,12 +1231,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1047,25 +1258,6 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1077,17 +1269,11 @@ CLANG_OPTIONS          =
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1134,7 +1320,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            =
+HTML_HEADER            = ../_doxygen/header.html
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1144,7 +1330,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            =
+HTML_FOOTER            = ../_doxygen/footer.html
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1156,7 +1342,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        =
+HTML_STYLESHEET        = ../_doxygen/stylesheet.css
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1166,10 +1352,15 @@ HTML_STYLESHEET        =
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  =
+HTML_EXTRA_STYLESHEET  = ../_doxygen/extra_stylesheet.css
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1181,19 +1372,32 @@ HTML_EXTRA_STYLESHEET  =
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_HUE    = 240
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
 # value of 255 will produce the most vivid colors.
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1211,14 +1415,16 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_TIMESTAMP         = NO
+HTML_DYNAMIC_MENUS     = YES
 
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
@@ -1243,13 +1449,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1263,6 +1470,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1288,8 +1502,12 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1319,7 +1537,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1346,6 +1564,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1364,7 +1592,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1372,8 +1601,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1381,30 +1610,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1447,16 +1676,28 @@ DISABLE_INDEX          = NO
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 GENERATE_TREEVIEW      = NO
 
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
 #
@@ -1481,6 +1722,24 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1490,19 +1749,14 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
 
-FORMULA_TRANSPARENT    = YES
+FORMULA_MACROFILE      =
 
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1512,11 +1766,29 @@ FORMULA_TRANSPARENT    = YES
 
 USE_MATHJAX            = YES
 
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
 # When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
 # Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1529,22 +1801,29 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_RELPATH        =
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1572,7 +1851,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1591,7 +1870,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1604,8 +1884,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1656,21 +1937,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1700,29 +1995,31 @@ PAPER_TYPE             = a4
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
 #
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
 # LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_FOOTER           =
@@ -1755,18 +2052,26 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# The LATEX_BATCHMODE tag ignals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1779,24 +2084,22 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1836,9 +2139,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1847,22 +2150,12 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -1934,6 +2227,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1952,23 +2252,14 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2047,7 +2338,8 @@ SEARCH_INCLUDES        = NO
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 INCLUDE_PATH           =
@@ -2136,41 +2428,10 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = NO
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2179,7 +2440,7 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
 # The default value is: NO.
@@ -2196,35 +2457,52 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTNAME           = Helvetica
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
 
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTSIZE           = 10
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
@@ -2238,7 +2516,8 @@ CLASS_GRAPH            = YES
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. See also the chapter Grouping
+# in the manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2261,10 +2540,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2331,10 +2632,17 @@ GRAPHICAL_HIERARCHY    = YES
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
@@ -2371,11 +2679,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2384,13 +2693,18 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
@@ -2420,18 +2734,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
@@ -2444,14 +2746,34 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index b89cb9fec8..ac03e40939 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.18.2
+rocm-docs-core[api_reference]==1.18.2
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 2a52a48e4c..3742eeebba 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -6,68 +6,79 @@
 #
 accessible-pygments==0.0.5
     # via pydata-sphinx-theme
-alabaster==0.7.16
+alabaster==1.0.0
     # via sphinx
 asttokens==3.0.0
     # via stack-data
-attrs==24.3.0
+attrs==25.3.0
     # via
     #   jsonschema
     #   jupyter-cache
     #   referencing
-babel==2.15.0
+babel==2.17.0
     # via
     #   pydata-sphinx-theme
     #   sphinx
-beautifulsoup4==4.12.3
+beautifulsoup4==4.13.4
     # via pydata-sphinx-theme
-breathe==4.35.0
+breathe==4.36.0
     # via rocm-docs-core
-certifi==2024.7.4
+certifi==2025.1.31
     # via requests
-cffi==1.16.0
+cffi==1.17.1
     # via
     #   cryptography
     #   pynacl
-charset-normalizer==3.3.2
+charset-normalizer==3.4.1
     # via requests
-click==8.1.7
+click==8.1.8
     # via
+    #   click-log
+    #   doxysphinx
     #   jupyter-cache
     #   sphinx-external-toc
+click-log==0.4.0
+    # via doxysphinx
 comm==0.2.2
     # via ipykernel
-cryptography==43.0.0
+contourpy==1.3.2
+    # via matplotlib
+cryptography==44.0.2
     # via pyjwt
-debugpy==1.8.12
+cycler==0.12.1
+    # via matplotlib
+debugpy==1.8.14
     # via ipykernel
-decorator==5.1.1
+decorator==5.2.1
     # via ipython
-deprecated==1.2.14
+deprecated==1.2.18
     # via pygithub
 docutils==0.21.2
     # via
-    #   breathe
     #   myst-parser
     #   pybtex-docutils
     #   pydata-sphinx-theme
     #   sphinx
     #   sphinxcontrib-bibtex
+doxysphinx==3.3.12
+    # via rocm-docs-core
 exceptiongroup==1.2.2
     # via ipython
-executing==2.1.0
+executing==2.2.0
     # via stack-data
-fastjsonschema==2.20.0
+fastjsonschema==2.21.1
     # via
     #   nbformat
     #   rocm-docs-core
-gitdb==4.0.11
+fonttools==4.57.0
+    # via matplotlib
+gitdb==4.0.12
     # via gitpython
-gitpython==3.1.43
+gitpython==3.1.44
     # via rocm-docs-core
-greenlet==3.1.1
+greenlet==3.2.1
     # via sqlalchemy
-idna==3.7
+idna==3.10
     # via requests
 imagesize==1.4.1
     # via sphinx
@@ -77,13 +88,13 @@ importlib-metadata==8.6.1
     #   myst-nb
 ipykernel==6.29.5
     # via myst-nb
-ipython==8.31.0
+ipython==8.35.0
     # via
     #   ipykernel
     #   myst-nb
 jedi==0.19.2
     # via ipython
-jinja2==3.1.4
+jinja2==3.1.6
     # via
     #   myst-parser
     #   sphinx
@@ -103,25 +114,35 @@ jupyter-core==5.7.2
     #   jupyter-client
     #   nbclient
     #   nbformat
+kiwisolver==1.4.8
+    # via matplotlib
 latexcodec==3.0.0
     # via pybtex
+libsass==0.22.0
+    # via doxysphinx
+lxml==5.2.1
+    # via doxysphinx
 markdown-it-py==3.0.0
     # via
     #   mdit-py-plugins
     #   myst-parser
-markupsafe==2.1.5
+markupsafe==3.0.2
     # via jinja2
+matplotlib==3.10.1
+    # via doxysphinx
 matplotlib-inline==0.1.7
     # via
     #   ipykernel
     #   ipython
-mdit-py-plugins==0.4.1
+mdit-py-plugins==0.4.2
     # via myst-parser
 mdurl==0.1.2
     # via markdown-it-py
-myst-nb==1.1.2
+mpire==2.10.2
+    # via doxysphinx
+myst-nb==1.2.0
     # via rocm-docs-core
-myst-parser==3.0.1
+myst-parser==4.0.1
     # via myst-nb
 nbclient==0.10.2
     # via
@@ -134,20 +155,28 @@ nbformat==5.10.4
     #   nbclient
 nest-asyncio==1.6.0
     # via ipykernel
-packaging==24.1
+numpy==1.26.4
+    # via
+    #   contourpy
+    #   doxysphinx
+    #   matplotlib
+packaging==25.0
     # via
     #   ipykernel
+    #   matplotlib
     #   pydata-sphinx-theme
     #   sphinx
 parso==0.8.4
     # via jedi
 pexpect==4.9.0
     # via ipython
-platformdirs==4.3.6
+pillow==11.2.1
+    # via matplotlib
+platformdirs==4.3.7
     # via jupyter-core
-prompt-toolkit==3.0.50
+prompt-toolkit==3.0.51
     # via ipython
-psutil==6.1.1
+psutil==7.0.0
     # via ipykernel
 ptyprocess==0.7.0
     # via pexpect
@@ -165,21 +194,30 @@ pydata-sphinx-theme==0.15.4
     # via
     #   rocm-docs-core
     #   sphinx-book-theme
-pygithub==2.3.0
+pygithub==2.6.1
     # via rocm-docs-core
-pygments==2.18.0
+pygments==2.19.1
     # via
     #   accessible-pygments
     #   ipython
+    #   mpire
     #   pydata-sphinx-theme
     #   sphinx
-pyjwt[crypto]==2.8.0
+pyjson5==1.6.8
+    # via doxysphinx
+pyjwt[crypto]==2.10.1
     # via pygithub
 pynacl==1.5.0
     # via pygithub
+pyparsing==3.2.3
+    # via
+    #   doxysphinx
+    #   matplotlib
 python-dateutil==2.9.0.post0
-    # via jupyter-client
-pyyaml==6.0.1
+    # via
+    #   jupyter-client
+    #   matplotlib
+pyyaml==6.0.2
     # via
     #   jupyter-cache
     #   myst-nb
@@ -187,11 +225,11 @@ pyyaml==6.0.1
     #   pybtex
     #   rocm-docs-core
     #   sphinx-external-toc
-pyzmq==26.2.0
+pyzmq==26.4.0
     # via
     #   ipykernel
     #   jupyter-client
-referencing==0.36.1
+referencing==0.36.2
     # via
     #   jsonschema
     #   jsonschema-specifications
@@ -199,23 +237,23 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.18.2
+rocm-docs-core[api-reference]==1.18.2
     # via -r requirements.in
-rpds-py==0.22.3
+rpds-py==0.24.0
     # via
     #   jsonschema
     #   referencing
-six==1.16.0
+six==1.17.0
     # via
     #   pybtex
     #   python-dateutil
-smmap==5.0.1
+smmap==5.0.2
     # via gitdb
 snowballstemmer==2.2.0
     # via sphinx
-soupsieve==2.5
+soupsieve==2.7
     # via beautifulsoup4
-sphinx==7.4.7
+sphinx==8.1.3
     # via
     #   breathe
     #   myst-nb
@@ -228,15 +266,15 @@ sphinx==7.4.7
     #   sphinx-external-toc
     #   sphinx-notfound-page
     #   sphinxcontrib-bibtex
-sphinx-book-theme==1.1.3
+sphinx-book-theme==1.1.4
     # via rocm-docs-core
 sphinx-copybutton==0.5.2
     # via rocm-docs-core
-sphinx-design==0.6.0
+sphinx-design==0.6.1
     # via rocm-docs-core
 sphinx-external-toc==1.0.1
     # via rocm-docs-core
-sphinx-notfound-page==1.0.3
+sphinx-notfound-page==1.1.0
     # via rocm-docs-core
 sphinxcontrib-applehelp==2.0.0
     # via sphinx
@@ -252,18 +290,20 @@ sphinxcontrib-qthelp==2.0.0
     # via sphinx
 sphinxcontrib-serializinghtml==2.0.0
     # via sphinx
-sqlalchemy==2.0.37
+sqlalchemy==2.0.40
     # via jupyter-cache
 stack-data==0.6.3
     # via ipython
 tabulate==0.9.0
     # via jupyter-cache
-tomli==2.0.1
+tomli==2.2.1
     # via sphinx
 tornado==6.4.2
     # via
     #   ipykernel
     #   jupyter-client
+tqdm==4.67.1
+    # via mpire
 traitlets==5.14.3
     # via
     #   comm
@@ -274,21 +314,22 @@ traitlets==5.14.3
     #   matplotlib-inline
     #   nbclient
     #   nbformat
-typing-extensions==4.12.2
+typing-extensions==4.13.2
     # via
+    #   beautifulsoup4
     #   ipython
     #   myst-nb
     #   pydata-sphinx-theme
     #   pygithub
     #   referencing
     #   sqlalchemy
-urllib3==2.2.2
+urllib3==2.4.0
     # via
     #   pygithub
     #   requests
 wcwidth==0.2.13
     # via prompt-toolkit
-wrapt==1.16.0
+wrapt==1.17.2
     # via deprecated
 zipp==3.21.0
     # via importlib-metadata

From 01cb8379cd9b7ce401085e60b39abde50e7dc734 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 24 Apr 2025 10:14:52 -0700
Subject: [PATCH 068/443] make code compliant with std=c++20 (#2123)

---
 include/ck/library/utility/fill.hpp        | 4 ++--
 include/ck/library/utility/host_tensor.hpp | 2 +-
 include/ck_tile/host/fill.hpp              | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp
index 3336041354..35625d142e 100644
--- a/include/ck/library/utility/fill.hpp
+++ b/include/ck/library/utility/fill.hpp
@@ -94,7 +94,7 @@ struct FillMonotonicSeq
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::generate(first, last, [=, n = init_value_]() mutable {
+        std::generate(first, last, [=, *this, n = init_value_]() mutable {
             auto tmp = n;
             n += step_;
             return tmp;
@@ -150,7 +150,7 @@ struct TransformIntoStructuralSparsity
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::for_each(first, last, [=, idx = 0](T& elem) mutable {
+        std::for_each(first, last, [=, *this, idx = 0](T& elem) mutable {
             auto tmp_idx = idx;
             idx += 1;
             return elem *= valid_sequences[tmp_idx % (sizeof(valid_sequences) / sizeof(T))];
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index edf58b20b4..2cbca29afc 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -252,7 +252,7 @@ struct ParallelTensorFunctor
             std::size_t iw_begin = it * work_per_thread;
             std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
 
-            auto f = [=] {
+            auto f = [=, *this] {
                 for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
                 {
                     call_f_unpack_args(mF, GetNdIndices(iw));
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index d90c0cf6cf..3f64eb28cd 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -280,7 +280,7 @@ struct FillMonotonicSeq
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::generate(first, last, [=, n = init_value_]() mutable {
+        std::generate(first, last, [=, *this, n = init_value_]() mutable {
             auto tmp = n;
             if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
             {
@@ -315,7 +315,7 @@ struct FillStepRange
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::generate(first, last, [=, n = start_value_]() mutable {
+        std::generate(first, last, [=, *this, n = start_value_]() mutable {
             auto tmp = n;
             n += step_;
             if constexpr(IsAscending)
@@ -388,7 +388,7 @@ struct AdjustToStructuredSparsity
     template <typename ForwardIter>
     void operator()(ForwardIter first, ForwardIter last) const
     {
-        std::transform(first, last, first, [=, index = start](T val) mutable {
+        std::transform(first, last, first, [=, *this, index = start](T val) mutable {
             auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
             index += 1;
 

From a2ed34a112982664132db5283ee4d1b1aac746d5 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 24 Apr 2025 10:20:22 -0700
Subject: [PATCH 069/443] MFMA_32x32x16 for gfx950  (#2121)

* Enable MFMA_32x32x16 for fp16/BF16 for gfx950

* clang formatted
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index e6350a8827..4732027e57 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -20,9 +20,15 @@ using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
 using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+
+#else
 using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
@@ -105,9 +111,15 @@ using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
 using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
+#if defined(__gfx950__)
+using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+
+#else
 using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
+#endif
 
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<

From 41541aff7a3651b72977d3c52786a37bba24a7d2 Mon Sep 17 00:00:00 2001
From: joyeamd <john.ye@amd.com>
Date: Fri, 25 Apr 2025 16:31:09 +0800
Subject: [PATCH 070/443] SWDEV-52596 for hdim=256, when use splitkv pipeline,
 two new pipelines need to be added (#2126)

---
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index ca49af1496..75d84daf32 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -676,6 +676,12 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
                     pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
 
+                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', bias, 't', squant, pagedkv, mask))
+
+                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+
                     pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                     pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
                 else:

From 3d4d70d2fc6b1fe77d82e3cd2b5c9aae3a315b42 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Sun, 27 Apr 2025 14:07:41 +0800
Subject: [PATCH 071/443] Avoid using store_tile_raw() for fp32 tensors (#2072)

---
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 75d84daf32..5ad118fd1a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -91,10 +91,12 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
 using fmha_pipeline = {F_pipeline}<
     fmha_pipeline_problem>;
 
+/// FIXME: use {F_spad}/{F_dvpad} as kPadM/kPadN parameters after solving
+///        store_tile_raw() data corruption issue
 using fmha_epilogue =
     ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
                                            typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
-                                           {F_spad}, {F_dvpad}>>;
+                                           false, false>>;
 
 using fmha_kernel =
     ck_tile::FmhaFwdSplitKVKernel<fmha_pipeline, fmha_epilogue>;

From 8add2cf45d8c9b298d820c6cf7f158cc13936352 Mon Sep 17 00:00:00 2001
From: Yi DING <andy-ding@outlook.com>
Date: Mon, 28 Apr 2025 07:26:05 +0800
Subject: [PATCH 072/443] Fix fp8 convert & add option for basic example
 (#2129)

---
 example/ck_tile/03_gemm/CMakeLists.txt  | 1 +
 include/ck_tile/core/numeric/float8.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 61c3a57391..411db2e317 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -5,4 +5,5 @@ if(CK_USE_OCP_FP8)
   list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index a4e8ca6a2b..b5da468319 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -530,7 +530,7 @@ CK_TILE_HOST_DEVICE DstT run_cast_from_f8(SrcT x)
     }
     else
     {
-        if(x == 0x80)
+        if(x == SrcT(0x80))
         {
             return fNeg0;
         }

From edd92fc546663094f42366e12a172701f18a2fd9 Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Mon, 28 Apr 2025 11:14:21 +0600
Subject: [PATCH 073/443] DeviceGemm_Wmma_CShuffleV3 with
 BlockGemmPipelineVersion::v3 (#2096)

* Prepare files for DeviceGemm_Wmma_CShuffleV3

* Implement main part of CShuffleV3 with block pipeline v3 for WMMA

* Remove unused functions and template params for A/B descriptors

* Support both gfx11 and gfx12

* Enable SplitK for gfx12 and disable for gfx11

* Added RowColRow layout for DeviceGemmV2 fp16

* Added more instances for Row, Col, Row data layout

* Added instances for DeviceGemm_Wmma_CShuffleV3, Col, Row, Row data layout

* Added instances for DeviceGemm_Wmma_CShuffleV3, Col, Col, Row data layout

* Added more instances for DeviceGemm_Wmma_CShuffleV3, Row, Row, Row data layout

* Fix formatting

* Add documentation

Based on e5ad48a7843a16a1ed0c1268b5dba7dfe2d59e4d

* Enable gemm_universal profiling for gfx11/12

* Add WMMA intrinsics for F8/BF8

* Support F8/BF8 DeviceGemm_Wmma_CShuffleV3, add basic instances

* Add BF16 instances and tests

* Fix test_gemm_universal_wmma_fp8 by adding CK_USE_WMMA_FP8

---------

Co-authored-by: Anca Hamuraru <anca@streamhpc.com>
---
 CMakeLists.txt                                |    7 +-
 include/ck/ck.hpp                             |    2 +-
 include/ck/config.h.in                        |    6 +-
 .../blockwise_gemm_pipeline_wmma_selector.hpp |   60 +
 .../block/blockwise_gemm_pipeline_wmmaops.hpp |   85 +
 .../blockwise_gemm_pipeline_wmmaops_base.hpp  |  309 +++
 .../blockwise_gemm_pipeline_wmmaops_v3.hpp    |  466 +++++
 .../impl/device_gemm_wmma_cshuffle_v3.hpp     |  542 ++++++
 .../grid/gridwise_gemm_wmma_cshuffle_v3.hpp   | 1725 +++++++++++++++++
 .../tensor_operation/gpu/warp/wmma_gemm.hpp   |  184 +-
 include/ck/utility/amd_buffer_addressing.hpp  |    2 +-
 include/ck/utility/amd_wmma.hpp               |   98 +-
 .../gpu/gemm_universal.hpp                    |  599 +-----
 .../gpu/gemm_universal_wmma.inc               |   68 +
 .../gpu/gemm_universal_xdl.inc                |  521 +++++
 .../gpu/CMakeLists.txt                        |   38 +-
 .../gpu/gemm_universal/CMakeLists.txt         |   68 +-
 ...wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp |   64 +
 ...16_bf16_km_kn_mn_comp_default_instance.cpp |   25 +
 ...wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp |   64 +
 ...16_bf16_km_nk_mn_comp_default_instance.cpp |   25 +
 ...wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp |   67 +
 ...16_bf16_mk_kn_mn_comp_default_instance.cpp |   25 +
 ...wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp |   64 +
 ...16_bf16_mk_nk_mn_comp_default_instance.cpp |   25 +
 ...mm_wmma_universal_f16_f16_f16_km_kn_mn.hpp |   64 +
 ...f16_f16_km_kn_mn_comp_default_instance.cpp |   24 +
 ...mm_wmma_universal_f16_f16_f16_km_nk_mn.hpp |   64 +
 ...f16_f16_km_nk_mn_comp_default_instance.cpp |   24 +
 ...mm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp |   67 +
 ...f16_f16_mk_kn_mn_comp_default_instance.cpp |   24 +
 ...mm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp |   64 +
 ...f16_f16_mk_nk_mn_comp_default_instance.cpp |   24 +
 ...emm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp |   51 +
 ...f8_bf16_mk_kn_mn_comp_default_instance.cpp |   27 +
 ...emm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp |   51 +
 ...f8_bf16_mk_nk_mn_comp_default_instance.cpp |   27 +
 .../profiler/profile_gemm_universal_impl.hpp  |    2 +-
 profiler/src/CMakeLists.txt                   |    4 +-
 profiler/src/profile_gemm_universal.cpp       |   10 +-
 test/gemm_universal/CMakeLists.txt            |   32 +-
 .../test_gemm_universal_wmma_bf16.cpp         |   80 +
 .../test_gemm_universal_wmma_fp16.cpp         |   57 +
 .../test_gemm_universal_wmma_fp8.cpp          |   61 +
 44 files changed, 5326 insertions(+), 570 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
 create mode 100644 test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
 create mode 100644 test/gemm_universal/test_gemm_universal_wmma_fp8.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba57ead09a..4e12462a41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,7 +202,7 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     set(CK_USE_XDL "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
-    message("Enabling FP8 gemms on native architectures")
+    message("Enabling XDL FP8 gemms on native architectures")
     add_definitions(-DCK_USE_GFX94)
     set(CK_USE_GFX94 "ON")
 endif()
@@ -211,6 +211,11 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1
     add_definitions(-DCK_USE_WMMA)
     set(CK_USE_WMMA "ON")
 endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+    message("Enabling WMMA FP8 gemms on native architectures")
+    add_definitions(-DCK_USE_WMMA_FP8)
+    set(CK_USE_WMMA_FP8 "ON")
+endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx950")
     add_definitions(-DCK_USE_OCP_FP8)
     set(CK_USE_OCP_FP8 "ON")
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 83b76382bc..e38f166c1a 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -125,7 +125,7 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx9__) // for GPU code
+#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
diff --git a/include/ck/config.h.in b/include/ck/config.h.in
index 994e60025d..306a6c2ff1 100644
--- a/include/ck/config.h.in
+++ b/include/ck/config.h.in
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -115,6 +115,10 @@
 #cmakedefine CK_USE_WMMA @CK_USE_WMMA@
 #endif
 
+#ifndef CK_USE_WMMA_FP8
+#cmakedefine CK_USE_WMMA_FP8 @CK_USE_WMMA_FP8@
+#endif
+
 #ifndef CK_USE_GFX94
 #cmakedefine CK_USE_GFX94 @CK_USE_GFX94@
 #endif
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
new file mode 100644
index 0000000000..2fdabc6bc7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp"
+
+namespace ck {
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmPipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        return BlockwiseGemmWmmaops_pipeline_v3<BlkGemmPipeSche,
+                                                BlockSize,
+                                                ADataType,
+                                                BDataType,
+                                                ComputeTypeA,
+                                                ComputeTypeB,
+                                                AccDataType,
+                                                AWmmaTileDesc,
+                                                BWmmaTileDesc,
+                                                ABlockTransferSrcScalarPerVector,
+                                                BBlockTransferSrcScalarPerVector,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWmma,
+                                                NPerWmma,
+                                                MRepeat,
+                                                NRepeat,
+                                                KPack>{};
+    }
+    else
+    {
+        static_assert(false, "BlockGemmPipeline configuration is not available");
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp
new file mode 100644
index 0000000000..31c4729760
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t ABufferLoadWidth,
+          index_t BBufferLoadWidth,
+          index_t ALDSWriteWidth,
+          index_t BLDSWriteWidth,
+          index_t ALDSReadWidth,
+          index_t BLDSReadWidth,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t KPerWmma>
+struct BlockwiseGemmWmmaops_pipeline_hotloop_inst
+{
+    static constexpr index_t WaveSize = 32;
+    static constexpr index_t WaveNumM = MPerBlock / (MRepeat * MPerWmma);
+    static constexpr index_t WaveNumN = NPerBlock / (NRepeat * NPerWmma);
+
+    static constexpr index_t A_LDS_Read_Width = ALDSReadWidth;
+    static constexpr index_t B_LDS_Read_Width = BLDSReadWidth;
+
+    static constexpr index_t A_Buffer_Load_Inst_Num =
+        MPerBlock * KPerBlock / (BlockSize * ABufferLoadWidth);
+    static constexpr index_t B_Buffer_Load_Inst_Num =
+        NPerBlock * KPerBlock / (BlockSize * BBufferLoadWidth);
+
+    static constexpr index_t A_LDS_Write_Inst_Num =
+        MPerBlock * KPerBlock / (BlockSize * ALDSWriteWidth);
+    static constexpr index_t B_LDS_Write_Inst_Num =
+        NPerBlock * KPerBlock / (BlockSize * BLDSWriteWidth);
+
+    static constexpr index_t A_LDS_Read_Inst_Num =
+        WaveNumN * MPerBlock * KPerBlock / (BlockSize * ALDSReadWidth);
+    static constexpr index_t B_LDS_Read_Inst_Num =
+        WaveNumM * NPerBlock * KPerBlock / (BlockSize * BLDSReadWidth);
+
+    static constexpr index_t C_WMMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
+                                               (BlockSize / WaveSize) /
+                                               (MPerWmma * NPerWmma * KPerWmma);
+
+    static constexpr auto Print()
+    {
+        printf(" Blk/Wave Size: %d, %d, M/N/K PerBlk: %d, %d, %d, M/N/K PerWmma: %d, %d, %d\n",
+               BlockSize,
+               WaveSize,
+               MPerBlock,
+               NPerBlock,
+               KPerBlock,
+               MPerWmma,
+               NPerWmma,
+               KPerWmma);
+
+        printf(" A/B buffer load inst: %d, %d\n A/B LDS write inst: %d, %d\n A/B LDS read inst: "
+               "%d, %d\n C WMMA inst: %d\n"
+               "A/B LDS read width: %d, %d, A/B LDS write width: %d, %d, A/B buffer load width: "
+               "%d, %d\n",
+               A_Buffer_Load_Inst_Num,
+               B_Buffer_Load_Inst_Num,
+               A_LDS_Write_Inst_Num,
+               B_LDS_Write_Inst_Num,
+               A_LDS_Read_Inst_Num,
+               B_LDS_Read_Inst_Num,
+               C_WMMA_Inst_Num,
+               A_LDS_Read_Width,
+               B_LDS_Read_Width,
+               ALDSWriteWidth,
+               BLDSWriteWidth,
+               ABufferLoadWidth,
+               BBufferLoadWidth);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
new file mode 100644
index 0000000000..a63d32802e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool TransposeC = false>
+struct BlockwiseGemmWmmaops_pipeline_base
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I5 = Number<5>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t WaveSize = 32;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+#if defined(__gfx12__)
+    static constexpr index_t A_KRow = 2;
+    static constexpr index_t B_KRow = 2;
+#else
+    static constexpr index_t A_KRow = 1;
+    static constexpr index_t B_KRow = 1;
+#endif
+
+    static constexpr index_t A_K1 = AWmmaTileDesc{}.GetLength(I5);
+    static constexpr index_t B_K1 = BWmmaTileDesc{}.GetLength(I5);
+
+    static_assert(KPack % (A_K1 * A_KRow) == 0, "wrong!");
+    static_assert(KPack % (B_K1 * B_KRow) == 0, "wrong!");
+
+    static constexpr auto wmma_gemm =
+        WmmaGemm<ADataType, BDataType, AccDataType, MPerWmma, NPerWmma, KPack, TransposeC>{};
+
+    static constexpr index_t KRepeat = KPerBlock / KPack;
+
+    static constexpr auto WmmaK = Number<wmma_gemm.wmma_instr.k_per_wmma>{};
+
+    using HotLoopInstList =
+        ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst<BlockSize,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       A_K1,
+                                                       B_K1,
+                                                       A_K1,
+                                                       B_K1,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       MPerWmma,
+                                                       NPerWmma,
+                                                       wmma_gemm.wmma_instr.k_per_wmma>;
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              AccDataType,
+                              MRepeat * NRepeat,
+                              wmma_gemm.GetRegSizePerWmma(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto wmma_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
+
+#if defined(__gfx12__)
+        const auto wmma_krow = wmma_gemm.GetSubGroupId();
+#else
+        const auto wmma_krow = 0;
+#endif
+
+        //  |KRepeat   |MRepeat|MWave    |KRow  |MLane  |KPack
+        return make_tuple(0, 0, waveId_m, wmma_krow, wmma_a_idx, 0);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto wmma_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
+
+#if defined(__gfx12__)
+        const auto wmma_krow = wmma_gemm.GetSubGroupId();
+#else
+        const auto wmma_krow = 0;
+#endif
+
+        //  |KRepeat   |NRepeat|Nwave     |KRow  |NLane  |KPack
+        return make_tuple(0, 0, waveId_n, wmma_krow, wmma_b_idx, 0);
+    }
+
+    template <index_t m0, index_t n0>
+    __device__ static auto CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
+
+        constexpr auto mrepeat_mwave_mperwmma_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWmma))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperwmma_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWmma))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperwmma_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperwmma_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    using Tuple6 = decltype(CalculateAThreadOriginDataIndex());
+
+    /**
+     * @brief Constructor for BlockwiseGemmWmmaops_pipeline_base.
+     *
+     * This constructor initializes the thread copy objects for matrices A and B.
+     * It also performs several compile-time checks to ensure the correctness of the
+     * matrix tile descriptors.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     * @note The constructor includes static assertions to ensure that:
+     * - The matrix tile descriptors for A and B are known at compile-time.
+     * - The number of threads in the thread block matches the product of MWaves, NWaves, and
+     * WaveSize.
+     * - The dimensions of the block are divisible by the product of the corresponding WMMA and
+     * repeat dimensions.
+     */
+    __host__ __device__
+    BlockwiseGemmWmmaops_pipeline_base(Tuple6 a_origin = CalculateAThreadOriginDataIndex(),
+                                       Tuple6 b_origin = CalculateBThreadOriginDataIndex())
+        : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
+    {
+        static_assert(AWmmaTileDesc::IsKnownAtCompileTime() &&
+                          BWmmaTileDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerWmma * MRepeat) == 0 &&
+                          NPerBlock % (NPerWmma * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
+            wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
+
+        constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
+        constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3];
+        return make_naive_tensor_descriptor(
+            //        |MRepeat           |MWave |MSubGroup |NRepeat           |NWave
+            //        |NThreadPerSubGroup |MAccVgprs
+            make_tuple(Number<MRepeat>{}, I1, I1, Number<NRepeat>{}, I1, I1, MAccVgprs),
+            make_tuple(Number<NRepeat>{} * MAccVgprs * AccStride,
+                       Number<NRepeat>{} * MAccVgprs * AccStride,
+                       Number<NRepeat>{} * MAccVgprs * AccStride,
+                       MAccVgprs * AccStride,
+                       MAccVgprs * AccStride,
+                       MAccVgprs * AccStride,
+                       AccStride));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
+    {
+        constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<MPerWmma>{},
+                                                           Number<NRepeat>{},
+                                                           Number<NWaves>{},
+                                                           Number<NPerWmma>{}));
+
+        return wmma_gemm
+            .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
+                c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
+    }
+
+    // Describe how data allocated in thread copy src buffer
+    // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
+    static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1;
+    static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1;
+
+    protected:
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor(make_tuple(Number<KPack / A_K1 / A_KRow>{},
+                                                Number<MRepeat>{},
+                                                Number<KRepeat>{},
+                                                I1,
+                                                I1,
+                                                Number<A_K1>{}),
+                                     make_tuple(Number<A_K1>{},
+                                                Number<KPack / A_KRow>{},
+                                                Number<KPack * A_K1>{},
+                                                Number<A_K1>{},
+                                                Number<A_K1>{},
+                                                Number<1>{}));
+
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor(make_tuple(Number<KPack / B_K1 / B_KRow>{},
+                                                Number<NRepeat>{},
+                                                Number<KRepeat>{},
+                                                I1,
+                                                I1,
+                                                Number<B_K1>{}),
+                                     make_tuple(Number<B_K1>{},
+                                                Number<KPack / B_KRow>{},
+                                                Number<KPack * B_K1>{},
+                                                Number<B_K1>{},
+                                                Number<B_K1>{},
+                                                Number<1>{}));
+
+    // C[M, N, NumRegWmma]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
+
+    using AThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                         ADataType,
+                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                         decltype(a_thread_desc_),
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         A_K1,
+                                         A_K1>;
+
+    using BThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                         BDataType,
+                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                         decltype(b_thread_desc_),
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         B_K1,
+                                         B_K1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
new file mode 100644
index 0000000000..2fb95f0f8d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
+                                        BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeTypeA,
+                                        ComputeTypeB,
+                                        AccDataType,
+                                        AWmmaTileDesc,
+                                        BWmmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerWmma,
+                                        NPerWmma,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+    : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                         ADataType,
+                                         BDataType,
+                                         ComputeTypeA,
+                                         ComputeTypeB,
+                                         AccDataType,
+                                         AWmmaTileDesc,
+                                         BWmmaTileDesc,
+                                         ABlockTransferSrcScalarPerVector,
+                                         BBlockTransferSrcScalarPerVector,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         KPerBlock,
+                                         MPerWmma,
+                                         NPerWmma,
+                                         MRepeat,
+                                         NRepeat,
+                                         KPack>
+{
+    using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                                    ADataType,
+                                                    BDataType,
+                                                    ComputeTypeA,
+                                                    ComputeTypeB,
+                                                    AccDataType,
+                                                    AWmmaTileDesc,
+                                                    BWmmaTileDesc,
+                                                    ABlockTransferSrcScalarPerVector,
+                                                    BBlockTransferSrcScalarPerVector,
+                                                    MPerBlock,
+                                                    NPerBlock,
+                                                    KPerBlock,
+                                                    MPerWmma,
+                                                    NPerWmma,
+                                                    MRepeat,
+                                                    NRepeat,
+                                                    KPack>;
+    using Base::I0;
+
+    using Base::A_K1;
+    using Base::A_KRow;
+    using Base::B_K1;
+    using Base::B_KRow;
+    using Base::KRepeat;
+    using Base::WmmaK;
+
+    using Base::wmma_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::
+        GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+    using Base::GetCThreadBuffer;
+    using Base::
+        GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+
+    using Base::a_block_desc_k0_m0_m1_m2_k1;
+    using Base::b_block_desc_k0_n0_n1_n2_k1;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // TODO: Calculation of the number of instructions may require changes for WMMA
+        /*
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_wmma_inst = HotLoopInstList::C_WMMA_Inst_Num;
+
+        constexpr auto wmma_cycle = NPerWmma == 16 ? 16 : 32;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_wmma_rate =
+            (wmma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_wmma_rate =
+            (wmma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_wmma =
+            (num_ds_read_inst_a + ds_read_a_wmma_rate - 1) / ds_read_a_wmma_rate;
+        constexpr auto num_dsread_b_wmma =
+            (num_ds_read_inst_b + ds_read_b_wmma_rate - 1) / ds_read_b_wmma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_wmma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_wmma_stage1 = num_wmma_inst - (num_dsread_a_wmma + num_dsread_b_wmma);
+        constexpr auto num_wmma_per_issue =
+            num_wmma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_wmma_per_issue - num_dswrite_per_issue_a, 0); // WMMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_wmma_per_issue - num_dswrite_per_issue_b, 0); // WMMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_wmma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_wmma_rate) >=
+                         ds_read_a_wmma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_wmma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_wmma - 1) *
+                                                                              ds_read_a_wmma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+        });
+
+        static_for<0, num_dsread_b_wmma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_wmma_rate) >=
+                         ds_read_b_wmma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_wmma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_wmma - 1) *
+                                                                              ds_read_b_wmma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+        });
+        */
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(
+                    a_block_desc_k0_m0_m1_m2_k1,
+                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
+                    a_block_buf,
+                    a_thread_desc_,
+                    make_tuple(I0, m0, k0, I0, I0, I0),
+                    a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_thread_copy_.Run(
+                    b_block_desc_k0_n0_n1_n2_k1,
+                    make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                    b_block_buf,
+                    b_thread_desc_,
+                    make_tuple(I0, n0, k0, I0, I0, I0),
+                    b_thread_buf);
+            });
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                block_sync_lds();
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                            static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(ik / A_K1, m0, k0, 0, 0, ik % A_K1))>{}];
+                            });
+                            static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(ik / B_K1, n0, k0, 0, 0, ik % B_K1))>{}];
+                            });
+
+                            using wmma_input_type_a =
+                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                            using wmma_input_type_b =
+                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        });
+                    });
+                });
+
+                block_sync_lds();
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_k0_m0_m1_m2_k1,
+                            make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(I0, m0, k0, I0, I0, I0),
+                            a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(
+                            b_block_desc_k0_n0_n1_n2_k1,
+                            make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                            b_block_buf,
+                            b_thread_desc_,
+                            make_tuple(I0, n0, k0, I0, I0, I0),
+                            b_thread_buf);
+                    });
+                });
+
+                HotLoopScheduler();
+                __builtin_amdgcn_sched_barrier(0);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(ik / A_K1, m0, k0, 0, 0, ik % A_K1))>{}];
+                        });
+                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(ik / B_K1, n0, k0, 0, 0, ik % B_K1))>{}];
+                        });
+
+                        using wmma_input_type_a =
+                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                        using wmma_input_type_b =
+                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                      b_thread_vec.template AsType<wmma_input_type_b>(),
+                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last WMMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..1ef8a9b8ad
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,542 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/// @brief \"Universal\" GEMM operation with SplitK support.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations applied to the A, B, and C tensors, respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
+                                                        BLayout,
+                                                        CLayout,
+                                                        ADataType,
+                                                        BDataType,
+                                                        CDataType,
+                                                        AElementwiseOperation,
+                                                        BElementwiseOperation,
+                                                        CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
+    struct Invoker : public BaseInvoker
+    {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            HIP_CHECK_ERROR(hipMemsetAsync(arg_.p_c_grid,
+                                                           0,
+                                                           arg_.M * arg_.N * sizeof(CDataType),
+                                                           stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        HIP_CHECK_ERROR(hipMemsetAsync(arg.p_c_grid,
+                                                       0,
+                                                       arg.M * arg.N * sizeof(CDataType),
+                                                       stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::AtomicAdd,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                else
+                {
+                    // TODO: Implement
+                }
+            }
+            else
+            {
+                // TODO: Implement
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported())
+        {
+            return false;
+        }
+
+        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
+                     std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if(arg.KBatch > 1 && ck::is_gfx11_supported())
+            {
+                // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
+                     std::is_same_v<BDataType, f8_t> || std::is_same_v<BDataType, bf8_t>)
+        {
+            if(ck::is_gfx11_supported())
+            {
+                return false;
+            }
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteA() override { return PermuteA; }
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
+    {
+        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, KBatch};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          KBatch);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemm_Wmma_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock << "x" << NPerBlock << "x" << KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma << "x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat << "x" << NRepeat << ", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector << "x" << BBlockTransferSrcScalarPerVector << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..4dfa472103
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,1725 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/env.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if defined(__gfx11__)
+    // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
+    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
+                   (std::is_same_v<c_data_type, ck::half_t> ||
+                    std::is_same_v<c_data_type, ck::bhalf_t>)))
+    {
+#endif
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+#if defined(__gfx11__)
+    }
+#endif
+#else
+    ignore = karg;
+#endif
+}
+
+/// @brief \"Universal\" GEMM kernel with SplitK support.
+///
+/// @par Overview
+///         This GEMM kernel is carrying out following mathematical equation:
+///         C{M,N} = C_op(A_op(A{M,K}) * B_op(B{K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations that could be applied on each tensor respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through it's design
+///         and versatilty.
+///
+/// @note   This Kernel implementation supports SplitK algorithm. It can be configured
+///         to split the dot product accumulated over the K dimension into multiple working groups.
+///         The partial products of different workgroups are then reduced using the AtomicAdd
+///         operation.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1Value    The vector load size from global memory for A tensor.
+/// @tparam BK1Value    The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam AThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BThreadTransferSrcResetCoordinateAfterRun   Decides whether we reset thread coordinate
+///                          (return back to the window origin) after all thread finish data copy.
+/// @tparam BBlockLdsExtraN             Whether to use padding for LDS or not. With universal GEMM
+///                                         there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled).
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct GridwiseGemm_wmma_cshuffle_v3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr index_t KPack = math::max(
+        math::lcm(AK1Number, BK1Number),
+        WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::selected_wmma
+            .k_per_wmma);
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNRepeat, index_t MNWaves, index_t MNPerWmma, typename BlockDesc>
+    __host__ __device__ static constexpr auto MakeWmmaTileDescriptor(const BlockDesc&)
+    {
+        // K0_N_K1 -> K0_MNRepeat_MNWaves_MNPerWmma_K1
+        constexpr auto K0 = BlockDesc{}.GetLength(I0);
+        constexpr auto K1 = BlockDesc{}.GetLength(I2);
+#ifdef __gfx12__
+        constexpr auto KRow = I2;
+#else
+        constexpr auto KRow = I1;
+#endif
+        return transform_tensor_descriptor(
+            BlockDesc{},
+            make_tuple(make_unmerge_transform(make_tuple(Number<K0 / KRow>{}, KRow)),
+                       make_unmerge_transform(
+                           make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                       make_pass_through_transform(Number<K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            static_assert(!PermuteA, "PermuteA is not supported");
+
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Pre-shuffled Weight
+                // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1]
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                const index_t BK0_     = StrideB / BK1Value;
+                const index_t BK00     = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto MakeAWmmaTileDescriptor(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+
+        return MakeWmmaTileDescriptor<MRepeat, MWaves, MPerWmma>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto MakeBWmmaTileDescriptor(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        return MakeWmmaTileDescriptor<NRepeat, NWaves, NPerWmma>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+        // TODO: Investigate why this path is not used in the original
+        // gridwise_gemm_xdl_cshuffle_v3.hpp
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          bool is_reduce_ = false)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, k_batch_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                }
+            }
+
+            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t c_reduce_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(Number<MPerBlock>{} * AK1Number, AK1Number, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
+                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_ak0_mldslayer_m_ak1,
+                make_tuple(make_pass_through_transform(AK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerWmma;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerWmma * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerWmma * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerWmma * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
+                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_pass_through_transform(BK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+        else // RowMajor B
+        {
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / NPerWmma;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerWmma * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerWmma * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerWmma * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+    }
+
+    __host__ __device__ static constexpr auto
+    // *Caution Here repeat is shuffle repeat
+    GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
+    {
+        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMRepeatPerShuffle * MWaves * MPerWmma>{},
+                           I1,
+                           Number<CShuffleNRepeatPerShuffle * NWaves * NPerWmma>{}));
+
+        return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
+    }
+
+    using BlockwiseGemmPipe = remove_cvref_t<
+        decltype(BlockGemmPipeline_Selector<
+                 BlkGemmPipelineVer,
+                 BlkGemmPipeSched,
+                 BlockSize,
+                 ADataType,
+                 BDataType,
+                 ComputeTypeA,
+                 ComputeTypeB,
+                 AccDataType,
+                 decltype(MakeAWmmaTileDescriptor(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                 decltype(MakeBWmmaTileDescriptor(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                 ABlockTransferSrcScalarPerVector,
+                 BBlockTransferSrcScalarPerVector,
+                 MPerBlock,
+                 NPerBlock,
+                 KPerBlock,
+                 MPerWmma,
+                 NPerWmma,
+                 MRepeat,
+                 NRepeat,
+                 KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
+                .GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
+                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
+                          (NPerBlock % (NPerWmma * NRepeat)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not supported yet"
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                            sizeof(ADataType) /
+                                                                            APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_buf,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_buf,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            // C mapping in single thread.
+            constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
+                blockwise_gemm_pipeline
+                    .GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
+
+            // C mapping in single block
+            constexpr auto
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp =
+                    blockwise_gemm_pipeline
+                        .GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
+
+            constexpr auto MWave =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I1);
+            constexpr auto MSubGroup =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I2);
+            constexpr auto NWave =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I4);
+            constexpr auto NThreadPerSubGroup =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I5);
+            constexpr auto MAccVgprs =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I6);
+
+            // LDS descriptor, shuffle and write out in MRepeat x NRepeat times
+            constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+                GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
+                    .GetElementSpaceSize());
+
+            constexpr auto
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
+                    transform_tensor_descriptor(
+                        c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                        make_tuple(
+                            make_freeze_transform(I0),
+                            make_unmerge_transform(make_tuple(
+                                Number<CShuffleMRepeatPerShuffle>{}, // MRepeat per shuffle repeat
+                                MWave,                               // MWave
+                                MSubGroup, // MSubGroup * MAccVgprs = MPerWmma
+                                MAccVgprs)),
+                            make_freeze_transform(I0),
+                            make_unmerge_transform(make_tuple(
+                                Number<CShuffleNRepeatPerShuffle>{}, // NRepeat per shuffle repeat
+                                NWave,                               // NWave
+                                NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                        make_tuple(Sequence<>{},
+                                   Sequence<0, 1, 2, 6>{},
+                                   Sequence<>{},
+                                   Sequence<3, 4, 5>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor =
+                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
+                                                     MRepeat, MWave, MSubGroup, MAccVgprs))),
+                                                 make_tuple(Sequence<0, 1, 2, 3>{}),
+                                                 make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor
+                    .CalculateBottomIndex(make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor =
+                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
+                                                     NRepeat, NWave, NThreadPerSubGroup))),
+                                                 make_tuple(Sequence<0, 1, 2>{}),
+                                                 make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor
+                    .CalculateBottomIndex(make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
+                decltype(c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMRepeatPerShuffle,
+                         I1,
+                         I1,
+                         CShuffleNRepeatPerShuffle,
+                         I1,
+                         I1,
+                         MAccVgprs>,
+                Sequence<0, 1, 2, 3, 4, 5, 6>,
+                6,
+                1, // vector write pixel
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                make_multi_index(0,
+                                 m_thread_data_on_block_idx[I1],
+                                 m_thread_data_on_block_idx[I2],
+                                 0,
+                                 n_thread_data_on_block_idx[I1],
+                                 n_thread_data_on_block_idx[I2],
+                                 m_thread_data_on_block_idx[I3]),
+                ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                         1,
+                         CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for local reg & global memory
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MRepeat, 1, 1, NRepeat, 1, 1, MAccVgprs>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                  Sequence<CShuffleMRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           CShuffleNRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           MAccVgprs>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                                           1,
+                                           CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(
+                    c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                    sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                    c_thread_buf,
+                    c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                    c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        Run<decltype(a_grid_desc_ak0_m_ak1),
+            decltype(b_grid_desc_bk0_n_bk1),
+            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+            HasMainKBlockLoop,
+            CGlobalMemoryDataOperation,
+            TailNum>(p_a_grid,
+                     p_b_grid,
+                     p_c_grid,
+                     p_shared,
+                     problem,
+                     a_grid_desc_ak0_m_ak1,
+                     b_grid_desc_bk0_n_bk1,
+                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 1abae56be4..429df2413f 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -22,6 +22,10 @@ enum struct WmmaInstr
     wmma_f32_16x16x16_f16_gfx12,
     wmma_f32_16x16x16_bf16_gfx12,
     wmma_i32_16x16x16_iu8_gfx12,
+    wmma_f32_16x16x16_f8f8_gfx12,
+    wmma_f32_16x16x16_f8bf8_gfx12,
+    wmma_f32_16x16x16_bf8f8_gfx12,
+    wmma_f32_16x16x16_bf8bf8_gfx12,
 };
 
 /*
@@ -400,6 +404,146 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8_gfx12,
     }
 };
 
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_f8f8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
+template <index_t WaveSize>
+struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12,
+                 WaveSize,
+                 typename std::enable_if_t<WaveSize == 32 || WaveSize == 64>>
+{
+    // Absolute fixing property
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t acc_data_size            = 4;
+    static constexpr index_t acc_pack_number          = 1;
+    static constexpr index_t num_thread_per_subgroups = n_per_wmma;
+
+    // Wave mode dependent propety
+    static constexpr index_t wave_size              = Number<WaveSize>{};
+    static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
+    static constexpr index_t num_subgroups          = wave_size / num_thread_per_subgroups;
+
+    template <index_t MPerWmma, index_t NPerWmma, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
+        if constexpr(wave_size == 32)
+        {
+#ifdef __gfx12__
+            intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12<MPerWmma, NPerWmma>::Run(a, b, reg_c);
+#else
+            ignore = a;
+            ignore = b;
+            ignore = reg_c;
+#endif
+        }
+    }
+};
+
 template <typename src_type_a,
           typename src_type_b,
           typename dst_type,
@@ -463,6 +607,31 @@ struct WmmaSelector
         return WmmaInstr::wmma_i32_16x16x16_iu4;
     }
 #endif
+
+    template <>
+    constexpr auto GetWmma<f8_t, f8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12;
+    }
+
+    template <>
+    constexpr auto GetWmma<f8_t, bf8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12;
+    }
+
+    template <>
+    constexpr auto GetWmma<bf8_t, f8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12;
+    }
+
+    template <>
+    constexpr auto GetWmma<bf8_t, bf8_t, float, 16, 16>()
+    {
+        return WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12;
+    }
+
     // get_warp_size do not return the correct wavesize, hardcode to 32 as workaround
     static constexpr auto selected_wmma =
         wmma_type<GetWmma<src_type_a, src_type_b, dst_type, MPerWmma, NPerWmma>(), Number<32>{}>{};
@@ -612,14 +781,17 @@ struct WmmaGemm
                 (is_same<src_type_a, bhalf_t>::value && is_same<src_type_b, bhalf_t>::value &&
                  is_same<dst_type, bhalf_t>::value) ||
                 (is_same<src_type_a, int8_t>::value && is_same<src_type_b, int8_t>::value &&
-                 is_same<dst_type, int32_t>::value)
+                 is_same<dst_type, int32_t>::value) ||
+                ((is_same<src_type_a, f8_t>::value || is_same<src_type_a, bf8_t>::value) &&
+                 (is_same<src_type_b, f8_t>::value || is_same<src_type_b, bf8_t>::value) &&
+                 is_same<dst_type, float>::value) ||
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-                || (is_same<src_type_a, int4_t>::value && is_same<src_type_b, int4_t>::value &&
-                    is_same<dst_type, int32_t>::value)
+                (is_same<src_type_a, int4_t>::value && is_same<src_type_b, int4_t>::value &&
+                 is_same<dst_type, int32_t>::value) ||
 #endif
-                ,
+                false,
             "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), "
-            "(int8, int32) or (int4, int32)!");
+            "((f8 or bf8, f8 or bf8), float), (int8, int32) or (int4, int32)!");
         static_for<0, KPack / wmma_instr.k_per_wmma, 1>{}([&](auto k) {
             if constexpr(!TransposeC)
             {
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index 317f324e6d..62e3220b5a 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -581,7 +581,7 @@ __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::typ
                                                       tmp.template AsType<half2_t>()[i]);
         });
     }
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
     else if constexpr(is_same<T, bhalf_t>::value)
     {
         vector_type<bhalf_t, N> tmp{src_thread_data};
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
index aa519fb2be..e14c0d62a8 100644
--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef CK_AMD_WMMA_HPP
 #define CK_AMD_WMMA_HPP
@@ -341,5 +341,101 @@ struct intrin_wmma_i32_16x16x16_iu8_w32_gfx12<16, 16, neg_a, neg_b, clamp>
     }
 };
 
+// src: f8, f8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f8f8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f8f8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: f8, bf8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: bf8, f8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
+// src: bf8, bf8, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bf8x8_t& reg_a, const bf8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx12__)
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(
+                bit_cast<int32x2_t>(reg_a),
+                bit_cast<int32x2_t>(reg_b),
+                reg_c.template AsType<float8_t>()[Number<0>{}]);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+
 } // namespace ck
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
index 4218c51ca3..79212e16dd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -7,521 +7,22 @@
 #include <memory>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 
+#ifdef CK_USE_WMMA
+#include "gemm_universal_wmma.inc"
+#endif
+#ifdef CK_USE_XDL
+#include "gemm_universal_xdl.inc"
+#endif
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-#ifdef CK_ENABLE_FP16
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
-#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
-#ifdef CK_ENABLE_BF16
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
-#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-#endif
 
 template <typename ADataType,
           typename BDataType,
@@ -554,6 +55,82 @@ struct DeviceOperationInstanceFactory<
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(op_ptrs);
+            }
+        }
+#endif
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(op_ptrs);
+            }
+        }
+#endif
+#endif // CK_USE_WMMA
+
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<CDataType, half_t>)
@@ -822,6 +399,7 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+#ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
                      is_same_v<CDataType, half_t>)
         {
@@ -831,7 +409,8 @@ struct DeviceOperationInstanceFactory<
                 add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
             }
         }
-
+#endif
+#ifdef CK_ENABLE_BF16
         if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, pk_i4_t> &&
                      is_same_v<CDataType, bhalf_t>)
         {
@@ -842,6 +421,8 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
             }
         }
+#endif
+#endif // CK_USE_XDL
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
new file mode 100644
index 0000000000..1396437326
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_FP16
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc
new file mode 100644
index 0000000000..f0de713834
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_xdl.inc
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_FP16
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v1_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_mem_v2_nkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 70e54962ed..fe35d9ca76 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -81,21 +81,29 @@ function(add_instance_library INSTANCE_NAME)
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
-    # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+    # Do not build XDL gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
     if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        foreach(source IN LISTS ARGN)
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply" AND source MATCHES "_f8_")
+                message("removing gemm_multiply_multiply_f8 instance ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+        foreach(source IN LISTS ARGN)
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
+                message("removing gemm_universal_f8 instance ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+    endif()
+    # Do not build WMMA gemm_universal_f8 for any targets except gfx12+
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply" AND source MATCHES "_f8_")
-         message("removing gemm_multiply_multiply_f8 instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
+        if(NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "gemm_wmma_universal" AND source MATCHES "_f8_")
+            message("removing gemm_universal_f8 instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
     endforeach()
-    foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
-         message("removing gemm_universal_f8 instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
-    endforeach()
-    endif()
+    message("remaining instances: ${ARGN}")
     #only continue if there are some source files left on the list
     if(ARGN)
         set(INST_OBJ)
@@ -124,7 +132,10 @@ function(add_instance_library INSTANCE_NAME)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
-                endif()    
+                endif()
+            endif()
+            if(source MATCHES "gemm_wmma_universal" AND source MATCHES "f8")
+                list(FILTER INST_TARGETS INCLUDE REGEX "gfx12")
             endif()
             set(offload_targets)
             foreach(target IN LISTS INST_TARGETS)
@@ -455,4 +466,3 @@ set(DEV_OPS_INC_DIRS
     ${PROJECT_SOURCE_DIR}/library/include/ck/
 )
 rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
-
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
index ade65eacf3..18eeefa522 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
@@ -1,7 +1,17 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_UNIVERSAL_INSTANCES)
 
-list(APPEND GEMM_UNIVERSAL_INSTANCES 
+list(APPEND GEMM_UNIVERSAL_INSTANCES
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
+
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
+
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -18,7 +28,7 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v1_kpadding_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-        
+
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -57,6 +67,16 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
         device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
         )
 
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -80,6 +100,9 @@ set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm
 set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 list(APPEND GEMM_UNIVERSAL_INSTANCES
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -134,25 +157,28 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
           device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
         )
 
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-        
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
- set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 
 add_instance_library(device_gemm_universal_instance ${GEMM_UNIVERSAL_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
new file mode 100644
index 0000000000..5d3bb3f7b4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..c9a730de68
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
new file mode 100644
index 0000000000..6c3a641f9f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..cd88edec59
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..b700e78d3d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        // Configurations used during development, mainly for testing
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..9951c02251
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..7b4cd64d33
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..3a607c4178
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
new file mode 100644
index 0000000000..3751dc5a11
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..3971802415
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
new file mode 100644
index 0000000000..222b49eb7d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..36901b4f38
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..6960375ed6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        // Configurations used during development, mainly for testing
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..bbc8b92217
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..7f71cf6f59
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..331ca8b2ff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..2fca3551b4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..5087a9d719
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..244eb69190
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..89df765517
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index 2054ffbbb3..f7b1d5f1f8 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -9,7 +9,7 @@
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/gemm_universal.hpp"
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 9cb70e4670..17c8c277eb 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -58,7 +58,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp)
   list(APPEND PROFILER_SOURCES profile_batched_gemm_b_scale.cpp)
   list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
@@ -76,6 +75,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
   endif()
+  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
   list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
@@ -144,7 +144,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
   endif()
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_b_scale_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
@@ -170,6 +169,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11"
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
   endif()
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index a22d983da5..7f2393a7e6 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 #include <initializer_list>
@@ -103,8 +103,10 @@ int profile_gemm_universal(int argc, char* argv[])
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
     using F8 = ck::f8_t;
+#endif
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using I4 = ck::pk_i4_t;
 #endif
 
@@ -201,7 +203,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
     else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
@@ -210,6 +212,8 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
+#endif
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
index cf5c68e220..0a68622ebe 100755
--- a/test/gemm_universal/CMakeLists.txt
+++ b/test/gemm_universal/CMakeLists.txt
@@ -1,15 +1,29 @@
-add_gtest_executable(test_gemm_universal_fp16 test_gemm_universal_xdl_fp16.cpp)
+add_gtest_executable(test_gemm_universal_wmma_fp16 test_gemm_universal_wmma_fp16.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal_fp16 PRIVATE utility device_gemm_universal_instance)
- endif()
-
-add_gtest_executable(test_gemm_universal_fp8 test_gemm_universal_xdl_fp8.cpp)
-if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal_fp8 PRIVATE utility device_gemm_universal_instance)
+    target_link_libraries(test_gemm_universal_wmma_fp16 PRIVATE utility device_gemm_universal_instance)
 endif()
 
-add_gtest_executable(test_gemm_universal_bf16 test_gemm_universal_xdl_bf16.cpp)
+add_gtest_executable(test_gemm_universal_wmma_bf16 test_gemm_universal_wmma_bf16.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_gemm_universal_bf16 PRIVATE utility device_gemm_universal_instance)
+    target_link_libraries(test_gemm_universal_wmma_bf16 PRIVATE utility device_gemm_universal_instance)
 endif()
 
+add_gtest_executable(test_gemm_universal_wmma_fp8 test_gemm_universal_wmma_fp8.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_wmma_fp8 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_xdl_fp16 test_gemm_universal_xdl_fp16.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_xdl_fp16 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_xdl_fp8 test_gemm_universal_xdl_fp8.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_xdl_fp8 PRIVATE utility device_gemm_universal_instance)
+endif()
+
+add_gtest_executable(test_gemm_universal_xdl_bf16 test_gemm_universal_xdl_bf16.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_universal_xdl_bf16 PRIVATE utility device_gemm_universal_instance)
+endif()
diff --git a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
new file mode 100644
index 0000000000..22376a8599
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_KM_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_BF16_KM_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+
+using KernelTypes_KM_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<     BF16,      BF16,            BF16,      BF16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_KN, KernelTypes_KM_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_BF16_KM_NK, KernelTypes_KM_NK);
+
+#include "test_gemm_universal_ut_cases_bf16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
new file mode 100644
index 0000000000..1adee41ed2
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+using F16 = ck::half_t;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,       F16>
+    >;
+
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,       F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
new file mode 100644
index 0000000000..3579424496
--- /dev/null
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_universal_util.hpp"
+
+#if CK_USE_WMMA_FP8
+
+using F8   = ck::f8_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP8_MK_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<       F8,        F8,              F8,      BF16>
+    >;
+
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<       F8,        F8,              F8,      BF16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_FP8_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_universal_ut_cases_fp8.inc"
+
+#endif // CK_USE_WMMA_FP8

From 83394e40d2452d32701bed4ed85bea1bfa50cfc2 Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Tue, 29 Apr 2025 00:49:31 +0800
Subject: [PATCH 074/443] fix moe i4 example bug (#2139)

---
 example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index 3c3ef16198..9e80a2ca35 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -233,7 +233,7 @@ int main(int argc, char* argv[])
     ck::index_t StrideB              = K;
     ck::index_t StrideE              = N;
     constexpr ck::index_t NumDTensor = DsDataType::Size();
-    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{1, 1, 1};
 
     ck::index_t KBatch = 1;
 
@@ -266,7 +266,8 @@ int main(int argc, char* argv[])
     Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
     Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
     Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
-    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N * 2}, {1, StrideDs[1]}));
+    Tensor<D1DataType> d1_e_n(
+        HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
     Tensor<EDataType> e_t_n_device_result(

From 434d19f696da62c12b5372b32cbc9ba968588d7e Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Mon, 28 Apr 2025 18:53:19 +0200
Subject: [PATCH 075/443] Add ck tile examples to package (#1880)

* add ck tile examples to package

* Update jenkinsfile

* fix for jenkinsfile

* fix for building ck tile code on non gfx9

* compile ck tile examples only for gfx94

* include ck tile examples in all target

* fix for basic gemm UseStructuredSparsity

* Update CMakeLists.txt

* Update gemm_pipeline_problem.hpp

* add targets to rocm install

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 example/CMakeLists.txt                        |  4 +++-
 example/ck_tile/01_fmha/CMakeLists.txt        |  6 ++++--
 example/ck_tile/02_layernorm2d/CMakeLists.txt |  3 ++-
 example/ck_tile/03_gemm/CMakeLists.txt        |  7 +++++--
 example/ck_tile/03_gemm/stript.sh             |  1 +
 example/ck_tile/04_img2col/CMakeLists.txt     |  3 ++-
 example/ck_tile/05_reduce/CMakeLists.txt      |  4 +++-
 example/ck_tile/06_permute/CMakeLists.txt     |  3 ++-
 .../ck_tile/09_topk_softmax/CMakeLists.txt    |  5 +++--
 example/ck_tile/10_rmsnorm2d/CMakeLists.txt   |  6 ++++--
 .../11_add_rmsnorm2d_rdquant/CMakeLists.txt   |  6 ++++--
 .../add_rmsnorm2d_rdquant_fwd.cpp             | 21 +++++++++++--------
 .../example_add_rmsnorm2d_rdquant_fwd.cpp     | 21 +++++++++++--------
 example/ck_tile/12_smoothquant/CMakeLists.txt |  3 ++-
 example/ck_tile/13_moe_sorting/CMakeLists.txt |  3 ++-
 .../ck_tile/14_moe_smoothquant/CMakeLists.txt |  3 ++-
 example/ck_tile/15_fused_moe/CMakeLists.txt   |  3 ++-
 .../ck_tile/16_batched_gemm/CMakeLists.txt    |  3 ++-
 .../ck_tile/17_grouped_gemm/CMakeLists.txt    |  4 ++--
 example/ck_tile/18_flatmm/CMakeLists.txt      |  4 +++-
 .../35_batched_transpose/CMakeLists.txt       |  4 ++--
 example/ck_tile/CMakeLists.txt                |  5 ++++-
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |  1 +
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |  3 +--
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |  5 +++--
 25 files changed, 83 insertions(+), 48 deletions(-)
 create mode 100644 example/ck_tile/03_gemm/stript.sh
 mode change 100644 => 100755 example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 996a543ecc..0e61fd33ef 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -5,7 +5,6 @@ include_directories(BEFORE
 
 add_custom_target(examples)
 
-
 # list of examples that are labelled as REGRESSION_EXAMPLE for make regression (runtime more than 30 seconds)
 # all other tests are labelled as SMOKE_EXAMPLE
 set(REGRESSION_EXAMPLES
@@ -232,6 +231,9 @@ endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 
 # add all example subdir
 file(GLOB dir_list LIST_DIRECTORIES true *)
+if (NOT SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+    list(FILTER dir_list EXCLUDE REGEX ".*/ck_tile")
+endif()
 FOREACH(subdir ${dir_list})
     if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
         add_subdirectory(${subdir})
diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 9ba3a453fc..ce3c8b3978 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -58,7 +58,8 @@ set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_FMHA_FWD}")
-add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
+add_executable(${EXAMPLE_FMHA_FWD} fmha_fwd.cpp)
+rocm_install(TARGETS ${EXAMPLE_FMHA_FWD} COMPONENT examples)
 target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
 
@@ -66,7 +67,8 @@ set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_FMHA_BWD}")
-add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
+add_executable(${EXAMPLE_FMHA_BWD} fmha_bwd.cpp)
+rocm_install(TARGETS ${EXAMPLE_FMHA_BWD} COMPONENT examples)
 target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
 
diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt
index fa69ac0f7a..74f195a9db 100644
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -26,7 +26,8 @@ add_custom_command(
 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
 
 message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
-add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
+add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp)
+rocm_install(TARGETS ${EXAMPLE_LAYERNORM2D_FWD} COMPONENT examples)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
 
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 411db2e317..deccb71d23 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,5 +1,8 @@
-add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
-add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
+add_executable(tile_example_gemm_basic gemm_basic.cpp)
+rocm_install(TARGETS tile_example_gemm_basic COMPONENT examples)
+add_executable(tile_example_gemm_universal universal_gemm.cpp)
+rocm_install(TARGETS tile_example_gemm_universal COMPONENT examples)
+
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
   list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
diff --git a/example/ck_tile/03_gemm/stript.sh b/example/ck_tile/03_gemm/stript.sh
new file mode 100644
index 0000000000..4b91cb36ce
--- /dev/null
+++ b/example/ck_tile/03_gemm/stript.sh
@@ -0,0 +1 @@
+for file in gemm_universal_*; do mv "$file" "${file/f16_f16_f16/fp16_fp16_fp16}"; done
diff --git a/example/ck_tile/04_img2col/CMakeLists.txt b/example/ck_tile/04_img2col/CMakeLists.txt
index 3864c9ed9d..d3737467d8 100644
--- a/example/ck_tile/04_img2col/CMakeLists.txt
+++ b/example/ck_tile/04_img2col/CMakeLists.txt
@@ -1,3 +1,4 @@
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp)
+add_executable(tile_example_img2col image_to_column.cpp)
+rocm_install(TARGETS tile_example_img2col COMPONENT examples)
diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt
index 6caa38d50d..855e59c48e 100644
--- a/example/ck_tile/05_reduce/CMakeLists.txt
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -3,7 +3,9 @@ set(EXAMPLE_REDUCE "tile_example_reduce")
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_REDUCE}")
 
-add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
+add_executable(${EXAMPLE_REDUCE} reduce.cpp)
+rocm_install(TARGETS ${EXAMPLE_REDUCE} COMPONENT examples)
+
 target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
 
diff --git a/example/ck_tile/06_permute/CMakeLists.txt b/example/ck_tile/06_permute/CMakeLists.txt
index 327fceb685..22483a4295 100644
--- a/example/ck_tile/06_permute/CMakeLists.txt
+++ b/example/ck_tile/06_permute/CMakeLists.txt
@@ -1,6 +1,7 @@
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
+add_executable(tile_example_permute permute.cpp)
+rocm_install(TARGETS tile_example_permute COMPONENT examples)
 
 if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
 # set(PERMUTE_USE_ALTERNATIVE_IMPL false)
diff --git a/example/ck_tile/09_topk_softmax/CMakeLists.txt b/example/ck_tile/09_topk_softmax/CMakeLists.txt
index b43b989792..fc2a4d3fe0 100644
--- a/example/ck_tile/09_topk_softmax/CMakeLists.txt
+++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt
@@ -1,6 +1,7 @@
-add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
-target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp)
+rocm_install(TARGETS tile_example_topk_softmax COMPONENT examples)
 
+target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
index 5684c9b2e0..731ff639a4 100644
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -26,7 +26,8 @@ add_custom_command(
 set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
 
 message("adding ${TILE_RMSNORM2D_FWD}")
-add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
+add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp)
+rocm_install(TARGETS ${TILE_RMSNORM2D_FWD} COMPONENT examples)
 target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
 
@@ -38,7 +39,8 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno
 target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
 
 set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
-add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp)
+add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp)
+rocm_install(TARGETS ${EXAMPLE_RMSNORM2D_FWD} COMPONENT examples)
 target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
 
 # TODO: we have to turn off this global prop, otherwise the progress bar generated
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
index 6b0c3cef7a..7071127e01 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -3,7 +3,8 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
 # to be included in "make all/install/check"
 message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
+add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp)
+rocm_install(TARGETS ${TILE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples)
 target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})
 
@@ -15,7 +16,8 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t
 target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
 
 set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd")
-add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp)
+add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp)
+rocm_install(TARGETS ${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples)
 target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
 
 # TODO: we have to turn off this global prop, otherwise the progress bar generated
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
index 574edf64d3..7d82a16aa9 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -67,13 +67,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;
 
-    using ADataType       = typename TypeConfig::ADataType;
-    using BDataType       = typename TypeConfig::BDataType;
-    using GammaDataType   = typename TypeConfig::GammaDataType;
-    using XDataType       = typename TypeConfig::XDataType;
-    using YScaleDataType  = typename TypeConfig::YScaleDataType;
-    using QYDataType      = typename TypeConfig::QYDataType;
-    using ComputeDataType = float;
+    using ADataType        = typename TypeConfig::ADataType;
+    using BDataType        = typename TypeConfig::BDataType;
+    using GammaDataType    = typename TypeConfig::GammaDataType;
+    using XDataType        = typename TypeConfig::XDataType;
+    using UnquantYDataType = ck_tile::null_type;
+    using YScaleDataType   = typename TypeConfig::YScaleDataType;
+    using QYDataType       = typename TypeConfig::QYDataType;
+    using ComputeDataType  = float;
 
     // host verify
     ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -88,6 +89,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
     ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
 
     ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
     ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
@@ -191,8 +193,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                                             InvRmsDataType,
+                                             UnquantYDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
         }
 
         // yscale
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
old mode 100644
new mode 100755
index ada4c6f2da..3aab357909
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -62,13 +62,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     assert(stride >= n);
 
-    using ADataType       = DataType;
-    using BDataType       = DataType;
-    using GammaDataType   = DataType;
-    using XDataType       = DataType;
-    using YScaleDataType  = float;
-    using QYDataType      = ck_tile::int8_t;
-    using ComputeDataType = float;
+    using ADataType        = DataType;
+    using BDataType        = DataType;
+    using GammaDataType    = DataType;
+    using XDataType        = DataType;
+    using UnquantYDataType = ck_tile::null_type;
+    using YScaleDataType   = float;
+    using QYDataType       = ck_tile::int8_t;
+    using ComputeDataType  = float;
 
     // host verify
     ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -81,6 +82,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
     ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
     ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
 
     ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
     ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
@@ -193,8 +195,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                                             InvRmsDataType,
+                                             UnquantYDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
         }
 
         // yscale
diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt
index 3849833aca..daeeb827bd 100644
--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -2,7 +2,8 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
     message("adding ${TARGET_NAME}")
     # not using add_example_executable() to add target, since we don't want this to have
     # to be included in "make all/install/check"
-    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
+    add_executable(${TARGET_NAME} ${MAIN_SRC})
+    rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
     target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
     foreach(source IN LISTS ARGN)
diff --git a/example/ck_tile/13_moe_sorting/CMakeLists.txt b/example/ck_tile/13_moe_sorting/CMakeLists.txt
index 09f3e4ac4e..662e16f0d3 100644
--- a/example/ck_tile/13_moe_sorting/CMakeLists.txt
+++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt
@@ -1,4 +1,5 @@
-add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
+add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp)
+rocm_install(TARGETS tile_example_moe_sorting COMPONENT examples)
 target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 
 set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
index 12224a39a2..9acb27552a 100644
--- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
@@ -2,7 +2,8 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
     message("adding ${TARGET_NAME}")
     # not using add_example_executable() to add target, since we don't want this to have
     # to be included in "make all/install/check"
-    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
+    add_executable(${TARGET_NAME} ${MAIN_SRC})
+    rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
     target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
     foreach(source IN LISTS ARGN)
diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt
index a716eef19e..bb25a55c7d 100644
--- a/example/ck_tile/15_fused_moe/CMakeLists.txt
+++ b/example/ck_tile/15_fused_moe/CMakeLists.txt
@@ -3,7 +3,8 @@ set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
 # to be included in "make all/install/check"
 message("adding ${TILE_EXAPMLE_FUSED_MOE}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
+add_executable(${TILE_EXAPMLE_FUSED_MOE} main.cpp)
+rocm_install(TARGETS ${TILE_EXAPMLE_FUSED_MOE} COMPONENT examples)
 target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})
 
diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt
index 78e78c6b04..9eb7a45d80 100644
--- a/example/ck_tile/16_batched_gemm/CMakeLists.txt
+++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt
@@ -1 +1,2 @@
-add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp)
+add_executable(tile_example_batched_gemm batched_gemm.cpp)
+rocm_install(TARGETS tile_example_batched_gemm COMPONENT examples)
diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
index d34013dd6c..80d688125b 100644
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-
+add_executable(tile_example_grouped_gemm grouped_gemm.cpp)
+rocm_install(TARGETS tile_example_grouped_gemm COMPONENT examples)
diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
index 9fbe65e3a7..3a70f0447d 100644
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
+add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
+rocm_install(TARGETS tile_example_flatmm_basic COMPONENT examples)
+
 
 set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
diff --git a/example/ck_tile/35_batched_transpose/CMakeLists.txt b/example/ck_tile/35_batched_transpose/CMakeLists.txt
index a08fcebb74..10101e4d2e 100644
--- a/example/ck_tile/35_batched_transpose/CMakeLists.txt
+++ b/example/ck_tile/35_batched_transpose/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(TARGET_NAME tile_example_batched_transpose)
-add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp)
+add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp)
+rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
 # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
-
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 88efe0d8d9..16f68c6255 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -14,8 +14,11 @@ add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
 add_subdirectory(13_moe_sorting)
 add_subdirectory(14_moe_smoothquant)
-add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
 add_subdirectory(35_batched_transpose)
+
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+  add_subdirectory(15_fused_moe)
+endif()
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 611aff318f..ad6641bc13 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/concat.hpp"
 #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+#include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 0b38e7789e..893c9d1ad3 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -30,8 +30,7 @@ struct GemmPipelineProblemBase
     using BLayout = remove_cvref_t<typename Traits::BLayout>;
     using CLayout = remove_cvref_t<typename Traits::CLayout>;
 
-    static constexpr bool TransposeC = Traits::TransposeC;
-
+    static constexpr bool TransposeC            = Traits::TransposeC;
     static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
 
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index a31004b425..ecf861e4e8 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -12,7 +12,8 @@ template <bool kPadM_,
           bool kPadK_,
           typename ALayout_,
           typename BLayout_,
-          typename CLayout_>
+          typename CLayout_,
+          bool UseStructuredSparsity_ = false>
 struct TileGemmTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -27,7 +28,7 @@ struct TileGemmTraits
     using CLayout = CLayout_;
 
     static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
+    static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
 };
 
 template <bool kPadM_,

From d9786f3363a56998dcba8434a05c387615dbf34f Mon Sep 17 00:00:00 2001
From: Daniel Su <danielsu@amd.com>
Date: Mon, 28 Apr 2025 16:40:22 -0400
Subject: [PATCH 076/443] Check max-ilp-scheduling compiler option for moe_gemm
 examples (#2127)

---
 example/65_gemm_multiply_multiply/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 3c1947c058..5d2a097576 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -15,7 +15,10 @@ foreach(gpu IN LISTS GPU_TARGETS)
         add_example_executable(example_moe_gemm2_xdl_pk_i4 moe_gemm2_xdl_pk_i4.cpp)
         if(CK_hip_VERSION VERSION_LESS_EQUAL 6.3.42132)
             set(EXAMPLE_COMPILE_OPTIONS)
-            list(APPEND EXAMPLE_COMPILE_OPTIONS -mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1)
+            check_cxx_compiler_flag("-mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1" HAS_MAX_ILP_SCHEDULING_STRATEGY)
+            if(HAS_MAX_ILP_SCHEDULING_STRATEGY)
+                list(APPEND EXAMPLE_COMPILE_OPTIONS -mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1)
+            endif()
             target_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
             target_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()

From 4094ad158a81a6c4fa0681e6d1481fb18c0d2257 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 28 Apr 2025 23:54:49 +0200
Subject: [PATCH 077/443] Integrate universal gemm with conv bwd data and add
 SplitK (#1315)

* Integrate universal gemm with conv bwd data

* Fix multi d kernel

* Add splitK support

* instances refactor

* instances refactor

* refactor

* fixeS

* fixes

* 16x16 instnaces

* Fixes

* Fix

* Fix

* Fix

* Fix

* Fix

* Fixes

* fix

* fix
---
 CHANGELOG.md                                  |    1 +
 Jenkinsfile                                   |    4 +-
 ...evice_grouped_conv_bwd_data_multiple_d.hpp |    5 +-
 ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp |   27 +-
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 1110 +++++++++++++++--
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp |    2 +-
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |    2 +-
 ...=> gridwise_gemm_xdl_cshuffle_conv_v3.hpp} |    2 +-
 .../transform_conv_bwd_data_to_gemm_v1.hpp    |   68 +-
 ...ice_grouped_conv_bwd_data_xdl_instance.hpp |   75 +-
 .../gpu/grouped_convolution_backward_data.hpp |   24 +
 .../grouped_convolution_backward_data_xdl.inc |  168 +++
 .../grouped_conv2d_bwd_data/CMakeLists.txt    |    6 +
 ...ta_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |   28 +-
 ...ata_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |   28 +-
 ...ata_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |   28 +-
 ..._ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp |   40 +
 ...ta_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp |    2 +-
 ...kcyx_ngkhw_bf16_vec_transpose_instance.cpp |    2 +-
 ...l_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp |   40 +
 ...ata_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp |    2 +-
 ...gkcyx_ngkhw_f16_vec_transpose_instance.cpp |    2 +-
 ...l_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp |   40 +
 ...ata_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp |    2 +-
 ...gkcyx_ngkhw_f32_vec_transpose_instance.cpp |    2 +-
 ...ta_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp |    2 +-
 ...ata_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp |    2 +-
 ...ata_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp |    2 +-
 ..._nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp |   49 +
 ...ta_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |    2 +-
 ...l_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp |   49 +
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |    2 +-
 ...l_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp |   49 +
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |    2 +-
 .../grouped_conv3d_bwd_data/CMakeLists.txt    |    7 +
 ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp |   28 +-
 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp |   28 +-
 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp |   28 +-
 ...hwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp |   49 +
 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |    5 +-
 ...dhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp |   49 +
 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |    5 +-
 ...dhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp |   49 +
 ..._xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |    5 +-
 ..._ndhwgk_input_f16_comp_bf8_f8_instance.cpp |    5 +-
 ...cdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp |   40 +
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |    3 +-
 ...zyx_ngkdhw_bf16_vec_transpose_instance.cpp |    3 +-
 ...gcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp |   40 +
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |    3 +-
 ...czyx_ngkdhw_f16_vec_transpose_instance.cpp |    3 +-
 ...gcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp |   40 +
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp |    3 +-
 ...czyx_ngkdhw_f32_vec_transpose_instance.cpp |    3 +-
 ...xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp |    3 +-
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp |    3 +-
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp |    3 +-
 ...ear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |    5 +-
 ...near_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |    5 +-
 ...near_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |    5 +-
 ...ale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |    5 +-
 ...cale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |    5 +-
 ...cale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |    5 +-
 .../profile_grouped_conv_bwd_data_impl.hpp    |  101 +-
 .../src/profile_grouped_conv_bwd_data.cpp     |    8 +-
 script/convert_miopen_driver_to_profiler.py   |    3 +
 test/grouped_convnd_bwd_data/CMakeLists.txt   |    5 +
 .../test_grouped_convnd_bwd_data_xdl.cpp      |   70 +-
 ...rouped_convnd_bwd_data_xdl_large_cases.cpp |  120 ++
 69 files changed, 2262 insertions(+), 349 deletions(-)
 rename include/ck/tensor_operation/gpu/grid/{gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp => gridwise_gemm_xdl_cshuffle_conv_v3.hpp} (99%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
 create mode 100644 test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b9012c0a77..e0ec214c69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
 * Added GEMM pipeline for microscaling (MX) data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
+* Added support for Split K for grouped convolution backward data.
 
 ### Optimized
 
diff --git a/Jenkinsfile b/Jenkinsfile
index f8043ba918..a18374509e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -937,8 +937,8 @@ pipeline {
                     environment{
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
-                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl"""
+                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases && \
+                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases"""
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
index 2abf1d5a10..9c44bda5ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -59,7 +59,8 @@ struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
         const std::array<index_t, NDimSpatial>& input_right_pads,
         const AElementwiseOperation& a_element_op,
         const BElementwiseOperation& b_element_op,
-        const CDEElementwiseOperation& cde_element_op) = 0;
+        const CDEElementwiseOperation& cde_element_op,
+        const ck::index_t split_k = 1) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
index 359711e5c4..5e41c96dfc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -227,7 +227,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 const ck::index_t split_k = 1)
             : p_a_grid_{static_cast<const ADataType*>(p_a)},
               p_b_grid_{static_cast<const BDataType*>(p_b)},
               p_ds_grid_{},
@@ -240,7 +241,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads}
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
         {
             // populate Ds pointer
             static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -445,6 +447,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
         std::array<index_t, NDimSpatial> conv_filter_strides_;
         std::array<index_t, NDimSpatial> input_left_pads_;
         std::array<index_t, NDimSpatial> input_right_pads_;
+
+        const index_t k_batch_;
     };
 
     // Invoker
@@ -534,6 +538,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
+        if(arg.k_batch_ != 1)
+        {
+            return false;
+        }
+
         // check device
         if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
         {
@@ -691,7 +700,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 const ck::index_t split_k = 1)
     {
         return Argument{p_a,
                         p_b,
@@ -711,7 +721,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                         input_right_pads,
                         a_element_op,
                         b_element_op,
-                        cde_element_op};
+                        cde_element_op,
+                        split_k};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -737,7 +748,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
         const std::array<index_t, NDimSpatial>& input_right_pads,
         const AElementwiseOp& a_element_op,
         const BElementwiseOp& b_element_op,
-        const CDEElementwiseOp& cde_element_op) override
+        const CDEElementwiseOp& cde_element_op,
+        const ck::index_t split_k = 1) override
     {
         return std::make_unique<Argument>(p_a,
                                           p_b,
@@ -757,7 +769,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                                           input_right_pads,
                                           a_element_op,
                                           b_element_op,
-                                          cde_element_op);
+                                          cde_element_op,
+                                          split_k);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 08edddf107..3028cd7cbc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -15,12 +15,15 @@
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
 #include "ck/host_utility/io.hpp"
 
 namespace ck {
@@ -151,6 +154,153 @@ __global__ void
 #endif
 }
 
+template <typename GridwiseGemm,
+          typename AGridDesc_AK0_M_K1,
+          typename BGridDesc_BK0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
+        const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    // offset base pointer for each work-group
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
+    const index_t k_idx =
+        __builtin_amdgcn_readfirstlane((blockIdx.y - n_idx * karg.KBatch) * num_k_per_block);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
+                               BGridDesc_BK0_N_K1,
+                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                               HasMainKBlockLoop,
+                               CGlobalMemoryDataOperation,
+                               TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
+                                        karg.p_b_grid + b_batch_offset,
+                                        karg.p_c_grid + e_batch_offset + e_n_offset,
+                                        p_shared,
+                                        karg,
+                                        a_grid_desc_ak0_m_ak1,
+                                        b_grid_desc_bk0_n_bk1,
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        k_idx);
+#else
+    ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
+    ignore = num_k_per_block;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm,
+          typename AGridDesc_AK0_M_K1,
+          typename BGridDesc_BK0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
+        const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
+    const index_t k_idx =
+        __builtin_amdgcn_readfirstlane((blockIdx.y - n_idx * karg.KBatch) * num_k_per_block);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
+                                    BGridDesc_BK0_N_K1,
+                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    HasMainKBlockLoop,
+                                    CGlobalMemoryDataOperation,
+                                    TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
+                                             karg.p_b_grid + b_batch_offset,
+                                             karg.p_c_grid + e_batch_offset + e_n_offset,
+                                             p_shared_0,
+                                             p_shared_1,
+                                             karg,
+                                             a_grid_desc_ak0_m_ak1,
+                                             b_grid_desc_bk0_n_bk1,
+                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                             k_idx);
+#else
+    ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
+    ignore = num_k_per_block;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
 } // namespace
 
 // Conv backward data multiple D:
@@ -210,7 +360,9 @@ template <index_t NDimSpatial,
           typename AComputeType                          = ADataType,
           typename BComputeType                          = AComputeType,
           index_t MaxTransposeTransferInScalarPerVector  = 1,
-          index_t MaxTransposeTransferOutScalarPerVector = 1>
+          index_t MaxTransposeTransferOutScalarPerVector = 1,
+          BlockGemmPipelineScheduler BlkGemmPipeSched    = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer    = BlockGemmPipelineVersion::v1>
 struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
                                                ALayout,    // output image
@@ -233,7 +385,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
     using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
 
-    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumDTensor          = DsDataType::Size();
+    static constexpr bool isMultiD               = NumDTensor > 0;
+    static constexpr GemmSpecialization GemmSpec = GemmSpecialization::MNKPadding;
+    static constexpr bool IsSplitKSupported =
+        (CDEBlockTransferScalarPerVector_NPerBlock % 2 == 0 || sizeof(EDataType) % 4 == 0) &&
+        std::is_same_v<remove_cvref_t<CDEElementwiseOp>, element_wise::PassThrough>;
 
     // TODO: Add support for different A and B data types.
     using ABDataType = ADataType;
@@ -315,53 +472,63 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
     }
 
-    // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
-        ABDataType,
-        ABDataType,
-        AComputeType,
-        AccDataType,
-        CShuffleDataType,
-        DsDataType,
-        EDataType,
-        AElementwiseOp,
-        BElementwiseOp,
-        CDEElementwiseOp,
-        InMemoryDataOperationEnum::Set,
-        NumGemmKPrefetchStage,
-        BlockSize,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_AK0_M_AK1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_AK1,
-        false,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_BK0_N_BK1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_BK1,
-        false,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CDEBlockTransferScalarPerVector_NPerBlock,
-        LoopSched,
-        PipelineVersion::v1,
-        BComputeType>;
+// GridwiseGemm
+#define GridwiseGemmMultiDTemplateParams                                                        \
+    ABDataType, ABDataType, AComputeType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
+        AElementwiseOp, BElementwiseOp, CDEElementwiseOp, InMemoryDataOperationEnum::Set,       \
+        NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,   \
+        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,        \
+        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                  \
+        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                           \
+        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                           \
+        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder,  \
+        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                               \
+        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,          \
+        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,          \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, BComputeType
+
+#define GridwiseGemmTemplateParams                                                               \
+    tensor_layout::gemm::RowMajor, tensor_layout::gemm::RowMajor, tensor_layout::gemm::RowMajor, \
+        ADataType, BDataType, AccDataType, CShuffleDataType, EDataType, AElementwiseOp,          \
+        BElementwiseOp, CDEElementwiseOp, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock,  \
+        AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,                                    \
+        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,   \
+        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                                \
+        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,           \
+        ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1,                           \
+        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                   \
+        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                            \
+        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                            \
+        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                            \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                        \
+        CDEBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer,         \
+        AComputeType, BComputeType
+
+    using GridwiseGemm =
+        std::conditional_t<isMultiD,
+                           GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>,
+                           GridwiseGemm_xdl_cshuffle_v3<GridwiseGemmTemplateParams>>;
+
+    template <typename EGridDesc_M_N>
+    static auto
+    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N e_grid_desc_m_n)
+    {
+        if constexpr(isMultiD)
+        {
+            return GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                e_grid_desc_m_n);
+        }
+        else
+        {
+            const index_t M = e_grid_desc_m_n.GetLength(I0);
+            const index_t N = e_grid_desc_m_n.GetLength(I1);
+            return GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                e_grid_desc_m_n,
+                GridwiseGemm::CalculateMBlock(M),
+                GridwiseGemm::CalculateNBlock(N));
+        }
+    }
 
     template <typename Desc_K0_M_K1>
     static auto transform_k0_m_k1_to_m_k(const Desc_K0_M_K1& desc_k0_m_k1)
@@ -390,15 +557,15 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
 
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            DsGridDesc_M_N{}));
+        decltype(GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>::
+                     MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}));
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            EGridDesc_M_N{}));
+        decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
 
     // block-to-e-tile map
-    using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+    using Block2ETileMap = remove_cvref_t<
+        decltype(GridwiseGemmMultipleD_xdl_cshuffle<
+                 GridwiseGemmMultiDTemplateParams>::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
     using Block2TileMapInOutElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, MPerBlock>;
     using Block2TileMapWeiElementwise   = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
@@ -511,7 +678,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 ck::index_t split_k = 1)
             : p_a_grid_{static_cast<const ADataType*>(p_a)},
               p_b_grid_{static_cast<const BDataType*>(p_b)},
               p_ds_grid_{},
@@ -525,7 +693,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
               e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads}
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
         {
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
@@ -626,7 +795,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                             conv_filter_dilations,
                             input_left_pads,
                             input_right_pads,
-                            tildes};
+                            tildes,
+                            k_batch_};
 
                         conv_N_per_block_ = conv_to_gemm_transform_.N_;
 
@@ -682,34 +852,48 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         const auto b_grid_desc_n_k =
                             transform_k0_m_k1_to_m_k(b_grid_desc_bk0_n_bk1);
 
-                        a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
-                        b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
-                        ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
-                        e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+                        if constexpr(isMultiD)
+                        {
+                            a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
+                            b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
+                            ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
+                            e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+                        }
 
                         // desc for blockwise copy
                         a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
                         b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
 
-                        // block-to-e-tile-map
-                        auto block_2_etile_map =
-                            GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
-
-                        block_2_etile_map_container_.push_back(block_2_etile_map);
-
-                        if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                       b_grid_desc_n_k,
-                                                       ds_grid_desc_m_n,
-                                                       e_grid_desc_m_n,
-                                                       block_2_etile_map))
+                        if constexpr(isMultiD)
                         {
-                            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-                                GridwiseGemm::
-                                    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                        ds_grid_desc_m_n));
+                            // block-to-e-tile-map
+                            auto block_2_etile_map =
+                                GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
 
+                            block_2_etile_map_container_.push_back(block_2_etile_map);
+
+                            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                           b_grid_desc_n_k,
+                                                           ds_grid_desc_m_n,
+                                                           e_grid_desc_m_n,
+                                                           block_2_etile_map))
+                            {
+                                ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+
+                                    GridwiseGemm::
+                                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                            ds_grid_desc_m_n));
+
+                                e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                                    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                        e_grid_desc_m_n));
+                            }
+                        }
+                        else
+                        {
+                            // there is no need to check since M, N, K are padded
                             e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-                                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                                     e_grid_desc_m_n));
                         }
                     }
@@ -844,7 +1028,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        typename GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>::DsGridPointer
+            p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptor for problem definition
@@ -891,6 +1076,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         std::array<index_t, NDimSpatial> input_left_pads_;
         std::array<index_t, NDimSpatial> input_right_pads_;
 
+        const index_t k_batch_;
         index_t num_workgroups_per_Conv_N_;
     };
 
@@ -899,7 +1085,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         using Argument = DeviceOp::Argument;
 
-        float RunGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        float RunMultiDGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
 
@@ -998,6 +1184,678 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             return ave_time;
         }
 
+        float RunGemmV3(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float ave_time = 0;
+
+            const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+            EDataType* p_e_grid       = arg.p_e_grid_;
+
+            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                p_e_grid =
+                    type_convert<EDataType*>(arg.p_workspace_) +
+                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                        sizeof(EDataType);
+            }
+
+            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            {
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+            }
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+
+            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            {
+                const index_t GemmM = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I1);
+                const index_t GemmN = arg.b_grid_desc_bk0_n_bk1_container_[i].GetLength(I1);
+                const index_t GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
+                                      arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
+
+                const auto num_k_per_block =
+                    arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(Number<0>{}) / arg.k_batch_;
+
+                // gdy is for the kbatch and num_workgrups_per_Conv_N
+                index_t gdx, gdy, gdz;
+                std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(
+                    GemmM, GemmN, arg.k_batch_ * arg.num_workgroups_per_Conv_N_, arg.num_group_);
+
+                index_t k_grain = arg.k_batch_ * KPerBlock;
+                index_t K_split = (GemmK + k_grain - 1) / k_grain * KPerBlock;
+                const bool has_main_k_block_loop =
+                    GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+                typename GridwiseGemm::Argument gemm_arg{
+                    p_a_grid, p_b_grid, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto Run = [&](const auto& kernel) {
+                    if(stream_config.flush_cache)
+                    {
+                        typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
+                        ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument>
+                            rotating_mem(gemm_arg_,
+                                         stream_config.rotating_count,
+                                         gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
+                                         gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
+                        rotating_mem.Print();
+
+                        auto run_flush_cache = [&]() {
+                            // flush icache
+                            ck::utility::flush_icache();
+                            // rotating mem
+                            rotating_mem.Next();
+                        };
+
+                        ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                            stream_config,
+                            run_flush_cache,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            gemm_arg_,
+                            arg.a_grid_desc_ak0_m_ak1_container_[i],
+                            arg.b_grid_desc_bk0_n_bk1_container_[i],
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                            arg.compute_ptr_offset_of_batch_,
+                            arg.compute_ptr_offset_of_n_,
+                            num_k_per_block);
+                    }
+                    else
+                    {
+                        ave_time += launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            gemm_arg,
+                            arg.a_grid_desc_ak0_m_ak1_container_[i],
+                            arg.b_grid_desc_bk0_n_bk1_container_[i],
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                            arg.compute_ptr_offset_of_batch_,
+                            arg.compute_ptr_offset_of_n_,
+                            num_k_per_block);
+                    }
+                };
+
+                if(has_main_k_block_loop)
+                {
+                    // Tail number always full
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                                 BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy>;
+                                Run(kernel);
+                            }
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy>;
+                            Run(kernel);
+                        }
+                    }
+                    // Tail number could be One to Seven
+                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::One)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::One>;
+                                    Run(kernel);
+                                }
+                                else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                        TailNumber::Full)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Full>;
+                                    Run(kernel);
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Two)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Two>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Three)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Three>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Four)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Four>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Five)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Five>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Six)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Six>;
+                                        Run(kernel);
+                                    }
+                                }
+
+                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                                {
+                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                       TailNumber::Seven)
+                                    {
+                                        const auto kernel =
+                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                                GridwiseGemm,
+                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                DeviceOp::
+                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                                true,
+                                                InMemoryDataOperationEnum::AtomicAdd,
+                                                minimum_occupancy,
+                                                TailNumber::Seven>;
+                                        Run(kernel);
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::One>;
+                                Run(kernel);
+                            }
+                            else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                    TailNumber::Full)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Full>;
+                                Run(kernel);
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Two)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Two>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Three)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Three>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Four)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Four>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Five)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Five>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Six)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Six>;
+                                    Run(kernel);
+                                }
+                            }
+
+                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Seven)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::Set,
+                                            minimum_occupancy,
+                                            TailNumber::Seven>;
+                                    Run(kernel);
+                                }
+                            }
+                        }
+                    }
+                    // Tail number could be Odd or Even
+                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Odd)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Odd>;
+                                    Run(kernel);
+                                }
+                                else
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Even>;
+                                    Run(kernel);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                            {
+                                const auto kernel =
+                                    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                        GridwiseGemm,
+                                        DeviceOp::AGridDesc_AK0_M_AK1,
+                                        DeviceOp::BGridDesc_BK0_N_BK1,
+                                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        true,
+                                        InMemoryDataOperationEnum::Set,
+                                        minimum_occupancy,
+                                        TailNumber::Odd>;
+                                Run(kernel);
+                            }
+                            else
+                            {
+                                const auto kernel =
+                                    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
+                                        GridwiseGemm,
+                                        DeviceOp::AGridDesc_AK0_M_AK1,
+                                        DeviceOp::BGridDesc_BK0_N_BK1,
+                                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                        true,
+                                        InMemoryDataOperationEnum::Set,
+                                        minimum_occupancy,
+                                        TailNumber::Even>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                   TailNumber::Odd)
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Odd>;
+                                    Run(kernel);
+                                }
+                                else
+                                {
+                                    const auto kernel =
+                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                            GridwiseGemm,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                            true,
+                                            InMemoryDataOperationEnum::AtomicAdd,
+                                            minimum_occupancy,
+                                            TailNumber::Even>;
+                                    Run(kernel);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Odd>;
+                                Run(kernel);
+                            }
+                            else
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Even>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    // Tail number always 1
+                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    {
+                        if(gemm_arg.KBatch > 1)
+                        {
+                            if constexpr(IsSplitKSupported)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    false,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy>;
+                                Run(kernel);
+                            }
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                false,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            return ave_time;
+        }
+
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
@@ -1084,7 +1942,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         static_cast<index_t>(arg.compute_ptr_offset_of_n_.BatchStrideA_)},
                     std::array<index_t, I1>{0});
             }
-            ave_time += RunGemm(arg, stream_config);
+
+            if constexpr(isMultiD)
+            {
+                ave_time += RunMultiDGemm(arg, stream_config);
+            }
+            else
+            {
+                ave_time += RunGemmV3(arg, stream_config);
+            }
+
             // Transpose from NHWGC to NGCHW
             if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
                          is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
@@ -1148,10 +2015,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             return false;
         }
 
+        if(!is_bf16_atomic_supported() && std::is_same_v<EDataType, ck::bhalf_t> &&
+           arg.k_batch_ > 1)
+        {
+            return false;
+        }
+
+        if constexpr(!IsSplitKSupported)
+        {
+            if(arg.k_batch_ != 1)
+            {
+                return false;
+            }
+        }
+
         const index_t ConvG = arg.b_g_k_c_xs_lengths_[0];
         const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
         const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
 
+        if constexpr(!isMultiD)
+        {
+            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            {
+                const index_t GemmM = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I1);
+                const index_t GemmN = arg.b_grid_desc_bk0_n_bk1_container_[i].GetLength(I1);
+                const index_t GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
+                                      arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
+
+                typename GridwiseGemm::Argument gemm_arg{
+                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+                const auto num_k_loop = gemm_arg.AK0 / (KPerBlock / AK1);
+                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+                {
+                    if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+
         // Specifialization
         if constexpr(ConvBackwardDataSpecialization ==
                      ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
@@ -1254,13 +2158,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // Gridwise GEMM size
         for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
-                                            arg.b_grid_desc_n_k_container_[i],
-                                            arg.ds_grid_desc_m_n_container_[i],
-                                            arg.e_grid_desc_m_n_container_[i],
-                                            arg.block_2_etile_map_container_[i]))
+            if constexpr(isMultiD)
             {
-                return false;
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                                arg.b_grid_desc_n_k_container_[i],
+                                                arg.ds_grid_desc_m_n_container_[i],
+                                                arg.e_grid_desc_m_n_container_[i],
+                                                arg.block_2_etile_map_container_[i]))
+                {
+                    return false;
+                }
             }
         }
 
@@ -1335,7 +2242,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                  const std::array<index_t, NDimSpatial>& input_right_pads,
                  const AElementwiseOp& a_element_op,
                  const BElementwiseOp& b_element_op,
-                 const CDEElementwiseOp& cde_element_op)
+                 const CDEElementwiseOp& cde_element_op,
+                 const ck::index_t split_k = 1)
     {
         return Argument{p_a,
                         p_b,
@@ -1355,7 +2263,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         input_right_pads,
                         a_element_op,
                         b_element_op,
-                        cde_element_op};
+                        cde_element_op,
+                        split_k};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
@@ -1381,7 +2290,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         const std::array<index_t, NDimSpatial>& input_right_pads,
         const AElementwiseOp& a_element_op,
         const BElementwiseOp& b_element_op,
-        const CDEElementwiseOp& cde_element_op) override
+        const CDEElementwiseOp& cde_element_op,
+        const ck::index_t split_k = 1) override
     {
         return std::make_unique<Argument>(p_a,
                                           p_b,
@@ -1401,7 +2311,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                           input_right_pads,
                                           a_element_op,
                                           b_element_op,
-                                          cde_element_op);
+                                          cde_element_op,
+                                          split_k);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
@@ -1413,6 +2324,17 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         auto str = std::stringstream();
 
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
         // clang-format off
         str << "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1"
             << "<"
@@ -1430,7 +2352,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             << ABlockTransferSrcScalarPerVector << ", "
             << BBlockTransferSrcScalarPerVector << ", "
             << CShuffleMXdlPerWavePerShuffle << ", "
-            << CShuffleNXdlPerWavePerShuffle;
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer];
 
             if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>()) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index da7c4f759b..c7d95254c5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -19,7 +19,7 @@
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index d56c7abcde..dd5b97096d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -17,7 +17,7 @@
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
similarity index 99%
rename from include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
rename to include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 4f5fedcd83..d37b3cd38e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index 0ddfd0a7c8..a191c75099 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -187,7 +187,8 @@ struct TransformConvBwdDataToGemm_v1
           WTilde_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.WTilde_)},
           ZDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.ZDot_)},
           YDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.YDot_)},
-          XDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XDot_)}
+          XDot_{static_cast<IndexType>(transform_conv_bwd_data_to_gemm_base.XDot_)},
+          batch_k_{transform_conv_bwd_data_to_gemm_base.batch_k_}
     {
     }
 
@@ -203,7 +204,8 @@ struct TransformConvBwdDataToGemm_v1
                                   const ConvSpatialDimsType& conv_filter_dilations,
                                   const ConvSpatialDimsType& input_left_pads,
                                   const ConvSpatialDimsType& input_right_pads,
-                                  const ConvSpatialDimsType& tildes)
+                                  const ConvSpatialDimsType& tildes,
+                                  const index_t batch_k = 1)
         : Hi_{c_g_n_c_wis_lengths[HIdx]},
           Wi_{c_g_n_c_wis_lengths[WIdx]},
           Ho_{a_g_n_k_wos_lengths[HIdx]},
@@ -231,7 +233,8 @@ struct TransformConvBwdDataToGemm_v1
           InRightPadH_{input_right_pads[HIdx - NonSpatialDimsNum]},
           InRightPadW_{input_right_pads[WIdx - NonSpatialDimsNum]},
           IdxYTilde_{tildes[YIdx - NonSpatialDimsNum]},
-          IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]}
+          IdxXTilde_{tildes[XIdx - NonSpatialDimsNum]},
+          batch_k_{batch_k}
     {
         static_assert(is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
                       is_same_v<ConvSpatialDimsType, ck::Array<IndexType, NDimSpatial>>);
@@ -616,20 +619,22 @@ struct TransformConvBwdDataToGemm_v1
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const index_t AK0 = math::integer_divide_ceil(K_, AK1);
+            const index_t K0PerBlock = GemmKPerBlock / AK1;
+            const index_t AK0 =
+                math::integer_divide_ceil(K_, AK1 * K0PerBlock * batch_k_) * K0PerBlock;
 
             // A: output tensor
             const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
                 out_grid_desc,
                 make_tuple(make_pass_through_transform(N_ * Do_ * Ho_ * Wo_),
-                           make_unmerge_transform(make_tuple(AK0, AK1))),
+                           make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1))),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
 
             const auto out_gemmak0_gemmm_gemmak1_grid_desc =
                 ck::tensor_operation::device::PadTensorDescriptor(
                     out_gemmak0_gemmmraw_gemmak1_grid_desc,
-                    make_tuple(AK0, GemmMPerBlock, AK1),
+                    make_tuple(AK0 * batch_k_, GemmMPerBlock, AK1),
                     Sequence<false, DoPadGemmM, false>{});
 
             return out_gemmak0_gemmm_gemmak1_grid_desc;
@@ -719,11 +724,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmMPerBlock),
                         Sequence<true, DoPadGemmM>{});
 
-                const index_t AK0 = out_gemmk_gemmm_padded_grid_desc.GetLength(I0) / AK1;
+                const index_t K0PerBlock = GemmKPerBlock / AK1;
+                const index_t AK0 =
+                    math::integer_divide_ceil(out_gemmk_gemmm_padded_grid_desc.GetLength(I0),
+                                              AK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                     out_gemmk_gemmm_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1)),
                                make_pass_through_transform(
                                    out_gemmk_gemmm_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -816,11 +825,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmMPerBlock),
                         Sequence<true, DoPadGemmM>{});
 
-                const index_t AK0 = out_gemmk_gemmm_padded_grid_desc.GetLength(I0) / AK1;
+                const index_t K0PerBlock = GemmKPerBlock / AK1;
+                const index_t AK0 =
+                    math::integer_divide_ceil(out_gemmk_gemmm_padded_grid_desc.GetLength(I0),
+                                              AK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto out_gemmak0_gemmm_gemmak1_grid_desc = transform_tensor_descriptor(
                     out_gemmk_gemmm_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(AK0 * batch_k_, AK1)),
                                make_pass_through_transform(
                                    out_gemmk_gemmm_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -850,21 +863,23 @@ struct TransformConvBwdDataToGemm_v1
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const index_t BK0 = math::integer_divide_ceil(K_, BK1);
+            const index_t K0PerBlock = GemmKPerBlock / BK1;
+            const index_t BK0 =
+                math::integer_divide_ceil(K_, BK1 * K0PerBlock * batch_k_) * K0PerBlock;
 
             // B: weight tensor
-            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc =
-                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K_, C_)),
-                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
-                                                       make_pass_through_transform(C_)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(K_, C_)),
+                make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)),
+                           make_pass_through_transform(C_)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
             make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, C_), make_tuple(I0, I1));
 
             const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc =
                 ck::tensor_operation::device::PadTensorDescriptor(
                     wei_gemmbk0_gemmnraw_gemmbk1_grid_desc,
-                    make_tuple(BK0, GemmNPerBlock, BK1),
+                    make_tuple(BK0 * batch_k_, GemmNPerBlock, BK1),
                     Sequence<false, DoPadGemmN, false>{});
 
             return wei_gemmbk0_gemmn_gemmbk1_grid_desc;
@@ -925,11 +940,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmNPerBlock),
                         Sequence<true, DoPadGemmN>{});
 
-                const index_t BK0 = wei_gemmk_gemmn_padded_grid_desc.GetLength(I0) / BK1;
+                const index_t K0PerBlock = GemmKPerBlock / BK1;
+                const index_t BK0 =
+                    math::integer_divide_ceil(wei_gemmk_gemmn_padded_grid_desc.GetLength(I0),
+                                              BK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc = transform_tensor_descriptor(
                     wei_gemmk_gemmn_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)),
                                make_pass_through_transform(
                                    wei_gemmk_gemmn_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -1006,11 +1025,15 @@ struct TransformConvBwdDataToGemm_v1
                         make_tuple(GemmKPerBlock, GemmNPerBlock),
                         Sequence<true, DoPadGemmN>{});
 
-                const index_t BK0 = wei_gemmk_gemmn_padded_grid_desc.GetLength(I0) / BK1;
+                const index_t K0PerBlock = GemmKPerBlock / BK1;
+                const index_t BK0 =
+                    math::integer_divide_ceil(wei_gemmk_gemmn_padded_grid_desc.GetLength(I0),
+                                              BK1 * K0PerBlock * batch_k_) *
+                    K0PerBlock;
 
                 const auto wei_gemmbk0_gemm_gemmbk1_grid_desc = transform_tensor_descriptor(
                     wei_gemmk_gemmn_padded_grid_desc,
-                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                    make_tuple(make_unmerge_transform(make_tuple(BK0 * batch_k_, BK1)),
                                make_pass_through_transform(
                                    wei_gemmk_gemmn_padded_grid_desc.GetLength(I1))),
                     make_tuple(Sequence<0>{}, Sequence<1>{}),
@@ -1355,6 +1378,7 @@ struct TransformConvBwdDataToGemm_v1
     IndexType ZTilde_, YTilde_, XTilde_;
     IndexType DTilde_, HTilde_, WTilde_;
     IndexType ZDot_, YDot_, XDot_;
+    index_t batch_k_;
 };
 
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
index ae6fabd0bd..5c0d7283f2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -54,6 +54,28 @@ using device_grouped_conv_bwd_data_xdl_f16_generic_instances =
         // clang-format on
         >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f16_16_16_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -73,7 +95,7 @@ using device_grouped_conv_bwd_data_xdl_f16_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 4>,                1>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
@@ -108,6 +130,27 @@ using device_grouped_conv_bwd_data_xdl_bf16_generic_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_bf16_16_16_instances = std::tuple<
+    // clang-format off
+        // ##############################################|          NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|       Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|              |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 8, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 2, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -126,7 +169,7 @@ using device_grouped_conv_bwd_data_xdl_bf16_instances = std::tuple<
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,    S<4, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 4>,                1>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,   S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
@@ -162,6 +205,28 @@ using device_grouped_conv_bwd_data_xdl_f32_generic_instances =
         // clang-format on
         >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f32_16_16_instances =
+    std::tuple<
+        // clang-format off
+       // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+       // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+       // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+       // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    16,    64,    32,   8,   8,   16,   16,       1,       4,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 4, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 4>,                1>,
+       DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    16,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,        S<4, 4, 1>,      S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
+        // clang-format on
+        >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -181,7 +246,7 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 32, 1, 4>,                1>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
@@ -194,7 +259,7 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 4>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 4>,                4>,
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 4>,                4>
         // clang-format on
         >;
 
@@ -218,7 +283,7 @@ using device_grouped_conv_bwd_data_xdl_input_fp16_comp_bf8f8_instances =
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,        S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,            1,            1,     S<1, 32, 1, 4>,                1,  LoopScheduler::Default, BF8, F8>,
 
-        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F32, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 16, 1, 8>,                4,  LoopScheduler::Default, BF8, F8>,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
index 12695f4f16..e9ff75a91d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -109,6 +109,8 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<ComputeTypeB, F16>)
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP32
@@ -117,6 +119,8 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<ComputeTypeB, F32>)
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -126,6 +130,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -167,6 +173,8 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<ComputeTypeB, F16>)
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
                         op_ptrs);
                 }
@@ -177,6 +185,8 @@ struct DeviceOperationInstanceFactory<
                              is_same_v<ComputeTypeB, F32>)
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
                         op_ptrs);
                 }
@@ -188,6 +198,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
                         op_ptrs);
                 }
@@ -237,6 +249,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
@@ -255,6 +269,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -264,6 +280,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -308,6 +326,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
                         op_ptrs);
                 }
@@ -319,6 +339,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
                         op_ptrs);
                 }
@@ -330,6 +352,8 @@ struct DeviceOperationInstanceFactory<
                 {
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
                         op_ptrs);
                 }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
index 5be8f29e99..c723be0db8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
@@ -69,6 +69,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
@@ -84,6 +98,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
@@ -99,6 +127,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -162,6 +204,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
@@ -191,6 +247,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
@@ -220,6 +290,20 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
@@ -295,6 +379,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
@@ -310,6 +408,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
@@ -325,6 +437,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
@@ -403,6 +529,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
@@ -432,6 +572,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
@@ -461,6 +615,20 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
                                                                   PassThrough,
                                                                   PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
index 913ebd3a12..0ef09c55ee 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
@@ -7,9 +7,15 @@ add_instance_library(
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
+	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
 	xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 226dca5083..bf775b04c0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -26,21 +26,21 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
-                                                        GNHWK,
-                                                        GKYXC,
-                                                        Empty_Tuple,
-                                                        GNHWC,
-                                                        ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              GNHWK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWC,
+                                                              ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<2,
-                                                        GNHWK,
-                                                        GKYXC,
-                                                        Empty_Tuple,
-                                                        GNHWC,
-                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              GNHWK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 64fbf8bbf2..1a3c80e5cf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -26,21 +26,21 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index f9351d96f2..96623a5161 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, hi, wi, c] * wei[g, k, y, x, c] = in[g, n, ho, wo, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   GNHWK,
@@ -26,21 +26,21 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<2,
-                                                       GNHWK,
-                                                       GKYXC,
-                                                       Empty_Tuple,
-                                                       GNHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             GNHWK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             GNHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..f3aded5043
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              NGKHW,
+                                                              GKCYX,
+                                                              Empty_Tuple,
+                                                              NGCHW,
+                                                              ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index 23aeeaf505..e8c6bc7cbe 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
index b6e4c170df..eed5403914 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_bf16_vec_transpose_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_bf16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..99e54abaa0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             NGKHW,
+                                                             GKCYX,
+                                                             Empty_Tuple,
+                                                             NGCHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
index beeda26690..3f94d30a55 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
index 234fd53c8c..1f04d143b1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_vec_transpose_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..e84cd1d2d2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NGKHW,
+                                                                  GKCYX,
+                                                                  Empty_Tuple,
+                                                                  NGCHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             NGKHW,
+                                                             GKCYX,
+                                                             Empty_Tuple,
+                                                             NGCHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
index a1d768f4eb..b5e89c9b7c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
index 3a8b22924a..b822c82c16 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f32_vec_transpose_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f32_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
index 38c3ebc67b..5f8f22a47d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
index e6f3985935..e266d79253 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
index 9212c546ca..76dd477e06 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -9,7 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkyxc_ngchw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NGKHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..603ff268a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              NHWGK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGC,
+                                                              ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<2,
+                                                              NHWGK,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 75e7f61f8a..11e0fc6073 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..a80c33df0b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 231e894be0..a63dd712b6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..fddcc7c290
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  NHWGK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<2,
+                                                             NHWGK,
+                                                             GKYXC,
+                                                             Empty_Tuple,
+                                                             NHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index dbaece1123..e4b4165928 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, ho, wo, g, c] * wei[g, k, y, x, c] = in[n, hi, wi, g, k]
+
 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
                                                                   NHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
index a656c79289..4bb05e5000 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
@@ -6,15 +6,22 @@ set(GROUPED_CONV3D_BWD_DATA
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
+   xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
    xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
+
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
index 8331ea1fda..41f0235063 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+
 // wo, k]
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
@@ -27,21 +27,21 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
-                                                        GNDHWK,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        GNDHWC,
-                                                        ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              GNDHWK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWC,
+                                                              ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
-                                                        GNDHWK,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        GNDHWC,
-                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              GNDHWK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index 1885d49c81..03b8285631 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+
 // wo, k]
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
@@ -27,21 +27,21 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f16_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index 77135fcc05..59526ba9bc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,7 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+
 // wo, k]
 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
@@ -27,21 +27,21 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataDefault>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataDefault>{});
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_data_xdl_f32_instances<3,
-                                                       GNDHWK,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       GNDHWC,
-                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             GNDHWK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             GNDHWC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..3f90c8b907
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              NDHWGK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGC,
+                                                              ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              NDHWGK,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGC,
+                                                              ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index 663d41fe0b..f9989dec13 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..5862b01852
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index ac0ab44ce3..071d34b94a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..11caf88756
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16_16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             NDHWGK,
+                                                             GKZYXC,
+                                                             Empty_Tuple,
+                                                             NDHWGC,
+                                                             ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 50d5cce73d..77127bf7f9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
index 3f191ab6bc..96873cd87f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
new file mode 100644
index 0000000000..fdbfa99f8d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_16_16_instances<3,
+                                                              NGKDHW,
+                                                              GKCZYX,
+                                                              Empty_Tuple,
+                                                              NGCDHW,
+                                                              ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
index a9a6b4d281..943c5bab26 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
index e0703a60fd..2a6b11fa1b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_bf16_vec_transpose_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_bf16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
new file mode 100644
index 0000000000..0cc6104e85
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_16_16_instances<3,
+                                                             NGKDHW,
+                                                             GKCZYX,
+                                                             Empty_Tuple,
+                                                             NGCDHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
index eec3944078..bada2507c2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
index 5bbd7863da..a2eb07261b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_vec_transpose_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
new file mode 100644
index 0000000000..a4b92ae13a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_16_16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_16_16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NGKDHW,
+                                                                  GKCZYX,
+                                                                  Empty_Tuple,
+                                                                  NGCDHW,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_16_16_instances<3,
+                                                             NGKDHW,
+                                                             GKCZYX,
+                                                             Empty_Tuple,
+                                                             NGCDHW,
+                                                             ConvBwdDataDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
index a596482ca8..f1c6f53bf3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
index d68062a707..606fe3f360 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f32_vec_transpose_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f32_vec_transpose_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
index b42eca238f..cbaf934b8a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
index a66965b4a3..645ee77b05 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
index af21d6dc5d..97633b618a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
@@ -9,8 +9,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkzyxc_ngcdhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NGKDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index c25c481c05..af9ec8365d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index f61083e791..71ccd4c5ea 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgk_gkzyxc_ndhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 2e014ae760..a05c130287 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_bilinear/xdl/device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_bilinear_ndhwgk_gkzyxc_ndhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index af94c0ce9d..8b7bce48bc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_scale_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index cc8995320a..890679e7e7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_scale_ndhwgk_gkzyxc_ndhwgc_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 5ed7962bbc..0a22063347 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data_scale/xdl/device_grouped_conv3d_bwd_data_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp"
@@ -8,8 +8,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
-// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
-// g, k]
+
 void add_device_grouped_conv3d_bwd_data_xdl_scale_ndhwgk_gkzyxc_ndhwgc_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
                                                                   NDHWGK,
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 6b24be7d1f..4e0ced347d 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -34,7 +34,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                         int init_method,
                                         bool do_log,
                                         bool time_kernel,
-                                        const ck::utils::conv::ConvParam& conv_param)
+                                        const ck::utils::conv::ConvParam& conv_param,
+                                        ck::index_t split_k = 1)
 {
     using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -88,6 +89,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     // reset input to zero
     in_device_buf.SetZero();
 
+    float max_accumulated_value = 0;
     if(do_verification)
     {
         auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
@@ -114,17 +116,19 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                                   in_element_op);
 
         ref_invoker.Run(ref_argument);
+        max_accumulated_value = *std::max_element(in_host.mData.begin(), in_host.mData.end());
     }
 
     std::string best_op_name;
-    float best_avg_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
+    float best_avg_time      = 0;
+    float best_tflops        = 0;
+    float best_gb_per_sec    = 0;
+    ck::index_t best_split_k = 1;
 
     // profile device op instances
     bool pass = true;
 
-    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr, const index_t& split_k_for_run) {
         // workspace_sz will be equal to 0 for other layout than NGCHW
         const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
         DeviceMem workspace_dev(workspace_sz);
@@ -150,7 +154,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
             float gb_per_sec = num_btype / 1.E6 / avg_time;
 
             std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                      << gb_per_sec << " GB/s, " << op_name << ", SplitK " << split_k_for_run
+                      << std::endl;
 
             if(tflops > best_tflops)
             {
@@ -158,13 +163,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                 best_tflops     = tflops;
                 best_avg_time   = avg_time;
                 best_gb_per_sec = gb_per_sec;
+                best_split_k    = split_k_for_run;
             }
 
             if(do_verification)
             {
                 in_device_buf.FromDevice(in_device.mData.data());
 
-                pass = pass & ck::utils::check_err(in_device, in_host);
+                using ComputeType = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
+                                                       OutDataType,
+                                                       WeiDataType>;
+                using AccDataType =
+                    std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
+                const index_t num_accums = conv_param.K_;
+                // Calculate thresholds
+                auto rtol = ck::utils::get_relative_threshold<ComputeType, InDataType, AccDataType>(
+                    num_accums / split_k_for_run);
+                auto atol = ck::utils::get_absolute_threshold<ComputeType, InDataType, AccDataType>(
+                    max_accumulated_value / split_k_for_run, num_accums / split_k_for_run);
+                // Calculate error due to split_k accumulation
+                auto rtol_split_k =
+                    ck::utils::get_relative_threshold<InDataType, InDataType, InDataType>(
+                        split_k_for_run);
+                auto atol_split_k =
+                    ck::utils::get_absolute_threshold<InDataType, InDataType, InDataType>(
+                        max_accumulated_value, split_k_for_run);
+                // Use higher threshold
+                rtol = std::max(rtol, rtol_split_k);
+                atol = std::max(atol, atol_split_k);
+
+                pass = pass & ck::utils::check_err(
+                                  in_device, in_host, "Error: Incorrect results!", rtol, atol);
+                std::cout << "Relative error threshold: " << rtol
+                          << " Absolute error threshold: " << atol << std::endl;
 
                 if(do_log)
                 {
@@ -225,35 +256,47 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     copy(conv_param.input_left_pads_, input_left_pads);
     copy(conv_param.input_right_pads_, input_right_pads);
 
+    std::vector<ck::index_t> split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
+
+    if(split_k > 0)
+    {
+        split_k_list = {split_k};
+    }
+
     for(auto& op_ptr : op_ptrs)
     {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                        {},
-                                        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                        out_lengths,
-                                        out_strides,
-                                        wei_lengths,
-                                        wei_strides,
-                                        {},
-                                        {},
-                                        in_lengths,
-                                        in_strides,
-                                        conv_filter_strides,
-                                        conv_filter_dilations,
-                                        input_left_pads,
-                                        input_right_pads,
-                                        out_element_op,
-                                        wei_element_op,
-                                        in_element_op);
+        for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                {},
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                out_lengths,
+                out_strides,
+                wei_lengths,
+                wei_strides,
+                {},
+                {},
+                in_lengths,
+                in_strides,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                out_element_op,
+                wei_element_op,
+                in_element_op,
+                split_k_list[split_k_id]);
 
-        run_impl(op_ptr, argument_ptr);
+            run_impl(op_ptr, argument_ptr, split_k_list[split_k_id]);
+        }
     }
 
     std::cout << "Best configuration parameters:"
               << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
+              << best_split_k << std::endl;
 
     return pass;
 }
diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/profiler/src/profile_grouped_conv_bwd_data.cpp
index 1515f1105f..5cdece499e 100644
--- a/profiler/src/profile_grouped_conv_bwd_data.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_data.cpp
@@ -68,8 +68,8 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
     const bool time_kernel     = std::stoi(argv[7]);
     const int num_dim_spatial  = std::stoi(argv[8]);
 
-    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
-    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
     {
         print_helper_msg();
         return 1;
@@ -77,6 +77,8 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
 
     const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
 
+    ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
+
     using F32  = float;
     using F16  = ck::half_t;
     using BF16 = ck::bhalf_t;
@@ -110,7 +112,7 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                                                                      OutDataType,
                                                                      WeiDataType,
                                                                      InDataType>(
-            do_verification, init_method, do_log, time_kernel, params);
+            do_verification, init_method, do_log, time_kernel, params, split_k);
 
         return pass ? 0 : 1;
     };
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index 1278b6744d..2ddcbb67cd 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -126,6 +126,8 @@ def run_ck_grouped_conv_bwd_data(args):
     args.ck_profier_op = "grouped_conv_bwd_data"
     parse_data_type(args)
     parse_layouts(args)
+    # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
+    args.split_k_value = -1
 
     cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
     cmd += [str(args.data_type), str(args.layout)]
@@ -136,6 +138,7 @@ def run_ck_grouped_conv_bwd_data(args):
     cmd += [str(args.in_channels)]
     add_conv_params_to_cmd(args, cmd)
 
+    cmd += [str(args.split_k_value)]
     run_ck_profiler_cmd(cmd)
 
 
diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt
index 6d78da8db7..5c816da416 100644
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -2,6 +2,11 @@ add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_da
 if(result EQUAL 0)
     target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
+if(GPU_TARGETS MATCHES "gfx9")
+    add_executable(test_grouped_convnd_bwd_data_xdl_large_cases test_grouped_convnd_bwd_data_xdl_large_cases.cpp)
+    target_compile_options(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_bwd_data_xdl_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+endif()
 add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp)
 if(result EQUAL 0)
     target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index eb6083c521..c4404b95ba 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -21,26 +21,31 @@ class TestGroupedConvndBwdDataXdl : public ::testing::Test
     using InLayout  = std::tuple_element_t<3, Tuple>;
 
     std::vector<ck::utils::conv::ConvParam> conv_params;
+    std::vector<ck::index_t> split_ks{1, 2};
 
     template <ck::index_t NDimSpatial>
     void Run()
     {
         EXPECT_FALSE(conv_params.empty());
         bool pass = true;
-        for(auto& param : conv_params)
+        for(auto split_k : split_ks)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
-                                                                            OutLayout,
-                                                                            WeiLayout,
-                                                                            InLayout,
-                                                                            DataType,
-                                                                            DataType,
-                                                                            DataType>(
-                               true,  // do_verification
-                               1,     // init_method: integer value
-                               false, // do_log
-                               false, // time_kernel
-                               param);
+            for(auto& param : conv_params)
+            {
+                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                OutLayout,
+                                                                                WeiLayout,
+                                                                                InLayout,
+                                                                                DataType,
+                                                                                DataType,
+                                                                                DataType>(
+                                   true,  // do_verification
+                                   1,     // init_method: integer value
+                                   false, // do_log
+                                   false, // time_kernel
+                                   param,
+                                   split_k);
+            }
         }
         EXPECT_TRUE(pass);
     }
@@ -92,19 +97,16 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
     this->conv_params.clear();
 
     this->conv_params.push_back(
-        {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 2, 2, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(
-        {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 2, 2, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(
-        {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 2, 2, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back(
-        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
-    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    // SplitN case
-    this->conv_params.push_back(
-        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 2, 2, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->template Run<2>();
 }
 
@@ -112,28 +114,16 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
 {
     this->conv_params.clear();
     this->conv_params.push_back(
-        {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 2, 2, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->conv_params.push_back(
         {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 2, 2, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
     this->conv_params.push_back(
-        {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 1, 1, 1, 32, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 1, 1, 64, 3, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
-        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
-    // SplitN case
-    this->conv_params.push_back({3,
-                                 1,
-                                 128,
-                                 4,
-                                 192,
-                                 {2, 2, 2},
-                                 {2, 224, 224},
-                                 {1, 224, 224},
-                                 {1, 1, 1},
-                                 {0, 0, 0},
-                                 {0, 0, 0}});
+        {3, 1, 1, 1, 1, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->template Run<3>();
 }
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp
new file mode 100644
index 0000000000..73d793cc5f
--- /dev/null
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl_large_cases.cpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataXdl : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using OutLayout = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using InLayout  = std::tuple_element_t<3, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+    std::vector<ck::index_t> split_ks{1, 2};
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto split_k : split_ks)
+        {
+            for(auto& param : conv_params)
+            {
+                pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                                OutLayout,
+                                                                                WeiLayout,
+                                                                                InLayout,
+                                                                                DataType,
+                                                                                DataType,
+                                                                                DataType>(
+                                   true,  // do_verification
+                                   1,     // init_method: integer value
+                                   false, // do_log
+                                   false, // time_kernel
+                                   param,
+                                   split_k);
+            }
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<ck::bhalf_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<float, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKYXC, NGCHW>,
+                                       std::tuple<float, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::half_t, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<ck::bhalf_t, NGKHW, GKCYX, NGCHW>,
+                                       std::tuple<float, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<ck::bhalf_t, NHWGK, GKYXC, NHWGC>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<ck::bhalf_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<float, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKZYXC, NGCDHW>,
+                                       std::tuple<float, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::half_t, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<ck::bhalf_t, NGKDHW, GKCZYX, NGCDHW>,
+                                       std::tuple<float, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<ck::bhalf_t, NDHWGK, GKZYXC, NDHWGC>>;
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataXdl2d : public TestGroupedConvndBwdDataXdl<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndBwdDataXdl3d : public TestGroupedConvndBwdDataXdl<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataXdl3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
+{
+    this->conv_params.clear();
+    // SplitN case
+    this->conv_params.push_back(
+        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
+{
+    this->conv_params.clear();
+    // SplitN case
+    this->conv_params.push_back({3,
+                                 1,
+                                 128,
+                                 4,
+                                 192,
+                                 {2, 2, 2},
+                                 {2, 224, 224},
+                                 {1, 224, 224},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    this->template Run<3>();
+}

From 768c99eca9e6a4e4edc4e6b920939933eafb4aea Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Mon, 28 Apr 2025 18:19:23 -0700
Subject: [PATCH 078/443] [TileEngine] Support for sparsity in codegen (#2128)

* Added sparsity flag in codegen

* remove comments

* clan formatted

* added sparsity as runtime argument

* updated README

* updated stream config variable

* fix typo for tail_num in hot loop
---
 tile_engine/ops/gemm/README.md                | 37 ++++++++++---------
 tile_engine/ops/gemm/gemm_host_api.cpp        | 20 ++++++++--
 tile_engine/ops/gemm/gemm_host_api.hpp        |  1 +
 tile_engine/ops/gemm/gemm_instance_builder.py | 33 ++++++++++-------
 4 files changed, 56 insertions(+), 35 deletions(-)
 mode change 100644 => 100755 tile_engine/ops/gemm/gemm_host_api.cpp
 mode change 100644 => 100755 tile_engine/ops/gemm/gemm_host_api.hpp

diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 495232f19b..08456a1675 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -20,24 +20,25 @@ make tile_engine_gemm -j
 ## tile_engine_gemm inputs
 ```
 
-          -m    m dimension (default:3840)
-          -n    n dimension (default:4096)
-          -k    k dimension (default:2048)
-   -stride_a    Tensor A stride (default:0)
-   -stride_b    Tensor B stride (default:0)
-   -stride_c    Tensor C stride (default:0)
-    -split_k    SplitK value (default:1)
-          -v    No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
-     -warmup    Number of iterations before benchmark the kernel (default:50)
-     -repeat    Number of iterations to benchmark the kernel (default:100)
-      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
-       -init    Value for initializing tensor- random: 0, linear: 1, constant(1): 2 (default:0)
-   -pipeline    possible values are: compv3, compv4, mem (default:compv3)
-  -scheduler    possible values are: intrawave, interwave (default:intrawave)
-   -epilogue    possible values are: cshuffle, default (default:cshuffle)
-      -pad_m    Pad in m direction - true/false (default:false)
-      -pad_n    Pad in n direction - true/false (default:false)
-      -pad_k    Pad in k direction - true/false (default:false)
+                  -m    m dimension (default:3840)
+                  -n    n dimension (default:4096)
+                  -k    k dimension (default:2048)
+           -stride_a    Tensor A stride (default:0)
+           -stride_b    Tensor B stride (default:0)
+           -stride_c    Tensor C stride (default:0)
+            -split_k    SplitK value (default:1)
+                  -v    No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
+             -warmup    Number of iterations before benchmark the kernel (default:50)
+             -repeat    Number of iterations to benchmark the kernel (default:100)
+              -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+               -init    Value for initializing tensor- random: 0, linear: 1, constant(1): 2 (default:0)
+-structured_sparsity    Sparsity for tensor - 0:false, 1:true (default: 0)
+           -pipeline    possible values are: compv3, compv4, mem (default:compv3)
+          -scheduler    possible values are: intrawave, interwave (default:intrawave)
+           -epilogue    possible values are: cshuffle, default (default:cshuffle)
+              -pad_m    Pad in m direction - true/false (default:false)
+              -pad_n    Pad in n direction - true/false (default:false)
+              -pad_k    Pad in k direction - true/false (default:false)
 
 Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json 
 ```
diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp
old mode 100644
new mode 100755
index 3cef425a51..a5447cd658
--- a/tile_engine/ops/gemm/gemm_host_api.cpp
+++ b/tile_engine/ops/gemm/gemm_host_api.cpp
@@ -10,12 +10,19 @@ void gemm_kernel_launch(ck_tile::DeviceMem& c_m_n_dev_buf,
                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
                         ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
                         int verify,
+                        bool structured_sparsity,
                         KernelTraits& trait,
                         ck_tile::GemmHostArgs& args,
-                        const ck_tile::stream_config& s)
+                        const ck_tile::stream_config& stream)
 {
-    return GemmDispatcher::dispatch(
-        c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, trait, args, s);
+    return GemmDispatcher::dispatch(c_m_n_dev_buf,
+                                    c_m_n_host_result,
+                                    c_m_n_dev_result,
+                                    verify,
+                                    structured_sparsity,
+                                    trait,
+                                    args,
+                                    stream);
 }
 
 template <typename ADataType,
@@ -43,6 +50,7 @@ void run(const ck_tile::ArgParser& arg_parser)
     int n_repeat                 = arg_parser.get_int("repeat");
     int verify                   = arg_parser.get_int("v");
     ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool structured_sparsity     = arg_parser.get_bool("structured_sparsity");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -76,6 +84,11 @@ void run(const ck_tile::ArgParser& arg_parser)
         b_k_n.SetZero();
     }
 
+    if(structured_sparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
     ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
     ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
@@ -153,6 +166,7 @@ void run(const ck_tile::ArgParser& arg_parser)
                        c_m_n_host_result,
                        c_m_n_dev_result,
                        verify,
+                       structured_sparsity,
                        trait,
                        gemm_args,
                        ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
old mode 100644
new mode 100755
index c1e1e1dc4f..579d2770db
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -118,6 +118,7 @@ inline auto create_args(int argc, char* argv[])
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("structured_sparsity", "0", "0:false, 1:true")
         .insert("pipeline", "compv3", "compv3, compv4, mem")
         .insert("scheduler", "intrawave", "intrawave, interwave")
         .insert("epilogue", "cshuffle", "cshuffle, default")
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index cfefd38cd2..b6c7685fb2 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -69,7 +69,7 @@ HOT_LOOP_FALSE = """
             else if(tail_num == ck_tile::TailNumber::Even)
             {
                 Run(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
             }
             else
             {
@@ -347,7 +347,8 @@ namespace {group_name} {{
         return f"""
 template <int TileM, int TileN, int TileK,
           int WarpM, int WarpN, int WarpK,
-          int WarpTileM, int WarpTileN, int WarpTileK>
+          int WarpTileM, int WarpTileN, int WarpTileK,
+          bool structured_sparsity>
 struct GemmKernel {{
     static constexpr bool kPadM = {BOOL_MAP(kPadM)};
     static constexpr bool kPadN = {BOOL_MAP(kPadN)};
@@ -356,7 +357,7 @@ struct GemmKernel {{
     static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
-        static constexpr bool DoubleSmemBuffer = false;
+        static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
         static constexpr bool TransposeC = false;
 
         static constexpr int kBlockPerCu                         = 1;
@@ -381,7 +382,7 @@ struct GemmKernel {{
 
         using GemmUniversalTraits =
             ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
-                                             ALayout, BLayout, CLayout, TransposeC>;    
+                                             ALayout, BLayout, CLayout, TransposeC, structured_sparsity>;    
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -494,7 +495,7 @@ struct GemmDispatcher {
         return kernel_map;
     }
 
-    static void init() {
+    static void init(bool structured_sparsity) {
         auto& kernel_map = get_kernel_map();    
         if(!kernel_map.empty()) return;
         \n"""
@@ -513,11 +514,11 @@ struct GemmDispatcher {
 
         
         for group in self.all_kernels:
-            content += f"""            kernel_map["{group}"] = [](ck_tile::DeviceMem& c_m_n_dev_buf,
+            content += f"""            kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf,
                                                                   ck_tile::HostTensor<CDataType>& c_m_n_host_result,
                                                                   ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
                                                                   int verify, ck_tile::GemmHostArgs& args,
-                                                                  const ck_tile::stream_config& s) {{
+                                                                  const ck_tile::stream_config& stream) {{
                         """
             for tile in tile_params:
                 # Check if we have valid tile/warp combinations 
@@ -526,7 +527,11 @@ struct GemmDispatcher {
                    ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
                     continue
                 content += f"""
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, s);"""
+                if(structured_sparsity) {{
+                    run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {1}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+                }} else {{
+                    run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {0}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+                }}"""
             content += f"""
             }};\n"""
 
@@ -536,9 +541,9 @@ struct GemmDispatcher {
     static void run_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
                            ck_tile::HostTensor<CDataType>& c_m_n_host_result,
                            ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                           int verify, ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+                           int verify, ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream)
     {
-        float avg_time = Kernel::launch(args, s);
+        float avg_time = Kernel::launch(args, stream);
         std::string description = Kernel::get_name();
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         
@@ -559,13 +564,13 @@ struct GemmDispatcher {
     static auto dispatch(ck_tile::DeviceMem& c_m_n_dev_buf,
                          ck_tile::HostTensor<CDataType>& c_m_n_host_result,
                          ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                         int verify, const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
-                         const ck_tile::stream_config& s) {
-        init();
+                         int verify, bool structured_sparsity, const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
+                         const ck_tile::stream_config& stream) {
+        init(structured_sparsity);
         const std::string key = assemble_key(trait);
         auto& kernel_map = get_kernel_map(); 
         if(auto it = kernel_map.find(key); it != kernel_map.end()) {
-            return it->second(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify,gemm_args, s); 
+            return it->second(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, gemm_args, stream); 
         }
         throw std::runtime_error("No suitable kernel found: " + key);
     }

From d107f3c3a53b6582a073e906133a9b05502352e8 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Mon, 28 Apr 2025 18:19:50 -0700
Subject: [PATCH 079/443] Support for MFMA_16x16x128 for fp8/bf8 (#2125)

* Adding 16x16x128 support for gfx950

* Support for fp8 and bf8

* fix input arguments for MFMA scale instruction

* clang-formatted

* Fixes for lwpck-3145 (#2138)

* Fix lds tile & cmake dep & default epilogue

* Fallback BTypeToUse to ADataType in WOQ cases

* reverting instance json file

* reverting instance json file

---------

Co-authored-by: Yi DING <yi.ding@amd.com>
---
 .../ops/epilogue/cshuffle_epilogue.hpp        |  3 +-
 .../ops/epilogue/default_2d_epilogue.hpp      | 21 ++--
 .../block/block_universal_gemm_as_bs_cr.hpp   |  2 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 12 +++
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 98 +++++++++++++++++++
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  5 +
 tile_engine/ops/gemm/CMakeLists.txt           |  8 +-
 tile_engine/ops/gemm/gemm_instance_builder.py |  4 +-
 8 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 0081edcb2e..225997439e 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -49,8 +49,9 @@ struct CShuffleEpilogue
     using BDataType   = remove_cvref_t<typename Problem::BDataType>;
     using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
     using ODataType   = remove_cvref_t<typename Problem::ODataType>;
+    // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
-        std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ODataType, BDataType>;
+        std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
     using CLayout                           = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kBlockSize     = Problem::kBlockSize;
     static constexpr index_t kMPerBlock     = Problem::kMPerBlock;
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 6e290fe6d7..1d6a99eb4b 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -25,7 +25,9 @@ struct Default2DEpilogueProblem
     static constexpr bool UseRawStore = UseRawStore_;
 };
 
-template <typename AccDataType_,
+template <typename ADataType_,
+          typename BDataType_,
+          typename AccDataType_,
           typename ODataType_,
           typename CLayout_,
           bool kPadM_,
@@ -38,6 +40,8 @@ template <typename AccDataType_,
 struct DefaultGemm2DEpilogueProblem
     : public Default2DEpilogueProblem<AccDataType_, ODataType_, kPadM_, kPadN_, UseRawStore_>
 {
+    using ADataType                        = remove_cvref_t<ADataType_>;
+    using BDataType                        = remove_cvref_t<BDataType_>;
     using CLayout                          = remove_cvref_t<CLayout_>;
     static constexpr index_t kMPerXdl      = kMPerXdl_;
     static constexpr index_t kNPerXdl      = kNPerXdl_;
@@ -96,17 +100,22 @@ struct Default2DEpilogue
 template <typename Problem_, typename Policy_ = void>
 struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
 {
-    using Problem                          = remove_cvref_t<Problem_>;
-    using AccDataType                      = remove_cvref_t<typename Problem::AccDataType>;
-    using ODataType                        = remove_cvref_t<typename Problem::ODataType>;
+    using Problem     = remove_cvref_t<Problem_>;
+    using ADataType   = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType   = remove_cvref_t<typename Problem::BDataType>;
+    using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
+    using ODataType   = remove_cvref_t<typename Problem::ODataType>;
+    // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
+    using BTypeToUse =
+        std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
     using CLayout                          = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kMPerXdl      = Problem::kMPerXdl;
     static constexpr index_t kNPerXdl      = Problem::kNPerXdl;
     static constexpr index_t kKPerXdl      = Problem::kKPerXdl;
     static constexpr index_t isCTransposed = Problem::isCTransposed;
 
-    using WG = WarpGemmMfmaDispatcher<ODataType,
-                                      ODataType,
+    using WG = WarpGemmMfmaDispatcher<ADataType,
+                                      BTypeToUse,
                                       AccDataType,
                                       kMPerXdl,
                                       kNPerXdl,
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index 38ed108f6d..c4d527da63 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -298,7 +298,7 @@ struct BlockUniversalGemmAsBsCr
         using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
 
         ALdsTile a_warp_tile_;
-        ALdsTile b_warp_tile_;
+        BLdsTile b_warp_tile_;
 
         template <typename ASmemBlockWindow, typename BSmemBlockWindow>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 4732027e57..22962b9404 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -216,6 +216,18 @@ using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIter
 using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
+using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 08f813a1e3..cd32f35180 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1342,6 +1342,104 @@ template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8 =
     WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<bf8_t, bf8_t, Ctrl_>;
 
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<ADataType, 32>;
+    using BVecType = ext_vector_t<BDataType, 32>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 128;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 32;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        //__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, cbsz, blgp, opsel, scale_a,
+        // opsel, scale_b)
+#if defined(__gfx950__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 0, 0, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 0, 1, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 1, 0, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, c_vec, 1, 1, 0, 0, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 0, 0, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 0, 1, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 1, 0, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 1, 1, 0, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<fp8_t, bf8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, bf8_t, Ctrl_>;
+
 // int8
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index f437ee10c5..0e3342c479 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -69,6 +69,11 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8; };
+
 // clang-format on
 } // namespace impl
 
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index d28017ca0c..bc613a931e 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -8,6 +8,10 @@ execute_process(
             --list_blobs
             RESULT_VARIABLE ret
 )
+set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
+  ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+  ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+)
 
 if(ret AND NOT ret EQUAL 0)
   message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
@@ -21,7 +25,9 @@ add_custom_command(
             --working_path ${CMAKE_CURRENT_BINARY_DIR}
             --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
             --gen_blobs
-    DEPENDS ${GEMM_CODEGEN_BLOBS}
+    DEPENDS ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+            ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt
+            ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
 )
 
 set(EXECUTABLE_GEMM_INSTANCE "tile_engine_gemm")
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index b6c7685fb2..b441bdd2d6 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -27,7 +27,9 @@ LAYOUT_MAP = {'r' : 'ck_tile::tensor_layout::gemm::RowMajor',
 
 DEFAULT_EPILOGUE = """
             using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
-                                ck_tile::DefaultGemm2DEpilogueProblem<AccDataType, 
+                                ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
+                                                                      BDataType,
+                                                                      AccDataType, 
                                                                       CDataType, 
                                                                       CLayout, 
                                                                       kPadM,

From 65f182d617b0d2889b778b4d8773b99ea4d13290 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Mon, 28 Apr 2025 20:20:47 -0500
Subject: [PATCH 080/443] Add Matrix A and Matrix B Swizzle for LDS in
 Computev4 policy  (#2136)

* fixed computev4 policy bug for lds swizzle

* added swizzle for input matrix B

* Improved ComputeV4 policy and pipeline by swizzling A and B

* consolidated LDS descriptor functions in parent struct
---
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |  48 +-
 ...peline_ag_bg_cr_comp_v4_default_policy.hpp |  50 --
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 482 +++++++++---------
 3 files changed, 265 insertions(+), 315 deletions(-)

diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 0e0ee9dbd8..667bb80ce9 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -217,17 +217,17 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             ////////////// global window & register /////////////////
             // A DRAM tile window for load
             auto a_copy_dram_window =
-                make_tile_window_linear(a_dram_block_window_tmp.get_bottom_tensor_view(),
-                                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                        a_dram_block_window_tmp.get_window_origin(),
-                                        Policy::template MakeADramTileDistribution<Problem>());
+                make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 a_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeADramTileDistribution<Problem>());
 
             // B DRAM tile window for load
             auto b_copy_dram_window =
-                make_tile_window_linear(b_dram_block_window_tmp.get_bottom_tensor_view(),
-                                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                        b_dram_block_window_tmp.get_window_origin(),
-                                        Policy::template MakeBDramTileDistribution<Problem>());
+                make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 b_dram_block_window_tmp.get_window_origin(),
+                                 Policy::template MakeBDramTileDistribution<Problem>());
 
             // A register tile for global load
             constexpr auto ABlockTileDistr = a_copy_dram_window.get_tile_distribution();
@@ -317,25 +317,25 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             BLdsTile b_block_tile1;
 
             auto a_lds_ld_window0 =
-                make_tile_window_linear(a_lds_block0,
-                                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        ALdsTileDistr);
+                make_tile_window(a_lds_block0,
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 ALdsTileDistr);
             auto a_lds_ld_window1 =
-                make_tile_window_linear(a_lds_block1,
-                                        make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        ALdsTileDistr);
+                make_tile_window(a_lds_block1,
+                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 ALdsTileDistr);
             auto b_lds_ld_window0 =
-                make_tile_window_linear(b_lds_block0,
-                                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        BLdsTileDistr);
+                make_tile_window(b_lds_block0,
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 BLdsTileDistr);
             auto b_lds_ld_window1 =
-                make_tile_window_linear(b_lds_block1,
-                                        make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                        {0, 0},
-                                        BLdsTileDistr);
+                make_tile_window(b_lds_block1,
+                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                 {0, 0},
+                                 BLdsTileDistr);
 
             Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
             Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0);
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index e528847438..f6920f1c57 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -17,56 +17,6 @@ namespace ck_tile {
 struct GemmPipelineAgBgCrCompV4DefaultPolicy
     : public UniversalGemmBasePolicy<GemmPipelineAgBgCrCompV4DefaultPolicy>
 {
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack      = GetSmemPackA<Problem>();
-
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / KPack>{}, number<kMPerBlock>{}, number<KPack>{}),
-            make_tuple(number<kMPerBlock * KPack>{}, number<KPack>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(
-                make_pass_through_transform(number<kMPerBlock>{}),
-                make_merge_transform(make_tuple(number<kKPerBlock>{} / KPack, number<KPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return a_lds_block_desc;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack      = GetSmemPackB<Problem>();
-
-        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / KPack>{}, number<kNPerBlock>{}, number<KPack>{}),
-            make_tuple(number<(kNPerBlock)*KPack>{}, number<KPack>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-            b_lds_block_desc_0,
-            make_tuple(
-                make_pass_through_transform(number<kNPerBlock>{}),
-                make_merge_transform(make_tuple(number<kKPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return b_lds_block_desc;
-    }
-
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index b555cf75e0..6890cf2f64 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -19,6 +19,245 @@ struct UniversalGemmBasePolicy
     static constexpr auto ATileAccessPattern = tile_distribution_pattern::thread_raked;
     static constexpr auto BTileAccessPattern = tile_distribution_pattern::thread_raked;
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
+                       number<MPerBlock / MLdsLayer>{},
+                       number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
+                                                     number<KPerBlock / KPack * MLdsLayer>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
+                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    /**
+     * @brief Create LDS block descriptor for B tensor.
+     *
+     * @tparam Problem  Gemm pipeline problem.
+     * @return B tensor LDS block descriptor.
+     */
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        // using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+#if 1
+        // if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t KPack     = GetSmemPackB<Problem>();
+            constexpr auto BK0          = number<KPerBlock / KPack>{};
+            constexpr auto DataTypeSize = sizeof(BDataType);
+            constexpr auto NLdsLayer =
+                (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+            constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock * NLdsLayer>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc_0,
+                make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
+                                                         BK0 * number<NLdsLayer>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(number<NLdsLayer>{}, BK0)),
+                           make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+            constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{})),
+                           make_merge_transform_v3_division_mod(make_tuple(BK0, number<KPack>{}))),
+                make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+            return b_lds_block_desc;
+        }
+#else
+        else // B is Row Major
+        {
+            constexpr index_t BlockSize   = Problem::kBlockSize;
+            constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
+            using TileEncodingPattern     = TileDistributionEncodingPattern2D<BlockSize,
+                                                                          KPerBlock,
+                                                                          NPerBlock,
+                                                                          VecLoadSize,
+                                                                          BTileAccessPattern>;
+
+            constexpr auto BK0 = number<TileEncodingPattern::X1>{};
+            constexpr auto BK1 = number<TileEncodingPattern::Y0>{};
+            // constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N0 = TileEncodingPattern::X0;
+            constexpr auto N1 = NPerBlock / N0;
+
+            using WarpTile         = typename Problem::BlockGemmShape::WarpTile;
+            constexpr auto NPerXdl = number<WarpTile::at(I1)>{};
+
+            // constexpr auto KThreadWrite     =
+            // BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto KThreadWrite     = TileEncodingPattern::Y2;
+            constexpr auto K0PerThreadWrite = BK0 / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / NPerXdl;
+            constexpr auto K0PerThreadRead  = BK0 / KThreadRead;
+
+            constexpr auto kfold =
+                (BK1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (BK1 * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1 * NPerXdl * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1 * NPerXdl * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1 * NPerXdl * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           number<K0PerThreadWrite>{},
+                           number<KThreadReadPerm * N1>{},
+                           number<kfold * N0 / npair>{},
+                           number<npair>{},
+                           BK1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_xor_transform(
+                        make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(number<npair>{}),
+                    make_pass_through_transform(BK1)),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
+                make_tuple(
+                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
+                    make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
+                    make_pass_through_transform(number<npair>{}),
+                    make_pass_through_transform(BK1)),
+                make_tuple(sequence<0>{},
+                           sequence<1>{},
+                           sequence<2>{},
+                           sequence<3>{},
+                           sequence<4>{},
+                           sequence<5>{}),
+                make_tuple(sequence<1>{},
+                           sequence<2>{},
+                           sequence<0, 3>{},
+                           sequence<4, 5>{},
+                           sequence<6>{},
+                           sequence<7>{}));
+
+            // constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+            //     b_lds_block_desc_unmerged,
+            //     make_tuple(make_merge_transform_v3_division_mod(
+            //                    make_tuple(number<KThreadReadPerm>{},
+            //                               number<KThreadWrite / kfold / KThreadReadPerm>{},
+            //                               number<kfold>{},
+            //                               number<K0PerThreadWrite>{})),
+            //                make_merge_transform_v3_division_mod(
+            //                    make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{})),
+            //                make_pass_through_transform(BK1)),
+            //     make_tuple(sequence<0, 1, 4, 2>{}, sequence<5, 6, 3>{}, sequence<7>{}),
+            //     make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_kn = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<KThreadReadPerm>{},
+                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          number<kfold>{},
+                                          number<K0PerThreadWrite>{},
+                                          BK1)),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
+                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
+                make_tuple(sequence<1>{}, sequence<0>{}));
+
+            // return b_lds_block_desc_bk0_n_bk1;
+            return b_lds_block_desc_kn;
+
+            // constexpr auto b_lds_block_desc_bk0_n_bk1 = make_naive_tensor_descriptor(
+            //     make_tuple(BK0, number<NPerBlock>{}, number<KPack>{}),
+            //     make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+            //     number<KPack>{},
+            //     number<1>{});
+
+            // constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            //     b_lds_block_desc_bk0_n_bk1,
+            //     make_tuple(make_pass_through_transform(number<NPerBlock>{}),
+            //                make_merge_transform_v3_division_mod(make_tuple(BK0,
+            //                number<KPack>{}))),
+            //     make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            //     make_tuple(sequence<0>{}, sequence<1>{}));
+
+            // return b_lds_block_desc;
+        }
+#endif
+    }
+
     /**
      * @brief Get the maximum global memory vector load size.
      *
@@ -301,7 +540,7 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
     {
-        constexpr auto a_lds_desc     = Derived::template MakeALdsBlockDescriptor<Problem>();
+        constexpr auto a_lds_desc     = MakeALdsBlockDescriptor<Problem>();
         constexpr index_t smem_size_a = integer_least_multiple(
             sizeof(typename Problem::ADataType) * a_lds_desc.get_element_space_size(), 16);
         return smem_size_a;
@@ -310,7 +549,7 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
     {
-        constexpr auto b_lds_desc     = Derived::template MakeBLdsBlockDescriptor<Problem>();
+        constexpr auto b_lds_desc     = MakeBLdsBlockDescriptor<Problem>();
         constexpr index_t smem_size_b = integer_least_multiple(
             sizeof(typename Problem::BDataType) * b_lds_desc.get_element_space_size(), 16);
         return smem_size_b;
@@ -330,245 +569,6 @@ struct UniversalGemmBasePolicy
 struct UniversalGemmPipelineAgBgCrPolicy
     : public UniversalGemmBasePolicy<UniversalGemmPipelineAgBgCrPolicy>
 {
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-
-        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack     = GetSmemPackA<Problem>();
-
-        constexpr auto DataTypeSize = sizeof(ADataType);
-        constexpr auto MLdsLayer =
-            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
-
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
-                       number<MPerBlock / MLdsLayer>{},
-                       number<KPack>{}),
-            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
-
-        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
-                                                     number<KPerBlock / KPack * MLdsLayer>{})),
-                       make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
-            a_lds_block_desc_permuted,
-            make_tuple(make_unmerge_transform(
-                           make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
-                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
-                       make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
-
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
-            make_tuple(make_merge_transform_v3_division_mod(
-                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return a_lds_block_desc;
-    }
-
-    /**
-     * @brief Create LDS block descriptor for B tensor.
-     *
-     * @tparam Problem  Gemm pipeline problem.
-     * @return B tensor LDS block descriptor.
-     */
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        // using BLayout   = remove_cvref_t<typename Problem::BLayout>;
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-
-        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-#if 1
-        // if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
-        {
-            constexpr index_t KPack     = GetSmemPackB<Problem>();
-            constexpr auto BK0          = number<KPerBlock / KPack>{};
-            constexpr auto DataTypeSize = sizeof(BDataType);
-            constexpr auto NLdsLayer =
-                (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
-
-            constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
-                make_tuple(
-                    BK0 * number<NLdsLayer>{}, number<NPerBlock / NLdsLayer>{}, number<KPack>{}),
-                make_tuple(number<KPack>{}, number<KPerBlock * NLdsLayer>{}, number<1>{}),
-                number<KPack>{},
-                number<1>{});
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc_0,
-                make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
-                                                         BK0 * number<NLdsLayer>{})),
-                           make_pass_through_transform(number<KPack>{})),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(number<NLdsLayer>{}, BK0)),
-                           make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
-                           make_pass_through_transform(number<KPack>{})),
-                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
-
-            constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-                b_lds_block_desc_bk0_nldslayer_n_bk1,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{})),
-                           make_merge_transform_v3_division_mod(make_tuple(BK0, number<KPack>{}))),
-                make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}));
-            return b_lds_block_desc;
-        }
-#else
-        else // B is Row Major
-        {
-            constexpr index_t BlockSize   = Problem::kBlockSize;
-            constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
-            using TileEncodingPattern     = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          KPerBlock,
-                                                                          NPerBlock,
-                                                                          VecLoadSize,
-                                                                          BTileAccessPattern>;
-
-            constexpr auto BK0 = number<TileEncodingPattern::X1>{};
-            constexpr auto BK1 = number<TileEncodingPattern::Y0>{};
-            // constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N0 = TileEncodingPattern::X0;
-            constexpr auto N1 = NPerBlock / N0;
-
-            using WarpTile         = typename Problem::BlockGemmShape::WarpTile;
-            constexpr auto NPerXdl = number<WarpTile::at(I1)>{};
-
-            // constexpr auto KThreadWrite     =
-            // BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
-            constexpr auto KThreadWrite     = TileEncodingPattern::Y2;
-            constexpr auto K0PerThreadWrite = BK0 / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerXdl;
-            constexpr auto K0PerThreadRead  = BK0 / KThreadRead;
-
-            constexpr auto kfold =
-                (BK1 * N0 * sizeof(BDataType) > 128) ? 1 : 128 / (BK1 * N0 * sizeof(BDataType));
-            constexpr auto KThreadReadPerm =
-                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
-                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
-                    : KThreadRead;
-
-            // 1<=npair<=n0
-            constexpr auto npair = (BK1 * NPerXdl * sizeof(BDataType) > 128)
-                                       ? 1
-                                       : ((128 / (BK1 * NPerXdl * sizeof(BDataType))) > N0
-                                              ? N0
-                                              : 128 / (BK1 * NPerXdl * sizeof(BDataType)));
-
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
-                make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
-                           number<K0PerThreadWrite>{},
-                           number<KThreadReadPerm * N1>{},
-                           number<kfold * N0 / npair>{},
-                           number<npair>{},
-                           BK1));
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_xor_transform(
-                        make_tuple(number<KThreadReadPerm * N1>{}, number<kfold * N0 / npair>{})),
-                    make_pass_through_transform(number<npair>{}),
-                    make_pass_through_transform(BK1)),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
-                make_tuple(
-                    sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
-
-            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(
-                    make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(number<K0PerThreadWrite>{}),
-                    make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<N1>{})),
-                    make_unmerge_transform(make_tuple(number<kfold>{}, number<N0 / npair>{})),
-                    make_pass_through_transform(number<npair>{}),
-                    make_pass_through_transform(BK1)),
-                make_tuple(sequence<0>{},
-                           sequence<1>{},
-                           sequence<2>{},
-                           sequence<3>{},
-                           sequence<4>{},
-                           sequence<5>{}),
-                make_tuple(sequence<1>{},
-                           sequence<2>{},
-                           sequence<0, 3>{},
-                           sequence<4, 5>{},
-                           sequence<6>{},
-                           sequence<7>{}));
-
-            // constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
-            //     b_lds_block_desc_unmerged,
-            //     make_tuple(make_merge_transform_v3_division_mod(
-            //                    make_tuple(number<KThreadReadPerm>{},
-            //                               number<KThreadWrite / kfold / KThreadReadPerm>{},
-            //                               number<kfold>{},
-            //                               number<K0PerThreadWrite>{})),
-            //                make_merge_transform_v3_division_mod(
-            //                    make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{})),
-            //                make_pass_through_transform(BK1)),
-            //     make_tuple(sequence<0, 1, 4, 2>{}, sequence<5, 6, 3>{}, sequence<7>{}),
-            //     make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
-
-            constexpr auto b_lds_block_desc_kn = transform_tensor_descriptor(
-                b_lds_block_desc_unmerged,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(number<KThreadReadPerm>{},
-                                          number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                          number<kfold>{},
-                                          number<K0PerThreadWrite>{},
-                                          BK1)),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<N0 / npair>{}, number<npair>{}, number<N1>{}))),
-                make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
-                make_tuple(sequence<1>{}, sequence<0>{}));
-
-            // return b_lds_block_desc_bk0_n_bk1;
-            return b_lds_block_desc_kn;
-
-            // constexpr auto b_lds_block_desc_bk0_n_bk1 = make_naive_tensor_descriptor(
-            //     make_tuple(BK0, number<NPerBlock>{}, number<KPack>{}),
-            //     make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
-            //     number<KPack>{},
-            //     number<1>{});
-
-            // constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-            //     b_lds_block_desc_bk0_n_bk1,
-            //     make_tuple(make_pass_through_transform(number<NPerBlock>{}),
-            //                make_merge_transform_v3_division_mod(make_tuple(BK0,
-            //                number<KPack>{}))),
-            //     make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            //     make_tuple(sequence<0>{}, sequence<1>{}));
-
-            // return b_lds_block_desc;
-        }
-#endif
-    }
-
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {

From 8fcb4dff1af2c44581a01607626927dd23297163 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 29 Apr 2025 07:35:10 -0700
Subject: [PATCH 081/443] Run CI jobs as user jenkins (#2141)

* run CI as jenkins

* remove user jenkins from docker image

* move inductor installation to a writeable path

* add a switch for inductor tests
---
 Dockerfile  |  1 -
 Jenkinsfile | 16 ++++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f77c685000..3cac1dde4c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,7 +9,6 @@ ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 
 # Add rocm repository
 RUN set -xe && \
-    useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
     apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
     curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
diff --git a/Jenkinsfile b/Jenkinsfile
index a18374509e..c46e2d53ef 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -395,7 +395,7 @@ def buildHipClangJob(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
@@ -464,7 +464,7 @@ def Build_CK(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="-u root --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
@@ -527,10 +527,10 @@ def Build_CK(Map conf=[:]){
                         arch_type = 6
                     }
                     cmake_build(conf)
-                    if ( !params.BUILD_LEGACY_OS && arch_type == 1 ){
+                    if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch_type == 1 ){
                             echo "Run inductor codegen tests"
                             sh """
-                                  pip install --break-system-packages --verbose .
+                                  pip install --target ${env.WORKSPACE} --break-system-packages --verbose .
                                   pytest python/test/test_gen_instances.py
                             """
                     }
@@ -625,10 +625,6 @@ def Build_CK(Map conf=[:]){
                             """
                         }
                     }
-                    // set ownership of all files and folders to jenkins after all steps completed
-                    dir("build"){
-                        sh "sudo chown -R jenkins:jenkins ../*"
-                    }
                 }
             }
         }
@@ -843,6 +839,10 @@ pipeline {
             name: "BUILD_LEGACY_OS",
             defaultValue: false,
             description: "Try building CK with legacy OS dockers: RHEL8 and SLES15 (default: OFF)")
+        booleanParam(
+            name: "RUN_INDUCTOR_TESTS",
+            defaultValue: false,
+            description: "Run inductor codegen tests (default: OFF)")
     }
     environment{
         dbuser = "${dbuser}"

From 6601931949dc385f78d24c4688369535d0f5315c Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 29 Apr 2025 17:22:38 -0700
Subject: [PATCH 082/443] try building ck4inductor and testing it inside a
 virtual environment (#2142)

use system virtualenv

use python-full ubuntu package in docker image

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 Dockerfile  | 4 +---
 Jenkinsfile | 7 +++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3cac1dde4c..c629bd034c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -49,9 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     mpich \
     net-tools \
     pkg-config \
-    python3 \
-    python3-dev \
-    python3-pip \
+    python3-full \
     redis \
     rocm-llvm-dev \
     sshpass \
diff --git a/Jenkinsfile b/Jenkinsfile
index c46e2d53ef..3e22eb2f01 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -530,8 +530,11 @@ def Build_CK(Map conf=[:]){
                     if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch_type == 1 ){
                             echo "Run inductor codegen tests"
                             sh """
-                                  pip install --target ${env.WORKSPACE} --break-system-packages --verbose .
-                                  pytest python/test/test_gen_instances.py
+                                  python3 -m venv ${env.WORKSPACE}
+                                  . ${env.WORKSPACE}/bin/activate
+                                  python3 -m pip install pytest build setuptools setuptools_scm
+                                  python3 -m pip install .
+                                  python3 -m pytest python/test/test_gen_instances.py
                             """
                     }
                     dir("build"){

From 1aea51d34eb17507b141ac9d6b36516bcc4bc584 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 29 Apr 2025 19:37:07 -0500
Subject: [PATCH 083/443] [Tile Engine] Improved README.md (#2134)

* improved tile_engine readme

* changed ck tile explanation and json

* further improved readme

* fixed typo
---
 tile_engine/ops/gemm/README.md                | 58 ++++++++++++++++---
 .../gemm/configs/instance_combination.json    |  6 +-
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 08456a1675..f7d86e90fe 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -1,22 +1,30 @@
 # GEMM Matrix Multiplication
 
-Use the files in this folder to generate and build applications that run Matrix multiplications using ck_tile programming based on the kernel parameters mentioned in the config file `./configs/instance_combination.json`.
+CK Tile Engine GEMM is used to generate and run GEMM kernels with different combinations of BlockTile sizes, WarpTile sizes, WarpTile mapping for all valid pipelines, schedulers and epilogues. 
 
 # Kernel Configurations
 
-User needs to provide kernel configuration such as datatype, layout, tile size, warp size, padding, pipeline, scheduler and epilogue in the config file. For reference please see `./configs/instance_combination.json`
+Kernel parameters are specified in the `instance_combination.json` file, including matrix layouts, data types, padding settings, pipelines, schedulers, epilogues, and numerical values for tile and warp sizes.
 
-## Build
-```
-# in the root of ck_tile
+Given a valid set of values, tile_engine_gemm will automatically iterate over all possible combinations of BlockTile and WarpTile sizes, as well as the specified pipelines, schedulers, and epilogues from `./configs/instance_combination.json`, and build the corresponding kernels.
+
+
+## Build Instructions
+``` bash
+# in the root of composable kernel create build directory
 mkdir build && cd build
-# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
-# To generate the executable
+# build composable kernel
+sh ../script/cmake-ck-dev.sh  ../ <arch> # replace <arch> with the appropriate architecture (example gfx942) or leave blank
+# generate the executable
 make tile_engine_gemm -j
 ```
 `tile_engine_gemm` will be located in the `./bin/` directory.
 
+_`tile_engine_gemm` must be rebuilt everytime `instance_combination.json` is modified._
+``` bash
+rm -rf tile_engine/ && make tile_engine_gemm -j  # rebuild
+```
+
 ## tile_engine_gemm inputs
 ```
 
@@ -42,11 +50,43 @@ make tile_engine_gemm -j
 
 Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json 
 ```
+Note: In `./configs/instance_combination.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above. 
 
 ## Example
 
-Below example will run gemm kernel with default dimensions of matrices, for compv3 pipeline, intrawave scheduler and default epilogue with all possible tile sizes mentioned in Config file.
+The following JSON file specifies parameters used to generate and build GEMM kernels across all possible combinations of pipelines, schedulers, epilogues with different tile and warp sizes.
 
+```json
+{     
+    /// other parameters ///
+    
+    "tile_m": {
+      "values": [256]
+    },
+    "tile_n": {
+      "values": [256]
+    },
+    "tile_k": {
+      "values": [64, 32]
+    },
+
+    /// other parameters ///
+
+    "pipeline": {
+      "values": ["compv3", "compv4", "mem"]
+    },
+    "scheduler": {
+      "values": ["intrawave", "interwave"]
+    },
+    "epilogue": {
+      "values": ["default", "cshuffle"]
+    }
+}
 ```
+
+At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
+``` bash
 ./bin/tile_engine_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default 
 ```
+The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and default epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
+
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
index e23df11500..66dbdafa11 100644
--- a/tile_engine/ops/gemm/configs/instance_combination.json
+++ b/tile_engine/ops/gemm/configs/instance_combination.json
@@ -7,10 +7,10 @@
       "values": ["c"]
     },
     "layout_c": {
-      "values": ["r"] 
+      "values": ["r"]
     },
     "datatype": {
-      "values": ["fp16"] 
+      "values": ["fp16"]
     },
     "tile_m": {
       "values": [256]
@@ -49,7 +49,7 @@
       "values": [false]
     },
     "pipeline": {
-      "values": ["compv3", "mem"]
+      "values": ["compv3", "compv4", "mem"]
     },
     "scheduler": {
       "values": ["intrawave", "interwave"]

From 23de234dbeb23f9f304b33e6b2de91639da62941 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 30 Apr 2025 09:49:37 +0200
Subject: [PATCH 084/443] Add grouped conv fwd 16x16 mfma instruction instances
 (#2140)

* Add grouped conv fwd 16x16 mfma instruction instances

* fix

* remove oddc

* fix

* fix
---
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp |  23 +++-
 .../gpu/grouped_convolution_forward.hpp       |   8 --
 .../gpu/grouped_convolution_forward_wmma.inc  | 111 ------------------
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   4 -
 ...ma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp |  40 -------
 ...mma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp |  40 -------
 ...ma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp |  40 -------
 ...mma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp |  40 -------
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |   9 --
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |   8 --
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |   9 --
 ...nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp |   9 --
 ...dl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp |   8 --
 ...gc_gkyxc_nhwgk_f16_comp_part2_instance.cpp |   9 --
 ...dl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp |  10 +-
 ...l_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp |  28 +----
 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp |  10 +-
 ...gc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp |  11 +-
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |   4 -
 ...gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp |  41 -------
 ..._gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp |  41 -------
 ...ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp |  41 -------
 ..._ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp |  41 -------
 36 files changed, 36 insertions(+), 686 deletions(-)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index f491474d38..6c0ba2f932 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -89,7 +90,12 @@ using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,    64,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,   128,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // mfma 16x16
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -140,7 +146,12 @@ using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        // mfma 16x16
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -184,7 +195,11 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        // mfma 16x16
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 638a3f98a3..d5eed7592e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -599,7 +599,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -610,7 +609,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -626,7 +624,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -637,7 +634,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -655,7 +651,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -667,7 +662,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -685,7 +679,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -697,7 +690,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
index 0ea24d0929..df4e95007d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
@@ -51,20 +51,6 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -107,20 +93,6 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -163,20 +135,6 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -219,19 +177,6 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -291,20 +236,6 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -347,20 +278,6 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -403,20 +320,6 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -458,20 +361,6 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
-
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index c1790901ec..3a101baac0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -87,8 +87,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
   ## NHWGC, GKYXC, NHWGK
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp
@@ -96,6 +94,4 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
deleted file mode 100644
index a8f723dfec..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<2,
-                                                                              GNHWC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              GNHWK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
deleted file mode 100644
index 784a118897..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
deleted file mode 100644
index 8c621543a9..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
deleted file mode 100644
index 5cb313b3ca..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
index c078f8ed04..f5df7278d0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index a67b11f1cf..db048679bd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -49,14 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                                                   NHWGC,
-                                                                                   GKYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NHWGK,
-                                                                                   ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
index 5c0391a25f..ee9507a80a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instanc
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
index 726276c461..132d3c8411 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
                                                               Empty_Tuple,
                                                               NHWGK,
                                                               ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
index 8b7bdec2a8..a7deb969ba 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
@@ -49,14 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
index c66114b9a3..d2732547fa 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance
                                                                  Empty_Tuple,
                                                                  NHWGK,
                                                                  ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
index 93e07e08fb..8a0caebc9f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -48,14 +48,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
index 6acbb7475c..e45df1e107 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -50,14 +50,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
 
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
-                                                                                   NHWGC,
-                                                                                   GKYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NHWGK,
-                                                                                   ConvFwdOddC>{});
-
     if(ck::get_device_name() != "gfx950")
     {
         add_device_operation_instances(
@@ -86,15 +78,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwdOddC>{});
     }
 
     if(ck::get_device_name() == "gfx950")
@@ -125,15 +108,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 2afbfdc386..078221f89f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               GNHWK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              GNHWC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              GNHWK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 822ef51e00..3a481dd204 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 79a1fb99a8..5add0f8add 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index e567c0df75..0257c7d315 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 3e42184996..2715506fe2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index c035d4c3da..8d3e4d91b1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
index 5c425effd8..465fa927a5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
index e8a763c527..87423801cb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
index 3ae3fb5186..ebb213461a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
index cb7e912936..c2c8a099b2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
index d787f4b048..11cb853f0d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
index 5644289790..1992d7f7c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
index 5b12dad5a3..2b8fd3d9db 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
index f667481fa4..5579ec62cc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
index 2ff2c7f51f..77f3df2c11 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 7b9ccf6609..eeea4aae6d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -60,10 +60,6 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
deleted file mode 100644
index fa378af1ee..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
-// wo, k]
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<3,
-                                                                              GNDHWC,
-                                                                              GKZYXC,
-                                                                              Empty_Tuple,
-                                                                              GNDHWK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
deleted file mode 100644
index d41416fd4a..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
-// wo, k]
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<3,
-                                                                             GNDHWC,
-                                                                             GKZYXC,
-                                                                             Empty_Tuple,
-                                                                             GNDHWK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
deleted file mode 100644
index 8a7bc26178..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
-// g, k]
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<3,
-                                                                              NDHWGC,
-                                                                              GKZYXC,
-                                                                              Empty_Tuple,
-                                                                              NDHWGK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
deleted file mode 100644
index 7649f86971..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
-// g, k]
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<3,
-                                                                             NDHWGC,
-                                                                             GKZYXC,
-                                                                             Empty_Tuple,
-                                                                             NDHWGK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck

From 9a9f59ae69a619e2d6ce3c8ff343f3c4b0ada413 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 30 Apr 2025 10:20:16 -0700
Subject: [PATCH 085/443] Revert "Add ck tile examples to package (#1880)"
 (#2150)

---
 example/CMakeLists.txt                        |  4 +---
 example/ck_tile/01_fmha/CMakeLists.txt        |  6 ++----
 example/ck_tile/02_layernorm2d/CMakeLists.txt |  3 +--
 example/ck_tile/03_gemm/CMakeLists.txt        |  7 ++-----
 example/ck_tile/03_gemm/stript.sh             |  1 -
 example/ck_tile/04_img2col/CMakeLists.txt     |  3 +--
 example/ck_tile/05_reduce/CMakeLists.txt      |  4 +---
 example/ck_tile/06_permute/CMakeLists.txt     |  3 +--
 .../ck_tile/09_topk_softmax/CMakeLists.txt    |  5 ++---
 example/ck_tile/10_rmsnorm2d/CMakeLists.txt   |  6 ++----
 .../11_add_rmsnorm2d_rdquant/CMakeLists.txt   |  6 ++----
 .../add_rmsnorm2d_rdquant_fwd.cpp             | 21 ++++++++-----------
 .../example_add_rmsnorm2d_rdquant_fwd.cpp     | 21 ++++++++-----------
 example/ck_tile/12_smoothquant/CMakeLists.txt |  3 +--
 example/ck_tile/13_moe_sorting/CMakeLists.txt |  3 +--
 .../ck_tile/14_moe_smoothquant/CMakeLists.txt |  3 +--
 example/ck_tile/15_fused_moe/CMakeLists.txt   |  3 +--
 .../ck_tile/16_batched_gemm/CMakeLists.txt    |  3 +--
 .../ck_tile/17_grouped_gemm/CMakeLists.txt    |  4 ++--
 example/ck_tile/18_flatmm/CMakeLists.txt      |  4 +---
 .../35_batched_transpose/CMakeLists.txt       |  4 ++--
 example/ck_tile/CMakeLists.txt                |  5 +----
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |  1 -
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |  3 ++-
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |  5 ++---
 25 files changed, 48 insertions(+), 83 deletions(-)
 delete mode 100644 example/ck_tile/03_gemm/stript.sh
 mode change 100755 => 100644 example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 0e61fd33ef..996a543ecc 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -5,6 +5,7 @@ include_directories(BEFORE
 
 add_custom_target(examples)
 
+
 # list of examples that are labelled as REGRESSION_EXAMPLE for make regression (runtime more than 30 seconds)
 # all other tests are labelled as SMOKE_EXAMPLE
 set(REGRESSION_EXAMPLES
@@ -231,9 +232,6 @@ endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 
 # add all example subdir
 file(GLOB dir_list LIST_DIRECTORIES true *)
-if (NOT SUPPORTED_GPU_TARGETS MATCHES "gfx9")
-    list(FILTER dir_list EXCLUDE REGEX ".*/ck_tile")
-endif()
 FOREACH(subdir ${dir_list})
     if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
         add_subdirectory(${subdir})
diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index ce3c8b3978..9ba3a453fc 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -58,8 +58,7 @@ set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_FMHA_FWD}")
-add_executable(${EXAMPLE_FMHA_FWD} fmha_fwd.cpp)
-rocm_install(TARGETS ${EXAMPLE_FMHA_FWD} COMPONENT examples)
+add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
 
@@ -67,8 +66,7 @@ set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_FMHA_BWD}")
-add_executable(${EXAMPLE_FMHA_BWD} fmha_bwd.cpp)
-rocm_install(TARGETS ${EXAMPLE_FMHA_BWD} COMPONENT examples)
+add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
 
diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt
index 74f195a9db..fa69ac0f7a 100644
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -26,8 +26,7 @@ add_custom_command(
 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
 
 message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
-add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp)
-rocm_install(TARGETS ${EXAMPLE_LAYERNORM2D_FWD} COMPONENT examples)
+add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
 
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index deccb71d23..411db2e317 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,8 +1,5 @@
-add_executable(tile_example_gemm_basic gemm_basic.cpp)
-rocm_install(TARGETS tile_example_gemm_basic COMPONENT examples)
-add_executable(tile_example_gemm_universal universal_gemm.cpp)
-rocm_install(TARGETS tile_example_gemm_universal COMPONENT examples)
-
+add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
+add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
   list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
diff --git a/example/ck_tile/03_gemm/stript.sh b/example/ck_tile/03_gemm/stript.sh
deleted file mode 100644
index 4b91cb36ce..0000000000
--- a/example/ck_tile/03_gemm/stript.sh
+++ /dev/null
@@ -1 +0,0 @@
-for file in gemm_universal_*; do mv "$file" "${file/f16_f16_f16/fp16_fp16_fp16}"; done
diff --git a/example/ck_tile/04_img2col/CMakeLists.txt b/example/ck_tile/04_img2col/CMakeLists.txt
index d3737467d8..3864c9ed9d 100644
--- a/example/ck_tile/04_img2col/CMakeLists.txt
+++ b/example/ck_tile/04_img2col/CMakeLists.txt
@@ -1,4 +1,3 @@
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_img2col image_to_column.cpp)
-rocm_install(TARGETS tile_example_img2col COMPONENT examples)
+add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp)
diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt
index 855e59c48e..6caa38d50d 100644
--- a/example/ck_tile/05_reduce/CMakeLists.txt
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -3,9 +3,7 @@ set(EXAMPLE_REDUCE "tile_example_reduce")
 # to be included in "make all/install/check"
 message("adding example ${EXAMPLE_REDUCE}")
 
-add_executable(${EXAMPLE_REDUCE} reduce.cpp)
-rocm_install(TARGETS ${EXAMPLE_REDUCE} COMPONENT examples)
-
+add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
 target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
 
diff --git a/example/ck_tile/06_permute/CMakeLists.txt b/example/ck_tile/06_permute/CMakeLists.txt
index 22483a4295..327fceb685 100644
--- a/example/ck_tile/06_permute/CMakeLists.txt
+++ b/example/ck_tile/06_permute/CMakeLists.txt
@@ -1,7 +1,6 @@
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_permute permute.cpp)
-rocm_install(TARGETS tile_example_permute COMPONENT examples)
+add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
 
 if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
 # set(PERMUTE_USE_ALTERNATIVE_IMPL false)
diff --git a/example/ck_tile/09_topk_softmax/CMakeLists.txt b/example/ck_tile/09_topk_softmax/CMakeLists.txt
index fc2a4d3fe0..b43b989792 100644
--- a/example/ck_tile/09_topk_softmax/CMakeLists.txt
+++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt
@@ -1,7 +1,6 @@
-add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp)
-rocm_install(TARGETS tile_example_topk_softmax COMPONENT examples)
-
+add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
 target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
 set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
index 731ff639a4..5684c9b2e0 100644
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -26,8 +26,7 @@ add_custom_command(
 set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
 
 message("adding ${TILE_RMSNORM2D_FWD}")
-add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp)
-rocm_install(TARGETS ${TILE_RMSNORM2D_FWD} COMPONENT examples)
+add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
 target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
 
@@ -39,8 +38,7 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno
 target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
 
 set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
-add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp)
-rocm_install(TARGETS ${EXAMPLE_RMSNORM2D_FWD} COMPONENT examples)
+add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp)
 target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
 
 # TODO: we have to turn off this global prop, otherwise the progress bar generated
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
index 7071127e01..6b0c3cef7a 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -3,8 +3,7 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
 # to be included in "make all/install/check"
 message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp)
-rocm_install(TARGETS ${TILE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples)
+add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
 target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})
 
@@ -16,8 +15,7 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t
 target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
 
 set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd")
-add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp)
-rocm_install(TARGETS ${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} COMPONENT examples)
+add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp)
 target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
 
 # TODO: we have to turn off this global prop, otherwise the progress bar generated
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
index 7d82a16aa9..574edf64d3 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -67,14 +67,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;
 
-    using ADataType        = typename TypeConfig::ADataType;
-    using BDataType        = typename TypeConfig::BDataType;
-    using GammaDataType    = typename TypeConfig::GammaDataType;
-    using XDataType        = typename TypeConfig::XDataType;
-    using UnquantYDataType = ck_tile::null_type;
-    using YScaleDataType   = typename TypeConfig::YScaleDataType;
-    using QYDataType       = typename TypeConfig::QYDataType;
-    using ComputeDataType  = float;
+    using ADataType       = typename TypeConfig::ADataType;
+    using BDataType       = typename TypeConfig::BDataType;
+    using GammaDataType   = typename TypeConfig::GammaDataType;
+    using XDataType       = typename TypeConfig::XDataType;
+    using YScaleDataType  = typename TypeConfig::YScaleDataType;
+    using QYDataType      = typename TypeConfig::QYDataType;
+    using ComputeDataType = float;
 
     // host verify
     ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -89,7 +88,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
     ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
-    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
 
     ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
     ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
@@ -193,9 +191,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType,
-                                             UnquantYDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
+                                             InvRmsDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
         }
 
         // yscale
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
old mode 100755
new mode 100644
index 3aab357909..ada4c6f2da
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -62,14 +62,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     assert(stride >= n);
 
-    using ADataType        = DataType;
-    using BDataType        = DataType;
-    using GammaDataType    = DataType;
-    using XDataType        = DataType;
-    using UnquantYDataType = ck_tile::null_type;
-    using YScaleDataType   = float;
-    using QYDataType       = ck_tile::int8_t;
-    using ComputeDataType  = float;
+    using ADataType       = DataType;
+    using BDataType       = DataType;
+    using GammaDataType   = DataType;
+    using XDataType       = DataType;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
 
     // host verify
     ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -82,7 +81,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
     ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
     ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
     ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
-    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {stride, 1});
 
     ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
     ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
@@ -195,9 +193,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType,
-                                             UnquantYDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
+                                             InvRmsDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
         }
 
         // yscale
diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt
index daeeb827bd..3849833aca 100644
--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -2,8 +2,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
     message("adding ${TARGET_NAME}")
     # not using add_example_executable() to add target, since we don't want this to have
     # to be included in "make all/install/check"
-    add_executable(${TARGET_NAME} ${MAIN_SRC})
-    rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
+    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
     target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
     foreach(source IN LISTS ARGN)
diff --git a/example/ck_tile/13_moe_sorting/CMakeLists.txt b/example/ck_tile/13_moe_sorting/CMakeLists.txt
index 662e16f0d3..09f3e4ac4e 100644
--- a/example/ck_tile/13_moe_sorting/CMakeLists.txt
+++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt
@@ -1,5 +1,4 @@
-add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp)
-rocm_install(TARGETS tile_example_moe_sorting COMPONENT examples)
+add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
 target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 
 set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
index 9acb27552a..12224a39a2 100644
--- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
@@ -2,8 +2,7 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
     message("adding ${TARGET_NAME}")
     # not using add_example_executable() to add target, since we don't want this to have
     # to be included in "make all/install/check"
-    add_executable(${TARGET_NAME} ${MAIN_SRC})
-    rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
+    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
     target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
     foreach(source IN LISTS ARGN)
diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt
index bb25a55c7d..a716eef19e 100644
--- a/example/ck_tile/15_fused_moe/CMakeLists.txt
+++ b/example/ck_tile/15_fused_moe/CMakeLists.txt
@@ -3,8 +3,7 @@ set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
 # to be included in "make all/install/check"
 message("adding ${TILE_EXAPMLE_FUSED_MOE}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_executable(${TILE_EXAPMLE_FUSED_MOE} main.cpp)
-rocm_install(TARGETS ${TILE_EXAPMLE_FUSED_MOE} COMPONENT examples)
+add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
 target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})
 
diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt
index 9eb7a45d80..78e78c6b04 100644
--- a/example/ck_tile/16_batched_gemm/CMakeLists.txt
+++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt
@@ -1,2 +1 @@
-add_executable(tile_example_batched_gemm batched_gemm.cpp)
-rocm_install(TARGETS tile_example_batched_gemm COMPONENT examples)
+add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp)
diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
index 80d688125b..d34013dd6c 100644
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_executable(tile_example_grouped_gemm grouped_gemm.cpp)
-rocm_install(TARGETS tile_example_grouped_gemm COMPONENT examples)
+add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
+
diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
index 3a70f0447d..9fbe65e3a7 100644
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -1,6 +1,4 @@
-add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
-rocm_install(TARGETS tile_example_flatmm_basic COMPONENT examples)
-
+add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
 
 set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
diff --git a/example/ck_tile/35_batched_transpose/CMakeLists.txt b/example/ck_tile/35_batched_transpose/CMakeLists.txt
index 10101e4d2e..a08fcebb74 100644
--- a/example/ck_tile/35_batched_transpose/CMakeLists.txt
+++ b/example/ck_tile/35_batched_transpose/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(TARGET_NAME tile_example_batched_transpose)
-add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp)
-rocm_install(TARGETS ${TARGET_NAME} COMPONENT examples)
+add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
 # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
 target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
+
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 16f68c6255..88efe0d8d9 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -14,11 +14,8 @@ add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
 add_subdirectory(13_moe_sorting)
 add_subdirectory(14_moe_smoothquant)
+add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
 add_subdirectory(35_batched_transpose)
-
-if (SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-  add_subdirectory(15_fused_moe)
-endif()
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index ad6641bc13..611aff318f 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -6,7 +6,6 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/concat.hpp"
 #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
-#include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 893c9d1ad3..0b38e7789e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -30,7 +30,8 @@ struct GemmPipelineProblemBase
     using BLayout = remove_cvref_t<typename Traits::BLayout>;
     using CLayout = remove_cvref_t<typename Traits::CLayout>;
 
-    static constexpr bool TransposeC            = Traits::TransposeC;
+    static constexpr bool TransposeC = Traits::TransposeC;
+
     static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
 
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index ecf861e4e8..a31004b425 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -12,8 +12,7 @@ template <bool kPadM_,
           bool kPadK_,
           typename ALayout_,
           typename BLayout_,
-          typename CLayout_,
-          bool UseStructuredSparsity_ = false>
+          typename CLayout_>
 struct TileGemmTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -28,7 +27,7 @@ struct TileGemmTraits
     using CLayout = CLayout_;
 
     static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
+    static constexpr bool UseStructuredSparsity = false;
 };
 
 template <bool kPadM_,

From cfae8634313f804593d123b3ec51a43319f5fab1 Mon Sep 17 00:00:00 2001
From: spolifroni-amd <Sandra.Polifroni@amd.com>
Date: Wed, 30 Apr 2025 17:58:40 -0400
Subject: [PATCH 086/443] updated Doxyfile and added the class list (#2147)

* updated Doxyfile and added the class list

* Update Doxyfile
---
 docs/doxygen/Doxyfile                         | 15 +++----
 docs/index.rst                                |  6 +--
 .../Composable-Kernel-API-reference.rst       | 42 -------------------
 docs/sphinx/_toc.yml.in                       |  6 +--
 4 files changed, 14 insertions(+), 55 deletions(-)
 delete mode 100644 docs/reference/Composable-Kernel-API-reference.rst

diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index d6f38e0ca9..4367aabc95 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -42,19 +42,19 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "ck"
+PROJECT_NAME           = "Composable Kernel"
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = v3.0.1.0
+PROJECT_NUMBER         = 
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HIP"
+PROJECT_BRIEF          = "Prototype interfaces compatible with ROCm platform and HiP"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
@@ -949,8 +949,8 @@ INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
                          ../../include/ck/tensor_operation/gpu/block \
                          ../../include/ck/tensor_operation/gpu/thread \
                          ../../library/include/ck/library/utility \
-                         ../../include/ck/wrapper
-
+                         ../../include/ck/wrapper \
+                         ../../include/ck_tile
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1161,7 +1161,8 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ../../README.md
+
+USE_MDFILE_AS_MAINPAGE = 
 
 # The Fortran standard specifies that for fixed formatted Fortran code all
 # characters from position 72 are to be considered as comment. A common
@@ -1370,7 +1371,7 @@ HTML_EXTRA_STYLESHEET  = ../_doxygen/extra_stylesheet.css
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       =
+HTML_EXTRA_FILES       = ../_doxygen/extra_stylesheet.css
 
 # The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
 # should be rendered with a dark or light theme.
diff --git a/docs/index.rst b/docs/index.rst
index 6d46eb49b1..4cc26a1d3e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -35,9 +35,9 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
     * :doc:`Composable Kernel supported scalar types <./reference/Composable_Kernel_supported_scalar_types>`
     * :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
     * :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
-    * :ref:`api-reference`
-    * :ref:`wrapper`
-
+    * :ref:`wrapper`    
+    * :doc:`Composable Kernel complete class list <./doxygen/html/annotated>`
+    
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
 
 You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
diff --git a/docs/reference/Composable-Kernel-API-reference.rst b/docs/reference/Composable-Kernel-API-reference.rst
deleted file mode 100644
index b6ee9f7790..0000000000
--- a/docs/reference/Composable-Kernel-API-reference.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _api-reference:
-
-********************************************************************
-Composable Kernel API reference guide
-********************************************************************
-
-This document contains details of the APIs for the Composable Kernel library and introduces some of the key design principles that are used to write new classes that extend the functionality of the Composable Kernel library.
-
-=================
-DeviceMem
-=================
-
-.. doxygenstruct:: DeviceMem
-
-=============================
-Kernels For Flashattention
-=============================
-
-The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
-the classes that are used in the CK GPU implementation of Flashattention.
-
-**Gridwise classes**
-
-.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
-
-**Blockwise classes**
-
-.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
-
-.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
-
-.. doxygenstruct:: ck::BlockwiseSoftmax
-
-**Threadwise classes**
-
-.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
-
-.. bibliography::
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index df98998224..2ef3383d84 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -32,10 +32,10 @@ subtrees:
     title: Composable Kernel custom types
   - file: reference/Composable_Kernel_vector_utilities.rst
     title: Composable Kernel vector utilities
-  - file: reference/Composable-Kernel-API-reference.rst
-    title: Composable Kernel API reference
   - file: reference/Composable-Kernel-wrapper.rst
-    title: Composable Kernel Wrapper
+    title: Composable Kernel wrapper
+  - file: doxygen/html/annotated.rst
+    title: Composable Kernel class list
 
 - caption: About
   entries:

From 1d8ef407604882b03857ba75d71be29ccd0ed592 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Wed, 30 Apr 2025 18:43:36 -0500
Subject: [PATCH 087/443] Add documentation for ck_tile::array<T,N> (#2078)

* addded documentation for ck_tile::array<T,N>

* clang format fix

* spelling errros

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* spelling errros

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Revert "spelling errros"

This reverts commit 4179e7d193e27b0b0b500ad50a87ae9f8dba8334.

* Revert "spelling errros"

This reverts commit 3f90733dbe27dffb9cb113a007059cf149cafb48.

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
Co-authored-by: John Afaganis <john.afaganis@amd.com>
---
 include/ck_tile/core/container/array.hpp | 27 ++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index fa63597db4..94aa40e278 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -19,6 +19,25 @@ namespace ck_tile {
 //      array<index_t, 4> buf {3, 2}; => {3, 2, 2, 2} (not {3,2,0,0})
 // use make_array_with({...}) to construct an array with compatible behavior as old ck
 // TODO: manually added constructor same as old ck
+/**
+ * @brief A fixed-size array container similar to std::array with additional utilities.
+ *
+ * This template class provides a lightweight fixed-size array with value semantics,
+ * supporting both host and device functionality for GPU programming. It includes
+ * specialized initialization methods and type punning capabilities.
+ *
+ * @tparam T_ The type of elements in the array
+ * @tparam N_ The fixed number of elements in the array
+ *
+ * @note This implementation provides additional features beyond std::array:
+ *       - GPU compatibility via CK_TILE_HOST_DEVICE macros
+ *       - Type punning via get_as() and set_as() methods
+ *       - Various specialized access methods
+ *       - Specialized initialization behaviors
+ *
+ * The initializer_list constructor fills remaining elements with the last value
+ * provided if the list size is smaller than N, which is different than std::array.
+ */
 template <typename T_, index_t N_>
 struct array
 {
@@ -142,6 +161,14 @@ struct array
 
 // empty Array
 
+/// @brief Specialization of array container for zero elements.
+///
+/// This is a specialization of the array container template for the case where the number of
+/// elements is 0. It provides the same interface as the general array template, but with operations
+/// appropriate for an empty array.
+///
+/// @tparam T The type of elements stored in the array (not used in this specialization but
+/// maintained for API consistency).
 template <typename T>
 struct array<T, 0>
 {

From b9d17bdb115c034e9a1028b3adca63762784d9b2 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 1 May 2025 07:04:57 -0700
Subject: [PATCH 088/443] add write permissions in workspace (#2154)

---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3e22eb2f01..68999d8aa6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -76,6 +76,7 @@ def check_host() {
     if ("${env.CK_SCCACHE}" != "null"){
         def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
         echo "sccache server: ${SCCACHE_SERVER}"
+        sh "chmod +w -R ${env.WORKSPACE}"
         sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
         def output = readFile(file: "tmp.txt")
         echo "tmp.txt contents: \$output"

From 79b0bfeb41db45de0cb65fdf24d27201ea0ae0e6 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Thu, 1 May 2025 11:55:48 -0600
Subject: [PATCH 089/443] MX GEMM - Add FP8 GEMM Tests for Different Layouts
 (#2152)

* Add gemm_mx_fp8_bf8 example with row-major B

* Add more overloads of MX MFMA instructions

* Add MK_KN (RRR) tests

* Add KM_NK (CCR) tests

* Add more problem sizes to Large tests

* Add test_gemm_mx to the list of regression tests
---
 example/67_gemm_microscaling/CMakeLists.txt   |   3 +
 .../67_gemm_microscaling/gemm_mx_common.hpp   |   2 +-
 .../67_gemm_microscaling/gemm_mx_fp8_bf8.cpp  |  97 ++++++++++
 .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp |  14 +-
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  18 ++
 include/ck/utility/amd_xdlops.hpp             | 101 +++++++++-
 .../tensor_operation_instance/gpu/gemm_mx.hpp |  50 ++++-
 .../gpu/gemm_mx/CMakeLists.txt                |   6 +-
 ...device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp |  61 ++++++
 ...l_bf8_f8_f16_mk_kn_mn_default_instance.cpp |  32 ++++
 ...device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp |  62 ++++++
 ...l_f8_f8_bf16_km_nk_mn_default_instance.cpp |  32 ++++
 test/CMakeLists.txt                           |   1 +
 test/gemm_mx/test_gemm_mx.cpp                 | 179 +++++++++++++++++-
 test/gemm_mx/test_gemm_mx_util.hpp            |   2 +-
 15 files changed, 642 insertions(+), 18 deletions(-)
 create mode 100644 example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp

diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 34125465a9..1a1db51c37 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -6,3 +6,6 @@ add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
 add_example_executable(example_gemm_mx_bf8 gemm_mx_bf8.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
 
+add_example_executable(example_gemm_mx_fp8_bf8 gemm_mx_fp8_bf8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_bf8)
+
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 32ef975192..99ed2a23b9 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -235,7 +235,7 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
         break;
 
     case 2:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
         a_m_k_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
 
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp b/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp
new file mode 100644
index 0000000000..ce4ebc0a40
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::f8_t;
+using BDataType = ck::bf8_t;
+
+using XDataType = ck::e8m0_bexp_t;
+
+using CDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Thread block size
+    256,              // MPerBlock
+    256,              // NPerBlock
+    128,              // KPerBlock
+    16,               // AK1
+    8,                // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    8,                // MXdlPerWave
+    8,                // NXdlPerWave
+    S<8, 32, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    false,            // ABlockLdsExtraM
+    S<16, 16, 1>,     // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<0, 2, 1>,       // BBlockTransferThreadClusterArrangeOrder
+    S<0, 2, 1>,       // BBlockTransferSrcAccessOrder
+    1,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    8,                // BBlockTransferDstScalarPerVector_BK1
+    false,            // BBlockLdsExtraN
+    1,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index 44d515e76c..1154fa2aa3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -797,12 +797,13 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1 = MPerBlock / M0;
+            constexpr auto WaveSize = 64;
+            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1       = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = BlockwiseGemmPipe::WaveSize / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
             constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
@@ -929,12 +930,13 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         }
         else // RowMajor B
         {
-            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N1 = NPerBlock / N0;
+            constexpr auto WaveSize = 64;
+            constexpr auto N0       = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1       = NPerBlock / N0;
 
             constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
             constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = BlockwiseGemmPipe::WaveSize / NPerXdl;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
             constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
 
             constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 08c4e4ba6e..06268f3cfb 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1129,6 +1129,12 @@ struct MfmaSelector
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
 
+    template <>
+    constexpr auto GetMfma<bf8_t, 32, 32, f8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<f8_t, 16, 16>()
     {
@@ -1147,6 +1153,18 @@ struct MfmaSelector
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 16, 16, bf8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, f8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<bf8_t, 32, 32>()
     {
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index a8c3baa31b..71e1937a23 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -532,7 +532,44 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                0, // cbsz
+                0, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+        // XXX: Note on the scale_a and scale_b parameters:
+        // If compiler detects that one or both scales are constant values, it will treat that
+        // constant as F32 constant. I.e., if scale_a at some point was declared as
+        // `e8m0_bexp_t a_scale{1.0f}`, the instruction would only work if scale_a parameter is
+        // assigned value `bit_cast<int32_t>(static_cast<float>(a_scale))`.
+
+        // XXX: Note on the OPSEL parameters: Instruction always takes byte0 as a scale value even
+        // when OPSEL is set otherwise.
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const f8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
                 0, // blgp
                 0, // OPSEL
                 scale_a,
@@ -576,7 +613,7 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                0, // cbsz
+                0, // cbsz   {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
                 0, // blgp
                 0, // OPSEL
                 scale_a,
@@ -605,7 +642,7 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                1, // cbsz
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
                 1, // blgp
                 0, // OPSEL
                 scale_a,
@@ -617,6 +654,64 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
         ignore = reg_b;
         ignore = scale_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const bf8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                0, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const f8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
 #endif
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
index 1c40ccec5d..4af5143f45 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
@@ -45,6 +45,34 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
                                              PassThrough,
                                              PassThrough>>>& instances);
 
+void add_device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Row,
+                                             Row,
+                                             BF8,
+                                             e8m0_bexp_t,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+void add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Col,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             F8,
+                                             e8m0_bexp_t,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
 template <typename ADataType,
           typename AScaleDataType,
           typename BDataType,
@@ -93,11 +121,31 @@ struct DeviceOperationInstanceFactory<
 
                 add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(op_ptrs);
             }
+            else if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
+                              is_same_v<CDataType, BF16>)
+            {
+
+                add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                          is_same_v<CLayout, Row>)
+        {
+            if constexpr(is_same_v<ADataType, BF8> && is_same_v<BDataType, F8> &&
+                         is_same_v<CDataType, F16>)
+            {
+
+                add_device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                          is_same_v<CLayout, Row>)
+        {
             if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
                          is_same_v<CDataType, BF16>)
             {
 
-                add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(op_ptrs);
+                add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(op_ptrs);
             }
         }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
index a166fc4ce4..0442bed130 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
@@ -1,14 +1,18 @@
 # ONLY MX_KERNELS
 set(GEMM_MX_INSTANCES)
 
-list(APPEND GEMM_MX_INSTANCES 
+list(APPEND GEMM_MX_INSTANCES
         device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
         device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
     )
 
 
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 
 add_instance_library(device_gemm_mx_instance ${GEMM_MX_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..25dd68a207
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF8  = bf8_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+using E8M0 = ck::e8m0_bexp_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    64,    16,   128,  16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   256,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,     false,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,    64,   256,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    16,    32,   512,  16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              8,         0,           1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
new file mode 100644
index 0000000000..2b6ccdbeda
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Row,
+                                             Row,
+                                             BF8,
+                                             E8M0,
+                                             F8,
+                                             E8M0,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
new file mode 100644
index 0000000000..0df018bf1d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+using E8M0 = ck::e8m0_bexp_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_instances = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,   4,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    16,   256,   128,   4,  16,  16,   16,    1,    4,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                  S<1, 16, 1, 16>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,    64,   4,  16,  32,   32,    2,    2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   64,     16,    16,   512,   8,  16,  16,   16,    1,    1,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,   8,  16,  16,   16,    8,    8,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,    64,   4,  16,  32,   32,    4,    4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   128,   128,   4,  16,  16,   16,    4,    8,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..c75e779fea
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Col,
+                                             Col,
+                                             Row,
+                                             F8,
+                                             E8M0,
+                                             F8,
+                                             E8M0,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 72c51823be..6bde1140d9 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -36,6 +36,7 @@ set(REGRESSION_TESTS
     test_batchnorm_bwd_rank_4
     test_grouped_convnd_bwd_data_xdl
     test_conv_tensor_rearrange
+    test_gemm_mx
 )
 
 function(add_test_executable TEST_NAME)
diff --git a/test/gemm_mx/test_gemm_mx.cpp b/test/gemm_mx/test_gemm_mx.cpp
index 6e1957e60a..2c976a217f 100644
--- a/test/gemm_mx/test_gemm_mx.cpp
+++ b/test/gemm_mx/test_gemm_mx.cpp
@@ -39,17 +39,49 @@ class TestGemmMX_MK_NK
 {
 };
 
+template <typename Tuple>
+class TestGemmMX_MK_KN
+    : public ck::test::TestGemmMX<typename tuple_concat<std::tuple<Row, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmMX_KM_NK
+    : public ck::test::TestGemmMX<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
 // clang-format off
-using KernelTypes_MK_NK = ::testing::Types<
+using KernelTypes_F8_MK_NK = ::testing::Types<
 #if defined(CK_ENABLE_FP8)
     //         ADataType, BDataType,       CDataType, ScaleBlockSize
     std::tuple<       F8,        F8,             F16, ck::Number<32> >,
     std::tuple<       F8,        F8,            BF16, ck::Number<32> >
 #endif
     >;
+
+using KernelTypes_BF8_F8_MK_KN = ::testing::Types<
+#if defined(CK_ENABLE_FP8)
+    //         ADataType, BDataType,       CDataType, ScaleBlockSize
+    std::tuple<      BF8,        F8,             F16, ck::Number<32> >
+#endif
+    >;
+
+using KernelTypes_F8_KM_NK = ::testing::Types<
+#if defined(CK_ENABLE_FP8)
+    //         ADataType, BDataType,       CDataType, ScaleBlockSize
+    std::tuple<       F8,        F8,            BF16, ck::Number<32> >
+#endif
+    >;
 // clang-format on
 
-TYPED_TEST_SUITE(TestGemmMX_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmMX_MK_NK, KernelTypes_F8_MK_NK);
+TYPED_TEST_SUITE(TestGemmMX_MK_KN, KernelTypes_BF8_F8_MK_KN);
+TYPED_TEST_SUITE(TestGemmMX_KM_NK, KernelTypes_F8_KM_NK);
+
+/// A: RowMajor
+/// B: ColMajor
+/// C: RowMajor
 
 TYPED_TEST(TestGemmMX_MK_NK, SmallM)
 {
@@ -95,14 +127,151 @@ TYPED_TEST(TestGemmMX_MK_NK, Regular)
 
 TYPED_TEST(TestGemmMX_MK_NK, Large)
 {
-    std::vector<int> Ms{4096};
-    constexpr int N = 3840;
-    constexpr int K = 4096;
+    std::vector<std::pair<int, int>> test_sizes{{5120, 5120}, {3840, 5120}, {4096, 4096}};
 
+    constexpr int K       = 4096;
     constexpr int StrideA = K;
     constexpr int StrideB = K;
+
+    for(auto test_size : test_sizes)
+    {
+        auto M = test_size.first;
+        auto N = test_size.second;
+
+        const auto StrideC = N;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+/// A: RowMajor
+/// B: RowMajor
+/// C: RowMajor
+
+TYPED_TEST(TestGemmMX_MK_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
     constexpr int StrideC = N;
 
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
+
+TYPED_TEST(TestGemmMX_MK_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 256;
+    constexpr int K = 512;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_KN, Regular)
+{
+    std::vector<int> Ms{3840};
+    constexpr int N = 512;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_MK_KN, Large)
+{
+    std::vector<std::pair<int, int>> test_sizes{{5120, 5120}, {3840, 5120}, {4096, 4096}};
+
+    constexpr int K       = 4096;
+    constexpr int StrideA = K;
+
+    for(auto test_size : test_sizes)
+    {
+        auto M = test_size.first;
+        auto N = test_size.second;
+
+        const auto StrideB = N;
+        const auto StrideC = N;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+/// A: ColMajor
+/// B: ColMajor
+/// C: RowMajor
+
+TYPED_TEST(TestGemmMX_KM_NK, SmallN)
+{
+    constexpr int M = 256;
+    std::vector<int> Ns{1, 2, 3, 4, 5, 6};
+    constexpr int K = 512;
+
+    constexpr int StrideA = M;
+    constexpr int StrideB = K;
+
+    for(int N : Ns)
+    {
+        const auto new_N   = N * 8;
+        const auto StrideC = new_N;
+        this->Run(M, new_N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmMX_KM_NK, MidLargeN)
+{
+    constexpr int M = 256;
+    std::vector<int> Ns{127, 255, 312, 799, 1573};
+    constexpr int K = 512;
+
+    constexpr int StrideA = M;
+    constexpr int StrideB = K;
+
+    for(int N : Ns)
+    {
+        const auto new_N   = (N + 7) / 8 * 8;
+        const auto StrideC = new_N;
+        this->Run(M, new_N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmMX_KM_NK, Regular)
+{
+    std::vector<int> Ms{3840};
+    constexpr int N = 512;
+    constexpr int K = 1024;
+
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, M, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmMX_KM_NK, Large)
+{
+    std::vector<std::pair<int, int>> test_sizes{{5120, 5120}, {3840, 5120}, {4096, 4096}};
+
+    constexpr int K       = 4096;
+    constexpr int StrideB = K;
+
+    for(auto test_size : test_sizes)
+    {
+        auto M = test_size.first;
+        auto N = test_size.second;
+
+        const auto StrideA = M;
+        const auto StrideC = N;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
diff --git a/test/gemm_mx/test_gemm_mx_util.hpp b/test/gemm_mx/test_gemm_mx_util.hpp
index 3bca4ceded..02833daeb4 100644
--- a/test/gemm_mx/test_gemm_mx_util.hpp
+++ b/test/gemm_mx/test_gemm_mx_util.hpp
@@ -150,7 +150,7 @@ bool profile_gemm_mx_impl(int do_verification,
         break;
 
     default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
         a_m_k_scale.GenerateTensorValue(
             GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f}); // R[2^-125, 1]
 

From 619fba3134641e4a08950a3bea385c16dbb74b64 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 1 May 2025 12:37:27 -0700
Subject: [PATCH 090/443] re-enable ck4inductor tests by default (#2155)

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 68999d8aa6..a9d30d9f71 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -845,8 +845,8 @@ pipeline {
             description: "Try building CK with legacy OS dockers: RHEL8 and SLES15 (default: OFF)")
         booleanParam(
             name: "RUN_INDUCTOR_TESTS",
-            defaultValue: false,
-            description: "Run inductor codegen tests (default: OFF)")
+            defaultValue: true,
+            description: "Run inductor codegen tests (default: ON)")
     }
     environment{
         dbuser = "${dbuser}"

From d58f2b8bd0c2adad65a731403673d545d8483acb Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 1 May 2025 13:36:24 -0700
Subject: [PATCH 091/443] mfma_32x32x64_fp8/bf8 (#2148)

* support for mfma_32x32x64_fp8

* clang-formatted

* Fixing sparsity in codegen
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 12 +++
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 98 +++++++++++++++++++
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  5 +
 tile_engine/ops/gemm/gemm_instance_builder.py | 54 +++++-----
 4 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 22962b9404..e75aca1d91 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -228,6 +228,18 @@ using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
 using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
     WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
+using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_32x32x64_fp8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_32x32x64_bf8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index cd32f35180..96c3c3d29f 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1440,6 +1440,104 @@ template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8 =
     WarpGemmAttributeMfmaImpl_f32_16x16x128_f8_bf8_base<bf8_t, bf8_t, Ctrl_>;
 
+template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = AType_;
+    using BDataType                     = BType_;
+    using CDataType                     = float;
+
+    using AVecType = ext_vector_t<ADataType, 32>;
+    using BVecType = ext_vector_t<BDataType, 32>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 64;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 32;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        //__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, cbsz, blgp, opsel, scale_a,
+        // opsel, scale_b)
+#if defined(__gfx950__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, c_vec, 0, 0, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, c_vec, 0, 1, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, c_vec, 1, 0, 0, 0, 0, 0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, c_vec, 1, 1, 0, 0, 0, 0);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+#if defined(__gfx950__)
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 0, 0, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 0, 1, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 1, 0, 0, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                a_vec, b_vec, CVecType{0.f}, 1, 1, 0, 0, 0, 0));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, bf8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, fp8_t, Ctrl_>;
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8 =
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, bf8_t, Ctrl_>;
+
 // int8
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 0e3342c479..64bd61a3dc 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -74,6 +74,11 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8; };
 
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8; };
+
 // clang-format on
 } // namespace impl
 
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index b441bdd2d6..a748c35feb 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -282,14 +282,14 @@ class GemmCodeGenerator:
 
     def _generate_common_header(self):
         """Generate common header with datatypes and layout"""
-        ctype = self.config.datatype
-        atype = self.config.datatype
-        btype = self.config.datatype
+        self.ctype = self.config.datatype
+        self.atype = self.config.datatype
+        self.btype = self.config.datatype
         if self.config.datatype in ['fp8', 'bf8']:
-            ctype = 'fp16'
+            self.ctype = 'fp16'
         elif self.config.datatype in ['int4']:
-            atype = 'fp16'
-            ctype = 'fp16'
+            self.atype = 'fp16'
+            self.ctype = 'fp16'
 
         content = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
@@ -298,10 +298,10 @@ class GemmCodeGenerator:
 #include "ck_tile/core.hpp"
 
 // Data types
-using ADataType = {DATA_TYPE_MAP[atype]};
-using BDataType = {DATA_TYPE_MAP[btype]};
+using ADataType = {DATA_TYPE_MAP[self.atype]};
+using BDataType = {DATA_TYPE_MAP[self.btype]};
 using AccDataType = float;
-using CDataType = {DATA_TYPE_MAP[ctype]};
+using CDataType = {DATA_TYPE_MAP[self.ctype]};
 
 // Layout configurations
 using ALayout = {LAYOUT_MAP[self.config.layouts[0]]};
@@ -499,7 +499,7 @@ struct GemmDispatcher {
 
     static void init(bool structured_sparsity) {
         auto& kernel_map = get_kernel_map();    
-        if(!kernel_map.empty()) return;
+        if(!kernel_map.empty()) return;            
         \n"""
          # Add tile/warp instantiations
         tile_params = set(itertools.product(
@@ -516,12 +516,25 @@ struct GemmDispatcher {
 
         
         for group in self.all_kernels:
-            content += f"""            kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf,
-                                                                  ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                                                                  ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                                                                  int verify, ck_tile::GemmHostArgs& args,
-                                                                  const ck_tile::stream_config& stream) {{
-                        """
+            content += f"""        kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf,
+                                                               ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                                                               ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                                                               int verify, ck_tile::GemmHostArgs& args,
+                                                               const ck_tile::stream_config& stream) {{
+            if(structured_sparsity){{  // SMFMA"""
+            for tile in tile_params:
+                # Check if we have valid tile/warp combinations 
+                # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
+                if ((tile[0]/(tile[3] * tile[7]) * tile[3] * tile[7]) != tile[0]) or \
+                   ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
+                    continue
+                sparse = self.atype == 'fp16' and \
+                    ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
+                     (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
+                content += f"""
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
+            content += f"""
+            }} else {{"""
             for tile in tile_params:
                 # Check if we have valid tile/warp combinations 
                 # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
@@ -529,13 +542,10 @@ struct GemmDispatcher {
                    ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
                     continue
                 content += f"""
-                if(structured_sparsity) {{
-                    run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {1}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
-                }} else {{
-                    run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {0}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
-                }}"""
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
             content += f"""
-            }};\n"""
+            }}
+        }};\n"""
 
         content += """    }
     

From c4e4e592c13168a9cf053039a447b31714b92c55 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 5 May 2025 07:29:07 -0700
Subject: [PATCH 092/443] Bump rocm-docs-core[api_reference] from 1.18.2 to
 1.18.4 in /docs/sphinx (#2161)

Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.18.2 to 1.18.4.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.18.2...v1.18.4)

---
updated-dependencies:
- dependency-name: rocm-docs-core[api_reference]
  dependency-version: 1.18.4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index ac03e40939..6c48b2de09 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.18.2
+rocm-docs-core[api_reference]==1.18.4
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 3742eeebba..62c3ea8ff8 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -237,7 +237,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.18.2
+rocm-docs-core[api-reference]==1.18.4
     # via -r requirements.in
 rpds-py==0.24.0
     # via

From 79beaacdd17928d77d6622498b734cd2d3d3c6d6 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Mon, 5 May 2025 09:18:22 -0600
Subject: [PATCH 093/443] Restrict MX GEMM instantiation to GFX950 arch (#2157)

---
 .../device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp                  | 2 +-
 .../device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp                  | 2 +-
 .../device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp                  | 2 +-
 .../device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp                   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
index 25dd68a207..3713ebae0e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
@@ -45,7 +45,7 @@ using device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_instances = std::tuple<
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    64,    16,   128,  16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   256,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,     false,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,    64,   256,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
index 0df018bf1d..5b0c5137b3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
@@ -44,7 +44,7 @@ using device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_instances = std::tuple<
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,   4,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    16,   256,   128,   4,  16,  16,   16,    1,    4,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                  S<1, 16, 1, 16>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,    64,   4,  16,  32,   32,    2,    2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
index 1e979f69ca..8e25bcc25f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -44,7 +44,7 @@ using device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
index 0ca4f2a3ce..5fefb57257 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -44,7 +44,7 @@ using device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances = std::tuple<
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__) || defined(CK_USE_NATIVE_MX_SUPPORT)
+#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,

From 0bcb804ad079f8b427786cc701675b3c535a180b Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Mon, 5 May 2025 18:46:44 +0200
Subject: [PATCH 094/443] [CK_TILE] Remove scratch usage from universal gemm
 (#2001)

* moves kbatch condition outside of kernel

* add reviewer comments

* fixes

* fix tests

* fixes after review

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |  91 ++++++----
 example/ck_tile/03_gemm/universal_gemm.cpp    |  88 ++++++---
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  | 171 ++++++++++--------
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 171 ++++++++++--------
 .../ops/epilogue/cshuffle_epilogue.hpp        |  63 +++----
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   |  10 +-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  53 ++----
 .../batched_gemm/test_batched_gemm_util.hpp   |  34 +++-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  98 ++++++----
 .../grouped_gemm/test_grouped_gemm_util.hpp   |  34 +++-
 10 files changed, 473 insertions(+), 340 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 69051423fb..1edb3da947 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -53,50 +53,67 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
     using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-    using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-        ck_tile::CShuffleEpilogueProblem<ADataType,
-                                         BDataType,
-                                         AccDataType,
-                                         CDataType,
-                                         CLayout,
-                                         CodegenPipelineProblem::kBlockSize,
-                                         TilePartitioner::MPerBlock,
-                                         TilePartitioner::NPerBlock,
-                                         M_Warp,
-                                         N_Warp,
-                                         M_Warp_Tile,
-                                         N_Warp_Tile,
-                                         K_Warp_Tile,
-                                         CodegenPipelineProblem::TransposeC>>;
-    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-    using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
-    auto kargs = Kernel::MakeKernelArgs(args);
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
 
-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-    constexpr dim3 blocks = Kernel::BlockSize();
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation>>;
 
-    if(!Kernel::IsSupportedArgument(kargs))
+        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
     {
-        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
     }
-
-    if(s.log_level_ > 0)
+    else
     {
-        std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                  << "shape: " << CodegenGemmShape::GetName() << '\n'
-                  << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                  << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                  << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                  << std::endl;
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
     }
-
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-    return ave_time;
 }
 
 #include "run_gemm_example.inc"
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 2ba16ca89d..e6a2811918 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -61,10 +61,13 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto memory_operation = memory_operation_.value;
 
         using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                            BDataType,
@@ -90,7 +93,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                              GemmConfig::M_Warp_Tile,
                                              GemmConfig::N_Warp_Tile,
                                              GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC>>;
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
@@ -116,23 +120,40 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         return ave_time;
     };
 
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
     if(has_hot_loop)
     {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
         if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
         else if(tail_num == ck_tile::TailNumber::Odd)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         else if(tail_num == ck_tile::TailNumber::Even)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
         }
         else
         {
@@ -146,20 +167,21 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         // Tail pipeline One to Seven
         if(tail_num == ck_tile::TailNumber::One)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
         }
         else if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
 
         if constexpr(BaseGemmPipeline::PrefetchStages > 2)
         {
             if(tail_num == ck_tile::TailNumber::Two)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
             }
         }
@@ -167,7 +189,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         {
             if(tail_num == ck_tile::TailNumber::Three)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
             }
         }
@@ -175,7 +198,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         {
             if(tail_num == ck_tile::TailNumber::Four)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
             }
         }
@@ -183,7 +207,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         {
             if(tail_num == ck_tile::TailNumber::Five)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
             }
         }
@@ -191,7 +216,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         {
             if(tail_num == ck_tile::TailNumber::Six)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
             }
         }
@@ -199,20 +225,22 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         {
             if(tail_num == ck_tile::TailNumber::Seven)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
             }
         }
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
         if(tail_num == ck_tile::TailNumber::Three)
         {
-            Run(ck_tile::bool_constant<true>{},
+            RunSplitk(
+                ck_tile::bool_constant<true>{},
                 ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
         }
         else
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
         }
 #endif
     }
@@ -220,18 +248,18 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     {
         if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
         else if(tail_num == ck_tile::TailNumber::Odd)
         {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         else if(tail_num == ck_tile::TailNumber::Even)
         {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         else
         {
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index a0cd18ec74..0219c67305 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -106,61 +106,81 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
 
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;
 
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
 
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             CLayout,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC>>;
-        using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
-        constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
+            constexpr dim3 blocks = Kernel::BlockSize();
 
-        if(!Kernel::IsSupportedArgument(kargs))
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
         {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
         }
-
-        if(s.log_level_ > 0)
+        else
         {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << GemmShape::GetName() << '\n'
-                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << GemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
         }
-
-        ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        return ave_time;
     };
 
     if(has_hot_loop)
@@ -168,18 +188,18 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
         if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
         else if(tail_num == ck_tile::TailNumber::Odd)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         else if(tail_num == ck_tile::TailNumber::Even)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
         }
         else
         {
@@ -193,20 +213,21 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         // Tail pipeline One to Seven
         if(tail_num == ck_tile::TailNumber::One)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
         }
         else if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
 
         if constexpr(BaseGemmPipeline::PrefetchStages > 2)
         {
             if(tail_num == ck_tile::TailNumber::Two)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
             }
         }
@@ -214,7 +235,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         {
             if(tail_num == ck_tile::TailNumber::Three)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
             }
         }
@@ -222,7 +244,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         {
             if(tail_num == ck_tile::TailNumber::Four)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
             }
         }
@@ -230,7 +253,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         {
             if(tail_num == ck_tile::TailNumber::Five)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
             }
         }
@@ -238,7 +262,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         {
             if(tail_num == ck_tile::TailNumber::Six)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
             }
         }
@@ -246,20 +271,22 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         {
             if(tail_num == ck_tile::TailNumber::Seven)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
             }
         }
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
         if(tail_num == ck_tile::TailNumber::Three)
         {
-            Run(ck_tile::bool_constant<true>{},
+            RunSplitk(
+                ck_tile::bool_constant<true>{},
                 ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
         }
         else
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
         }
 #endif
     }
@@ -267,18 +294,18 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
     {
         if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
         else if(tail_num == ck_tile::TailNumber::Odd)
         {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         else if(tail_num == ck_tile::TailNumber::Even)
         {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         std::ostringstream err;
         err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 2a9903362d..9b134ff779 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -114,66 +114,86 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
 
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;
 
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
 
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             CLayout,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC>>;
-        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(gemm_descs);
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKargs(gemm_descs);
 
-        const dim3 grids      = Kernel::GridSize(gemm_descs);
-        constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids      = Kernel::GridSize(gemm_descs);
+            constexpr dim3 blocks = Kernel::BlockSize();
 
-        ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
-                                                     kargs.data(),
-                                                     get_workspace_size(gemm_descs),
-                                                     hipMemcpyHostToDevice,
-                                                     s.stream_id_));
+            ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
+                                                         kargs.data(),
+                                                         get_workspace_size(gemm_descs),
+                                                         hipMemcpyHostToDevice,
+                                                         s.stream_id_));
 
-        if(s.log_level_ > 0)
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                    Kernel{},
+                    grids,
+                    blocks,
+                    0,
+                    ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
+                    gemm_descs.size()));
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(gemm_descs[0].k_batch == 1)
         {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
         }
-
-        ave_time = ck_tile::launch_kernel(
-            s,
-            ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                Kernel{},
-                grids,
-                blocks,
-                0,
-                ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
-                gemm_descs.size()));
-        return ave_time;
     };
 
     if(has_hot_loop)
@@ -181,18 +201,18 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
         if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
         else if(tail_num == ck_tile::TailNumber::Odd)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
         }
         else if(tail_num == ck_tile::TailNumber::Even)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
         }
         else
         {
@@ -206,20 +226,21 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         // Tail pipeline One to Seven
         if(tail_num == ck_tile::TailNumber::One)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
         }
         else if(tail_num == ck_tile::TailNumber::Full)
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
 
         if constexpr(BaseGemmPipeline::PrefetchStages > 2)
         {
             if(tail_num == ck_tile::TailNumber::Two)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
             }
         }
@@ -227,7 +248,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         {
             if(tail_num == ck_tile::TailNumber::Three)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
             }
         }
@@ -235,7 +257,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         {
             if(tail_num == ck_tile::TailNumber::Four)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
             }
         }
@@ -243,7 +266,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         {
             if(tail_num == ck_tile::TailNumber::Five)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
             }
         }
@@ -251,7 +275,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         {
             if(tail_num == ck_tile::TailNumber::Six)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
             }
         }
@@ -259,20 +284,22 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         {
             if(tail_num == ck_tile::TailNumber::Seven)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
             }
         }
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
         if(tail_num == ck_tile::TailNumber::Three)
         {
-            Run(ck_tile::bool_constant<true>{},
+            RunSplitk(
+                ck_tile::bool_constant<true>{},
                 ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
         }
         else
         {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
         }
 #endif
     }
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 225997439e..9b8dde1905 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -22,23 +22,25 @@ template <typename ADataType_,
           index_t kMPerXdl_,
           index_t kNPerXdl_,
           index_t kKPerXdl_,
-          bool isCTransposed_>
+          bool isCTransposed_,
+          memory_operation_enum MemoryOperation_>
 struct CShuffleEpilogueProblem
 {
-    using ADataType                        = remove_cvref_t<ADataType_>;
-    using BDataType                        = remove_cvref_t<BDataType_>;
-    using AccDataType                      = remove_cvref_t<AccDataType_>;
-    using ODataType                        = remove_cvref_t<ODataType_>;
-    using CLayout                          = remove_cvref_t<CLayout_>;
-    static constexpr index_t kBlockSize    = kBlockSize_;
-    static constexpr index_t kMPerBlock    = kM_;
-    static constexpr index_t kNPerBlock    = kN_;
-    static constexpr index_t kMWave        = kMWave_;
-    static constexpr index_t kNWave        = kNWave_;
-    static constexpr index_t kMPerXdl      = kMPerXdl_;
-    static constexpr index_t kNPerXdl      = kNPerXdl_;
-    static constexpr index_t kKPerXdl      = kKPerXdl_;
-    static constexpr index_t isCTransposed = isCTransposed_;
+    using ADataType                                        = remove_cvref_t<ADataType_>;
+    using BDataType                                        = remove_cvref_t<BDataType_>;
+    using AccDataType                                      = remove_cvref_t<AccDataType_>;
+    using ODataType                                        = remove_cvref_t<ODataType_>;
+    using CLayout                                          = remove_cvref_t<CLayout_>;
+    static constexpr index_t kBlockSize                    = kBlockSize_;
+    static constexpr index_t kMPerBlock                    = kM_;
+    static constexpr index_t kNPerBlock                    = kN_;
+    static constexpr index_t kMWave                        = kMWave_;
+    static constexpr index_t kNWave                        = kNWave_;
+    static constexpr index_t kMPerXdl                      = kMPerXdl_;
+    static constexpr index_t kNPerXdl                      = kNPerXdl_;
+    static constexpr index_t kKPerXdl                      = kKPerXdl_;
+    static constexpr index_t isCTransposed                 = isCTransposed_;
+    static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
 };
 
 template <typename Problem_, typename Policy_ = void>
@@ -52,18 +54,19 @@ struct CShuffleEpilogue
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
-    using CLayout                           = remove_cvref_t<typename Problem::CLayout>;
-    static constexpr index_t kBlockSize     = Problem::kBlockSize;
-    static constexpr index_t kMPerBlock     = Problem::kMPerBlock;
-    static constexpr index_t kNPerBlock     = Problem::kNPerBlock;
-    static constexpr index_t kMWave         = Problem::kMWave;
-    static constexpr index_t kNWave         = Problem::kNWave;
-    static constexpr index_t kMPerXdl       = Problem::kMPerXdl;
-    static constexpr index_t kNPerXdl       = Problem::kNPerXdl;
-    static constexpr index_t kKPerXdl       = Problem::kKPerXdl;
-    static constexpr index_t isCTransposed  = Problem::isCTransposed;
-    static constexpr index_t kMPerIteration = kMPerXdl * kMWave;
-    static constexpr index_t kNPerIteration = kNPerXdl * kNWave;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    static constexpr memory_operation_enum MemoryOperation = Problem::MemoryOperation;
+    static constexpr index_t kBlockSize                    = Problem::kBlockSize;
+    static constexpr index_t kMPerBlock                    = Problem::kMPerBlock;
+    static constexpr index_t kNPerBlock                    = Problem::kNPerBlock;
+    static constexpr index_t kMWave                        = Problem::kMWave;
+    static constexpr index_t kNWave                        = Problem::kNWave;
+    static constexpr index_t kMPerXdl                      = Problem::kMPerXdl;
+    static constexpr index_t kNPerXdl                      = Problem::kNPerXdl;
+    static constexpr index_t kKPerXdl                      = Problem::kKPerXdl;
+    static constexpr index_t isCTransposed                 = Problem::isCTransposed;
+    static constexpr index_t kMPerIteration                = kMPerXdl * kMWave;
+    static constexpr index_t kNPerIteration                = kNPerXdl * kNWave;
 
     using WG = WarpGemmMfmaDispatcher<ADataType,
                                       BTypeToUse,
@@ -120,9 +123,7 @@ struct CShuffleEpilogue
         return kMWave * kNWave * kMPerXdl * kNPerXdl * sizeof(ODataType);
     }
 
-    template <typename ODramWindow,
-              typename OAccTile,
-              memory_operation_enum out_memory_data_op = memory_operation_enum::set>
+    template <typename ODramWindow, typename OAccTile>
     CK_TILE_DEVICE auto
     operator()(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, void* p_smem)
     {
@@ -179,7 +180,7 @@ struct CShuffleEpilogue
             const auto c_out_tensor =
                 load_tile(make_tile_window(out_lds_window, dram_tile_distribution));
 
-            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            if constexpr(MemoryOperation == memory_operation_enum::set)
             {
                 store_tile(out_dram_window, c_out_tensor);
             }
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index dfb6bfae58..d495c0d950 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -142,15 +142,7 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
-        if(kargs.k_batch == 1)
-        {
-            this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
-        }
-        else
-        {
-            this->template RunGemm<memory_operation_enum::atomic_add>(
-                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
-        }
+        this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index bc41f680f2..9c25104cd7 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -608,9 +608,7 @@ struct GemmKernel
      * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
      *
-     * @tparam DstInMemOp Destination memory operation (default: set).
      */
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
     CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
                                        const BDataType* b_ptr,
                                        CDataType* c_ptr,
@@ -622,7 +620,8 @@ struct GemmKernel
     {
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
-            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
@@ -640,9 +639,8 @@ struct GemmKernel
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I2);
 
-        EpiloguePipeline{}
-            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
-                c_block_window, c_block_tile, smem_ptr_0);
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, smem_ptr_0);
     }
 
     /**
@@ -660,9 +658,7 @@ struct GemmKernel
      * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
      *
-     * @tparam DstInMemOp Destination memory operation (default: set).
      */
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
     CK_TILE_DEVICE static void RunGemm2LDS(const ADataType* a_ptr,
                                            const BDataType* b_ptr,
                                            CDataType* c_ptr,
@@ -675,7 +671,8 @@ struct GemmKernel
     {
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
-            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
@@ -692,9 +689,8 @@ struct GemmKernel
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I2);
 
-        EpiloguePipeline{}
-            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
-                c_block_window, c_block_tile, smem_ptr_0);
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, smem_ptr_0);
     }
 
     CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
@@ -718,7 +714,9 @@ struct GemmKernel
         if constexpr(GemmPipeline::DoubleSmemBuffer == true)
         {
             __shared__ char smem_ptr_1[GetSmemSize()];
-            if(kargs.k_batch == 1)
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<CDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm2LDS(a_ptr,
                             b_ptr,
@@ -730,38 +728,15 @@ struct GemmKernel
                             i_m,
                             i_n);
             }
-            else
-            {
-                if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<CDataType, fp16_t, bf16_t>::value))
-                {
-                    RunGemm2LDS<memory_operation_enum::atomic_add>(a_ptr,
-                                                                   b_ptr,
-                                                                   c_ptr,
-                                                                   smem_ptr_0,
-                                                                   smem_ptr_1,
-                                                                   kargs,
-                                                                   splitk_batch_offset,
-                                                                   i_m,
-                                                                   i_n);
-                }
-            }
         }
         else
         {
-            if(kargs.k_batch == 1)
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<CDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
             }
-            else
-            {
-                if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<CDataType, fp16_t, bf16_t>::value))
-                {
-                    RunGemm<memory_operation_enum::atomic_add>(
-                        a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
-                }
-            }
         }
     }
 };
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index 0af3ef3b34..4633f23ded 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -81,10 +81,13 @@ class TestCkTileBatchedGemm : public ::testing::Test
 
         float ave_time{0};
 
-        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-            constexpr bool has_hot_loop_v = has_hot_loop_.value;
-            constexpr auto tail_number_v  = tail_number_.value;
-            constexpr auto scheduler      = ck_tile::GemmPipelineScheduler::Intrawave;
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+            constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                                BDataType,
@@ -110,7 +113,8 @@ class TestCkTileBatchedGemm : public ::testing::Test
                                                  M_Warp_Tile,
                                                  N_Warp_Tile,
                                                  K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC>>;
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
             using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
@@ -138,11 +142,29 @@ class TestCkTileBatchedGemm : public ::testing::Test
             return ave_time;
         };
 
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+
         if(has_hot_loop)
         {
             if(tail_num == ck_tile::TailNumber::Full)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
             else
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 1b997ddbce..0329f16416 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -138,9 +138,12 @@ class TestCkTileGemmPipeline : public ::testing::Test
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
         const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
-        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-            constexpr bool has_hot_loop_v = has_hot_loop_.value;
-            constexpr auto tail_number_v  = tail_number_.value;
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                                BDataType,
@@ -168,7 +171,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                  M_Warp_Tile,
                                                  N_Warp_Tile,
                                                  K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC>>;
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
 
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
@@ -193,15 +197,32 @@ class TestCkTileGemmPipeline : public ::testing::Test
                 s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
         };
 
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+
         if(has_hot_loop)
         {
             if constexpr(PipelineType == GemmPipelineType::CompV3)
             {
                 if(tail_num == ck_tile::TailNumber::Full)
                 {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber,
-                                                   ck_tile::TailNumber::Full>{});
+                    RunSplitk(ck_tile::bool_constant<true>{},
+                              ck_tile::integral_constant<ck_tile::TailNumber,
+                                                         ck_tile::TailNumber::Full>{});
                 }
                 else
                 {
@@ -219,69 +240,69 @@ class TestCkTileGemmPipeline : public ::testing::Test
                 // Tail pipeline One to Seven
                 if(tail_num == ck_tile::TailNumber::One)
                 {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber,
-                                                   ck_tile::TailNumber::One>{});
+                    RunSplitk(ck_tile::bool_constant<true>{},
+                              ck_tile::integral_constant<ck_tile::TailNumber,
+                                                         ck_tile::TailNumber::One>{});
                 }
                 else if(tail_num == ck_tile::TailNumber::Full)
                 {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber,
-                                                   ck_tile::TailNumber::Full>{});
+                    RunSplitk(ck_tile::bool_constant<true>{},
+                              ck_tile::integral_constant<ck_tile::TailNumber,
+                                                         ck_tile::TailNumber::Full>{});
                 }
 
                 if constexpr(BaseGemmPipeline::PrefetchStages > 2)
                 {
                     if(tail_num == ck_tile::TailNumber::Two)
                     {
-                        Run(ck_tile::bool_constant<true>{},
-                            ck_tile::integral_constant<ck_tile::TailNumber,
-                                                       ck_tile::TailNumber::Two>{});
+                        RunSplitk(ck_tile::bool_constant<true>{},
+                                  ck_tile::integral_constant<ck_tile::TailNumber,
+                                                             ck_tile::TailNumber::Two>{});
                     }
                 }
                 if constexpr(BaseGemmPipeline::PrefetchStages > 3)
                 {
                     if(tail_num == ck_tile::TailNumber::Three)
                     {
-                        Run(ck_tile::bool_constant<true>{},
-                            ck_tile::integral_constant<ck_tile::TailNumber,
-                                                       ck_tile::TailNumber::Three>{});
+                        RunSplitk(ck_tile::bool_constant<true>{},
+                                  ck_tile::integral_constant<ck_tile::TailNumber,
+                                                             ck_tile::TailNumber::Three>{});
                     }
                 }
                 if constexpr(BaseGemmPipeline::PrefetchStages > 4)
                 {
                     if(tail_num == ck_tile::TailNumber::Four)
                     {
-                        Run(ck_tile::bool_constant<true>{},
-                            ck_tile::integral_constant<ck_tile::TailNumber,
-                                                       ck_tile::TailNumber::Four>{});
+                        RunSplitk(ck_tile::bool_constant<true>{},
+                                  ck_tile::integral_constant<ck_tile::TailNumber,
+                                                             ck_tile::TailNumber::Four>{});
                     }
                 }
                 if constexpr(BaseGemmPipeline::PrefetchStages > 5)
                 {
                     if(tail_num == ck_tile::TailNumber::Five)
                     {
-                        Run(ck_tile::bool_constant<true>{},
-                            ck_tile::integral_constant<ck_tile::TailNumber,
-                                                       ck_tile::TailNumber::Five>{});
+                        RunSplitk(ck_tile::bool_constant<true>{},
+                                  ck_tile::integral_constant<ck_tile::TailNumber,
+                                                             ck_tile::TailNumber::Five>{});
                     }
                 }
                 if constexpr(BaseGemmPipeline::PrefetchStages > 6)
                 {
                     if(tail_num == ck_tile::TailNumber::Six)
                     {
-                        Run(ck_tile::bool_constant<true>{},
-                            ck_tile::integral_constant<ck_tile::TailNumber,
-                                                       ck_tile::TailNumber::Six>{});
+                        RunSplitk(ck_tile::bool_constant<true>{},
+                                  ck_tile::integral_constant<ck_tile::TailNumber,
+                                                             ck_tile::TailNumber::Six>{});
                     }
                 }
                 if constexpr(BaseGemmPipeline::PrefetchStages > 7)
                 {
                     if(tail_num == ck_tile::TailNumber::Seven)
                     {
-                        Run(ck_tile::bool_constant<true>{},
-                            ck_tile::integral_constant<ck_tile::TailNumber,
-                                                       ck_tile::TailNumber::Seven>{});
+                        RunSplitk(ck_tile::bool_constant<true>{},
+                                  ck_tile::integral_constant<ck_tile::TailNumber,
+                                                             ck_tile::TailNumber::Seven>{});
                     }
                 }
             }
@@ -290,15 +311,15 @@ class TestCkTileGemmPipeline : public ::testing::Test
             {
                 if(tail_num == ck_tile::TailNumber::Three)
                 {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber,
-                                                   ck_tile::TailNumber::Three>{});
+                    RunSplitk(ck_tile::bool_constant<true>{},
+                              ck_tile::integral_constant<ck_tile::TailNumber,
+                                                         ck_tile::TailNumber::Three>{});
                 }
                 else
                 {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber,
-                                                   ck_tile::TailNumber::Two>{});
+                    RunSplitk(ck_tile::bool_constant<true>{},
+                              ck_tile::integral_constant<ck_tile::TailNumber,
+                                                         ck_tile::TailNumber::Two>{});
                 }
             }
         }
@@ -307,7 +328,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
             // Tail number always Full - #PrefetchStages
             if(tail_num == ck_tile::TailNumber::Full)
             {
-                Run(ck_tile::bool_constant<false>{},
+                RunSplitk(
+                    ck_tile::bool_constant<false>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
             else
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index b125d19762..3dec229643 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -102,10 +102,13 @@ class TestCkTileGroupedGemm : public ::testing::Test
 
         float ave_time{0};
 
-        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-            constexpr bool has_hot_loop_v = has_hot_loop_.value;
-            constexpr auto tail_number_v  = tail_number_.value;
-            constexpr auto scheduler      = ck_tile::GemmPipelineScheduler::Intrawave;
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+            constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                                BDataType,
@@ -131,7 +134,8 @@ class TestCkTileGroupedGemm : public ::testing::Test
                                                  GroupedGemKernelParam::M_Warp_Tile,
                                                  GroupedGemKernelParam::N_Warp_Tile,
                                                  GroupedGemKernelParam::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC>>;
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
             using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKargs(gemm_descs);
 
@@ -164,11 +168,29 @@ class TestCkTileGroupedGemm : public ::testing::Test
             return ave_time;
         };
 
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(gemm_descs[0].k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+
         if(has_hot_loop)
         {
             if(tail_num == ck_tile::TailNumber::Full)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
             else

From b8fa27bfef7b1d2df3984e1fd01e9c5df72f8b33 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Mon, 5 May 2025 13:12:22 -0700
Subject: [PATCH 095/443] Fix failure in test_batched_gemm_softmax_gemm_permute
 for lower resource devices (#2117)

* Problematic test case are analyzed and turned off for lower resource GPUs

* update device info

* Update test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp

* Update test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp

* Update test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp

Co-authored-by: John Afaganis <john.afaganis@amd.com>
---
 ...ed_gemm_bias_softmax_gemm_permute_util.hpp |  2 +
 .../test_batched_gemm_device_utils.hpp        | 67 ++++++++++++++
 ...hed_gemm_softmax_gemm_permute_bf16_xdl.cpp | 87 +++++++++++++++----
 ...hed_gemm_softmax_gemm_permute_fp16_xdl.cpp | 11 +++
 4 files changed, 150 insertions(+), 17 deletions(-)
 create mode 100644 test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp

diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
index d7c39367c8..1464eacfa5 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_bias_softmax_gemm_permute_util.hpp
@@ -9,6 +9,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
 #include "profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp"
 
+#include <hip/hip_runtime.h>
+
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
 using ck::tensor_operation::device::TensorSpecialization;
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp
new file mode 100644
index 0000000000..7d20ee4827
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <string>
+
+namespace ck {
+namespace test {
+
+struct DeviceResources
+{
+    int computeUnits;
+    size_t totalMemory;
+    std::string deviceName;
+    // Add other relevant properties as needed
+};
+
+inline DeviceResources GetDeviceResources()
+{
+    DeviceResources res;
+    hipDeviceProp_t props;
+
+    hipError_t status = hipGetDeviceProperties(&props, 0);
+    if(status != hipSuccess)
+    {
+        props.multiProcessorCount = 0;
+        res.computeUnits          = 0;
+        res.totalMemory           = 0;
+        res.deviceName            = "Unknown";
+        return res;
+    }
+
+    res.computeUnits = props.multiProcessorCount;
+    res.totalMemory  = props.totalGlobalMem;
+    res.deviceName   = props.name;
+
+    return res;
+}
+
+// Device capability tiers
+enum class DeviceCapabilityTier
+{
+    LOW,    // Low resources devices (CU less than 80)
+    MEDIUM, // Mid-range devices
+    HIGH    // High resources devices (CU hiher than 100)
+};
+
+inline DeviceCapabilityTier DetermineDeviceTier()
+{
+    DeviceResources res = GetDeviceResources();
+
+    // Adjust these thresholds based on your device specifics
+    if(res.computeUnits < 80)
+    {
+        return DeviceCapabilityTier::LOW;
+    }
+    else if(res.computeUnits < 100)
+    {
+        return DeviceCapabilityTier::MEDIUM;
+    }
+    else
+    {
+        return DeviceCapabilityTier::HIGH;
+    }
+}
+
+} // namespace test
+} // namespace ck
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
index 8136257a24..8d894576c4 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16_xdl.cpp
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
+#include "test_batched_gemm_device_utils.hpp"
 
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
@@ -110,14 +111,45 @@ TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Bench_BF16_Irregul
 
 TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Bench_BF16)
 {
-    this->lengths_ = std::vector<std::vector<int>>{
-        {256, 256, 64, 64, 48, 16},
-        {256, 256, 128, 128, 48, 16},
-        {512, 512, 64, 64, 48, 16},
-        {512, 512, 128, 128, 48, 16},
-        {1024, 1024, 64, 64, 48, 16},
-        {1024, 1024, 128, 128, 48, 16},
-    };
+
+    // Get device capability tier
+    auto deviceTier = ck::test::DetermineDeviceTier();
+
+    // Configure test sizes based on device tier
+    if(deviceTier == ck::test::DeviceCapabilityTier::LOW)
+    {
+        // Minimal test sizes for low resource devices
+        this->lengths_ = std::vector<std::vector<int>>{
+            {256, 256, 64, 64, 16, 8}, {256, 256, 128, 128, 16, 8}, {512, 512, 64, 64, 8, 4}};
+        std::cout << "Running reduced benchmarks for low-resource device" << std::endl;
+    }
+    else if(deviceTier == ck::test::DeviceCapabilityTier::MEDIUM)
+    {
+        // Medium test sizes
+        this->lengths_ = std::vector<std::vector<int>>{{256, 256, 64, 64, 24, 12},
+                                                       {256, 256, 128, 128, 24, 12},
+                                                       {512, 512, 64, 64, 16, 8},
+                                                       {512, 512, 128, 128, 16, 8},
+                                                       {1024, 1024, 64, 64, 8, 4},
+                                                       {1024, 1024, 128, 128, 8, 4}};
+        std::cout << "Running medium benchmarks for mid-tier device" << std::endl;
+    }
+    else
+    {
+        // Full test sizes for high resource devices
+        this->lengths_ = std::vector<std::vector<int>>{{256, 256, 64, 64, 48, 16},
+                                                       {256, 256, 128, 128, 48, 16},
+                                                       {512, 512, 64, 64, 48, 16},
+                                                       {512, 512, 128, 128, 48, 16},
+                                                       {1024, 1024, 64, 64, 48, 16},
+                                                       {1024, 1024, 128, 128, 48, 16},
+                                                       {2048, 2048, 64, 64, 48, 16},
+                                                       {2048, 2048, 128, 128, 48, 16},
+                                                       {4096, 4096, 64, 64, 48, 16},
+                                                       {4096, 4096, 128, 128, 48, 16}};
+        std::cout << "Running full benchmarks for high-performance device" << std::endl;
+    }
+
     this->bench_  = true;
     this->verify_ = false;
     this->Run();
@@ -127,9 +159,20 @@ using ck::tensor_operation::device::GemmSpecialization;
 
 TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
 {
+
+    // Get device capability tier
+    auto deviceTier = ck::test::DetermineDeviceTier();
+
     int P = 120; // requires padding
     int Q = 128; // do not require padding
 
+    // For lower-end devices, we might need to skip some tests
+    if(deviceTier == ck::test::DeviceCapabilityTier::LOW)
+    {
+        std::cout << "Skipping GemmSpecialization tests for low-resource device" << std::endl;
+        return;
+    }
+
     // IsSupported(M, N, K, O)
     // clang-format off
     EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
@@ -153,15 +196,25 @@ TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationS
 
 TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
 {
-    // IsSupported(M, N, K, O)
-    // clang-format off
-    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
-    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
-    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
-    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
-    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
-    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
-    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    EXPECT_FALSE(
+        DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}
+            .IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<
+                     GemmSpecialization::MNKPadding>{}
+                     .IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw %
+    // ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<
+                     GemmSpecialization::MNKOPadding>{}
+                     .IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<
+                     GemmSpecialization::MNKOPadding>{}
+                     .IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw %
+    // B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<
+                     GemmSpecialization::MNKOPadding>{}
+                     .IsSupported(128, 128, 128, 129));
     // clang-format on
 }
 
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
index 81d404109f..3a86736f44 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16_xdl.cpp
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "test_batched_gemm_softmax_gemm_permute_util.hpp"
+#include "test_batched_gemm_device_utils.hpp"
 
 template <typename Tuple>
 class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
@@ -132,9 +133,19 @@ using ck::tensor_operation::device::GemmSpecialization;
 
 TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
 {
+    // Get device capability tier
+    auto deviceTier = ck::test::DetermineDeviceTier();
+
     int P = 120; // requires padding
     int Q = 128; // do not require padding
 
+    // For lower-end devices, we might need to skip some tests
+    if(deviceTier == ck::test::DeviceCapabilityTier::LOW)
+    {
+        std::cout << "Skipping GemmSpecialization tests for low-resource device" << std::endl;
+        return;
+    }
+
     // IsSupported(M, N, K, O)
     // clang-format off
     EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));

From 4e9b76f88c572a6c54f34cc6467b96279c0e86e4 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Tue, 6 May 2025 17:32:07 +0800
Subject: [PATCH 096/443] [CK_TILE] optimize moe sorting kernel, boost large
 context case up to 20x (#2153)

* combine 2-3 as single stage

* support zeroing

* improve long tokens

* update specialization

* b16 ws

* 8bit topk optimize

* update 15 example
---
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    |   3 +-
 .../13_moe_sorting/moe_sorting_api.cpp        | 225 +++--
 .../13_moe_sorting/moe_sorting_api.hpp        |   2 +-
 .../13_moe_sorting/script/smoke_test.sh       |   6 +
 example/ck_tile/15_fused_moe/fused_moe.hpp    |   2 +
 .../ck_tile/15_fused_moe/fused_moesorting.hpp |   1 +
 .../15_fused_moe/instances/fused_moe_api.cpp  |   6 +
 .../instances/fused_moesorting_api.cpp        | 208 ++++-
 example/ck_tile/15_fused_moe/main.cpp         |   3 +-
 include/ck_tile/core.hpp                      |   1 +
 include/ck_tile/core/arch/arch.hpp            |   9 +
 .../ck_tile/core/arch/workgroup_barrier.hpp   |  65 ++
 include/ck_tile/core/config.hpp               |   2 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 789 +++++++++++++++++-
 .../fused_moe/kernel/moe_sorting_problem.hpp  |   9 +-
 15 files changed, 1216 insertions(+), 115 deletions(-)
 create mode 100644 include/ck_tile/core/arch/workgroup_barrier.hpp

diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index e59fcaedad..ce689a370c 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -153,9 +153,8 @@ bool test_moe_sorting(ck_tile::ArgParser args)
         local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
 
     // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-    ck_tile::index_t workspace_size = moe_sorting_get_workspace_size(tokens, num_experts);
+    ck_tile::index_t workspace_size = moe_sorting_get_workspace_size(tokens, num_experts, topk);
     ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
-
     if(workspace_size != 0)
         moe_sorting_ws.SetZero(); // note, clear here!!!!
 
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index 109ec1b157..305cf118d2 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -7,6 +7,14 @@
 #define MOE_SORTING_USE_EX_KERNEL 1
 #endif
 
+#ifndef MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_SUPPORT_LARGE_EXPERT 0
+#endif
+
+#ifndef MOE_SORTING_SUPPORT_LARGE_TOPK
+#define MOE_SORTING_SUPPORT_LARGE_TOPK 0
+#endif
+
 #if !MOE_SORTING_USE_EX_KERNEL
 
 #define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
@@ -153,7 +161,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }
         }
 #else
-        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts) != 0)
+        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts, a.topk) != 0)
         {
             return moe_sorting_mp(t, a, s);
         }
@@ -171,57 +179,107 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
     return -1;
 }
 
-#define MOE_SORTING_MP_0(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
     }()
 
-#define MOE_SORTING_MP_1(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 
-#define MOE_SORTING_MP_2(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+#endif
+
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                          \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
+        constexpr bool expert_masking         = expert_masking_;                                     \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
+                                                        ms_weight_type,         \
+                                                        mesh_type_,             \
+                                                        unroll_num,             \
+                                                        expert_masking>;        \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                                \
+        const dim3 grids                      = kernel::GridSize(a);                                 \
+        const dim3 blocks                     = kernel::BlockSize(a);                                \
+        const auto lds_size                   = kernel::GetSmemSize(a);                              \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
     }()
 
-#define MOE_SORTING_MP_3(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
-    }()
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)  \
+    if(t.local_expert_masking)                                                           \
+    {                                                                                    \
+        float ave_time =                                                                 \
+            ck_tile::launch_kernel(s,                                                    \
+                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true),     \
+                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true),     \
+                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true));  \
+        return ave_time;                                                                 \
+    }                                                                                    \
+    else                                                                                 \
+    {                                                                                    \
+        float ave_time =                                                                 \
+            ck_tile::launch_kernel(s,                                                    \
+                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false),    \
+                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false),    \
+                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false)); \
+        return ave_time;                                                                 \
+    }
 
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
 {
@@ -230,29 +288,74 @@ float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_co
         using ms_index_t     = ck_tile::index_t;
         using ms_weight_type = float;
 
-        if(t.local_expert_masking)
+        if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
+           ck_tile::get_smem_capacity())
         {
-            float ave_time = ck_tile::launch_kernel(s,
-                                                    MOE_SORTING_MP_0(1, true),
-                                                    MOE_SORTING_MP_1(1, true),
-                                                    MOE_SORTING_MP_2(1, true),
-                                                    MOE_SORTING_MP_3(1, true));
-            return ave_time;
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+            if(t.local_expert_masking)
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, true));
+                return ave_time;
+            }
+            else
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, false));
+                return ave_time;
+            }
+#else
+            printf("do not support large expert %d\n", a.num_experts);
+            return -1;
+#endif
         }
         else
         {
-            float ave_time = ck_tile::launch_kernel(s,
-                                                    MOE_SORTING_MP_0(1, false),
-                                                    MOE_SORTING_MP_1(1, false),
-                                                    MOE_SORTING_MP_2(1, false),
-                                                    MOE_SORTING_MP_3(1, false));
-            return ave_time;
+            ck_tile::index_t mesh_byte_size =
+                ck_tile::impl::moe_sorting_mesh_byte_size(a.tokens, a.num_experts, a.topk);
+            if(mesh_byte_size == 1)
+            {
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 4, 16, 16)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 1, 16, 16)
+                }
+            }
+            else if(mesh_byte_size == 2)
+            {
+#if MOE_SORTING_SUPPORT_LARGE_TOPK
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 4, 8, 8)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 1, 8, 8)
+                }
+#else
+                printf("do not support large topk %d\n", a.topk);
+                return -1;
+#endif
+            }
+            else
+            {
+                MOR_SORTING_MP_DISPATCH_(ck_tile::index_t, 1, 1, 1)
+            }
         }
     }
     return -1;
 }
 
-int moe_sorting_get_workspace_size(int tokens, int num_experts)
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk)
 {
-    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts);
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk);
 }
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
index b47ae9013b..0fe8d81e70 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -22,6 +22,6 @@ struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
 // if return non zero, means need workspace, you need to allocate a GPU buffer
 // and set to moe_sorting_args.p_ws
 // NOTE: workspace size are required to clear zero before use the API
-int moe_sorting_get_workspace_size(int tokens, int num_experts);
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk);
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
index cf2c2e164b..fbfb10822c 100644
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -26,3 +26,9 @@ $EXE -t=13 -e=64 -k=3 -local_eid=4,5,6,7,8,9,10,11
 $EXE -t=99 -e=33 -k=9 -local_eid=6,10,11,15,19
 $EXE -t=80 -e=99 -k=10 -local_eid=0,8,12,33
 $EXE -t=11 -e=256 -k=5 -local_eid=99,110,129
+$EXE -t=128 -e=128 -k=6 -moe_buf_size=163840
+$EXE -t=8192 -e=32 -k=5 -moe_buf_size=163840
+$EXE -t=8192 -e=32 -k=8 -moe_buf_size=163840
+$EXE -t=8192 -e=256 -k=5 -moe_buf_size=163840
+$EXE -t=8192 -e=256 -k=8 -moe_buf_size=163840
+$EXE -t=163840 -e=256 -k=8 -moe_buf_size=163840
\ No newline at end of file
diff --git a/example/ck_tile/15_fused_moe/fused_moe.hpp b/example/ck_tile/15_fused_moe/fused_moe.hpp
index b354d1d347..46425384cc 100644
--- a/example/ck_tile/15_fused_moe/fused_moe.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moe.hpp
@@ -56,4 +56,6 @@ struct fused_moe_traits
     bool local_expert_masking; // if mask experts as local expert
 };
 
+// if return zero, no ws needed
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk);
 float fused_moe(fused_moe_traits, fused_moe_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/15_fused_moe/fused_moesorting.hpp b/example/ck_tile/15_fused_moe/fused_moesorting.hpp
index a3ff8c5bf7..11e1c6e531 100644
--- a/example/ck_tile/15_fused_moe/fused_moesorting.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moesorting.hpp
@@ -18,4 +18,5 @@ struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs
 {
 };
 
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk);
 float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s);
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
index f887d57aa9..b3515b1bec 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -2,6 +2,12 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "fused_moe.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
+
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk)
+{
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk);
+}
 
 float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s)
 {
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
index 7aedaa9317..0d83c48d02 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -7,6 +7,14 @@
 #define MOE_SORTING_USE_EX_KERNEL 1
 #endif
 
+#ifndef MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_SUPPORT_LARGE_EXPERT 0
+#endif
+
+#ifndef MOE_SORTING_SUPPORT_LARGE_TOPK
+#define MOE_SORTING_SUPPORT_LARGE_TOPK 0
+#endif
+
 #if !MOE_SORTING_USE_EX_KERNEL
 
 #define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
@@ -107,6 +115,10 @@
     }
 #endif
 
+float fused_moesorting_mp(fused_moesorting_trait t,
+                          fused_moesorting_args a,
+                          ck_tile::stream_config s);
+
 float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s)
 {
     if(t.weight_type == "fp32" && t.index_type == "int32")
@@ -153,18 +165,198 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         }
         }
 #else
-        using index_t            = ck_tile::index_t;
-        using ms_weight_type     = float;
-        auto [r_, c_]            = ck_tile::moe_sorting_get_smem_row_col(a.tokens, a.num_experts);
-        auto sub_token_          = r_ - 2;
-        r_                       = (r_ - 2) / 8;
-        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        if(fused_moe_get_workspace_size(a.tokens, a.num_experts, a.topk) != 0)
+        {
+            return fused_moesorting_mp(t, a, s);
+        }
+        using index_t                = ck_tile::index_t;
+        using ms_weight_type         = float;
+        auto sub_token_              = ck_tile::moe_sorting_get_sub_token(a.tokens, a.num_experts);
+        auto row_                    = sub_token_ / 8;
+        bool is_sub_token_onshot     = a.tokens <= sub_token_;
         bool is_local_expert_masking = t.local_expert_masking;
-        (void)c_;
 
-        MOE_SORTING_DISPATCH_EMASK_(r_);
+        MOE_SORTING_DISPATCH_EMASK_(row_);
         // MOE_SORTING_DISPATCH_ETILE(0, 0);
 #endif
     }
     return -1;
 }
+
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking>;       \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+#endif
+
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_)                                  \
+    [&]() {                                                                                          \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
+        constexpr bool expert_masking         = expert_masking_;                                     \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
+                                                        ms_weight_type,         \
+                                                        mesh_type_,             \
+                                                        unroll_num,             \
+                                                        expert_masking>;        \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                                \
+        const dim3 grids                      = kernel::GridSize(a);                                 \
+        const dim3 blocks                     = kernel::BlockSize(a);                                \
+        const auto lds_size                   = kernel::GetSmemSize(a);                              \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
+    }()
+
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)  \
+    if(t.local_expert_masking)                                                           \
+    {                                                                                    \
+        float ave_time =                                                                 \
+            ck_tile::launch_kernel(s,                                                    \
+                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true),     \
+                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true),     \
+                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true));  \
+        return ave_time;                                                                 \
+    }                                                                                    \
+    else                                                                                 \
+    {                                                                                    \
+        float ave_time =                                                                 \
+            ck_tile::launch_kernel(s,                                                    \
+                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false),    \
+                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false),    \
+                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false)); \
+        return ave_time;                                                                 \
+    }
+
+float fused_moesorting_mp(fused_moesorting_trait t,
+                          fused_moesorting_args a,
+                          ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        using ms_index_t     = ck_tile::index_t;
+        using ms_weight_type = float;
+
+        if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
+           ck_tile::get_smem_capacity())
+        {
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+            if(t.local_expert_masking)
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, true));
+                return ave_time;
+            }
+            else
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, false));
+                return ave_time;
+            }
+#else
+            printf("do not support large expert %d\n", a.num_experts);
+            return -1;
+#endif
+        }
+        else
+        {
+            ck_tile::index_t mesh_byte_size =
+                ck_tile::impl::moe_sorting_mesh_byte_size(a.tokens, a.num_experts, a.topk);
+            if(mesh_byte_size == 1)
+            {
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 4, 16, 16)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 1, 16, 16)
+                }
+            }
+            else if(mesh_byte_size == 2)
+            {
+#if MOE_SORTING_SUPPORT_LARGE_TOPK
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 4, 8, 8)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 1, 8, 8)
+                }
+#else
+                printf("do not support large topk %d\n", a.topk);
+                return -1;
+#endif
+            }
+            else
+            {
+                MOR_SORTING_MP_DISPATCH_(ck_tile::index_t, 1, 1, 1)
+            }
+        }
+    }
+    return -1;
+}
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index cb93ce8907..da843891ce 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -372,7 +372,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
             num_sorted_tiles_host.get_element_space_size_in_bytes());
 
         // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-        ck_tile::index_t workspace_size = ck_tile::moe_sorting_get_workspace_size(tokens, experts);
+        ck_tile::index_t workspace_size =
+            ck_tile::moe_sorting_get_workspace_size(tokens, experts, topk);
         ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
         if(workspace_size != 0)
             moe_sorting_ws.SetZero(); // note, clear here!!!!
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 821b3a8e84..b94157eaec 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -13,6 +13,7 @@
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/arch/workgroup_barrier.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 09de5f325f..1d3cf5c010 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -154,4 +154,13 @@ __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_addres
 #pragma clang diagnostic pop
 }
 
+CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity()
+{
+#if defined(__gfx950__)
+    return 163840;
+#else
+    return 65536;
+#endif
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/arch/workgroup_barrier.hpp b/include/ck_tile/core/arch/workgroup_barrier.hpp
new file mode 100644
index 0000000000..827a490fcb
--- /dev/null
+++ b/include/ck_tile/core/arch/workgroup_barrier.hpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+
+namespace ck_tile {
+
+struct workgroup_barrier
+{
+    CK_TILE_DEVICE workgroup_barrier(uint32_t* ptr) : base_ptr(ptr) {}
+
+    CK_TILE_DEVICE uint32_t ld(uint32_t offset = 0)
+    {
+        return __atomic_load_n(base_ptr + offset, __ATOMIC_RELAXED);
+    }
+
+    CK_TILE_DEVICE void wait_eq(uint32_t value, uint32_t offset = 0)
+    {
+        if(threadIdx.x == 0)
+        {
+            while(ld(offset) != value) {}
+        }
+        __syncthreads();
+    }
+
+    CK_TILE_DEVICE void wait_lt(uint32_t value, uint32_t offset = 0)
+    {
+        if(threadIdx.x == 0)
+        {
+            while(ld(offset) < value) {}
+        }
+        __syncthreads();
+    }
+
+    CK_TILE_DEVICE void wait_set(uint32_t compare, uint32_t value, uint32_t offset = 0)
+    {
+        if(threadIdx.x == 0)
+        {
+            while(atomicCAS(base_ptr + offset, compare, value) != compare) {}
+        }
+        __syncthreads();
+    }
+
+    // enter critical zoon, assume buffer is zero when launch kernel
+    CK_TILE_DEVICE void aquire(uint32_t offset = 0) { wait_set(offset, 0, 1); }
+
+    // exit critical zoon, assume buffer is zero when launch kernel
+    CK_TILE_DEVICE void release(uint32_t offset = 0) { wait_set(offset, 1, 0); }
+
+    CK_TILE_DEVICE void inc(uint32_t offset = 0)
+    {
+        __syncthreads();
+        if(threadIdx.x == 0)
+        {
+            atomicAdd(base_ptr + offset, 1);
+        }
+    }
+
+    uint32_t* base_ptr;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 414509e479..27133fa847 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -257,5 +257,5 @@
 #endif
 
 #ifndef CK_TILE_WA_ISSUE_2028
-#define CK_TILE_WA_ISSUE_2028 1
+#define CK_TILE_WA_ISSUE_2028 0
 #endif
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 6a7ccd2472..664294fe18 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -19,6 +19,10 @@ namespace ck_tile {
 #define MOE_SORTING_USE_EX_KERNEL 1
 #endif
 
+#ifndef MOE_SORTING_FUSE_MP_01
+#define MOE_SORTING_FUSE_MP_01 0
+#endif
+
 // clang-format off
 // [indexing implementation-1]
 // using M_a as constexpr block_size to partition all tokens into different slices
@@ -118,7 +122,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int tokens_, int num_ex
     int smem_cols = num_experts_ + 1;  // usually experts is power of 2. padding here
     int smem_rows = [&](){
         index_t target_occupancy_ = 2;
-        constexpr index_t total_ = 65536 / sizeof(int);
+        constexpr index_t total_ = get_smem_capacity() / sizeof(index_t);
         constexpr index_t sub_unroll = 8;
         constexpr index_t cumsum_bufs = 2;  // 1 for cumsum, 1 for cnt
         // at lease 2 lines, one for sub_token unroll, one for cumsum
@@ -250,7 +254,7 @@ struct MoeSortingKernel
     {
 #if MOE_SORTING_USE_EX_KERNEL
         auto [smem_rows, smem_cols] = moe_sorting_get_smem_row_col(h.tokens, h.num_experts);
-        return smem_rows * smem_cols * sizeof(int);
+        return smem_rows * smem_cols * sizeof(index_t);
 #else
         const auto blocks = BlockSize(h);
         // usually num_experts is power of 2, we pad 1 dword here for the row-size
@@ -1063,17 +1067,43 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_stride(index_t tokens)
     return (tokens + chunk - 1) / chunk * chunk;
 };
 
-CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_elem(index_t tokens, index_t num_experts)
+// 4-i32 mesh, 2-i16 mseh, 1-i8 mesh
+CK_TILE_HOST index_t moe_sorting_mesh_byte_size(index_t tokens_,
+                                                index_t /*num_experts_*/,
+                                                index_t topk_)
+{
+    // small token case, let's run mesh with dword score board
+    if(tokens_ < 512)
+        return 4;
+    else
+    {
+        if(topk_ >= 255)
+            return 2; // 16bit mesh
+        else
+            return 1; // 8bit mesh if small enough
+    }
+}
+
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_smem_size(index_t tokens,
+                                                          index_t num_experts,
+                                                          index_t topk)
 {
     index_t row_size = moe_sorting_mp_mesh_stride(tokens);
-    return num_experts * row_size;
+    index_t elem     = num_experts * row_size;
+    return elem * moe_sorting_mesh_byte_size(tokens, num_experts, topk);
 };
 
-CK_TILE_HOST_DEVICE index_t moe_sorting_mp_cumsum_elem(index_t num_experts)
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_cumsum_smem_size(index_t num_experts)
 {
     constexpr index_t chunk = 32;
     index_t row_size        = num_experts + 1;
-    return (row_size + chunk - 1) / chunk * chunk;
+    return (row_size + chunk - 1) / chunk * chunk * sizeof(index_t);
+};
+
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
+{
+    constexpr index_t chunk = 32;
+    return chunk * sizeof(index_t);
 };
 
 template <typename T, typename F, index_t wave_size_ = warpSize>
@@ -1245,15 +1275,20 @@ CK_TILE_HOST bool moe_sorting_is_oneshot(int tokens_, int num_experts_)
 }
 
 // return size in byte
-CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size(int tokens_, int num_experts_)
+CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size(int tokens_, int num_experts_, int topk_)
 {
-    index_t elem = impl::moe_sorting_mp_mesh_elem(tokens_, num_experts_) +
-                   impl::moe_sorting_mp_cumsum_elem(num_experts_);
-    return elem * sizeof(index_t);
+    index_t s_ = impl::moe_sorting_mp_mesh_smem_size(tokens_, num_experts_, topk_) +
+                 impl::moe_sorting_mp_cumsum_smem_size(num_experts_)
+#if MOE_SORTING_FUSE_MP_01
+                 + impl::moe_sorting_mp_sem_smem_size();
+#else
+        ;
+#endif
+    return s_;
 }
 
 // return size in byte
-CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts_)
+CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts_, int topk_)
 {
 #if 1
     if(moe_sorting_is_oneshot(tokens_, num_experts_))
@@ -1262,10 +1297,10 @@ CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts
     }
     else
     {
-        return moe_sorting_mp_get_workspace_size(tokens_, num_experts_);
+        return moe_sorting_mp_get_workspace_size(tokens_, num_experts_, topk_);
     }
 #else
-    return moe_sorting_mp_get_workspace_size(tokens_, num_experts_);
+    return moe_sorting_mp_get_workspace_size(tokens_, num_experts_, topk_);
 #endif
 }
 
@@ -1320,6 +1355,7 @@ struct MoeSortingMultiPhaseKernel_P0
 
     using IndexType  = typename Problem::IndexType;
     using WeightType = typename Problem::WeightType;
+    using MeshType   = typename Problem::MeshType;
 
     static constexpr index_t BLOCK_SIZE = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
@@ -1371,22 +1407,21 @@ struct MoeSortingMultiPhaseKernel_P0
     {
         using topk_id_t = ext_vector_t<IndexType, Problem::SubTokenTile>;
 
-        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
-                      Problem::SubTokenTile == 4);
-
         const topk_id_t* p_topk_ids = reinterpret_cast<const topk_id_t*>(kargs.p_topk_ids);
-        IndexType* p_expert_mesh    = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
+        MeshType* p_expert_mesh     = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
         index_t total_elem = kargs.tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
 
 #pragma unroll Problem::SubTokenTile
-        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem; i += blockDim.x)
+        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem;
+            i += gridDim.x * BLOCK_SIZE)
         {
             auto x = p_topk_ids[i];
             static_for<0, Problem::SubTokenTile, 1>{}([&](auto j) {
                 IndexType eid = x[j.value]; // ext_vector_type must use int to []
                 uint32_t curr_token_id, curr_topk_id;
                 kargs.topk_mdiv.divmod(i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
-                p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] = curr_topk_id + 1;
+                p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
+                    (curr_topk_id + 1) & 0xffff;
             });
         }
     }
@@ -1400,6 +1435,7 @@ struct MoeSortingMultiPhaseKernel_P1
 
     using IndexType  = typename Problem::IndexType;
     using WeightType = typename Problem::WeightType;
+    using MeshType   = typename Problem::MeshType;
 
     static constexpr index_t BLOCK_SIZE = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
@@ -1420,9 +1456,9 @@ struct MoeSortingMultiPhaseKernel_P1
         Kargs k;
         k.p_local_expert_mask = h.p_local_expert_mask;
         k.p_expert_mesh       = h.p_ws;
-        k.p_expert_cumsum =
-            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
-                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.p_expert_cumsum     = reinterpret_cast<void*>(
+            reinterpret_cast<char*>(h.p_ws) +
+            impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk));
         k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
 
         return k;
@@ -1444,13 +1480,11 @@ struct MoeSortingMultiPhaseKernel_P1
 
         int eid = blockIdx.x;
 
-        constexpr index_t index_pack = 4;                                   // always packed
-        using r_t                    = ext_vector_t<IndexType, index_pack>; // always use int32x4
+        constexpr index_t index_pack = Problem::SubTokenTile;              // always packed
+        using r_t                    = ext_vector_t<MeshType, index_pack>; // always use int32x4
         r_t* p_expert_mesh           = reinterpret_cast<r_t*>(
-            reinterpret_cast<index_t*>(kargs.p_expert_mesh) + eid * kargs.mesh_stride);
+            reinterpret_cast<MeshType*>(kargs.p_expert_mesh) + eid * kargs.mesh_stride);
 
-        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
-                      Problem::SubTokenTile == 4);
         const IndexType* p_local_expert_mask =
             static_cast<const IndexType*>(kargs.p_local_expert_mask);
         IndexType* p_expert_cumsum = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
@@ -1502,6 +1536,197 @@ struct MoeSortingMultiPhaseKernel_P1
     }
 };
 
+#if MOE_SORTING_FUSE_MP_01
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P01
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+    using MeshType   = typename Problem::MeshType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_topk_ids;          // [tokens, topk]
+        const void* p_local_expert_mask; // [expert]
+        void* p_expert_mesh;             // [expert, tokens]
+        void* p_expert_cumsum;           // [expert + 1]
+        void* p_expert_sem;              // [1]
+        index_t tokens;
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        index_t wg_count;    // used for semaphore
+        mdiv topk_mdiv;
+    };
+
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_topk_ids          = h.p_topk_ids;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_expert_mesh       = h.p_ws;
+        k.p_expert_cumsum     = reinterpret_cast<void*>(
+            reinterpret_cast<char*>(h.p_ws) +
+            impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk));
+        k.p_expert_sem = reinterpret_cast<void*>(
+            reinterpret_cast<char*>(h.p_ws) +
+            impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk) +
+            impl::moe_sorting_mp_cumsum_smem_size(h.num_experts));
+        k.tokens      = h.tokens;
+        k.num_experts = h.num_experts;
+        k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.wg_count    = WGCounts(h);
+        k.topk_mdiv   = mdiv{static_cast<uint32_t>(h.topk)};
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    CK_TILE_HOST static constexpr auto WGCounts(const Hargs& h)
+    {
+        index_t total_elem = h.tokens * h.topk / Problem::SubTokenTile;
+        index_t elem_cnt   = (total_elem + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        // no more than grid_size
+        return min(elem_cnt, GridSize(h));
+    }
+
+    // in byte
+    CK_TILE_HOST static constexpr auto GetSmemSize()
+    {
+        return BLOCK_SIZE / warpSize * sizeof(IndexType);
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        workgroup_barrier wb{reinterpret_cast<uint32_t*>(kargs.p_expert_sem)};
+
+        {
+            using topk_id_t = ext_vector_t<IndexType, Problem::SubTokenTile>;
+
+            const topk_id_t* p_topk_ids = reinterpret_cast<const topk_id_t*>(kargs.p_topk_ids);
+            IndexType* p_expert_mesh    = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
+            index_t total_elem = kargs.tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
+
+#pragma unroll Problem::SubTokenTile
+            for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem;
+                i += BLOCK_SIZE * gridDim.x)
+            {
+                auto x = p_topk_ids[i];
+                static_for<0, Problem::SubTokenTile, 1>{}([&](auto j) {
+                    IndexType eid = x[j.value]; // ext_vector_type must use int to []
+                    uint32_t curr_token_id, curr_topk_id;
+                    kargs.topk_mdiv.divmod(
+                        i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
+                    p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] = curr_topk_id + 1;
+                });
+            }
+            if(static_cast<index_t>(blockIdx.x) < kargs.wg_count)
+            {
+                wb.inc();
+            }
+        }
+
+        {
+            __shared__ char smem[GetSmemSize()];
+            int eid = blockIdx.x;
+
+            // early exist in case of extra atomic wait
+            if(eid >= kargs.num_experts)
+                return;
+
+            wb.wait_lt(kargs.wg_count);
+
+            for(; eid < kargs.num_experts; eid += gridDim.x)
+            {
+                // if(threadIdx.x == 0)
+                //     printf("!!! bid:%d, eid:%d (%d, %d)\n",
+                //            static_cast<int>(blockIdx.x),
+                //            eid,
+                //            kargs.num_experts,
+                //            static_cast<int>(blockDim.x));
+                constexpr index_t index_pack = 4;                         // always packed
+                using r_t          = ext_vector_t<IndexType, index_pack>; // always use int32x4
+                r_t* p_expert_mesh = reinterpret_cast<r_t*>(
+                    reinterpret_cast<index_t*>(kargs.p_expert_mesh) + eid * kargs.mesh_stride);
+
+                const IndexType* p_local_expert_mask =
+                    static_cast<const IndexType*>(kargs.p_local_expert_mask);
+                IndexType* p_expert_cumsum = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+
+                auto f_sum = [](auto x_, auto y_) { return x_ + y_; };
+
+                int loops = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+                if constexpr(Problem::LocalExpertMasking)
+                {
+                    IndexType mask = p_local_expert_mask[eid];
+                    if(mask == 0)
+                        continue; // skip
+                }
+
+                index_t cnt = 0; // per-wave cnt
+                for(int i = 0; i < loops; i++)
+                {
+                    int position = i * BLOCK_SIZE + threadIdx.x;
+                    r_t v{0};
+                    if(position < (kargs.mesh_stride / index_pack))
+                        v = p_expert_mesh[position];
+                    index_t local_sum = 0;
+                    static_for<0, index_pack, 1>{}(
+                        [&](auto i_vec) { local_sum += v[i_vec.value] != 0 ? 1 : 0; });
+                    cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
+                }
+
+                index_t lane_id = threadIdx.x % warpSize;
+                index_t wave_id = threadIdx.x / warpSize;
+
+                // reduce cross wave
+                IndexType* s = reinterpret_cast<IndexType*>(smem);
+                __syncthreads();
+                if(lane_id == 0)
+                {
+                    s[wave_id] = cnt;
+                }
+                __syncthreads();
+
+                if(threadIdx.x == 0)
+                {
+                    index_t c = 0;
+                    for(auto i = 0; i < (BLOCK_SIZE / warpSize); i++)
+                    {
+                        c += s[i];
+                    }
+                    p_expert_cumsum[eid] = c;
+                }
+            }
+        }
+    }
+};
+#endif
+
 // token count cumsum
 template <typename Problem_>
 struct MoeSortingMultiPhaseKernel_P2
@@ -1510,6 +1735,7 @@ struct MoeSortingMultiPhaseKernel_P2
 
     using IndexType  = typename Problem::IndexType;
     using WeightType = typename Problem::WeightType;
+    using MeshType   = typename Problem::MeshType;
 
     static constexpr index_t BLOCK_SIZE = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
@@ -1536,10 +1762,9 @@ struct MoeSortingMultiPhaseKernel_P2
     {
         Kargs k;
         k.p_local_expert_mask = h.p_local_expert_mask;
-        // k.p_expert_mesh                    = h.p_ws;
-        k.p_expert_cumsum =
-            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
-                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.p_expert_cumsum     = reinterpret_cast<void*>(
+            reinterpret_cast<char*>(h.p_ws) +
+            impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk));
         k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
         k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
 
@@ -1566,7 +1791,8 @@ struct MoeSortingMultiPhaseKernel_P2
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return 2 * BLOCK_SIZE * sizeof(IndexType);
+        // return 2 * BLOCK_SIZE * sizeof(IndexType);
+        return (4 + 2 * BLOCK_SIZE / warpSize) * sizeof(IndexType);
     }
 
     // reduce single pixel within a wave
@@ -1718,6 +1944,7 @@ struct MoeSortingMultiPhaseKernel_P3
 
     using IndexType  = typename Problem::IndexType;
     using WeightType = typename Problem::WeightType;
+    using MeshType   = typename Problem::MeshType;
 
     static constexpr index_t BLOCK_SIZE = 256;
     static constexpr index_t OCCUPANCY  = 2; // hard coded
@@ -1749,9 +1976,9 @@ struct MoeSortingMultiPhaseKernel_P3
         k.p_sorted_token_ids  = h.p_sorted_token_ids;
         k.p_sorted_weights    = h.p_sorted_weights;
         k.p_expert_mesh       = h.p_ws;
-        k.p_expert_cumsum =
-            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
-                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.p_expert_cumsum     = reinterpret_cast<void*>(
+            reinterpret_cast<char*>(h.p_ws) +
+            impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk));
         k.tokens      = h.tokens;
         k.num_experts = h.num_experts;
         k.topk_mdiv   = mdiv{static_cast<uint32_t>(h.topk)};
@@ -1782,9 +2009,6 @@ struct MoeSortingMultiPhaseKernel_P3
         const WeightType* p_weights   = static_cast<const WeightType*>(kargs.p_weights);
         WeightType* p_sorted_weights  = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
 
-        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
-                      Problem::SubTokenTile == 4);
-
         int eid     = blockIdx.x;
         int wave_id = threadIdx.x / warpSize;
         int lane_id = threadIdx.x % warpSize;
@@ -1866,6 +2090,495 @@ struct MoeSortingMultiPhaseKernel_P3
     }
 };
 
+namespace impl {
+// we use dynamic LDS size here
+CK_TILE_HOST constexpr auto moe_sorting_get_smem_size_p23(int num_experts_)
+{
+    constexpr index_t BLOCK_SIZE     = 256; // hardcoded 256
+    const index_t expert_cumsum_elem = num_experts_ + 1;
+    return (4 + 2 * BLOCK_SIZE / warpSize + expert_cumsum_elem) * sizeof(int);
+}
+} // namespace impl
+
+// token count cumsum
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P23
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+    using MeshType   = typename Problem::MeshType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+    struct Kargs
+    {
+        const void* p_weights;
+        const void* p_local_expert_mask; // [expert]
+        void* p_expert_mesh;             // [expert, tokens]
+        void* p_expert_cumsum;           // [expert + 1]
+        void* p_total_tokens_post_pad;   // [1]
+        void* p_sorted_expert_ids;
+
+        void* p_sorted_token_ids;
+        void* p_sorted_weights;
+        void* p_moe_buf;
+
+        index_t tokens;
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv unit_size_mdiv;
+        mdiv topk_mdiv;
+        long_index_t moe_buf_bytes;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_weights           = h.p_weights;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_expert_mesh       = h.p_ws;
+        k.p_expert_cumsum     = reinterpret_cast<void*>(
+            reinterpret_cast<char*>(h.p_ws) +
+            impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk));
+        k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
+        k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
+
+        k.p_sorted_token_ids = h.p_sorted_token_ids;
+        k.p_sorted_weights   = h.p_sorted_weights;
+
+        k.p_moe_buf = h.p_moe_buf;
+
+        k.tokens         = h.tokens;
+        k.num_experts    = h.num_experts;
+        k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.unit_size_mdiv = mdiv{static_cast<uint32_t>(h.unit_size)};
+        k.topk_mdiv      = mdiv{static_cast<uint32_t>(h.topk)};
+
+        k.moe_buf_bytes = h.moe_buf_bytes;
+
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        // use 1 block to cumsum
+        // return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+        return dim3(h.num_experts + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // only use this at host !
+    CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h)
+    {
+        const auto smem_23 = impl::moe_sorting_get_smem_size_p23(h.num_experts);
+        const auto smem_sf = BLOCK_SIZE * 4 * sizeof(IndexType);
+        return max(smem_23, smem_sf);
+    }
+
+    // reduce single pixel within a wave
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if(static_cast<index_t>(blockIdx.x) >= kargs.num_experts)
+        {
+            impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
+                reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
+                kargs.moe_buf_bytes,
+                blockIdx.x - kargs.num_experts);
+            return;
+        }
+
+        extern __shared__ char smem[];
+        {
+            IndexType* s = reinterpret_cast<IndexType*>(smem);
+
+            const IndexType* p_local_expert_mask =
+                static_cast<const IndexType*>(kargs.p_local_expert_mask);
+            IndexType* p_expert_cumsum      = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / warpSize;
+            IndexType* p_total_tokens_post_pad =
+                reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
+            IndexType* p_sorted_expert_ids =
+                reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
+
+            const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            index_t wave_id     = threadIdx.x / warpSize;
+            index_t lane_id     = threadIdx.x % warpSize;
+
+            IndexType prev_cumsum_a = 0;
+            IndexType prev_cumsum_b = 0;
+
+            for(index_t i = 0; i < loops; i++)
+            {
+                index_t position = i * BLOCK_SIZE + threadIdx.x;
+                IndexType a_     = 0; // token count for a expert
+                IndexType b_     = 0; // mask for a expert
+                if(position < kargs.num_experts)
+                {
+                    a_ = p_expert_cumsum[position];
+                    if constexpr(Problem::LocalExpertMasking)
+                        b_ = p_local_expert_mask[position];
+                }
+
+                int blocks_pers_expert =
+                    kargs.unit_size_mdiv.div(a_ + kargs.unit_size_mdiv.divisor - 1);
+                // pad token
+                int padded_blocks_per_expert = [&]() {
+                    int x_ = [&]() {
+                        if constexpr(Problem::SkipExpertsWithZeroTokens)
+                        {
+                            // if local_cnt is zero, blocks_pers_expert will be zero
+                            // this is what we want to achieve
+                            return blocks_pers_expert; //  * kargs.unit_size_mdiv.divisor;
+                        }
+                        else
+                        {
+                            return max(blocks_pers_expert, 1);
+                        }
+                    }();
+                    if constexpr(Problem::LocalExpertMasking)
+                    {
+                        return b_ ? x_ : 0;
+                    }
+                    else
+                        return x_;
+                }();
+
+                IndexType cumsum_a = padded_blocks_per_expert;
+                IndexType cumsum_b = b_;
+
+                // Note: we first cumsum local round, then add previous cumsum
+                impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_a);
+                impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_b);
+
+                __syncthreads();
+                if(lane_id == warpSize - 1)
+                {
+                    s[4 + wave_id]                         = cumsum_a;
+                    s[4 + wave_id + BLOCK_SIZE / warpSize] = cumsum_b;
+                }
+
+                __syncthreads();
+
+                // reduce cross wave
+                static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                    IndexType prev_a = s[4 + i_w];
+                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / warpSize];
+                    prev_a           = wave_id > i_w ? prev_a : 0; // mask out
+                    prev_b           = wave_id > i_w ? prev_b : 0; // mask out
+                    cumsum_a += prev_a;
+                    cumsum_b += prev_b;
+                });
+
+                // Now let's add previous cumsum
+                cumsum_a += prev_cumsum_a;
+                cumsum_b += prev_cumsum_b;
+
+                if(threadIdx.x == BLOCK_SIZE - 1)
+                {
+                    s[2] = cumsum_a; // store the last cumsum
+                    s[3] = cumsum_b;
+                }
+
+                IndexType out_0 = cumsum_a - padded_blocks_per_expert; // exclusive cumsum tok cnt
+                IndexType out_1 = cumsum_b - b_;                       // exclusive cumsum mask cnt
+
+                __syncthreads();
+                prev_cumsum_a = s[2];
+                prev_cumsum_b = s[3];
+
+                if(position < kargs.num_experts)
+                {
+                    p_expert_cumsum_smem[position] = out_0 * kargs.unit_size_mdiv.divisor;
+                }
+
+                {
+                    if(blockIdx.x == 0)
+                    {
+                        if constexpr(Problem::LocalExpertMasking)
+                        {
+                            if(b_)
+                            {
+                                for(int j = 0; j < blocks_pers_expert; j++)
+                                {
+                                    p_sorted_expert_ids[out_0 + j] = out_1;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for(int j = 0; j < blocks_pers_expert; j++)
+                            {
+                                p_sorted_expert_ids[out_0 + j] = position;
+                            }
+                        }
+                    }
+                }
+            }
+
+            if(threadIdx.x == 0)
+            {
+                auto total_tokens_post_pad = prev_cumsum_a * kargs.unit_size_mdiv.divisor;
+                if(blockIdx.x == 0)
+                    p_total_tokens_post_pad[0] = total_tokens_post_pad;
+                p_expert_cumsum_smem[kargs.num_experts] = total_tokens_post_pad;
+            }
+        }
+
+        __syncthreads();
+
+        {
+            const IndexType* p_local_expert_mask =
+                static_cast<const IndexType*>(kargs.p_local_expert_mask);
+            IndexType* s                  = reinterpret_cast<IndexType*>(smem);
+            MeshType* p_expert_mesh       = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
+            IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / warpSize;
+            const WeightType* p_weights     = static_cast<const WeightType*>(kargs.p_weights);
+            WeightType* p_sorted_weights    = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
+
+            int eid     = blockIdx.x;
+            int wave_id = threadIdx.x / warpSize;
+            int lane_id = threadIdx.x % warpSize;
+            int e_start = p_expert_cumsum_smem[eid];
+            int e_end   = p_expert_cumsum_smem[eid + 1];
+            if constexpr(Problem::SkipExpertsWithZeroTokens)
+            {
+                if(e_start == e_end)
+                    return;
+            }
+
+            if constexpr(Problem::LocalExpertMasking)
+            {
+                int e_mask = p_local_expert_mask[eid];
+                if(e_mask == 0)
+                    return; // skip empty expert
+            }
+
+            // cumsum one by one
+            constexpr index_t index_pack = Problem::SubTokenTile;              // always packed
+            using r_t                    = ext_vector_t<MeshType, index_pack>; // always use int32x4
+            using d_t                    = ext_vector_t<index_t, index_pack>;
+            int loops       = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            int prev_cumsum = 0;
+
+            for(int i = 0; i < loops; i++)
+            {
+                int i_token_pack = i * BLOCK_SIZE + threadIdx.x;
+                r_t x_v          = 0;
+                if(i_token_pack < (kargs.tokens + index_pack - 1) / index_pack)
+                {
+                    x_v = reinterpret_cast<r_t*>(p_expert_mesh +
+                                                 eid * kargs.mesh_stride)[i_token_pack];
+                }
+
+                r_t x_r;
+#if 0
+                if constexpr(index_pack != 1)
+                {
+                    // shuffle, we must have contiguout thread holds contiguout token
+                    __syncthreads();
+                    reinterpret_cast<r_t*>(s)[threadIdx.x] = x_v;
+                    __syncthreads();
+
+                    static_for<0, index_pack, 1>{}([&](auto j_) {
+                        constexpr auto j = j_.value;
+                        x_r[j]           = reinterpret_cast<MeshType*>(s)[threadIdx.x + j * BLOCK_SIZE];
+                    });
+                }
+#else
+                x_r = x_v;
+#endif
+                {
+#if 0
+#pragma unroll
+                    for(int j = 0; j < index_pack / 2; j++)
+                    {
+                        int i_token = i * BLOCK_SIZE * index_pack + threadIdx.x + j * BLOCK_SIZE;
+                        index_t x   = x_d[j];
+                        int i_topk  = x - 1;          // topk of this token
+                        int i_show  = x != 0 ? 1 : 0; // has this token or not
+                        int cumsum  = i_show;
+                        impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+
+                        __syncthreads();
+                        if(lane_id == warpSize - 1)
+                        {
+                            s[4 + wave_id] = cumsum;
+                        }
+                        __syncthreads();
+
+                        // reduce cross wave
+                        static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                            IndexType prev = s[4 + i_w];
+                            prev           = wave_id > i_w ? prev : 0; // mask out
+                            cumsum += prev;
+                        });
+                        cumsum += prev_cumsum; // add previous round cumsum
+                        if(threadIdx.x == BLOCK_SIZE - 1)
+                        {
+                            s[0] = cumsum;
+                        }
+                        __syncthreads();
+
+                        int position = cumsum - i_show;
+                        prev_cumsum  = s[0]; // update the last cumsum
+
+                        if(i_show)
+                        {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                            p_sorted_token_ids[e_start + position] =
+                                MOE_SORTING_MOCK_ID(i_token, i_topk);
+#else
+                            p_sorted_token_ids[e_start + position] = i_token;
+#endif
+                            p_sorted_weights[e_start + position] =
+                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
+                        }
+                    }
+#endif
+                    {
+                        d_t i_topk;
+                        d_t i_show;
+                        // = 0;
+                        int cumsum_store = 0;
+
+                        static_for<0, index_pack, 1>{}([&](auto j_) {
+                            constexpr auto j = j_.value;
+                            i_topk[j]        = static_cast<index_t>(x_r[j] - 1);
+                            i_show[j]        = static_cast<index_t>(x_r[j] != 0 ? 1 : 0);
+                            cumsum_store += i_show[j];
+                        });
+                        int cumsum = cumsum_store;
+                        impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+
+                        __syncthreads();
+                        if(lane_id == warpSize - 1)
+                        {
+                            s[4 + wave_id] = cumsum;
+                        }
+                        __syncthreads();
+
+                        // reduce cross wave
+                        static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                            IndexType prev = s[4 + i_w];
+                            prev           = wave_id > i_w ? prev : 0; // mask out
+                            cumsum += prev;
+                        });
+                        cumsum += prev_cumsum; // add previous round cumsum
+                        if(threadIdx.x == BLOCK_SIZE - 1)
+                        {
+                            s[0] = cumsum;
+                        }
+                        __syncthreads();
+                        prev_cumsum = s[0]; // update the last cumsum
+
+                        int position = cumsum - cumsum_store;
+                        static_for<0, index_pack, 1>{}([&](auto j_) {
+                            constexpr auto j = j_.value;
+                            // int i_token = i * BLOCK_SIZE * index_pack + threadIdx.x + j *
+                            // BLOCK_SIZE;
+                            int i_token =
+                                i * BLOCK_SIZE * index_pack + threadIdx.x * index_pack + j;
+
+                            if(i_show[j])
+                            {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                                p_sorted_token_ids[e_start + position] =
+                                    MOE_SORTING_MOCK_ID(i_token, i_topk[j]);
+#else
+                                p_sorted_token_ids[e_start + position] = i_token;
+#endif
+                                p_sorted_weights[e_start + position] =
+                                    p_weights[i_token * kargs.topk_mdiv.divisor + i_topk[j]];
+                            }
+                            position += i_show[j];
+                        });
+
+#if 0
+                        int i_token = i * BLOCK_SIZE * index_pack + threadIdx.x * 2 + j * BLOCK_SIZE * 2;
+                        index_t x   = x_d[j];
+                        index_t x0  = static_cast<index_t>(x & 0xffff);
+                        index_t x1  = static_cast<index_t>(x >> 16);
+                        int i_topk_0  = x0 - 1;          // topk of this token
+                        int i_show_0  = x0 != 0 ? 1 : 0; // has this token or not
+                        int i_topk_1  = x1 - 1;          // topk of this token
+                        int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
+                        int cumsum  = i_show_0 + i_show_1;
+                        impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+
+                        __syncthreads();
+                        if(lane_id == warpSize - 1)
+                        {
+                            s[4 + wave_id] = cumsum;
+                        }
+                        __syncthreads();
+
+                        // reduce cross wave
+                        static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                            IndexType prev = s[4 + i_w];
+                            prev           = wave_id > i_w ? prev : 0; // mask out
+                            cumsum += prev;
+                        });
+                        cumsum += prev_cumsum; // add previous round cumsum
+                        if(threadIdx.x == BLOCK_SIZE - 1)
+                        {
+                            s[0] = cumsum;
+                        }
+                        __syncthreads();
+
+                        int position_0 = cumsum - i_show_0 - i_show_1;
+                        prev_cumsum  = s[0]; // update the last cumsum
+
+                        if(i_show_0)
+                        {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                            p_sorted_token_ids[e_start + position_0] =
+                                MOE_SORTING_MOCK_ID(i_token, i_topk_0);
+#else
+                            p_sorted_token_ids[e_start + position_0] = i_token;
+#endif
+                            p_sorted_weights[e_start + position_0] =
+                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0];
+                        }
+
+                        int position_1 = cumsum - i_show_1;
+
+                        if(i_show_1)
+                        {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                            p_sorted_token_ids[e_start + position_1] =
+                                MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1);
+#else
+                            p_sorted_token_ids[e_start + position_1] = i_token + 1;
+#endif
+                            p_sorted_weights[e_start + position_1] =
+                                p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1];
+                        }
+#endif
+                    }
+                }
+            }
+
+            for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
+            {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(kargs.tokens, kargs.topk_mdiv.divisor);
+#else
+                p_sorted_token_ids[i] = tokens;
+#endif
+                p_sorted_weights[i] = static_cast<WeightType>(0.0);
+            }
+        }
+    }
+};
+
 #undef MOE_SORTING_MOCK_ID
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
index a98e0d7652..39bc6ca93e 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
@@ -50,20 +50,23 @@ struct MoeSortingProblemEx
 };
 
 template <typename IndexType_,
-          typename WeightType_,
-          index_t SubTokenTile_,    // 1,2,4
+          typename WeightType_, // used for expert mesh in ws
+          typename MeshType_,
+          index_t SubTokenTile_,    // 1,2,4,8
           bool LocalExpertMasking_, // used in EP case
           bool SkipExpertsWithZeroTokens_ = true>
 struct MoeSortingProblemMp
 {
     // TODO: this kernel only support warp per row
     using WeightType = remove_cvref_t<WeightType_>;
+    using MeshType   = remove_cvref_t<MeshType_>;
     using IndexType  = remove_cvref_t<IndexType_>;
 
     static constexpr index_t SubTokenTile           = SubTokenTile_;
     static constexpr bool LocalExpertMasking        = LocalExpertMasking_;
     static constexpr bool SkipExpertsWithZeroTokens = SkipExpertsWithZeroTokens_;
-    static_assert(SubTokenTile == 1 || SubTokenTile == 2 || SubTokenTile == 4);
+    static_assert(SubTokenTile == 1 || SubTokenTile == 2 || SubTokenTile == 4 ||
+                  SubTokenTile == 8 || SubTokenTile == 16);
 };
 
 } // namespace ck_tile

From 8a0d659f92897e1ae99e4dc0ea4842a2c78170ab Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Tue, 6 May 2025 09:24:00 -0500
Subject: [PATCH 097/443] Add FP4 MX MFMA tests (#2151)

* Add conversion tests

* Fix ctor

* Fix nan logic

* Fix conversion logic

* Permute packed f4_t values

* Fix conversion to float, repack vector elements

* Fix device tests

* Permute elements in a vector

* Add a repro test

* Add a conversion for a repro test

* Update test vectors

* Update conversion

* Fix the test

* Update test vector generator

* Fix vector sr conversion

* Permute conversion args

* Update conversion

* Test

* Fix packing

* Simplify conversion function

* Pack conversion in a loop

* Pack conversion in a loop

* Pack another conversion in a loop

* Pack one more conversion in a loop

* Pack the last conversion in a loop

* Clean up

* Add ops

* Add tests

* Add missing utils

* Update reference mx gemm

* Add f4x2 init mode

* Update host tensor utils

* Update chunk size for f4x2

* Add non scaled ops

* Add a type utility

* Update non scaled reference kernel

* Add non scaled tests

* Debug mfma arguments

* Add more debug info

* Update chunk size

* Update data layout

* Add more debugging

* Fix B stride

* Fix reference gemm

* Fix build

* One more reference fix

* Add more debug info

* Disable some tests

* Enable tests

* Add fp4 dimensions

* Update reference kernels

* Temp edits

* Remove leftovers

* Fix conflicts

* Clean up

* More clean up

* Revert "More clean up"

This reverts commit d8d35a0846a8c2f0ccc7defe5f4fc7cc4ef36760.

* Add layouts to tests

---------

Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
---
 include/ck/library/utility/host_tensor.hpp    |  21 +-
 .../library/utility/host_tensor_generator.hpp |  48 ++-
 include/ck/utility/amd_xdlops.hpp             | 122 +++++++
 include/ck/utility/data_type.hpp              |   7 +
 .../cpu/reference_gemm.hpp                    |  20 ++
 .../cpu/reference_mx_gemm.hpp                 |  50 ++-
 test/mx_mfma_op/mx_mfma_op.cpp                | 114 ++++++-
 test/mx_mfma_op/mx_mfma_op.hpp                | 307 +++++++++++++++---
 8 files changed, 610 insertions(+), 79 deletions(-)

diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 2cbca29afc..71417ce7bf 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -51,7 +51,8 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
         {
             os << ck::type_convert<float>(v);
         }
-        else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t>)
+        else if constexpr(std::is_same_v<RangeType, ck::pk_i4_t> ||
+                          std::is_same_v<RangeType, ck::f4x2_pk_t>)
         {
             const auto packed_floats = ck::type_convert<ck::float2_t>(v);
             const ck::vector_type<float, 2> vector_of_floats{packed_floats};
@@ -359,7 +360,8 @@ struct Tensor
 
     std::size_t GetElementSpaceSize() const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
+                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
         {
             return (mDesc.GetElementSpaceSize() + 1) / 2;
         }
@@ -514,7 +516,8 @@ struct Tensor
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
+                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
         {
             return mDesc.GetOffsetFromMultiIndex(is...) / 2;
         }
@@ -527,7 +530,8 @@ struct Tensor
     template <typename... Is>
     T& operator()(Is... is)
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
+                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
         {
             return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
         }
@@ -540,7 +544,8 @@ struct Tensor
     template <typename... Is>
     const T& operator()(Is... is) const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
+                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
         {
             return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
         }
@@ -552,7 +557,8 @@ struct Tensor
 
     T& operator()(std::vector<std::size_t> idx)
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
+                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
         {
             return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
         }
@@ -564,7 +570,8 @@ struct Tensor
 
     const T& operator()(std::vector<std::size_t> idx) const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t>)
+        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
+                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
         {
             return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
         }
diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp
index 274051da83..785f74a3c0 100644
--- a/include/ck/library/utility/host_tensor_generator.hpp
+++ b/include/ck/library/utility/host_tensor_generator.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -81,6 +81,18 @@ struct GeneratorTensor_1<ck::f4_t>
     }
 };
 
+template <>
+struct GeneratorTensor_1<ck::f4x2_pk_t>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ck::f4x2_pk_t operator()(Is...)
+    {
+        return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(ck::float2_t{value, value})};
+    }
+};
+
 template <>
 struct GeneratorTensor_1<int8_t>
 {
@@ -209,6 +221,21 @@ struct GeneratorTensor_2<ck::f4_t>
     }
 };
 
+template <>
+struct GeneratorTensor_2<ck::f4x2_pk_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::f4x2_pk_t operator()(Is...)
+    {
+        float tmp0 = (std::rand() % (max_value - min_value)) + min_value;
+        float tmp1 = (std::rand() % (max_value - min_value)) + min_value;
+        return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(ck::float2_t{tmp0, tmp1})};
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_3
 {
@@ -296,6 +323,25 @@ struct GeneratorTensor_3<ck::f4_t>
     }
 };
 
+template <>
+struct GeneratorTensor_3<ck::f4x2_pk_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::f4x2_pk_t operator()(Is...)
+    {
+        float tmp0 = float(std::rand()) / float(RAND_MAX);
+        float tmp1 = float(std::rand()) / float(RAND_MAX);
+
+        float fp32_tmp0 = min_value + tmp0 * (max_value - min_value);
+        float fp32_tmp1 = min_value + tmp1 * (max_value - min_value);
+
+        return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(ck::float2_t{fp32_tmp0, fp32_tmp1})};
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_4
 {
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 71e1937a23..66c4958e1d 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -508,6 +508,34 @@ struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
         ignore = reg_a;
         ignore = reg_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f4x32_t& reg_a, const f4x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+
+        int32x4_t arg_a = bit_cast<int32x4_t>(reg_a);
+        int32x4_t arg_b = bit_cast<int32x4_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                4, // cbsz
+                4, // blgp
+                0, // OPSEL
+                0,
+                0, // OPSEL
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
 #endif
     }
 };
@@ -589,6 +617,40 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
         ignore = reg_b;
         ignore = scale_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f4x32_t& reg_a,
+                               const int32_t scale_a,
+                               const f4x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+
+        int32x4_t arg_a = bit_cast<int32x4_t>(reg_a);
+        int32x4_t arg_b = bit_cast<int32x4_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                4, // cbsz
+                4, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
 #endif
     }
 };
@@ -686,6 +748,39 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const f4x32_t& reg_a,
+                               const int32_t scale_a,
+                               const f4x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        int32x4_t arg_a = bit_cast<int32x4_t>(reg_a);
+        int32x4_t arg_b = bit_cast<int32x4_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                4, // cbsz
+                4, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const bf8x32_t& reg_a,
                                const int32_t& scale_a,
@@ -748,6 +843,33 @@ struct intrin_mfma_f32_16x16x128f8f6f4<16, 16>
         ignore = reg_a;
         ignore = reg_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f4x32_t& reg_a, const f4x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        int32x4_t arg_a = bit_cast<int32x4_t>(reg_a);
+        int32x4_t arg_b = bit_cast<int32x4_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                4, // cbsz
+                4, // blgp
+                0, // OPSEL
+                0,
+                0, // OPSEL
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
 #endif
     }
 };
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 79bd717501..a6106bb146 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -470,6 +470,13 @@ struct scalar_type<e8m0_bexp_t>
     static constexpr index_t vector_size = 1;
 };
 
+template <>
+struct scalar_type<f4x2_pk_t>
+{
+    using type                           = f4x2_pk_t::type;
+    static constexpr index_t vector_size = 1;
+};
+
 template <>
 struct scalar_type<bool>
 {
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 7e2482807d..c8d284a1d7 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -79,6 +79,16 @@ struct ReferenceGemm : public device::BaseOperator
                         i4  = i4 - 8;
                         v_a = type_convert<ComputeTypeA>(i4);
                     }
+                    else if constexpr(is_same_v<ADataType, f4x2_pk_t>)
+                    {
+                        // TODO: add support for ColMajor layout as well
+                        if(k % 2 == 1)
+                            v_a = type_convert<ComputeTypeA>(
+                                f4_t(arg.a_m_k_(m, k).template unpack<>(Number<1>{})));
+                        else
+                            v_a = type_convert<ComputeTypeA>(
+                                f4_t(arg.a_m_k_(m, k).template unpack<>(Number<0>{})));
+                    }
                     else
                     {
                         arg.a_element_op_(v_a, arg.a_m_k_(m, k));
@@ -95,6 +105,16 @@ struct ReferenceGemm : public device::BaseOperator
                         i4  = i4 - 8;
                         v_b = type_convert<ComputeTypeB>(i4);
                     }
+                    else if constexpr(is_same_v<BDataType, f4x2_pk_t>)
+                    {
+                        // TODO: add support for RowMajor layout as well
+                        if(k % 2 == 1)
+                            v_b = type_convert<ComputeTypeB>(
+                                f4_t(arg.b_k_n_(k, n).template unpack<>(Number<1>{})));
+                        else
+                            v_b = type_convert<ComputeTypeB>(
+                                f4_t(arg.b_k_n_(k, n).template unpack<>(Number<0>{})));
+                    }
                     else
                     {
                         arg.b_element_op_(v_b, arg.b_k_n_(k, n));
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
index 649f130c41..e8fdcf1acd 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
@@ -89,9 +89,28 @@ struct ReferenceMXGemm : public device::BaseOperator
             {
                 for(size_t k = 0; k < K; k++)
                 {
-                    a_m_k_scaled(m, k) =
-                        type_convert<ComputeTypeA>(arg.a_m_k_(m, k)) *
-                        type_convert<ComputeTypeA>(arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                    if constexpr(is_same_v<ADataType, f4x2_pk_t>)
+                    {
+                        // TODO: add support for ColMajor layout as well
+                        if(k % 2 == 1)
+                            a_m_k_scaled(m, k) =
+                                type_convert<ComputeTypeA>(
+                                    f4_t(arg.a_m_k_(m, k).template unpack<>(Number<1>{}))) *
+                                type_convert<ComputeTypeA>(
+                                    arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                        else
+                            a_m_k_scaled(m, k) =
+                                type_convert<ComputeTypeA>(
+                                    f4_t(arg.a_m_k_(m, k).template unpack<>(Number<0>{}))) *
+                                type_convert<ComputeTypeA>(
+                                    arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                    }
+                    else
+                    {
+                        a_m_k_scaled(m, k) =
+                            type_convert<ComputeTypeA>(arg.a_m_k_(m, k)) *
+                            type_convert<ComputeTypeA>(arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                    }
                 }
             }
 
@@ -99,9 +118,28 @@ struct ReferenceMXGemm : public device::BaseOperator
             {
                 for(size_t k = 0; k < K; k++)
                 {
-                    b_k_n_scaled(k, n) =
-                        type_convert<ComputeTypeB>(arg.b_k_n_(k, n)) *
-                        type_convert<ComputeTypeB>(arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                    if constexpr(is_same_v<BDataType, f4x2_pk_t>)
+                    {
+                        // TODO: add support for RowMajor layout as well
+                        if(k % 2 == 1)
+                            b_k_n_scaled(k, n) =
+                                type_convert<ComputeTypeB>(
+                                    f4_t(arg.b_k_n_(k, n).template unpack<>(Number<1>{}))) *
+                                type_convert<ComputeTypeB>(
+                                    arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                        else
+                            b_k_n_scaled(k, n) =
+                                type_convert<ComputeTypeB>(
+                                    f4_t(arg.b_k_n_(k, n).template unpack<>(Number<0>{}))) *
+                                type_convert<ComputeTypeB>(
+                                    arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                    }
+                    else
+                    {
+                        b_k_n_scaled(k, n) =
+                            type_convert<ComputeTypeB>(arg.b_k_n_(k, n)) *
+                            type_convert<ComputeTypeB>(arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                    }
                 }
             }
 
diff --git a/test/mx_mfma_op/mx_mfma_op.cpp b/test/mx_mfma_op/mx_mfma_op.cpp
index f65e89bb82..fddb8288a6 100644
--- a/test/mx_mfma_op/mx_mfma_op.cpp
+++ b/test/mx_mfma_op/mx_mfma_op.cpp
@@ -6,6 +6,8 @@
 #include "mx_mfma_op.hpp"
 
 using ck::e8m0_bexp_t;
+using ck::f4_t;
+using ck::f4x2_pk_t;
 using ck::f8_t;
 using ck::half_t;
 using ck::type_convert;
@@ -16,7 +18,7 @@ using ck::type_convert;
  * @param init - selects initialization algorithm for A and B tensors
  */
 template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
-bool run_mfma_test(ck::index_t init)
+bool run_mfma_km_kn_nm_test(ck::index_t init)
 {
     using ALayout = ck::tensor_layout::gemm::ColumnMajor;
     using BLayout = ck::tensor_layout::gemm::ColumnMajor;
@@ -30,7 +32,8 @@ bool run_mfma_test(ck::index_t init)
     constexpr auto BLOCK_N = mfma_instr.n_per_blk;
     constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
 
-    const auto mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
+    const auto mfma_kernel = ck::
+        matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, ALayout, BLayout, CLayout>;
 
     bool pass = true;
 
@@ -52,15 +55,72 @@ bool run_mfma_test(ck::index_t init)
 
 TEST(MFMA, FP8MFMA16x16x128)
 {
-    auto AB_init = 4;
-    auto pass    = run_mfma_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
+    auto AB_init = 5;
+    auto pass = run_mfma_km_kn_nm_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MFMA, FP8MFMA32x32x64)
+{
+    auto AB_init = 5;
+    auto pass = run_mfma_km_kn_nm_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+/**
+ * @brief Run the test for the given MFMA instruction
+ *
+ * @param init - selects initialization algorithm for A and B tensors
+ */
+template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
+bool run_mfma_mk_kn_mn_test(ck::index_t init)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    using AccType    = float; // only MFMA_F32 instructions supported
+    using CPUAccType = AccType;
+
+    ck::mfma_type<static_cast<ck::MfmaInstr>(mfma)> mfma_instr;
+    constexpr auto BLOCK_M = mfma_instr.m_per_blk;
+    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
+    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
+
+    const auto mfma_kernel = ck::
+        matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, ALayout, BLayout, CLayout>;
+
+    bool pass = true;
+
+    pass = ck::mfma_test::TestMFMA<decltype(mfma_kernel),
+                                   AType,
+                                   BType,
+                                   CType,
+                                   AccType,
+                                   CPUAccType,
+                                   ALayout,
+                                   BLayout,
+                                   CLayout,
+                                   BLOCK_M,
+                                   BLOCK_N,
+                                   BLOCK_K>{}(mfma_kernel, init);
+
+    return pass;
+}
+
+TEST(MFMA, FP4MFMA16x16x128)
 {
     auto AB_init = 4;
-    auto pass    = run_mfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
+    auto pass = run_mfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, float, ck::MFMA_F8F6F4::F32_16x16x128>(
+        AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MFMA, FP4MFMA32x32x64)
+{
+    auto AB_init = 4;
+    auto pass = run_mfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, half_t, ck::MFMA_F8F6F4::F32_32x32x64>(
+        AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -70,7 +130,7 @@ TEST(MFMA, FP8MFMA32x32x64)
  * @param init - selects initialization algorithm for A and B tensors
  */
 template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
-bool run_mxmfma_test(ck::index_t init)
+bool run_mxmfma_mk_kn_mn_test(ck::index_t init)
 {
     static_assert(mfma == ck::MFMA_F8F6F4::SCALE_F32_16x16x128 ||
                       mfma == ck::MFMA_F8F6F4::SCALE_F32_32x32x64,
@@ -88,8 +148,18 @@ bool run_mxmfma_test(ck::index_t init)
     constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
     constexpr auto BLOCK_X = 32; // scaling vector size
 
-    const auto mx_mfma_kernel =
-        ck::matmul<AType, BType, ScaleType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_X>;
+    const auto mx_mfma_kernel = ck::matmul<AType,
+                                           BType,
+                                           ScaleType,
+                                           CType,
+                                           AccType,
+                                           BLOCK_M,
+                                           BLOCK_N,
+                                           BLOCK_K,
+                                           BLOCK_X,
+                                           ALayout,
+                                           BLayout,
+                                           CLayout>;
 
     bool pass = true;
 
@@ -111,14 +181,34 @@ bool run_mxmfma_test(ck::index_t init)
 
 TEST(MXMFMA, MXFP8MFMA16x16x128)
 {
-    auto AB_init = 7;
-    auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    auto AB_init = 5;
+    auto pass =
+        run_mxmfma_mk_kn_mn_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MXMFMA, MXFP8MFMA32x32x64)
 {
-    auto AB_init = 7;
-    auto pass = run_mxmfma_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    auto AB_init = 5;
+    auto pass =
+        run_mxmfma_mk_kn_mn_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP4MFMA16x16x128)
+{
+    auto AB_init = 4;
+    auto pass =
+        run_mxmfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(
+            AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP4MFMA32x32x64)
+{
+    auto AB_init = 4;
+    auto pass =
+        run_mxmfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(
+            AB_init);
     EXPECT_TRUE(pass);
 }
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index d22157c3b3..9ce871cfb1 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -5,6 +5,7 @@
 
 #include "ck/ck.hpp"
 
+#include "ck/utility/data_type.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -111,7 +112,7 @@ template <typename AType, typename AFragT, int32_t BLOCK_M, int32_t BLOCK_K>
 __device__ AFragT load_A_col_major(AType const* input_ptr)
 {
     // clang-format off
-    // Register Mapping for 16x128:                                                        ||    Register Mapping for 32x64:
+    // Register Mapping for 16x128 for FP8:                                                ||    Register Mapping for 32x64 for FP8:
     // Size              |   BLOCK_M  |   BLOCK_M   |   BLOCK_M  |   BLOCK_M   |           ||    Size              |   BLOCK_M  |   BLOCK_M   |        |
     // M                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    M                 | 0  ...  31 |  0  ...  31 | Vector |
     // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
@@ -176,13 +177,19 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
     auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
     auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
 
-    using ARawT        = typename scalar_type<AFragT>::type;
-    using AScalarFragT = vector_type<ARawT, vectorSize(AFragT{})>::type;
+    using ARawT = typename scalar_type<AFragT>::type;
+    using AScalarFragT =
+        vector_type<ARawT,
+                    BLOCK_M * BLOCK_K / WAVE_SIZE /
+                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
 
     AScalarFragT fragA{};
 
+    constexpr index_t num_chunks =
+        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 1 : 2);
+
 #pragma unroll
-    for(int chunk = 0; chunk < 2; chunk++)
+    for(int chunk = 0; chunk < num_chunks; chunk++)
     {
 #pragma unroll
         for(uint32_t i = 0; i < chunk_size; i++)
@@ -241,6 +248,28 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
     // Reg 7 [8:15]      |     K77    |     K93     |     K109   |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |
     // Reg 7 [16:23]     |     K78    |     K94     |     K110   |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |
     // Reg 7 [24:31]     |     K79    |     K95     |     K111   |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |
+
+    // Register Mapping for 16x128 for FP4:                                                ||    Register Mapping for 32x64 for FP4:
+    // Size              |   BLOCK_M  |   BLOCK_M   |   BLOCK_M  |   BLOCK_M   |           ||    Size              |   BLOCK_M  |   BLOCK_M   |        |
+    // M                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    M                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0 [0:7]       |     K0K1   |     K32K33  |     K64K65 |    K96K97   |  v[0]     ||    Reg 0 [0:7]       |     K0K1   |     K32K33  |  v[0]  |
+    // Reg 0 [8:15]      |     K2K3   |     K34K35  |     K66K67 |    K98K99   |  v[1]     ||    Reg 0 [8:15]      |     K2K3   |     K34K35  |  v[1]  |
+    // Reg 0 [16:23]     |     K4K5   |     K36K37  |     K68K69 |    K100K101 |  v[2]     ||    Reg 0 [16:23]     |     K4K5   |     K36K37  |  v[2]  |
+    // Reg 0 [24:31]     |     K6K7   |     K38K39  |     K70K71 |    K102K103 |  v[3]     ||    Reg 0 [24:31]     |     K6K7   |     K38K39  |  v[3]  |
+    // Reg 1 [0:7]       |     K8K9   |     K40K41  |     K72K73 |    K104K105 |  v[4]     ||    Reg 1 [0:7]       |     K8K9   |     K40K41  |  v[4]  |
+    // Reg 1 [8:15]      |     K10K11 |     K42K43  |     K74K75 |    K106K107 |  v[5]     ||    Reg 1 [8:15]      |     K10K11 |     K42K43  |  v[5]  |
+    // Reg 1 [16:23]     |     K12K13 |     K44K45  |     K76K77 |    K108K109 |  v[6]     ||    Reg 1 [16:23]     |     K12K13 |     K44K45  |  v[6]  |
+    // Reg 1 [24:31]     |     K14K15 |     K46K47  |     K78K79 |    K110K111 |  v[7]     ||    Reg 1 [24:31]     |     K14K15 |     K46K47  |  v[7]  |
+    // Reg 2 [0:7]       |     K16K17 |     K48K49  |     K80K81 |    K112K113 |  v[8]     ||    Reg 2 [0:7]       |     K16K17 |     K48K49  |  v[8]  |
+    // Reg 2 [8:15]      |     K18K19 |     K50K51  |     K82K83 |    K114K115 |  v[9]     ||    Reg 2 [8:15]      |     K18K19 |     K50K51  |  v[9]  |
+    // Reg 2 [16:23]     |     K20K21 |     K52K53  |     K84K85 |    K116K117 |  v[10]    ||    Reg 2 [16:23]     |     K20K21 |     K52K53  |  v[10] |
+    // Reg 2 [24:31]     |     K22K23 |     K54K55  |     K86K87 |    K118K119 |  v[11]    ||    Reg 2 [24:31]     |     K22K23 |     K54K55  |  v[11] |
+    // Reg 3 [0:7]       |     K24K25 |     K56K57  |     K88K89 |    K120K121 |  v[12]    ||    Reg 3 [0:7]       |     K24K25 |     K56K57  |  v[12] |
+    // Reg 3 [8:15]      |     K26K27 |     K58K59  |     K90K91 |    K122K123 |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |     K58K59  |  v[13] |
+    // Reg 3 [16:23]     |     K28K29 |     K60K61  |     K92K93 |    K124K125 |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |     K60K61  |  v[14] |
+    // Reg 3 [24:31]     |     K30K31 |     K62K63  |     K94K95 |    K126K127 |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |     K62K63  |  v[15] |
     // clang-format on
 
     static constexpr int32_t WAVE_SIZE = 64;
@@ -265,23 +294,34 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
     auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
 
     // BLOCK_K is a stride in A matrix
-    auto startOffset = row_major(startCoord2D, BLOCK_K);
-    // auto kMinorOffset = row_major(minorStepCoord2D, BLOCK_K);
-    auto kMajorOffset = row_major(majorStepCoord2D, BLOCK_K);
+    auto startOffset = row_major(
+        startCoord2D, BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1));
+    // auto kMinorOffset = row_major(minorStepCoord2D, BLOCK_K /
+    // (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1));
+    auto kMajorOffset =
+        row_major(majorStepCoord2D,
+                  BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1));
 
     using ARawT        = typename scalar_type<AFragT>::type;
     using AScalarFragT = vector_type<ARawT, chunk_size>::type;
 
+    constexpr index_t num_chunks =
+        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 1 : 2);
+
     union
     {
         AFragT frag;
-        AScalarFragT chunks[2];
+        AScalarFragT chunks[num_chunks];
     } fragA{};
 
-    auto* fragPtr   = reinterpret_cast<AScalarFragT const*>(input_ptr + startOffset);
-    fragA.chunks[0] = *fragPtr;
-    fragPtr         = reinterpret_cast<AScalarFragT const*>(input_ptr + startOffset + kMajorOffset);
-    fragA.chunks[1] = *fragPtr;
+    const AScalarFragT* fragPtr;
+
+    for(index_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++)
+    {
+        fragPtr                 = reinterpret_cast<AScalarFragT const*>(input_ptr + startOffset +
+                                                        chunk_idx * kMajorOffset);
+        fragA.chunks[chunk_idx] = *fragPtr;
+    }
 
     return fragA.frag;
 }
@@ -339,15 +379,35 @@ __device__ AFragT load_mx_A_row_major(AType const* input_ptr,
     // Reg 7 [8:15]      |     K77    |     K93     |  x(M,2)  |     K109   |     K125    |  x(M,3)  |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |  x(M,1)  |
     // Reg 7 [16:23]     |     K78    |     K94     |  x(M,2)  |     K110   |     K126    |  x(M,3)  |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |  x(M,1)  |
     // Reg 7 [24:31]     |     K79    |     K95     |  x(M,2)  |     K111   |     K127    |  x(M,3)  |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |  x(M,1)  |
+
+    // Register Mapping for 16x128 for FP4:                                                                                            ||    Register Mapping for 32x64 for FP4:
+    // Size              |   BLOCK_M  |          |   BLOCK_M   |          |   BLOCK_M  |          |   BLOCK_M   |          |           ||    Size              |   BLOCK_M  |          |   BLOCK_M   |          |        |
+    // M                 | 0  ...  15 |          |  0  ...  15 |          | 0  ...  15 |          |  0  ...  15 |          | Vector    ||    M                 | 0  ...  31 |          |  0  ...  31 |          | Vector |
+    // Thread Id         | 0  ...  15 |  Scale   | 16  ...  31 |  Scale   | 32  ... 47 |  Scale   | 48  ...  63 |  Scale   | Element   ||    Thread Id         | 0  ...  31 |  Scale   | 32  ...  63 |  Scale   | Element|
+    // Register Element  |------------ ----------|------------- ----------|------------ ----------|------------- ----------|-----------||    Register Element  |------------|----------|-------------|----------|--------|
+    // Reg 0 [0:7]       |     K0K1   |  x(M,0)  |     K32K33  |  x(M,1)  |     K64K65 |  x(M,2)  |    K96K97   |  x(M,3)  |  v[0]     ||    Reg 0 [0:7]       |     K0K1   |  x(M,0)  |     K32K33  |  x(M,1)  |  v[0]  |
+    // Reg 0 [8:15]      |     K2K3   |  x(M,0)  |     K34K35  |  x(M,1)  |     K66K67 |  x(M,2)  |    K98K99   |  x(M,3)  |  v[1]     ||    Reg 0 [8:15]      |     K2K3   |  x(M,0)  |     K34K35  |  x(M,1)  |  v[1]  |
+    // Reg 0 [16:23]     |     K4K5   |  x(M,0)  |     K36K37  |  x(M,1)  |     K68K69 |  x(M,2)  |    K100K101 |  x(M,3)  |  v[2]     ||    Reg 0 [16:23]     |     K4K5   |  x(M,0)  |     K36K37  |  x(M,1)  |  v[2]  |
+    // Reg 0 [24:31]     |     K6K7   |  x(M,0)  |     K38K39  |  x(M,1)  |     K70K71 |  x(M,2)  |    K102K103 |  x(M,3)  |  v[3]     ||    Reg 0 [24:31]     |     K6K7   |  x(M,0)  |     K38K39  |  x(M,1)  |  v[3]  |
+    // Reg 1 [0:7]       |     K8K9   |  x(M,0)  |     K40K41  |  x(M,1)  |     K72K73 |  x(M,2)  |    K104K105 |  x(M,3)  |  v[4]     ||    Reg 1 [0:7]       |     K8K9   |  x(M,0)  |     K40K41  |  x(M,1)  |  v[4]  |
+    // Reg 1 [8:15]      |     K10K11 |  x(M,0)  |     K42K43  |  x(M,1)  |     K74K75 |  x(M,2)  |    K106K107 |  x(M,3)  |  v[5]     ||    Reg 1 [8:15]      |     K10K11 |  x(M,0)  |     K42K43  |  x(M,1)  |  v[5]  |
+    // Reg 1 [16:23]     |     K12K13 |  x(M,0)  |     K44K45  |  x(M,1)  |     K76K77 |  x(M,2)  |    K108K109 |  x(M,3)  |  v[6]     ||    Reg 1 [16:23]     |     K12K13 |  x(M,0)  |     K44K45  |  x(M,1)  |  v[6]  |
+    // Reg 1 [24:31]     |     K14K15 |  x(M,0)  |     K46K47  |  x(M,1)  |     K78K79 |  x(M,2)  |    K110K111 |  x(M,3)  |  v[7]     ||    Reg 1 [24:31]     |     K14K15 |  x(M,0)  |     K46K47  |  x(M,1)  |  v[7]  |
+    // Reg 2 [0:7]       |     K16K17 |  x(M,0)  |     K48K49  |  x(M,1)  |     K80K81 |  x(M,2)  |    K112K113 |  x(M,3)  |  v[8]     ||    Reg 2 [0:7]       |     K16K17 |  x(M,0)  |     K48K49  |  x(M,1)  |  v[8]  |
+    // Reg 2 [8:15]      |     K18K19 |  x(M,0)  |     K50K51  |  x(M,1)  |     K82K83 |  x(M,2)  |    K114K115 |  x(M,3)  |  v[9]     ||    Reg 2 [8:15]      |     K18K19 |  x(M,0)  |     K50K51  |  x(M,1)  |  v[9]  |
+    // Reg 2 [16:23]     |     K20K21 |  x(M,0)  |     K52K53  |  x(M,1)  |     K84K85 |  x(M,2)  |    K116K117 |  x(M,3)  |  v[10]    ||    Reg 2 [16:23]     |     K20K21 |  x(M,0)  |     K52K53  |  x(M,1)  |  v[10] |
+    // Reg 2 [24:31]     |     K22K23 |  x(M,0)  |     K54K55  |  x(M,1)  |     K86K87 |  x(M,2)  |    K118K119 |  x(M,3)  |  v[11]    ||    Reg 2 [24:31]     |     K22K23 |  x(M,0)  |     K54K55  |  x(M,1)  |  v[11] |
+    // Reg 3 [0:7]       |     K24K25 |  x(M,0)  |     K56K57  |  x(M,1)  |     K88K89 |  x(M,2)  |    K120K121 |  x(M,3)  |  v[12]    ||    Reg 3 [0:7]       |     K24K25 |  x(M,0)  |     K56K57  |  x(M,1)  |  v[12] |
+    // Reg 3 [8:15]      |     K26K27 |  x(M,0)  |     K58K59  |  x(M,1)  |     K90K91 |  x(M,2)  |    K122K123 |  x(M,3)  |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |  x(M,0)  |     K58K59  |  x(M,1)  |  v[13] |
+    // Reg 3 [16:23]     |     K28K29 |  x(M,0)  |     K60K61  |  x(M,1)  |     K92K93 |  x(M,2)  |    K124K125 |  x(M,3)  |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |  x(M,0)  |     K60K61  |  x(M,1)  |  v[14] |
+    // Reg 3 [24:31]     |     K30K31 |  x(M,0)  |     K62K63  |  x(M,1)  |     K94K95 |  x(M,2)  |    K126K127 |  x(M,3)  |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |  x(M,0)  |     K62K63  |  x(M,1)  |  v[15] |
     // clang-format on
-    static constexpr uint32_t VW = vectorSize(AFragT{});
-    static_assert(VW == BLOCK_X, "Fragment size must be equal to BLOCK_X");
 
     // To start the loading process, let's visualize in 2D coords.
     // Each thread will load 1 element
     // We need to know where they start
-    auto startCoord2D = std::make_pair(threadIdx.x % BLOCK_M,                   // Row
-                                       (threadIdx.x / BLOCK_M) * VW / BLOCK_X); // Col
+    auto startCoord2D = std::make_pair(threadIdx.x % BLOCK_M,    // Row
+                                       (threadIdx.x / BLOCK_M)); // Col
 
     // Flatten to 1D row_major offsets.
     auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
@@ -369,7 +429,7 @@ template <typename BType, typename BFragT, int32_t BLOCK_K, int32_t BLOCK_N>
 __device__ BFragT load_B_col_major(BType const* input_ptr)
 {
     // clang-format off
-    // Register Mapping for 128x16:                                                        ||    Register Mapping for 64x32:
+    // Register Mapping for 128x16 for FP8:                                                ||    Register Mapping for 64x32 for FP8:
     // Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N  |   BLOCK_N   |           ||    Size              |   BLOCK_N  |   BLOCK_N   |        |
     // N                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    N                 | 0  ...  31 |  0  ...  31 | Vector |
     // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
@@ -406,6 +466,28 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
     // Reg 7 [8:15]      |     K77    |     K93     |     K109   |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |
     // Reg 7 [16:23]     |     K78    |     K94     |     K110   |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |
     // Reg 7 [24:31]     |     K79    |     K95     |     K111   |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |
+
+    // Register Mapping for 128x16 for FP4:                                                ||    Register Mapping for 64x32 for FP4:
+    // Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N  |   BLOCK_N   |           ||    Size              |   BLOCK_N  |   BLOCK_N   |        |
+    // N                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    N                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0 [0:7]       |     K0K1   |     K32K33  |     K64K65 |    K96K97   |  v[0]     ||    Reg 0 [0:7]       |     K0K1   |     K32K33  |  v[0]  |
+    // Reg 0 [8:15]      |     K2K3   |     K34K35  |     K66K67 |    K98K99   |  v[1]     ||    Reg 0 [8:15]      |     K2K3   |     K34K35  |  v[1]  |
+    // Reg 0 [16:23]     |     K4K5   |     K36K37  |     K68K69 |    K100K101 |  v[2]     ||    Reg 0 [16:23]     |     K4K5   |     K36K37  |  v[2]  |
+    // Reg 0 [24:31]     |     K6K7   |     K38K39  |     K70K71 |    K102K103 |  v[3]     ||    Reg 0 [24:31]     |     K6K7   |     K38K39  |  v[3]  |
+    // Reg 1 [0:7]       |     K8K9   |     K40K41  |     K72K73 |    K104K105 |  v[4]     ||    Reg 1 [0:7]       |     K8K9   |     K40K41  |  v[4]  |
+    // Reg 1 [8:15]      |     K10K11 |     K42K43  |     K74K75 |    K106K107 |  v[5]     ||    Reg 1 [8:15]      |     K10K11 |     K42K43  |  v[5]  |
+    // Reg 1 [16:23]     |     K12K13 |     K44K45  |     K76K77 |    K108K109 |  v[6]     ||    Reg 1 [16:23]     |     K12K13 |     K44K45  |  v[6]  |
+    // Reg 1 [24:31]     |     K14K15 |     K46K47  |     K78K79 |    K110K111 |  v[7]     ||    Reg 1 [24:31]     |     K14K15 |     K46K47  |  v[7]  |
+    // Reg 2 [0:7]       |     K16K17 |     K48K49  |     K80K81 |    K112K113 |  v[8]     ||    Reg 2 [0:7]       |     K16K17 |     K48K49  |  v[8]  |
+    // Reg 2 [8:15]      |     K18K19 |     K50K51  |     K82K83 |    K114K115 |  v[9]     ||    Reg 2 [8:15]      |     K18K19 |     K50K51  |  v[9]  |
+    // Reg 2 [16:23]     |     K20K21 |     K52K53  |     K84K85 |    K116K117 |  v[10]    ||    Reg 2 [16:23]     |     K20K21 |     K52K53  |  v[10] |
+    // Reg 2 [24:31]     |     K22K23 |     K54K55  |     K86K87 |    K118K119 |  v[11]    ||    Reg 2 [24:31]     |     K22K23 |     K54K55  |  v[11] |
+    // Reg 3 [0:7]       |     K24K25 |     K56K57  |     K88K89 |    K120K121 |  v[12]    ||    Reg 3 [0:7]       |     K24K25 |     K56K57  |  v[12] |
+    // Reg 3 [8:15]      |     K26K27 |     K58K59  |     K90K91 |    K122K123 |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |     K58K59  |  v[13] |
+    // Reg 3 [16:23]     |     K28K29 |     K60K61  |     K92K93 |    K124K125 |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |     K60K61  |  v[14] |
+    // Reg 3 [24:31]     |     K30K31 |     K62K63  |     K94K95 |    K126K127 |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |     K62K63  |  v[15] |
     // clang-format on
 
     static constexpr int32_t WAVE_SIZE = 64;
@@ -430,23 +512,34 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
     auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
 
     // BLOCK_K is a stride in B matrix
-    auto startOffset = col_major(startCoord2D, BLOCK_K);
-    // auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_K);
-    auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_K);
+    auto startOffset = col_major(
+        startCoord2D, BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1));
+    // auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_K /
+    // (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1));
+    auto kMajorOffset =
+        col_major(majorStepCoord2D,
+                  BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1));
 
     using BRawT        = typename scalar_type<BFragT>::type;
     using BScalarFragT = vector_type<BRawT, chunk_size>::type;
 
+    constexpr index_t num_chunks =
+        (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 1 : 2);
+
     union
     {
         BFragT frag;
-        BScalarFragT chunks[2];
+        BScalarFragT chunks[num_chunks];
     } fragB{};
 
-    auto* fragPtr   = reinterpret_cast<BScalarFragT const*>(input_ptr + startOffset);
-    fragB.chunks[0] = *fragPtr;
-    fragPtr         = reinterpret_cast<BScalarFragT const*>(input_ptr + startOffset + kMajorOffset);
-    fragB.chunks[1] = *fragPtr;
+    const BScalarFragT* fragPtr;
+
+    for(index_t chunk = 0; chunk < num_chunks; chunk++)
+    {
+        fragPtr =
+            reinterpret_cast<BScalarFragT const*>(input_ptr + startOffset + chunk * kMajorOffset);
+        fragB.chunks[chunk] = *fragPtr;
+    }
 
     return fragB.frag;
 }
@@ -506,15 +599,56 @@ __device__ BFragT load_mx_B_col_major(BType const* input_ptr,
     // Reg 7 [16:23]     |     K78    |     K94     |  x(2,N)  |     K110   |     K126    |  x(3,N)  |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |  x(1,N)  |
     // Reg 7 [24:31]     |     K79    |     K95     |  x(2,N)  |     K111   |     K127    |  x(3,N)  |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |  x(1,N)  |
 
+    // Register Mapping for 128x16 for FP4:                                                ||    Register Mapping for 64x32 for FP4:
+    // Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N  |   BLOCK_N   |           ||    Size              |   BLOCK_N  |   BLOCK_N   |        |
+    // N                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    N                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0 [0:7]       |     K0K1   |     K32K33  |     K64K65 |    K96K97   |  v[0]     ||    Reg 0 [0:7]       |     K0K1   |     K32K33  |  v[0]  |
+    // Reg 0 [8:15]      |     K2K3   |     K34K35  |     K66K67 |    K98K99   |  v[1]     ||    Reg 0 [8:15]      |     K2K3   |     K34K35  |  v[1]  |
+    // Reg 0 [16:23]     |     K4K5   |     K36K37  |     K68K69 |    K100K101 |  v[2]     ||    Reg 0 [16:23]     |     K4K5   |     K36K37  |  v[2]  |
+    // Reg 0 [24:31]     |     K6K7   |     K38K39  |     K70K71 |    K102K103 |  v[3]     ||    Reg 0 [24:31]     |     K6K7   |     K38K39  |  v[3]  |
+    // Reg 1 [0:7]       |     K8K9   |     K40K41  |     K72K73 |    K104K105 |  v[4]     ||    Reg 1 [0:7]       |     K8K9   |     K40K41  |  v[4]  |
+    // Reg 1 [8:15]      |     K10K11 |     K42K43  |     K74K75 |    K106K107 |  v[5]     ||    Reg 1 [8:15]      |     K10K11 |     K42K43  |  v[5]  |
+    // Reg 1 [16:23]     |     K12K13 |     K44K45  |     K76K77 |    K108K109 |  v[6]     ||    Reg 1 [16:23]     |     K12K13 |     K44K45  |  v[6]  |
+    // Reg 1 [24:31]     |     K14K15 |     K46K47  |     K78K79 |    K110K111 |  v[7]     ||    Reg 1 [24:31]     |     K14K15 |     K46K47  |  v[7]  |
+    // Reg 2 [0:7]       |     K16K17 |     K48K49  |     K80K81 |    K112K113 |  v[8]     ||    Reg 2 [0:7]       |     K16K17 |     K48K49  |  v[8]  |
+    // Reg 2 [8:15]      |     K18K19 |     K50K51  |     K82K83 |    K114K115 |  v[9]     ||    Reg 2 [8:15]      |     K18K19 |     K50K51  |  v[9]  |
+    // Reg 2 [16:23]     |     K20K21 |     K52K53  |     K84K85 |    K116K117 |  v[10]    ||    Reg 2 [16:23]     |     K20K21 |     K52K53  |  v[10] |
+    // Reg 2 [24:31]     |     K22K23 |     K54K55  |     K86K87 |    K118K119 |  v[11]    ||    Reg 2 [24:31]     |     K22K23 |     K54K55  |  v[11] |
+    // Reg 3 [0:7]       |     K24K25 |     K56K57  |     K88K89 |    K120K121 |  v[12]    ||    Reg 3 [0:7]       |     K24K25 |     K56K57  |  v[12] |
+    // Reg 3 [8:15]      |     K26K27 |     K58K59  |     K90K91 |    K122K123 |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |     K58K59  |  v[13] |
+    // Reg 3 [16:23]     |     K28K29 |     K60K61  |     K92K93 |    K124K125 |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |     K60K61  |  v[14] |
+    // Reg 3 [24:31]     |     K30K31 |     K62K63  |     K94K95 |    K126K127 |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |     K62K63  |  v[15] |
+
+    // Register Mapping for 128x16 for FP4:                                                                                            ||    Register Mapping for 64x32 for FP4:
+    // Size              |   BLOCK_N  |          |   BLOCK_N   |          |   BLOCK_N  |          |   BLOCK_N   |          |           ||    Size              |   BLOCK_N  |          |   BLOCK_N   |          |        |
+    // N                 | 0  ...  15 |          |  0  ...  15 |          | 0  ...  15 |          |  0  ...  15 |          | Vector    ||    N                 | 0  ...  31 |          |  0  ...  31 |          | Vector |
+    // Thread Id         | 0  ...  15 |  Scale   | 16  ...  31 |  Scale   | 32  ... 47 |  Scale   | 48  ...  63 |  Scale   | Element   ||    Thread Id         | 0  ...  31 |  Scale   | 32  ...  63 |  Scale   | Element|
+    // Register Element  |------------ ----------|------------- ----------|------------ ----------|------------- ----------|-----------||    Register Element  |------------|----------|-------------|----------|--------|
+    // Reg 0 [0:7]       |     K0K1   |  x(0,N)  |     K32K33  |  x(M,1)  |     K64K65 |  x(M,2)  |    K96K97   |  x(M,3)  |  v[0]     ||    Reg 0 [0:7]       |     K0K1   |  x(M,0)  |     K32K33  |  x(M,1)  |  v[0]  |
+    // Reg 0 [8:15]      |     K2K3   |  x(0,N)  |     K34K35  |  x(M,1)  |     K66K67 |  x(M,2)  |    K98K99   |  x(M,3)  |  v[1]     ||    Reg 0 [8:15]      |     K2K3   |  x(M,0)  |     K34K35  |  x(M,1)  |  v[1]  |
+    // Reg 0 [16:23]     |     K4K5   |  x(0,N)  |     K36K37  |  x(M,1)  |     K68K69 |  x(M,2)  |    K100K101 |  x(M,3)  |  v[2]     ||    Reg 0 [16:23]     |     K4K5   |  x(M,0)  |     K36K37  |  x(M,1)  |  v[2]  |
+    // Reg 0 [24:31]     |     K6K7   |  x(0,N)  |     K38K39  |  x(M,1)  |     K70K71 |  x(M,2)  |    K102K103 |  x(M,3)  |  v[3]     ||    Reg 0 [24:31]     |     K6K7   |  x(M,0)  |     K38K39  |  x(M,1)  |  v[3]  |
+    // Reg 1 [0:7]       |     K8K9   |  x(0,N)  |     K40K41  |  x(M,1)  |     K72K73 |  x(M,2)  |    K104K105 |  x(M,3)  |  v[4]     ||    Reg 1 [0:7]       |     K8K9   |  x(M,0)  |     K40K41  |  x(M,1)  |  v[4]  |
+    // Reg 1 [8:15]      |     K10K11 |  x(0,N)  |     K42K43  |  x(M,1)  |     K74K75 |  x(M,2)  |    K106K107 |  x(M,3)  |  v[5]     ||    Reg 1 [8:15]      |     K10K11 |  x(M,0)  |     K42K43  |  x(M,1)  |  v[5]  |
+    // Reg 1 [16:23]     |     K12K13 |  x(0,N)  |     K44K45  |  x(M,1)  |     K76K77 |  x(M,2)  |    K108K109 |  x(M,3)  |  v[6]     ||    Reg 1 [16:23]     |     K12K13 |  x(M,0)  |     K44K45  |  x(M,1)  |  v[6]  |
+    // Reg 1 [24:31]     |     K14K15 |  x(0,N)  |     K46K47  |  x(M,1)  |     K78K79 |  x(M,2)  |    K110K111 |  x(M,3)  |  v[7]     ||    Reg 1 [24:31]     |     K14K15 |  x(M,0)  |     K46K47  |  x(M,1)  |  v[7]  |
+    // Reg 2 [0:7]       |     K16K17 |  x(0,N)  |     K48K49  |  x(M,1)  |     K80K81 |  x(M,2)  |    K112K113 |  x(M,3)  |  v[8]     ||    Reg 2 [0:7]       |     K16K17 |  x(M,0)  |     K48K49  |  x(M,1)  |  v[8]  |
+    // Reg 2 [8:15]      |     K18K19 |  x(0,N)  |     K50K51  |  x(M,1)  |     K82K83 |  x(M,2)  |    K114K115 |  x(M,3)  |  v[9]     ||    Reg 2 [8:15]      |     K18K19 |  x(M,0)  |     K50K51  |  x(M,1)  |  v[9]  |
+    // Reg 2 [16:23]     |     K20K21 |  x(0,N)  |     K52K53  |  x(M,1)  |     K84K85 |  x(M,2)  |    K116K117 |  x(M,3)  |  v[10]    ||    Reg 2 [16:23]     |     K20K21 |  x(M,0)  |     K52K53  |  x(M,1)  |  v[10] |
+    // Reg 2 [24:31]     |     K22K23 |  x(0,N)  |     K54K55  |  x(M,1)  |     K86K87 |  x(M,2)  |    K118K119 |  x(M,3)  |  v[11]    ||    Reg 2 [24:31]     |     K22K23 |  x(M,0)  |     K54K55  |  x(M,1)  |  v[11] |
+    // Reg 3 [0:7]       |     K24K25 |  x(0,N)  |     K56K57  |  x(M,1)  |     K88K89 |  x(M,2)  |    K120K121 |  x(M,3)  |  v[12]    ||    Reg 3 [0:7]       |     K24K25 |  x(M,0)  |     K56K57  |  x(M,1)  |  v[12] |
+    // Reg 3 [8:15]      |     K26K27 |  x(0,N)  |     K58K59  |  x(M,1)  |     K90K91 |  x(M,2)  |    K122K123 |  x(M,3)  |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |  x(M,0)  |     K58K59  |  x(M,1)  |  v[13] |
+    // Reg 3 [16:23]     |     K28K29 |  x(0,N)  |     K60K61  |  x(M,1)  |     K92K93 |  x(M,2)  |    K124K125 |  x(M,3)  |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |  x(M,0)  |     K60K61  |  x(M,1)  |  v[14] |
+    // Reg 3 [24:31]     |     K30K31 |  x(0,N)  |     K62K63  |  x(M,1)  |     K94K95 |  x(M,2)  |    K126K127 |  x(M,3)  |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |  x(M,0)  |     K62K63  |  x(M,1)  |  v[15] |
     // clang-format on
-    static constexpr uint32_t VW = vectorSize(BFragT{});
-    static_assert(VW == BLOCK_X, "Fragment size must be equal to BLOCK_X");
 
     // To start the loading process, let's visualize in 2D coords.
     // Each thread will load 1 element
     // We need to know where to start
-    auto startCoord2D = std::make_pair((threadIdx.x / BLOCK_N) * VW / BLOCK_X, // Row
-                                       threadIdx.x % BLOCK_N);                 // Col
+    auto startCoord2D = std::make_pair((threadIdx.x / BLOCK_N), // Row
+                                       threadIdx.x % BLOCK_N);  // Col
 
     // Flatten to 1D col_major offsets.
     auto col_major = [](auto const& coord, auto ld) { return coord.first + coord.second * ld; };
@@ -766,15 +900,24 @@ template <typename AType,
           typename AccType,
           int32_t BLOCK_M,
           int32_t BLOCK_N,
-          int32_t BLOCK_K>
+          int32_t BLOCK_K,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 __global__ void matmul(const AType* a, const BType* b, CType* c)
 {
     constexpr int WAVE_SIZE = 64;
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT        = vector_type<AType, BLOCK_M * BLOCK_K / WAVE_SIZE>::type;
-    using BFragT        = vector_type<BType, BLOCK_K * BLOCK_N / WAVE_SIZE>::type;
+    using AFragT =
+        vector_type<AType,
+                    BLOCK_M * BLOCK_K / WAVE_SIZE /
+                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
+    using BFragT =
+        vector_type<BType,
+                    BLOCK_K * BLOCK_N / WAVE_SIZE /
+                        (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
     using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
     using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
@@ -786,10 +929,23 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
     auto fragAcc = AccumFragT{0};
 
     // Load the inputs.
-    // A = col major, BLOCK_M x BLOCK_K
-    fragA = load_A_col_major<AType, AFragT, BLOCK_M, BLOCK_K>(a);
-    // B = col major, BLOCK_K x BLOCK_N
-    fragB = load_B_col_major<BType, BFragT, BLOCK_K, BLOCK_N>(b);
+    if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+    {
+        fragA = load_A_row_major<AType, AFragT, BLOCK_M, BLOCK_K>(a);
+    }
+    else
+    {
+        fragA = load_A_col_major<AType, AFragT, BLOCK_M, BLOCK_K>(a);
+    }
+
+    if constexpr(is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+    {
+        printf("This layout is not implemented\n");
+    }
+    else
+    {
+        fragB = load_B_col_major<BType, BFragT, BLOCK_K, BLOCK_N>(b);
+    }
 
     // Matrix multiply-accumulate using MFMA units
     // Accumulation intermediate = BLOCK_M x BLOCK_N
@@ -801,8 +957,14 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
         fragC[i] = type_convert<CType>(fragAcc.template AsType<RawAccumFragT>()[Number<0>{}][i]);
     }
 
-    auto storeC = store_C_col_major<CType, CFragT, BLOCK_M, BLOCK_N>{};
-    storeC(c, fragC);
+    if constexpr(is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+    {
+        store_C_row_major<CType, CFragT, BLOCK_M, BLOCK_N>{}(c, fragC);
+    }
+    else
+    {
+        store_C_col_major<CType, CFragT, BLOCK_M, BLOCK_N>{}(c, fragC);
+    }
 }
 
 template <typename AType,
@@ -813,7 +975,10 @@ template <typename AType,
           int32_t BLOCK_M,
           int32_t BLOCK_N,
           int32_t BLOCK_K,
-          int32_t BLOCK_X>
+          int32_t BLOCK_X,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 __global__ void
 matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb, CType* c)
 {
@@ -821,8 +986,14 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT        = vector_type<AType, BLOCK_M * BLOCK_K / WAVE_SIZE>::type;
-    using BFragT        = vector_type<BType, BLOCK_K * BLOCK_N / WAVE_SIZE>::type;
+    using AFragT =
+        vector_type<AType,
+                    BLOCK_M * BLOCK_K / WAVE_SIZE /
+                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
+    using BFragT =
+        vector_type<BType,
+                    BLOCK_K * BLOCK_N / WAVE_SIZE /
+                        (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
     using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
     using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
@@ -838,13 +1009,27 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
     auto fragXb  = BScaleFragT{};
 
     // Load the inputs.
-    // A = col major, BLOCK_M x BLOCK_K
-    fragA = load_mx_A_row_major<AType, AFragT, ScaleType, AScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
-        a, xa, fragXa);
+    if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+    {
+        fragA =
+            load_mx_A_row_major<AType, AFragT, ScaleType, AScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
+                a, xa, fragXa);
+    }
+    else
+    {
+        printf("This layout is not implemented\n");
+    }
 
-    // B = col major, BLOCK_K x BLOCK_N
-    fragB = load_mx_B_col_major<BType, BFragT, ScaleType, BScaleFragT, BLOCK_K, BLOCK_N, BLOCK_X>(
-        b, xb, fragXb);
+    if constexpr(is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+    {
+        printf("This layout is not implemented\n");
+    }
+    else
+    {
+        fragB =
+            load_mx_B_col_major<BType, BFragT, ScaleType, BScaleFragT, BLOCK_K, BLOCK_N, BLOCK_X>(
+                b, xb, fragXb);
+    }
 
     // Scaled Matrix multiply-accumulate using MFMA units
     // Accumulation intermediate = BLOCK_M x BLOCK_N
@@ -860,8 +1045,14 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
         fragC[i] = type_convert<CType>(fragAcc.template AsType<RawAccumFragT>()[Number<0>{}][i]);
     }
 
-    auto storeC = store_C_row_major<CType, CFragT, BLOCK_M, BLOCK_N>{};
-    storeC(c, fragC);
+    if constexpr(is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+    {
+        store_C_row_major<CType, CFragT, BLOCK_M, BLOCK_N>{}(c, fragC);
+    }
+    else
+    {
+        store_C_col_major<CType, CFragT, BLOCK_M, BLOCK_N>{}(c, fragC);
+    }
 }
 
 /**
@@ -993,8 +1184,7 @@ struct TestMXMFMA
         {
         case 0:
             a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0f});
-            a_scales.GenerateTensorValue(
-                GeneratorTensor_1<ScaleType>{ScaleType{0.015625f}}); // 1/64
+            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{0.015625f}}); // 1/6
             // NOTE: not all numbers are representable in FP8, BF8, etc.
             // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 16 18 20 20 20 22 24 24 24 26 28 28 28 30 32
             b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
@@ -1012,11 +1202,9 @@ struct TestMXMFMA
             a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
             a_scales.GenerateTensorValue(
                 GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
-
             b_n_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
             b_scales.GenerateTensorValue(GeneratorTensor_2<ScaleType>{126, 129});
             break;
-
         case 3:
             // expect small round off errors
             a_m_k.GenerateTensorValue(GeneratorTensor_4<ADataType>(0, 1));
@@ -1026,6 +1214,14 @@ struct TestMXMFMA
             b_scales.GenerateTensorValue(
                 GeneratorTensor_2<ScaleType>{126, 129}); //  scales: {0.5, 1, 2}
             break;
+        case 4:
+            a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1., 1.});
+            a_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
+            b_n_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1., 1.});
+            b_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{126, 129}); //  scales: {0.5, 1, 2}
+            break;
         default:
             // all initial values are representable in FP8, BF8
             a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6}); // Z[-5,5]
@@ -1207,6 +1403,11 @@ struct TestMFMA
             a_m_k.GenerateTensorValue(GeneratorTensor_4<ADataType>(-1, 3));
             b_n_k.GenerateTensorValue(GeneratorTensor_4<BDataType>(1, 3));
             break;
+        case 4:
+            // FP4 values case
+            a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5});
+            b_n_k.GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5});
+            break;
         default:
             // all initial values are representable in FP8, BF8
             a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6});

From 769336b6404d36ee6e7ef39baa8fccd3f583a8e7 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Wed, 7 May 2025 02:00:39 -0500
Subject: [PATCH 098/443] [CK_TILE] Add type traits to detect tile window types
 at compile time (#2158)

* added WindowType enum to tile_window_structs and static assert checks in computev4 pipeline

* added type traits instead of enum to tile_window() and tile_window_linear() with debug comments

* removed comments, added documentation and clang format
---
 include/ck_tile/core/tensor/tile_window.hpp   | 78 +++++++++++++++++++
 .../core/tensor/tile_window_linear.hpp        | 46 +++++++++++
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |  6 ++
 3 files changed, 130 insertions(+)

diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 3bb728df23..716b1f4ecb 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -1164,4 +1164,82 @@ CK_TILE_DEVICE void move_tile_window(
     window.move(step);
 }
 
+/**
+ * @brief Type trait to determine if a type is a tile window with static distribution.
+ *
+ * Defaults to `false_type`. Specializations define when the trait evaluates to `true`.
+ *
+ * @tparam T The type to check.
+ */
+template <typename T>
+struct is_tile_window_with_static_distribution : std::false_type
+{
+};
+
+/**
+ * @brief Specialization for `tile_window_with_static_distribution` to evaluate to `true_type`.
+ *
+ * @tparam BottomTensorView_ Bottom tensor view type of the tile window.
+ * @tparam WindowLengths_ Static window lengths.
+ * @tparam StaticTileDistribution_ Tile distribution policy.
+ * @tparam NumCoord Number of coordinate dimensions.
+ */
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          index_t NumCoord>
+struct is_tile_window_with_static_distribution<
+    tile_window_with_static_distribution<BottomTensorView_,
+                                         WindowLengths_,
+                                         StaticTileDistribution_,
+                                         NumCoord>> : std::true_type
+{
+};
+
+/**
+ * @brief Helper variable template to check if a type is a tile window with static distribution.
+ *
+ * Equivalent to `is_tile_window_with_static_distribution<T>::value`.
+ *
+ * @tparam T The type to check.
+ */
+template <typename T>
+inline constexpr bool is_tile_window_with_static_distribution_v =
+    is_tile_window_with_static_distribution<T>::value;
+
+/**
+ * @brief Type trait to determine if a type is a tile window with static lengths.
+ *
+ * Defaults to `false_type`. Specializations define when the trait evaluates to `true`.
+ *
+ * @tparam T The type to check.
+ */
+template <typename T>
+struct is_tile_window_with_static_lengths : std::false_type
+{
+};
+
+/**
+ * @brief Specialization for `tile_window_with_static_lengths` to evaluate to `true_type`.
+ *
+ * @tparam BottomTensorView_ Bottom tensor view type of the tile window.
+ * @tparam WindowLengths_ Static window lengths.
+ */
+template <typename BottomTensorView_, typename WindowLengths_>
+struct is_tile_window_with_static_lengths<
+    tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>> : std::true_type
+{
+};
+
+/**
+ * @brief Helper variable template to check if a type is a tile window with static lengths.
+ *
+ * Equivalent to `is_tile_window_with_static_lengths<T>::value`.
+ *
+ * @tparam T The type to check.
+ */
+template <typename T>
+inline constexpr bool is_tile_window_with_static_lengths_v =
+    is_tile_window_with_static_lengths<T>::value;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index 1e24e660f6..5ecaf5ca17 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -44,6 +44,7 @@ template <typename BottomTensorView_,
           typename LinearBottomDims_>
 struct tile_window_linear
 {
+
     using BottomTensorView = remove_reference_t<BottomTensorView_>;
     using WindowLengths    = remove_cvref_t<WindowLengths_>;
     using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
@@ -1215,4 +1216,49 @@ CK_TILE_DEVICE void move_tile_window(
     window.move(step);
 }
 
+/**
+ * @brief Type trait to determine if a type is a linear tile window.
+ *
+ * Defaults to `false_type`. Specialized to `true_type` for types that match
+ * `tile_window_linear<...>`.
+ *
+ * @tparam T The type to check.
+ */
+template <typename T>
+struct is_tile_window_linear : std::false_type
+{
+};
+
+/**
+ * @brief Specialization of `is_tile_window_linear` for `tile_window_linear`.
+ *
+ * Evaluates to `true_type` if the type is a `tile_window_linear` with the given template
+ * parameters.
+ *
+ * @tparam BottomTensorView_ Bottom tensor view type of the tile window.
+ * @tparam WindowLengths_ Static window lengths.
+ * @tparam StaticTileDistribution_ Tile distribution policy.
+ * @tparam LinearBottomDims_ Dimensions of the bottom tensor view that participate in linearization.
+ */
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+struct is_tile_window_linear<tile_window_linear<BottomTensorView_,
+                                                WindowLengths_,
+                                                StaticTileDistribution_,
+                                                LinearBottomDims_>> : std::true_type
+{
+};
+
+/**
+ * @brief Helper variable template to check if a type is a linear tile window.
+ *
+ * Equivalent to `is_tile_window_linear<T>::value`.
+ *
+ * @tparam T The type to check.
+ */
+template <typename T>
+inline constexpr bool is_tile_window_linear_v = is_tile_window_linear<T>::value;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 667bb80ce9..6535f612f1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -337,6 +337,12 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                                  {0, 0},
                                  BLdsTileDistr);
 
+            static_assert(
+                !(is_tile_window_linear_v<decltype(a_lds_ld_window0)>)&&!(is_tile_window_linear_v<decltype(a_lds_ld_window1)>)&&!(
+                    is_tile_window_linear_v<
+                        decltype(b_lds_ld_window0)>)&&!(is_tile_window_linear_v<decltype(b_lds_ld_window1)>),
+                "LDS windows must not be linear");
+
             Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
             Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0);
 

From 956fe8f75118de688b1ee9ca8619b2c1dbe35ea1 Mon Sep 17 00:00:00 2001
From: kylasa <sudhir.kylasa@amd.com>
Date: Wed, 7 May 2025 00:02:59 -0700
Subject: [PATCH 099/443] Simple copy kernel, which can be a tool to experiment
 with CK_Tile API with minimal code. (#2156)

* Test Copy kernel code for testing tile distribution logic

* Fix the error

* Solved the problem

* Updated comments and document formatting

* Removed unused tile distribution and code cleanup

* Added README.md and formatting for CI/CD.

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/36_copy/CMakeLists.txt |   4 +
 example/ck_tile/36_copy/README.md      |  31 +++++
 example/ck_tile/36_copy/test_copy.cpp  | 117 ++++++++++++++++
 example/ck_tile/36_copy/test_copy.hpp  | 178 +++++++++++++++++++++++++
 example/ck_tile/CMakeLists.txt         |   1 +
 5 files changed, 331 insertions(+)
 create mode 100644 example/ck_tile/36_copy/CMakeLists.txt
 create mode 100644 example/ck_tile/36_copy/README.md
 create mode 100644 example/ck_tile/36_copy/test_copy.cpp
 create mode 100644 example/ck_tile/36_copy/test_copy.hpp

diff --git a/example/ck_tile/36_copy/CMakeLists.txt b/example/ck_tile/36_copy/CMakeLists.txt
new file mode 100644
index 0000000000..d1b9ba923c
--- /dev/null
+++ b/example/ck_tile/36_copy/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(test_copy_kernel EXCLUDE_FROM_ALL test_copy.cpp)
+target_compile_options(test_copy_kernel PRIVATE
+  -mllvm -enable-noalias-to-md-conversion=0
+)
\ No newline at end of file
diff --git a/example/ck_tile/36_copy/README.md b/example/ck_tile/36_copy/README.md
new file mode 100644
index 0000000000..7856f0b4bd
--- /dev/null
+++ b/example/ck_tile/36_copy/README.md
@@ -0,0 +1,31 @@
+# Copy Kernel
+This folder contains basic setup code designed to provide a platform for novice 
+CK_Tile kernel developers to test basic functionality with minimal additional 
+code compared to the functional code. Sample functional code for a simple
+tile distribution for DRAM window and LDS window are provided and data is moved
+from DRAM to registers, registers to LDS, LDS to registers and finally data
+is moved to output DRAM window for a simple copy operation.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture 
+# (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# Make the copy kernel executable
+make test_copy -j
+```
+This will result in an executable `build/bin/test_copy_kernel`
+
+## example
+```
+args:
+          -m        input matrix rows. (default 64)
+          -n        input matrix cols. (default 8)
+          -id       warp to use for computation. (default 0)
+          -v        validation flag to check device results. (default 1)
+          -prec     datatype precision to use. (default fp16)
+          -warmup   no. of warmup iterations. (default 50)
+          -repeat   no. of iterations for kernel execution time. (default 100)
+```
\ No newline at end of file
diff --git a/example/ck_tile/36_copy/test_copy.cpp b/example/ck_tile/36_copy/test_copy.cpp
new file mode 100644
index 0000000000..81ea5255fc
--- /dev/null
+++ b/example/ck_tile/36_copy/test_copy.cpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include <cstring>
+#include "test_copy.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "64", "m dimension")
+        .insert("n", "8", "n dimension")
+        .insert("id", "0", "warp to use")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "50", "cold iter")
+        .insert("repeat", "100", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using XDataType = DataType;
+    using YDataType = DataType;
+
+    ck_tile::index_t m       = arg_parser.get_int("m");
+    ck_tile::index_t n       = arg_parser.get_int("n");
+    ck_tile::index_t warp_id = arg_parser.get_int("id");
+    int do_validation        = arg_parser.get_int("v");
+    int warmup               = arg_parser.get_int("warmup");
+    int repeat               = arg_parser.get_int("repeat");
+
+    ck_tile::HostTensor<XDataType> x_host({m, n});
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n});
+
+    // ck_tile::FillConstant<XDataType>{1.f}(x_host);
+    ck_tile::half_t value = 1;
+    for(int i = 0; i < m; i++)
+    {
+        value = 1;
+        for(int j = 0; j < n; j++)
+        {
+            x_host(i, j) = value++;
+        }
+    }
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+
+    using BlockWaves = ck_tile::sequence<2, 1>;
+    using BlockTile  = ck_tile::sequence<64, 8>;
+    using WaveTile   = ck_tile::sequence<64, 8>;
+    using Vector     = ck_tile::sequence<1, 4>;
+
+    ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
+    std::cout << "grid size " << kGridSize << std::endl;
+
+    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+    using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
+    using Kernel  = ck_tile::TileCopy<Problem>;
+
+    constexpr ck_tile::index_t kBlockSize  = 128;
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    std::cout << "block size " << kBlockSize << std::endl;
+    std::cout << "warp SIze " << ck_tile::get_warp_size() << std::endl;
+    std::cout << "warps per block _M " << Shape::WarpPerBlock_M << " " << Shape::WarpPerBlock_N
+              << std::endl;
+    std::cout << "Block waves: " << BlockWaves::at(ck_tile::number<0>{}) << " "
+              << BlockWaves::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << " Wave Groups: " << Shape::WaveGroups << std::endl;
+
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                       m,
+                                       n,
+                                       warp_id));
+
+    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        y_buf.FromDevice(y_host_dev.mData.data());
+        pass = ck_tile::check_err(y_host_dev, x_host);
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+}
diff --git a/example/ck_tile/36_copy/test_copy.hpp b/example/ck_tile/36_copy/test_copy.hpp
new file mode 100644
index 0000000000..8fed22a3d0
--- /dev/null
+++ b/example/ck_tile/36_copy/test_copy.hpp
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWaves, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WaveTile,   // warp size, seq<M, N>
+          typename Vector>     // contiguous elements(vector size) along seq<M, N>
+struct TileCopyShape
+{
+    // We split Workgroup waves into two specialized groups.
+    // One for reading data from global -> LDS, the other is doing reduction
+    static constexpr index_t WaveGroups = 2;
+    static constexpr index_t MWarps     = BlockWaves::at(number<0>{});
+    static constexpr index_t NWarps     = BlockWaves::at(number<0>{});
+
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WaveTile::at(number<0>{});
+    static constexpr index_t Warp_N = WaveTile::at(number<1>{});
+
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+
+    static constexpr index_t WarpPerBlock_M =
+        integer_divide_ceil(BlockWaves::at(number<0>{}), WaveGroups);
+    static constexpr index_t WarpPerBlock_N =
+        integer_divide_ceil(BlockWaves::at(number<1>{}), WaveGroups);
+
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
+
+    static constexpr index_t BlockSize     = get_warp_size() * WaveNum;
+    static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;
+    static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N, "Inconsisten wave group size!");
+};
+
+template <typename XDataType_, typename BlockShape_>
+struct TileCopyProblem
+{
+    using XDataType  = remove_cvref_t<XDataType_>;
+    using BlockShape = remove_cvref_t<BlockShape_>;
+};
+
+template <typename Problem_>
+struct TileCopy
+{
+    using Problem   = ck_tile::remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        constexpr index_t warp_size = get_warp_size();
+        constexpr index_t X0 = S::ThreadPerWarp_N; // threads needed along N dimension, fastest
+                                                   // changing with given vector size.
+        constexpr index_t X1 =
+            S::Vector_N; // no. of elements along N dimensions to be read by each thread.
+
+        constexpr index_t Y0 =
+            S::WaveNum / S::WaveGroups;        // no. of active warps working in this thread block.
+        constexpr index_t Y1 = warp_size / X0; // no. of threads in a warp needed along M dimension.
+        constexpr index_t Y2 =
+            S::Warp_M /
+            (Y1 *
+             Y0); // no. of iterations each warp needs to perform to cover the entire tile window.
+
+        constexpr auto outer_encoding =
+            tile_distribution_encoding<sequence<Y0>,
+                                       tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<0>, sequence<1, 2>>,
+                                       tuple<sequence<0>, sequence<0, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<1, 1>>{};
+        return make_static_tile_distribution(outer_encoding);
+    }
+
+    CK_TILE_DEVICE void
+    operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N, index_t warp_id) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // LDS Data.
+        __shared__ XDataType x_lds[number<S::Block_M>{} * number<S::Block_N>{}];
+        XDataType* __restrict__ p_x_lds = static_cast<XDataType*>(x_lds);
+
+        const auto x_lds_desc = make_naive_tensor_descriptor(
+            make_tuple(number<S::Block_M>{}, number<S::Block_N>{}, number<S::Vector_N>{}),
+            make_tuple(number<S::Block_N>{}, number<S::Vector_N>{}, 1),
+            number<S::Vector_N>{},
+            number<1>{});
+
+        auto x_lds_block_desc = transform_tensor_descriptor(
+            x_lds_desc,
+            make_tuple(make_pass_through_transform(number<S::Block_M>{}),
+                       make_merge_transform(
+                           make_tuple(number<S::Block_N>{} / S::Vector_N, number<S::Vector_N>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        auto x_lds_view = make_tensor_view<address_space_enum::lds>(p_x_lds, x_lds_block_desc);
+
+        auto x_block_lds_window =
+            make_tile_window(x_lds_view,
+                             make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                             {0, 0},
+                             MakeDRAMDistribution<Problem>());
+        auto x_block_lds_window_no_dist = make_tile_window(
+            x_lds_view, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {0, 0});
+
+        // Input tensor
+        const auto iM    = get_block_id() * S::Block_M;
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        auto x_block_window =
+            make_tile_window(x_m_n,
+                             make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                             {iM, 0},
+                             MakeDRAMDistribution<Problem>());
+
+        // Output tensor
+        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        auto y_block_window =
+            make_tile_window(y_m, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {iM, 0});
+
+        // Programming logic
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
+        auto my_id = get_warp_id();
+
+        auto DramTileDist   = x_block_window.get_tile_distribution();
+        using dram_reg_tile = decltype(make_static_distributed_tensor<XDataType>(DramTileDist));
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            dram_reg_tile dram_tile;
+
+            if(my_id == warp_id)
+            {
+                // load from DRAM to registers
+                load_tile(dram_tile, x_block_window);
+
+                // store in lds
+                store_tile(x_block_lds_window_no_dist, dram_tile);
+
+                // read from lds to registers
+                load_tile(dram_tile, x_block_lds_window);
+
+                // store from registers to DRAM
+                store_tile(y_block_window, dram_tile);
+            }
+            __syncthreads();
+            move_tile_window(x_block_window, {0, S::Block_N});
+            move_tile_window(y_block_window, {0, S::Block_N});
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 88efe0d8d9..d479cd35f6 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -19,3 +19,4 @@ add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
 add_subdirectory(35_batched_transpose)
+add_subdirectory(36_copy)

From 397b9080a217633f3f35d632329b16f4fababdf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 7 May 2025 17:04:31 +0200
Subject: [PATCH 100/443] Move 16x16 grouped conv fwd instances from comp
 header (#2165)

* Move 16x16 grouped conv fwd instances from comp header

* Improvements
---
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp |  21 +--
 .../device_grouped_conv_fwd_xdl_instance.hpp  |  57 ++++++
 .../gpu/grouped_convolution_forward.hpp       |  14 ++
 .../gpu/grouped_convolution_forward_xdl.inc   | 168 ++++++++++++++++++
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   6 +
 ..._ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp |  55 ++++++
 ...wd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp |  14 ++
 ...l_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp |  54 ++++++
 ...fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp |  14 ++
 ...l_ngchw_gkcyx_ngkhw_f32_16x16_instance.cpp |  54 ++++++
 ...fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp |  14 ++
 ..._nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp |  57 ++++++
 ...l_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp |  56 ++++++
 ...l_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp |  56 ++++++
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |   6 +
 ...hwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp |  55 ++++++
 ...dhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp |  54 ++++++
 ...dhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp |  54 ++++++
 ...cdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp |  56 ++++++
 ...gcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp |  55 ++++++
 ...gcdhw_gkczyx_ngkdhw_f32_16x16_instance.cpp |  55 ++++++
 21 files changed, 957 insertions(+), 18 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index 6c0ba2f932..158ed26ec4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -4,7 +4,6 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -90,12 +89,7 @@ using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,    64,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,   128,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // mfma 16x16
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
     // clang-format on
     >;
 
@@ -146,12 +140,7 @@ using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        // mfma 16x16
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<   NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding, 1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
     // clang-format on
     >;
 
@@ -195,11 +184,7 @@ using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        // mfma 16x16
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<    NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,1,256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,                   S<1, 32, 1, 4>,               4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index c9ea462316..f5397308dc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -97,6 +97,25 @@ using device_grouped_conv_fwd_xdl_bf16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_bf16_16x16_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,               S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,               S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,               S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,               S<1, 32, 1, 4>,               8>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -148,6 +167,25 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f16_16x16_instances = std::tuple<
+    // clang-format off
+      //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+      //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+      //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+      //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              2,              8,          1,          1,           1,               S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              1,              8,          1,          1,           1,               S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              4,              8,          1,          1,           1,               S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          1,          1,           1,               S<1, 32, 1, 4>,               8>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -199,6 +237,25 @@ using device_grouped_conv_fwd_xdl_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec>
+using device_grouped_conv_fwd_xdl_f32_16x16_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        // generic instance
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,              S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,              S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,              S<1, 32, 1, 4>,               4>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index d5eed7592e..cf5dbaa323 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -204,6 +204,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<BComputeType, float>)
             {
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
@@ -221,6 +222,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<BComputeType, half_t>)
             {
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
@@ -243,6 +245,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<BComputeType, ck::bhalf_t>)
             {
                 add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
@@ -288,6 +291,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f32_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_mem_intra_instances(
                     op_ptrs);
@@ -303,6 +307,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_f16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_2x_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_part2_instances(
@@ -323,6 +328,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkcyx_ngkhw_bf16_instances(
                     op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_part2_instances(
@@ -426,6 +432,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<BComputeType, float>)
             {
                 add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
@@ -484,6 +491,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<BComputeType, half_t>)
             {
                 add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
@@ -503,6 +511,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<BComputeType, ck::bhalf_t>)
             {
                 add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+                    op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
@@ -536,6 +546,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f32_instances(
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
                     op_ptrs);
@@ -551,6 +562,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_f16_instances(
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instances(
                     op_ptrs);
@@ -572,6 +584,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_xdl_merged_groups_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
+                    op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instances(
                     op_ptrs);
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index d3624b0fd9..b018737932 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -137,6 +137,20 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -153,6 +167,20 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP32
@@ -169,6 +197,20 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -267,6 +309,20 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_BF16
@@ -283,6 +339,20 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP32
@@ -299,6 +369,20 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_BF16
@@ -382,6 +466,20 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -398,6 +496,20 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP8
@@ -446,6 +558,20 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_INT8
@@ -532,6 +658,20 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_BF16
@@ -548,6 +688,20 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP32
@@ -564,6 +718,20 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 3a101baac0..eba6fd789e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -9,6 +9,9 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -19,6 +22,9 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instance.cpp
    # large tensor
    # NHWGC, GKYXC, NHWGK
    xdl/large_tensor/device_grouped_conv2d_fwd_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..0843325287
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NGCHW,
+                                                         GKCYX,
+                                                         Empty_Tuple,
+                                                         NGKHW,
+                                                         ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NGCHW,
+                                                         GKCYX,
+                                                         Empty_Tuple,
+                                                         NGKHW,
+                                                         ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NGCHW,
+                                                         GKCYX,
+                                                         Empty_Tuple,
+                                                         NGKHW,
+                                                         ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index 6c5d9b5b94..4ca1b2b85e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -30,6 +30,20 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
                                                                               Empty_Tuple,
                                                                               NGKHW,
                                                                               ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
new file mode 100644
index 0000000000..a82e800bb1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NGCHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NGCHW,
+                                                                                   GKCYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NGCHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
index f1ccad2add..e3a12fd5f4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -30,6 +30,20 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
                                                                              Empty_Tuple,
                                                                              NGKHW,
                                                                              ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instance.cpp
new file mode 100644
index 0000000000..5918f2479f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKCYX,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NGCHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NGCHW,
+                                                                                   GKCYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NGCHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
index de7e416e48..467a33deb3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
@@ -30,6 +30,20 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instances(
                                                                              Empty_Tuple,
                                                                              NGKHW,
                                                                              ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..5b8b62010a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGK,
+                                                         ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGK,
+                                                         ConvFwd1x1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC,
+                                                         Empty_Tuple,
+                                                         NHWGK,
+                                                         ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp
new file mode 100644
index 0000000000..7ca27e21a7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp
new file mode 100644
index 0000000000..74cdbde0ba
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Empty_Tuple,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index eeea4aae6d..f55bdd45c9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -7,10 +7,16 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instance.cpp
 
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/large_tensor/device_grouped_conv3d_fwd_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..8f113b5234
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC,
+                                                         Empty_Tuple,
+                                                         NDHWGK,
+                                                         ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC,
+                                                         Empty_Tuple,
+                                                         NDHWGK,
+                                                         ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC,
+                                                         Empty_Tuple,
+                                                         NDHWGK,
+                                                         ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
new file mode 100644
index 0000000000..1395447660
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
new file mode 100644
index 0000000000..43b3565c74
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..3b5068d605
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NGCDHW,
+                                                         GKCZYX,
+                                                         Empty_Tuple,
+                                                         NGKDHW,
+                                                         ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NGCDHW,
+                                                         GKCZYX,
+                                                         Empty_Tuple,
+                                                         NGKDHW,
+                                                         ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NGCDHW,
+                                                         GKCZYX,
+                                                         Empty_Tuple,
+                                                         NGKDHW,
+                                                         ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
new file mode 100644
index 0000000000..0ddf5bfa48
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NGCDHW,
+                                                                                   GKCZYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKDHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instance.cpp
new file mode 100644
index 0000000000..dc4f7be9c0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NGCDHW,
+                                                                                   GKCZYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKDHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From cb07ad84d5b8a6a796dff34c5d990476b6693b16 Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Wed, 7 May 2025 19:46:53 +0200
Subject: [PATCH 101/443] fix for default epilogue (#2167)

---
 .../ops/epilogue/default_2d_epilogue.hpp      | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 1d6a99eb4b..a2915f5c8f 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -15,14 +15,16 @@ template <typename AccDataType_,
           typename ODataType_,
           bool kPadM_,
           bool kPadN_,
-          bool UseRawStore_ = true>
+          bool UseRawStore_                      = true,
+          memory_operation_enum MemoryOperation_ = memory_operation_enum::set>
 struct Default2DEpilogueProblem
 {
-    using AccDataType                 = remove_cvref_t<AccDataType_>;
-    using ODataType                   = remove_cvref_t<ODataType_>;
-    static constexpr bool kPadM       = kPadM_;
-    static constexpr bool kPadN       = kPadN_;
-    static constexpr bool UseRawStore = UseRawStore_;
+    using AccDataType                                      = remove_cvref_t<AccDataType_>;
+    using ODataType                                        = remove_cvref_t<ODataType_>;
+    static constexpr bool kPadM                            = kPadM_;
+    static constexpr bool kPadN                            = kPadN_;
+    static constexpr bool UseRawStore                      = UseRawStore_;
+    static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
 };
 
 template <typename ADataType_,
@@ -36,9 +38,14 @@ template <typename ADataType_,
           index_t kNPerXdl_,
           index_t kKPerXdl_,
           bool isCTransposed_,
-          bool UseRawStore_ = true>
-struct DefaultGemm2DEpilogueProblem
-    : public Default2DEpilogueProblem<AccDataType_, ODataType_, kPadM_, kPadN_, UseRawStore_>
+          bool UseRawStore_                      = true,
+          memory_operation_enum MemoryOperation_ = memory_operation_enum::set>
+struct DefaultGemm2DEpilogueProblem : public Default2DEpilogueProblem<AccDataType_,
+                                                                      ODataType_,
+                                                                      kPadM_,
+                                                                      kPadN_,
+                                                                      UseRawStore_,
+                                                                      MemoryOperation_>
 {
     using ADataType                        = remove_cvref_t<ADataType_>;
     using BDataType                        = remove_cvref_t<BDataType_>;
@@ -58,14 +65,13 @@ struct Default2DEpilogue
     static constexpr bool kPadM       = Problem::kPadM;
     static constexpr bool kPadN       = Problem::kPadN;
     static constexpr bool UseRawStore = Problem::UseRawStore;
+    static constexpr memory_operation_enum MemoryOperation = Problem::MemoryOperation;
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
 
     // TODO: this function assume store out vector size is the same as OAccTile last dimension size
     //       how do we fix this ?
-    template <typename ODramWindowTmp,
-              typename OAccTile,
-              memory_operation_enum out_memory_data_op = memory_operation_enum::set>
+    template <typename ODramWindowTmp, typename OAccTile>
     CK_TILE_DEVICE auto
     operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr)
     {
@@ -73,7 +79,7 @@ struct Default2DEpilogue
         // TODO: this is ugly
         if constexpr(UseRawStore && (kPadM || kPadN))
         {
-            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            if constexpr(MemoryOperation == memory_operation_enum::set)
             {
                 store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
             }
@@ -85,7 +91,7 @@ struct Default2DEpilogue
         }
         else
         {
-            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            if constexpr(MemoryOperation == memory_operation_enum::set)
             {
                 store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
             }

From c7b8e86e342a77f9176b0f4688282fad03eb863b Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Wed, 7 May 2025 18:37:31 -0700
Subject: [PATCH 102/443] [CK_Tile] Simplified Mem pipeline (#2159)

* simplify code

* compiled the code

* Simplified example and codegen for mem pipeline

* Reveting config and universal gemm example

* clang formatted

* remove comments

* clang formatted

* Add memory operation changes for defualt pipeline

* fix config file

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/03_gemm/universal_gemm.cpp    |  81 ++++---------
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  78 ++++--------
 .../gemm/configs/instance_combination.json    |   2 +-
 tile_engine/ops/gemm/gemm_instance_builder.py | 111 +++++++++---------
 4 files changed, 107 insertions(+), 165 deletions(-)

diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index e6a2811918..b60a3b274b 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -12,6 +12,19 @@
 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
 
+template <typename Pipeline, ck_tile::TailNumber TN>
+void try_run(ck_tile::TailNumber tn)
+{
+    if constexpr(Pipeline::PrefetchStages > static_cast<int>(TN))
+    {
+        if(tn == TN)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, TN>{});
+        }
+    }
+}
+
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
@@ -164,7 +177,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             throw std::runtime_error(err.str());
         }
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        // Tail pipeline One to Seven
         if(tail_num == ck_tile::TailNumber::One)
         {
             RunSplitk(ck_tile::bool_constant<true>{},
@@ -176,60 +188,17 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                       ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
         }
 
-        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-        {
-            if(tail_num == ck_tile::TailNumber::Two)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-        {
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-        {
-            if(tail_num == ck_tile::TailNumber::Four)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-        {
-            if(tail_num == ck_tile::TailNumber::Five)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-        {
-            if(tail_num == ck_tile::TailNumber::Six)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-        {
-            if(tail_num == ck_tile::TailNumber::Seven)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-        }
+        auto check_tail = [&](auto... TNs) {
+            (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
+        };
+
+        check_tail(ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
         if(tail_num == ck_tile::TailNumber::Three)
         {
@@ -259,7 +228,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         else if(tail_num == ck_tile::TailNumber::Even)
         {
             RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
         }
         else
         {
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 0329f16416..85742cb3de 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -63,6 +63,19 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
     using pipeline      = ck_tile::GemmPipelineAgBgCrCompV4<Problem>;
 };
 
+template <typename Pipeline, ck_tile::TailNumber TN>
+void try_run(ck_tile::TailNumber tn)
+{
+    if constexpr(Pipeline::PrefetchStages > static_cast<int>(TN))
+    {
+        if(tn == TN)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, TN>{});
+        }
+    }
+}
+
 template <typename Tuple>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
@@ -251,60 +264,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                          ck_tile::TailNumber::Full>{});
                 }
 
-                if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-                {
-                    if(tail_num == ck_tile::TailNumber::Two)
-                    {
-                        RunSplitk(ck_tile::bool_constant<true>{},
-                                  ck_tile::integral_constant<ck_tile::TailNumber,
-                                                             ck_tile::TailNumber::Two>{});
-                    }
-                }
-                if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-                {
-                    if(tail_num == ck_tile::TailNumber::Three)
-                    {
-                        RunSplitk(ck_tile::bool_constant<true>{},
-                                  ck_tile::integral_constant<ck_tile::TailNumber,
-                                                             ck_tile::TailNumber::Three>{});
-                    }
-                }
-                if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-                {
-                    if(tail_num == ck_tile::TailNumber::Four)
-                    {
-                        RunSplitk(ck_tile::bool_constant<true>{},
-                                  ck_tile::integral_constant<ck_tile::TailNumber,
-                                                             ck_tile::TailNumber::Four>{});
-                    }
-                }
-                if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-                {
-                    if(tail_num == ck_tile::TailNumber::Five)
-                    {
-                        RunSplitk(ck_tile::bool_constant<true>{},
-                                  ck_tile::integral_constant<ck_tile::TailNumber,
-                                                             ck_tile::TailNumber::Five>{});
-                    }
-                }
-                if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-                {
-                    if(tail_num == ck_tile::TailNumber::Six)
-                    {
-                        RunSplitk(ck_tile::bool_constant<true>{},
-                                  ck_tile::integral_constant<ck_tile::TailNumber,
-                                                             ck_tile::TailNumber::Six>{});
-                    }
-                }
-                if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-                {
-                    if(tail_num == ck_tile::TailNumber::Seven)
-                    {
-                        RunSplitk(ck_tile::bool_constant<true>{},
-                                  ck_tile::integral_constant<ck_tile::TailNumber,
-                                                             ck_tile::TailNumber::Seven>{});
-                    }
-                }
+                auto check_tail = [&](auto... TNs) {
+                    (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
+                };
+
+                check_tail(
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
             }
 
             if constexpr(PipelineType == GemmPipelineType::CompV4)
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
index 66dbdafa11..53197ada6c 100644
--- a/tile_engine/ops/gemm/configs/instance_combination.json
+++ b/tile_engine/ops/gemm/configs/instance_combination.json
@@ -19,7 +19,7 @@
       "values": [256]
     },
     "tile_k": {
-      "values": [64, 32]
+      "values": [32]
     },
     "warp_m": {
       "values": [2]
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index a748c35feb..3839523e3d 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -37,7 +37,9 @@ DEFAULT_EPILOGUE = """
                                                                       WarpTileM,
                                                                       WarpTileN,
                                                                       WarpTileK,
-                                                                      UniversalGemmProblem::TransposeC>>;
+                                                                      UniversalGemmProblem::TransposeC,
+                                                                      true,
+                                                                      memory_operation>>;
 """
 
 CSHUFFLE_EPILOGUE = """
@@ -55,22 +57,23 @@ CSHUFFLE_EPILOGUE = """
                                                              WarpTileM,
                                                              WarpTileN,
                                                              WarpTileK,
-                                                             UniversalGemmProblem::TransposeC>>;
+                                                             UniversalGemmProblem::TransposeC,
+                                                             memory_operation>>;
 """
 HOT_LOOP_FALSE = """
             if(tail_num == ck_tile::TailNumber::Full)
             {
-                Run(ck_tile::bool_constant<false>{},
+                RunSplitk(ck_tile::bool_constant<false>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
             else if(tail_num == ck_tile::TailNumber::Odd)
             {
-                Run(ck_tile::bool_constant<false>{},
+                RunSplitk(ck_tile::bool_constant<false>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
             }
             else if(tail_num == ck_tile::TailNumber::Even)
             {
-                Run(ck_tile::bool_constant<false>{},
+                RunSplitk(ck_tile::bool_constant<false>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
             }
             else
@@ -79,68 +82,43 @@ HOT_LOOP_FALSE = """
             }  
 """
 RUN_MEM = """
-            if(tail_num == ck_tile::TailNumber::One)
-            {
-                Run(ck_tile::bool_constant<true>{},
+            // Handle One and Full cases directly
+            if (tail_num == ck_tile::TailNumber::One) {
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Full)
-            {
-                Run(ck_tile::bool_constant<true>{},
+            } else if (tail_num == ck_tile::TailNumber::Full) {
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
+            // Variadic call using fold expression
+            auto check_tail = [&](auto... TNs) {
+                (try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
+            };
 
-            if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-            {
-                if(tail_num == ck_tile::TailNumber::Two)
-                {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-                }
-        
-                if(tail_num == ck_tile::TailNumber::Three)
-                {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-                }
-                if(tail_num == ck_tile::TailNumber::Four)
-                {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-                }
-                if(tail_num == ck_tile::TailNumber::Five)
-                {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-                }
-                if(tail_num == ck_tile::TailNumber::Six)
-                {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-                }
-                if(tail_num == ck_tile::TailNumber::Seven)
-                {
-                    Run(ck_tile::bool_constant<true>{},
-                        ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-                }
-                throw std::runtime_error("The tile number is wrong! It should not exceed the prefetch stage numbers");
-            }
+            check_tail(
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{}
+            );
 """
 
 RUN_COMPV3 = """
             if(tail_num == ck_tile::TailNumber::Full)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
             else if(tail_num == ck_tile::TailNumber::Odd)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
             }
             else if(tail_num == ck_tile::TailNumber::Even)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
             }
             else
@@ -152,12 +130,12 @@ RUN_COMPV3 = """
 RUN_COMPV4 = """
             if(tail_num == ck_tile::TailNumber::Three)
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
             }
             else
             {
-                Run(ck_tile::bool_constant<true>{},
+                RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
             }
 """
@@ -347,6 +325,15 @@ namespace {group_name} {{
                                kPadM: bool, kPadN: bool, kPadK: bool) -> str:
         """Generate kernel struct template"""
         return f"""
+template <typename Pipeline, ck_tile::TailNumber TN>
+void try_run(ck_tile::TailNumber tn) {{
+    if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN)) {{
+        if (tn == TN) {{
+            RunSplitk(ck_tile::bool_constant<true>{{}},
+                ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
+        }}
+    }}
+}}
 template <int TileM, int TileN, int TileK,
           int WarpM, int WarpN, int WarpK,
           int WarpTileM, int WarpTileN, int WarpTileK,
@@ -355,7 +342,7 @@ struct GemmKernel {{
     static constexpr bool kPadM = {BOOL_MAP(kPadM)};
     static constexpr bool kPadN = {BOOL_MAP(kPadN)};
     static constexpr bool kPadK = {BOOL_MAP(kPadK)};
-
+   
     static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
@@ -399,10 +386,11 @@ struct GemmKernel {{
 
         float ave_time{{0}};
 
-        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {{
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {{
             constexpr bool has_hot_loop_v = has_hot_loop_.value;
             constexpr auto tail_number_v  = tail_number_.value;
             constexpr auto scheduler      = {SCHEDULER_MAP[scheduler]};
+            constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = 
                 ck_tile::UniversalGemmPipelineProblem<ADataType,
@@ -442,6 +430,20 @@ struct GemmKernel {{
 
         }};
 
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            if(args.k_batch == 1) {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::set>{{}});
+            }} else {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::atomic_add>{{}});
+            }}
+        }};
+
         if(has_hot_loop) {{
             {HOT_LOOP_TRUE[pipeline]}
         }} else {{
@@ -450,6 +452,7 @@ struct GemmKernel {{
 
         return ave_time;
     }}
+    
     static std::string get_name() {{
         return std::string("GemmKernel<Bllktile: ") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) + ", " +
                 "WaveMap: " + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + ", " +

From 6a3960c1e13a1b03d969326bd5b4c74ac4ded9fd Mon Sep 17 00:00:00 2001
From: "BingYuan.Zhou" <BingYuan.Zhou@amd.com>
Date: Thu, 8 May 2025 12:59:57 +0800
Subject: [PATCH 103/443] Flatmm merge (#2168)

* sync with function interface of cshuffleepiloge,fix flatmm build fail

* move code from solin/flatmm which add mfma16*16*32fp8 and optimize flatmm

---------

Co-authored-by: solin <bingzhou@amd.com>
---
 example/ck_tile/18_flatmm/CMakeLists.txt      |   3 +-
 example/ck_tile/18_flatmm/flatmm_basic.cpp    | 162 ++++++++----
 example/ck_tile/18_flatmm/flatmm_basic.hpp    |  52 +++-
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |  79 +++---
 .../block_flatmm_asmem_bsmem_creg_v1.hpp      |  77 +-----
 .../ops/flatmm/kernel/flatmm_kernel.hpp       |  28 +--
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   | 234 +++++++++++++++++-
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp |  97 +++++++-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |   8 +
 .../warp/warp_gemm_attribute_mfma_impl.hpp    |   2 +-
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |   2 +
 11 files changed, 552 insertions(+), 192 deletions(-)

diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
index 9fbe65e3a7..f4d823e91a 100644
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -3,5 +3,6 @@ add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
 set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
-# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-local-typedef)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_16x16x32=1 -DENABLE_FP8=1 -Wno-unused-local-typedef)
+#list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_32x32x16=1 -DENABLE_FP8=1 -Wno-unused-local-typedef)
 target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 05d0c73b7e..5f2c2a5aab 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -12,7 +12,13 @@
 #include "ck_tile/host.hpp"
 #include "flatmm_basic.hpp"
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
 {
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
@@ -23,18 +29,32 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
     constexpr int kBlockPerCu = 2;
 
     // This part comes from the Codegen
+#if defined(USING_MFMA_16x16x32) || defined(ENABLE_FP16)
     constexpr ck_tile::index_t M_Tile = 128;
     constexpr ck_tile::index_t N_Tile = 128;
-    constexpr ck_tile::index_t K_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 128;
 
     constexpr ck_tile::index_t M_Warp = 1;
     constexpr ck_tile::index_t N_Warp = 4;
     constexpr ck_tile::index_t K_Warp = 1;
 
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
+    constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
+    constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
+    constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 64 : 16;
 
+#elif defined(USING_MFMA_32x32x16) && defined(ENABLE_FP8)
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 128;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 8;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
+    constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
+    constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 16;
+#endif
     using CodegenFlatmmShape =
         ck_tile::TileFlatmmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                  ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
@@ -49,54 +69,112 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                                                                 AccDataType,
                                                                 CodegenFlatmmShape,
                                                                 CodegenGemmTraits>;
-    using GemmEpilogue           = ck_tile::CShuffleEpilogue<
-        ck_tile::CShuffleEpilogueProblem<ADataType,
-                                         BDataType,
-                                         AccDataType,
-                                         CDataType,
-                                         CLayout,
-                                         CodegenPipelineProblem::kBlockSize,
-                                         TilePartitioner::MPerBlock,
-                                         TilePartitioner::NPerBlock,
-                                         M_Warp,
-                                         N_Warp,
-                                         M_Warp_Tile,
-                                         N_Warp_Tile,
-                                         K_Warp_Tile,
-                                         CodegenPipelineProblem::TransposeC>>;
+    const auto Run               = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
 
-    using CodegenFlatmmPolicy = ck_tile::UniversalFlatmmPipelineAgBgCrPolicy;
-    using CodegenFlatmmPipeline =
-        ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenFlatmmPolicy>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation>>;
 
-    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-    using Kernel = ck_tile::FlatmmKernel<TilePartitioner, CodegenFlatmmPipeline, GemmEpilogue>;
+        using CodegenFlatmmPolicy = ck_tile::UniversalFlatmmPipelineAgBgCrPolicy;
+        using CodegenFlatmmPipeline =
+            ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenFlatmmPolicy>;
 
-    auto kargs = Kernel::MakeKernelArgs(args);
+        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+        using Kernel = ck_tile::FlatmmKernel<TilePartitioner, CodegenFlatmmPipeline, GemmEpilogue>;
 
-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-    constexpr dim3 blocks = Kernel::BlockSize();
+        auto kargs = Kernel::MakeKernelArgs(args);
 
-    if(!Kernel::IsSupportedArgument(kargs))
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+    if(args.k_batch == 1)
     {
-        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
     }
-
-    if(s.log_level_ > 0)
+    else
     {
-        std::cout << "Launching kernel with args:"
-                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                  << std::endl;
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
     }
-
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-    return ave_time;
 }
 
 #include "run_flatmm_example.inc"
 
+int run_flatmm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        if(data_type == "fp16")
+        {
+            run_flatmm_example_with_layouts<ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(data_type == "bf16")
+        {
+            run_flatmm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(data_type == "fp8")
+        {
+            run_flatmm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(data_type == "bf8")
+        {
+            run_flatmm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported data_type!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+    return -1;
+}
+
 int main(int argc, char* argv[]) { return !run_flatmm_example(argc, argv); }
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index 355ac45ebe..bbce978724 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -31,7 +31,7 @@
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif
 
-template <typename DataType>
+template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
 struct GemmBasicTypeConfig;
 
 template <>
@@ -44,9 +44,47 @@ struct GemmBasicTypeConfig<ck_tile::half_t>
     // ToDo: Add more bias config to support different categories of GEMM.
 };
 
+template <>
+struct GemmBasicTypeConfig<ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+template <>
+struct GemmBasicTypeConfig<ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
 template <typename T>
 struct DataTypeTraits;
 
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
 template <>
 struct DataTypeTraits<float>
 {
@@ -65,13 +103,11 @@ struct DataTypeTraits<ck_tile::half_t>
     static constexpr const char* name = "fp16";
 };
 
-using Types = GemmBasicTypeConfig<ck_tile::half_t>;
-
-// Specific type aliases for easy access
-using ADataType   = Types::ADataType;
-using BDataType   = Types::BDataType;
-using AccDataType = Types::AccDataType;
-using CDataType   = Types::CDataType;
+template <typename T>
+struct is_8bit_type
+    : std::bool_constant<std::is_same_v<T, ck_tile::fp8_t> || std::is_same_v<T, ck_tile::bf8_t>>
+{
+};
 
 auto create_args(int argc, char* argv[])
 {
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 864d888074..15a9df2c0c 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -1,6 +1,20 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include <type_traits>
+
+template <typename T>
+constexpr const char* DataTypeToString() {
+    if constexpr (std::is_same_v<T, ck_tile::half_t>) {
+        return "fp16";
+    } else if constexpr (std::is_same_v<T, ck_tile::fp8_t>) {
+        return "fp8";
+    } else if constexpr (std::is_same_v<T, ck_tile::bf8_t>) {
+        return "bf8";
+    } else {
+        return "unknown";
+    }
+}
 
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
@@ -11,7 +25,7 @@ static constexpr inline auto is_row_major(Layout layout_)
 
 // mfma_type, 0:32x32, 1:16x16
 template <typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type = 0)
+auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type)
 {
     assert(t.get_lengths().size() == 2);
     int n_ = t.get_lengths()[1];
@@ -29,13 +43,13 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma
         std::copy(t.begin(), t.end(), t_view.begin());
         return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
     }
-    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0)
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8" || mfma_dtype == "bf8") && mfma_type == 0)
     {
         ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 32, 2, 16});
         std::copy(t.begin(), t.end(), t_view.begin());
         return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
     }
-    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1)
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8" || mfma_dtype == "bf8") && mfma_type == 1)
     {
         ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 64, 4, 16});
         std::copy(t.begin(), t.end(), t_view.begin());
@@ -44,6 +58,7 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma
     return t;
 }
 
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
                          const float max_accumulated_value)
@@ -64,7 +79,13 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
                     ck_tile::DeviceMem& b_shuffle_dev_buf,
                     ck_tile::DeviceMem& c_dev_buf,
@@ -91,7 +112,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time = flatmm_calc<ALayout, BLayout, CLayout>(
+    float ave_time = flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop = std::size_t(2) * M * N * K;
@@ -100,7 +121,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Flatmm kernel with M =" << M << " N =" << N << " K =" << K
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>() << " M =" << M << " N =" << N << " K =" << K
               << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
               << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
               << std::endl;
@@ -108,7 +129,10 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     return ave_time;
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename PrecType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                     char* argv[],
                                     const ALayout a_layout                  = ALayout{},
@@ -119,6 +143,11 @@ int run_flatmm_example_with_layouts(int argc,
     if(!result)
         return -1;
 
+    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
+    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
+    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
+    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
+    
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
     ck_tile::index_t K = arg_parser.get_int("k");
@@ -154,11 +183,17 @@ int run_flatmm_example_with_layouts(int argc,
 
     // do pre-shuffle
     std::string mfma                              = arg_parser.get_str("prec");
-    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, 0);
+#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
+    ck_tile::index_t mfma_type = 1;
+#else
+    ck_tile::index_t mfma_type = 0;
+#endif
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, mfma_type);
     ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
-    invoke_flatmm<ALayout, BLayout, CLayout>(a_dev_buf,
+    invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+                                             a_dev_buf,
                                              b_shuffle_dev_buf,
                                              c_dev_buf,
                                              M,
@@ -184,7 +219,7 @@ int run_flatmm_example_with_layouts(int argc,
             a_host, b_origin_host, c_ref_host);
         const float max_accumulated_value =
             *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
         pass                 = ck_tile::check_err(c_rslt_host,
                                   c_ref_host,
                                   "Error: Incorrect results!",
@@ -242,7 +277,7 @@ int run_flatmm_example_with_layouts(int argc,
         c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
         const float max_accumulated_value =
             *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
         pass                 = ck_tile::check_err(c_rslt_host,
                                   c_gpu_ref_host,
                                   "Error: Incorrect results!",
@@ -257,25 +292,3 @@ int run_flatmm_example_with_layouts(int argc,
 
     return pass;
 }
-
-int run_flatmm_example(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    std::string a_layout = arg_parser.get_str("a_layout");
-    std::string b_layout = arg_parser.get_str("b_layout");
-
-    if(a_layout == "R" && b_layout == "C")
-    {
-        return run_flatmm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
-    }
-    else
-    {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
-    }
-}
diff --git a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
index 935eb2c028..18b2fe6483 100644
--- a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
@@ -66,76 +66,24 @@ struct BlockFlatmmASmemBSmemCRegV1
     }
 
     // C += A * B
-    template <typename CBlockTensor, typename ABlockWindow, typename BFlatBlockWindow>
+    template <typename CBlockTensor, typename ABlockWindow, typename BFlatBlockTensor>
     CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
-                                   const ABlockWindow& a_block_window,
-                                   const BFlatBlockWindow& b_flat_block_window) const
+                                   ABlockWindow& a_warp_windows,
+                                   BFlatBlockTensor& b_warp_tensor) const
     {
-        static_assert(std::is_same_v<ADataType, typename ABlockWindow::DataType> &&
-                          std::is_same_v<BDataType, typename BFlatBlockWindow::DataType> &&
-                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
-                      "wrong!");
-        constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}];
-        constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}];
-
-        static_assert(MPerBlock == BlockGemmShape::kM && KPerBlock == BlockGemmShape::kK, "wrong!");
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t KPerBlock = BlockGemmShape::kK;
 
         constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
         using WG              = remove_cvref_t<decltype(config.template at<0>())>;
 
         constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
 
         constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
         constexpr index_t NIterPerWarp =
             BlockTile::at(idxN) / (WarpTile::at(idxN) * BlockWarps::at(idxN));
         constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
 
-        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
-        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
-
-        constexpr index_t NFlatPerBlockPerIter = BlockGemmShape::flatNPerWarp;
-        constexpr index_t KFlatPerBlockPerIter = BlockGemmShape::flatKPerWarp;
-
-        const index_t iMWarp = get_warp_id() / NWarp;
-
-        // construct A-warp-window
-        auto a_warp_window_tmp = make_tile_window(
-            a_block_window.get_bottom_tensor_view(),
-            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
-            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
-            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
-        statically_indexed_array<
-            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
-            MIterPerWarp>
-            a_warp_windows;
-        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
-
-                move_tile_window(a_warp_windows(mIter)(kIter),
-                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
-            });
-        });
-
-        // construct Bflat-warp-window
-        auto b_flat_warp_windows_tmp = b_flat_block_window;
-        statically_indexed_array<
-            statically_indexed_array<decltype(b_flat_warp_windows_tmp), KIterPerWarp>,
-            NIterPerWarp>
-            b_flat_warp_windows;
-        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                b_flat_warp_windows(nIter)(kIter) = b_flat_warp_windows_tmp;
-
-                move_tile_window(b_flat_warp_windows(nIter)(kIter),
-                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
-            });
-        });
-
-        // auto b_warp_windows = b_origin_warp_windows;
-        auto b_warp_windows = b_flat_warp_windows;
-
         using CWarpDstr   = typename WG::CWarpDstr;
         using CWarpTensor = typename WG::CWarpTensor;
 
@@ -150,9 +98,6 @@ struct BlockFlatmmASmemBSmemCRegV1
                 const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
 
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B Block window
-                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
-
                     // read C warp tensor from C block tensor
                     CWarpTensor c_warp_tensor;
 
@@ -161,7 +106,7 @@ struct BlockFlatmmASmemBSmemCRegV1
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                     // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor(nIter)(kIter));
 
                     // write C warp tensor into C block tensor
                     c_block_tensor.set_y_sliced_thread_data(
@@ -172,16 +117,6 @@ struct BlockFlatmmASmemBSmemCRegV1
             });
         });
     }
-
-    // C = A * B
-    template <typename ABlockTensorTmp, typename BFlatBlockWindow>
-    CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
-                                   const BFlatBlockWindow& b_flat_block_window) const
-    {
-        auto c_block_tensor = MakeCBlockTile();
-        operator()(c_block_tensor, a_block_tensor_tmp, b_flat_block_window);
-        return c_block_tensor;
-    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index eb45e6c0bd..a9ed1519e6 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -321,7 +321,7 @@ struct FlatmmKernel
         const auto& c_tensor_view = [&]() {
             if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
             {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                return make_naive_tensor_view<address_space_enum::global>(
                     c_ptr,
                     make_tuple(kargs.M, kargs.N),
                     make_tuple(kargs.stride_C, 1),
@@ -330,7 +330,7 @@ struct FlatmmKernel
             }
             else
             {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                return make_naive_tensor_view<address_space_enum::global>(
                     c_ptr,
                     make_tuple(kargs.M, kargs.N),
                     make_tuple(1, kargs.stride_C),
@@ -426,7 +426,6 @@ struct FlatmmKernel
         return make_tuple(a_block_window, b_flat_block_window, c_block_window);
     }
 
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
     CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
                                          const BDataType* b_flat_ptr,
                                          CDataType* c_ptr,
@@ -438,7 +437,8 @@ struct FlatmmKernel
     {
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
-            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_flat_ptr, c_ptr, kargs, splitk_batch_offset);
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_flat_ptr, c_ptr, kargs, splitk_batch_offset);
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
@@ -453,9 +453,8 @@ struct FlatmmKernel
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I2);
 
-        EpiloguePipeline{}
-            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
-                c_block_window, c_block_tile, smem_ptr);
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, smem_ptr);
     }
 
     CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
@@ -475,21 +474,12 @@ struct FlatmmKernel
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
-        if(kargs.k_batch == 1)
+        if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                       EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                       is_any_of<CDataType, fp16_t, bf16_t>::value))
         {
             RunFlatmm(a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
         }
-        else
-        {
-            // Do not compile in case where we have unsupported
-            // VectorSizeC & data type configuration.
-            if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<CDataType, fp16_t, bf16_t>::value))
-            {
-                RunFlatmm<memory_operation_enum::atomic_add>(
-                    a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
-            }
-        }
     }
 };
 
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 611aff318f..2ff9d1ebf0 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -73,6 +73,83 @@ struct FlatmmPipelineAGmemBGmemCRegV1
         return PipelinePolicy::template GetSmemSize<Problem>();
     }
 
+    CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t KIterPerWarp = kKPerBlock / WG::kK;
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WG::kN);
+
+        constexpr index_t KPerLoad               = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
+        constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
+        constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
+        // constexpr index_t A_LDS_Read_Inst_Remain = A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num;
+#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
+        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+        static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+        });
+        static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+        });
+        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+        });
+
+#elif defined(USING_MFMA_32x32x16)
+        static_for<0,
+                   A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num - B_Buffer_Load_Inst_Num,
+                   1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+        static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+        static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+        });
+        __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+#endif
+    }
+
     template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
     CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                         const AElementFunction& a_element_func,
@@ -89,6 +166,25 @@ struct FlatmmPipelineAGmemBGmemCRegV1
         static_assert(kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
                       "wrong!");
 
+        constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WG::kK;
+
+        constexpr index_t KFlatPerBlockPerIter = flatKPerWarp;
+        constexpr index_t NFlatPerBlockPerIter = flatNPerWarp;
+
+        constexpr index_t MPerBlockPerIter = kMPerBlock / MIterPerWarp;
+        constexpr index_t KPerBlockPerIter = kKPerBlock / KIterPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+
         // A tile in LDS
         ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
 
@@ -112,6 +208,25 @@ struct FlatmmPipelineAGmemBGmemCRegV1
         auto a_lds_gemm_window = make_tile_window(
             a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
 
+        auto a_warp_window_tmp = make_tile_window(
+            a_lds_gemm_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_lds_gemm_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
         // Block GEMM
         auto block_flatmm = BlockFlatmm();
 
@@ -126,16 +241,45 @@ struct FlatmmPipelineAGmemBGmemCRegV1
                 b_flat_distribution);
 
         // Acc register tile
-        auto c_block_tile = decltype(block_flatmm(a_lds_gemm_window, b_flat_dram_window)){};
+        auto c_block_tile = block_flatmm.MakeCBlockTile();
 
         // prefetch
         // global read 0
         auto a_block_tile = load_tile(a_copy_dram_window);
 
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_flat_dram_window), KIterPerWarp>,
+            NIterPerWarp>
+            b_flat_dram_windows;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensor;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensor_2;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+            });
+        });
+
         {
             // move to 1
             move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
@@ -152,40 +296,116 @@ struct FlatmmPipelineAGmemBGmemCRegV1
             {
                 store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
             }
+            block_sync_lds();
         }
 
-        index_t iCounter = num_loop - 1;
+        index_t iCounter = num_loop / 2 - 1;
         while(iCounter > 0)
         {
             // global read i + 1
             a_block_tile = load_tile(a_copy_dram_window);
 
-            block_sync_lds();
-
             // GEMM i
-            block_flatmm(c_block_tile, a_lds_gemm_window, b_flat_dram_window);
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor);
 
             block_sync_lds();
 
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
             // move to i + 2
             move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
             // LDS write i + 1
-            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
             store_tile(a_copy_lds_window, a_block_tile_tmp);
+            HotLoopScheduler();
+            block_sync_lds();
+
+            // iCounter--;
+
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor_2);
+
+            block_sync_lds();
+
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // move to i + 2
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
 
             // move to next flat K
             move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
 
+            // LDS write i + 1
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            HotLoopScheduler();
+            block_sync_lds();
+
             iCounter--;
         }
 
         // tail
         {
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor);
+
+            block_sync_lds();
+
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // move to i + 2
+            // move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // LDS write i + 1
+            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            // move to next flat K
+            // move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            HotLoopScheduler();
             block_sync_lds();
 
             // GEMM num_loop - 1
-            block_flatmm(c_block_tile, a_lds_gemm_window, b_flat_dram_window);
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor_2);
         }
 
         return c_block_tile;
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index d1aac07d54..474924ec84 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -19,23 +19,100 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
     {
         using namespace ck_tile;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
+        /*reduce transform layers,compare with old ck*/
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
 
         constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
-            make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
-            number<8>{},
+            make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(
+                make_xor_transform(make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
+                make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+#elif defined(USING_MFMA_32x32x16)
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t kKPack     = GetSmemPackA<Problem>();
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
+            number<kKPack>{},
             number<1>{});
 
         constexpr auto a_lds_block_desc = transform_tensor_descriptor(
             a_lds_block_desc_0,
             make_tuple(make_pass_through_transform(kMPerBlock),
-                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
+                       make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
             make_tuple(sequence<1>{}, sequence<0, 2>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
 
+        return a_lds_block_desc;
+#endif
+/*xor*/
+#if 0
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t kKPack     = GetSmemPackA<Problem>();
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / kKPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / kKPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / kKPack * MLdsLayer>{},
+                    number<kMPerBlock / MLdsLayer>{},
+                    number<kKPack>{}),
+            make_tuple(number<kKPack>{}, number<kKPerBlock * MLdsLayer>{}, number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<kMPerBlock / MLdsLayer>{},
+                                                    number<kKPerBlock / kKPack * MLdsLayer>{})),
+                    make_pass_through_transform(number<kKPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                        make_tuple(number<MLdsLayer>{}, number<kKPerBlock / kKPack>{})),
+                        make_pass_through_transform(number<kMPerBlock / MLdsLayer>{}),
+                        make_pass_through_transform(number<kKPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform(
+                        make_tuple(number<kMPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                        make_merge_transform(
+                        make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+#endif
         return a_lds_block_desc;
     }
 
@@ -58,7 +135,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
     {
-        return Problem::VectorLoadSize;
+        return Problem::VectorLoadSize / sizeof(typename Problem::ADataType);
     }
 
     template <typename Problem>
@@ -82,7 +159,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
             constexpr index_t KPack = GetSmemPackA<Problem>();
             static_assert(KPack % K3 == 0);
             constexpr index_t K2 = KPack / K3;
-            if constexpr(get_warp_size() % (K2 * M0))
+            if constexpr(get_warp_size() >= (K2 * M0))
             {
                 constexpr index_t K1 = get_warp_size() / (K2 * M0);
                 constexpr index_t K0 = BlockSize / get_warp_size();
@@ -209,7 +286,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         static_assert(kKPack % K3 == 0);
         constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
         constexpr index_t warp_size = get_warp_size();
-        if constexpr(warp_size % (K2 * M0) == 0)
+        if constexpr(warp_size >= (K2 * M0))
         {
             constexpr index_t K1 = warp_size / (K2 * M0);
             constexpr index_t K0 = kBlockSize / warp_size;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index e75aca1d91..c98d46e3a0 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -193,6 +193,14 @@ using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterat
 using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
+using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
+    2>>;
+
+using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
+    2>>;
+
 using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 96c3c3d29f..69d22496f1 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1022,7 +1022,7 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
             }
             else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
             {
-                DISPATCH_MFMA_("mfma_f32_116x16x32_fp8_bf8", "+v", "v", "v", "v")
+                DISPATCH_MFMA_("mfma_f32_16x16x32_fp8_bf8", "+v", "v", "v", "v")
             }
             else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
             {
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 64bd61a3dc..b2f5d56d01 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -57,6 +57,7 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 
 // fp8
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
@@ -65,6 +66,7 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };

From a32d9077710d8c99283be86565a1e9f9a5aa1671 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Wed, 7 May 2025 23:09:22 -0700
Subject: [PATCH 104/443] Disable the SMFMA instruction for gfx90a. (#2174)

* remove smfma for gfx90a

* clang formatted
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 3 ++-
 tile_engine/ops/gemm/gemm_instance_builder.py | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index c98d46e3a0..61c61c2d9a 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -97,12 +97,13 @@ using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK
     4>>;
 
 // fp16 2:4 structured sparsity
-
+#if defined(__gfx94__) || defined(__gfx950__)
 using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#endif
 
 // bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 3839523e3d..c00554df8f 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -535,7 +535,11 @@ struct GemmDispatcher {
                     ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
                      (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
                 content += f"""
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
+#if defined(__gfx908__)
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+#else
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+#endif"""
             content += f"""
             }} else {{"""
             for tile in tile_params:

From c757046d49e5e5bbd3b3c9bfda95cd093e70f0e8 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Thu, 8 May 2025 00:07:03 -0700
Subject: [PATCH 105/443] Revert "Disable the SMFMA instruction for gfx90a.
 (#2174)" (#2175)

This reverts commit a32d9077710d8c99283be86565a1e9f9a5aa1671.
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 3 +--
 tile_engine/ops/gemm/gemm_instance_builder.py | 6 +-----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 61c61c2d9a..c98d46e3a0 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -97,13 +97,12 @@ using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK
     4>>;
 
 // fp16 2:4 structured sparsity
-#if defined(__gfx94__) || defined(__gfx950__)
+
 using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
-#endif
 
 // bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index c00554df8f..3839523e3d 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -535,11 +535,7 @@ struct GemmDispatcher {
                     ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
                      (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
                 content += f"""
-#if defined(__gfx908__)
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
-#else
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
-#endif"""
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
             content += f"""
             }} else {{"""
             for tile in tile_params:

From cb27e7c77fe807dbdc763feb128bbd127f49b4c8 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Thu, 8 May 2025 13:26:03 -0600
Subject: [PATCH 106/443] Ensure MX GEMM Instances can be Cross-Compiled for
 Multiple Architectures (#2171)

* Re-enable MX GEMM instances

* Fix compilation error when building MX GEMM for multiple architectures
---
 .../gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp     | 2 +-
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp          | 4 ++--
 library/src/tensor_operation_instance/gpu/CMakeLists.txt   | 5 ++++-
 .../device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp             | 4 +---
 .../device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp             | 4 +---
 .../device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp             | 7 +------
 .../device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp              | 7 +------
 7 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index c37af49387..2c34be9007 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -714,7 +714,7 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
             return false;
         }
 
-        if(!ck::is_xdl_supported())
+        if(ck::get_device_name() != "gfx950")
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index 1154fa2aa3..f877912329 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -35,7 +35,7 @@ __global__ void
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -66,7 +66,7 @@ __global__ void
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index fe35d9ca76..25ea3b2ae4 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -115,9 +115,12 @@ function(add_instance_library INSTANCE_NAME)
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
             elseif(source MATCHES "mha")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
-            elseif(source MATCHES "_mx")
+            endif()
+
+            if(source MATCHES "_mx")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             endif()
+
             #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
index 3713ebae0e..8dc21cbf1f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
@@ -40,18 +40,16 @@ static constexpr auto ScaleBlockSize = 32;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    64,    16,   128,  16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   256,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,     false,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,    64,   256,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    16,    32,   512,  16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              8,         0,           1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
-#endif
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
index 5b0c5137b3..d3f74b2907 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
@@ -39,12 +39,11 @@ static constexpr auto ScaleBlockSize = 32;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,   4,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    16,   256,   128,   4,  16,  16,   16,    1,    4,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                  S<1, 16, 1, 16>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,    64,   4,  16,  32,   32,    2,    2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
@@ -52,7 +51,6 @@ using device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_instances = std::tuple<
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,   8,  16,  16,   16,    8,    8,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,    64,   4,  16,  32,   32,    4,    4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   128,   128,   4,  16,  16,   16,    4,    8,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
-#endif
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
index 8e25bcc25f..ac09df7ea2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -39,21 +39,16 @@ static constexpr auto ScaleBlockSize = 32;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
-
-//Require verification
-      //DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
-#endif
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
index 5fefb57257..68363de523 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -39,21 +39,16 @@ static constexpr auto ScaleBlockSize = 32;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances = std::tuple<
-// clang-format off
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx950__)
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
-
-      //Require verification
-      //DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-#endif
     // clang-format on
     >;
 

From 3448e12609f9c8a623e31e3eadc2617928f2780c Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 8 May 2025 13:29:14 -0700
Subject: [PATCH 107/443] Generate ckProfiler package for gfx942 only. (#2180)

* build CI for gfx942 exclusively

* run the last stage in a docker with user jenkins

* update the image for the last stage

* ignore perf_log if not found

* archive and store all packages

* use ccache for building packages
---
 Jenkinsfile | 50 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a9d30d9f71..2ad96ed44b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -116,7 +116,7 @@ def getDockerImage(Map conf=[:]){
     def retimage
     try 
     {
-        echo "Pulling down image: ${image}"
+        echo "Pulling image: ${image}"
         retimage = docker.image("${image}")
         withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
             retimage.pull()
@@ -335,8 +335,8 @@ def cmake_build(Map conf=[:]){
         }
     }
 
-    // Only archive from master or develop
-    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
+    // Only archive from develop
+    if (package_build == true && env.BRANCH_NAME == "develop") {
         archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
     }
     //check the node gpu architecture
@@ -539,13 +539,16 @@ def Build_CK(Map conf=[:]){
                             """
                     }
                     dir("build"){
-                        if (params.RUN_FULL_QA && arch_type == 1 ){
+                        if (params.RUN_FULL_QA && arch_type == 2 ){
                             // build deb packages for all gfx9 targets on gfx90a system and prepare to export
                             echo "Build ckProfiler package"
                             sh 'make -j package'
-                            archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
-                            sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
-                            stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb"
+                            archiveArtifacts artifacts: 'composablekernel*.deb'
+                            sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb'
+                            sh 'mv composablekernel-dev_*.deb composablekernel-dev_1.1.0_amd64.deb'
+                            sh 'mv composablekernel-examples_*.deb composablekernel-examples_1.1.0_amd64.deb'
+                            sh 'mv composablekernel-tests_*.deb composablekernel-tests_1.1.0_amd64.deb'
+                            stash includes: "composablekernel-**.deb", name: "packages"
                         }
                     }
                     // run performance tests, stash the logs, results will be processed on the master node
@@ -654,7 +657,8 @@ def Build_CK_and_Reboot(Map conf=[:]){
 def process_results(Map conf=[:]){
     env.HSA_ENABLE_SDMA=0
     checkout scm
-    def image = getDockerImageName() 
+    //use older image that has user jenkins
+    def image = "rocm/composable_kernel:ck_ub22.04_rocm6.3"
     def prefixpath = "/opt/rocm"
 
     // Jenkins is complaining about the render group 
@@ -667,12 +671,17 @@ def process_results(Map conf=[:]){
     def retimage
 
     gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
-        try {
-            (retimage, image) = getDockerImage(conf)
+        try
+        {
+            echo "Pulling image: ${image}"
+            retimage = docker.image("${image}")
+            withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+                retimage.pull()
+            }
         }
-        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
-            echo "The job was cancelled or aborted"
-            throw e
+        catch(Exception ex)
+        {
+            error "Unable to locate image: ${image}"
         }
     }
 
@@ -700,9 +709,14 @@ def process_results(Map conf=[:]){
                     }
                     if (params.RUN_FULL_QA){
                         // unstash perf files to master
-                        unstash "ckprofiler_0.2.0_amd64.deb"
-                        sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
-                        unstash "perf_log"
+                        unstash "packages"
+                        sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no composablekernel-*.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
+                        try{
+                            unstash "perf_log"
+                        }
+                        catch(Exception err){
+                            echo "could not locate perf_log: ${err.getMessage()}."
+                        }
                         try{
                             unstash "perf_log_gfx11"
                             unstash "perf_log_gfx12"
@@ -1114,11 +1128,11 @@ pipeline {
                     agent{ label rocmnode("gfx942") }
                     environment{
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx90a;gfx942" \
+                                         -DGPU_TARGETS="gfx942" \
                                          -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx90a;gfx942" \
+                                           -DGPU_TARGETS="gfx942" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }

From ef72a4b9bc2e5ddc63d9138cae4e5eba23d35b16 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Fri, 9 May 2025 00:18:07 -0700
Subject: [PATCH 108/443] Disable SMFMA for gfx90a (#2182)

---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 10 +++++++++-
 tile_engine/ops/gemm/gemm_instance_builder.py |  6 +++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index c98d46e3a0..5cc5ddc70e 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -97,12 +97,20 @@ using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK
     4>>;
 
 // fp16 2:4 structured sparsity
-
+#if defined(__gfx94__) || defined(__gfx95__)
 using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+#else // gfx 90a does not support smfmac
+using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
+    2>>;
+using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
+    2>>;
+#endif
 
 // bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 3839523e3d..c00554df8f 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -535,7 +535,11 @@ struct GemmDispatcher {
                     ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
                      (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
                 content += f"""
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
+#if defined(__gfx908__)
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+#else
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
+#endif"""
             content += f"""
             }} else {{"""
             for tile in tile_params:

From a23390163d604d9f00ea43e920822ac7cfb0884f Mon Sep 17 00:00:00 2001
From: Mingtao Gu <145657261+mtgu0705@users.noreply.github.com>
Date: Fri, 9 May 2025 23:25:31 +0800
Subject: [PATCH 109/443] fix moe gemm2 for gfx950 (#2164)

Co-authored-by: mtgu0705 <mtgu@amd.com>
---
 example/65_gemm_multiply_multiply/CMakeLists.txt          | 2 +-
 example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 5d2a097576..8d51d43c65 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -7,7 +7,7 @@ add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_mul
 add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
 
-list(APPEND gpu_list gfx942)
+list(APPEND gpu_list gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index b9621cc9b3..3745e3d0af 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -281,7 +281,7 @@ int main(int argc, char* argv[])
         break;
     case 4:
         a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
         d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
         d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
         d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});

From 6b1a339b6faca7e423fdbce67a40a8fca7445abd Mon Sep 17 00:00:00 2001
From: jefyang1 <146495389+jefyang1@users.noreply.github.com>
Date: Fri, 9 May 2025 09:01:06 -0700
Subject: [PATCH 110/443] Fix grouped conv bwd data tests on gfx950 (#2173)

---
 ...ice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 3028cd7cbc..41f596d160 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -179,8 +179,7 @@ __global__ void
         const ComputePtrOffsetOfN compute_ptr_offset_of_n,
         const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
@@ -251,8 +250,7 @@ __global__ void
         const ComputePtrOffsetOfN compute_ptr_offset_of_n,
         const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
     const index_t k_idx =

From 6fddb5708ca28a84519675ffd3f0ca5c25442706 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 9 May 2025 22:52:34 +0200
Subject: [PATCH 111/443] Add grouped conv fwd bias relu instances (#2179)

* Add grouped conv fwd bias relu instances

* fixes

* fix
---
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 174 ++++----
 ...d_multiple_d_xdl_large_tensor_cshuffle.hpp | 250 ++++++------
 .../element/binary_element_wise_operation.hpp |   8 +
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp | 131 +++---
 .../device_grouped_conv_fwd_xdl_instance.hpp  | 383 ++++++++++--------
 ...ped_conv_fwd_xdl_large_tensor_instance.hpp |  37 +-
 ...vice_grouped_conv_fwd_xdl_mem_instance.hpp | 171 ++++----
 ...ed_conv_fwd_xdl_merged_groups_instance.hpp |  63 +--
 .../grouped_convolution_forward_bias_relu.hpp | 141 +++++++
 ...uped_convolution_forward_bias_relu_xdl.inc | 242 +++++++++++
 .../CMakeLists.txt                            |  16 +
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |  67 +++
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |  61 +++
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |  67 +++
 ..._nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp |  60 +++
 ...lu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  60 +++
 ...tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  41 ++
 ...gc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp |  63 +++
 ...groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  80 ++++
 .../CMakeLists.txt                            |  16 +
 ...dhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp | 127 ++++++
 ...hwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp |  58 +++
 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  58 +++
 ...sor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  41 ++
 ..._gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp |  61 +++
 ...ups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  51 +++
 ...rofile_grouped_conv_fwd_bias_relu_impl.hpp | 277 +++++++++++++
 .../profile_grouped_conv_fwd_impl.hpp         |   2 +-
 test/CMakeLists.txt                           |   1 +
 .../CMakeLists.txt                            |   4 +
 .../test_grouped_convnd_fwd_bias_relu.cpp     |  92 +++++
 33 files changed, 2477 insertions(+), 550 deletions(-)
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp
 create mode 100644 test/grouped_convnd_fwd_bias_relu/CMakeLists.txt
 create mode 100644 test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index a93e6ded96..bebcd72ceb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -279,9 +279,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     static constexpr bool isMultiD   = DsDataType::Size() > 0;
     static constexpr bool isMultiABD = isMultiA || isMultiB || isMultiD;
 
-    // multi ABD not supported
-    static_assert(!isMultiABD, "Multi A, Mutli B and Multi D are not supported");
-
     static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
     static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
     static constexpr index_t NumDTensor = DsDataType::Size();
@@ -1080,91 +1077,96 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float avg_time = 0.f;
-
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(!isMultiABD)
             {
-                const index_t a_grid_size =
-                    arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
-                        arg.a_in_transpose_desc_);
-                const index_t b_grid_size =
-                    arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
-                        arg.b_in_transpose_desc_);
+                if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                             is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                {
+                    const index_t a_grid_size =
+                        arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
+                            arg.a_in_transpose_desc_);
+                    const index_t b_grid_size =
+                        arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                            arg.b_in_transpose_desc_);
 
-                ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_);
-                BDataType* p_b_out_grid = type_convert<BDataType*>(arg.p_workspace_) +
-                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                    ADataType* p_a_out_grid = type_convert<ADataType*>(arg.p_workspace_);
+                    BDataType* p_b_out_grid =
+                        type_convert<BDataType*>(arg.p_workspace_) +
+                        arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
 
-                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseInputTranspose,
-                                                                GridwiseElementwiseWeightTranspose,
-                                                                ck::Tuple<NGCHWTransposeDescType>,
-                                                                ck::Tuple<GKCYXTransposeDescType>,
-                                                                ck::Tuple<NHWGCTransposeDescType>,
-                                                                ck::Tuple<GKYXCTransposeDescType>,
-                                                                ck::Tuple<const ADataType*>,
-                                                                ck::Tuple<const BDataType*>,
-                                                                ck::Tuple<ADataType*>,
-                                                                ck::Tuple<BDataType*>,
-                                                                Block2TileMapElementwise,
-                                                                Block2TileMapElementwise,
-                                                                element_wise::PassThrough>;
+                    auto kernel_transpose =
+                        kernel_elementwise_dual<GridwiseElementwiseInputTranspose,
+                                                GridwiseElementwiseWeightTranspose,
+                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                ck::Tuple<GKCYXTransposeDescType>,
+                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                ck::Tuple<GKYXCTransposeDescType>,
+                                                ck::Tuple<const ADataType*>,
+                                                ck::Tuple<const BDataType*>,
+                                                ck::Tuple<ADataType*>,
+                                                ck::Tuple<BDataType*>,
+                                                Block2TileMapElementwise,
+                                                Block2TileMapElementwise,
+                                                element_wise::PassThrough>;
 
-                avg_time += launch_and_time_kernel(stream_config,
-                                                   kernel_transpose,
-                                                   dim3(a_grid_size + b_grid_size),
-                                                   dim3(ElementwiseBlocksize),
-                                                   0,
-                                                   make_tuple(arg.a_in_transpose_desc_),
-                                                   make_tuple(arg.b_in_transpose_desc_),
-                                                   make_tuple(arg.a_out_transpose_desc_),
-                                                   make_tuple(arg.b_out_transpose_desc_),
-                                                   make_tuple(arg.p_a_grid_),
-                                                   make_tuple(arg.p_b_grid_),
-                                                   make_tuple(p_a_out_grid),
-                                                   make_tuple(p_b_out_grid),
-                                                   arg.elementwise_block_2_ctile_map_transpose_a_,
-                                                   arg.elementwise_block_2_ctile_map_transpose_b_,
-                                                   element_wise::PassThrough{},
-                                                   a_grid_size);
+                    avg_time +=
+                        launch_and_time_kernel(stream_config,
+                                               kernel_transpose,
+                                               dim3(a_grid_size + b_grid_size),
+                                               dim3(ElementwiseBlocksize),
+                                               0,
+                                               make_tuple(arg.a_in_transpose_desc_),
+                                               make_tuple(arg.b_in_transpose_desc_),
+                                               make_tuple(arg.a_out_transpose_desc_),
+                                               make_tuple(arg.b_out_transpose_desc_),
+                                               make_tuple(arg.p_a_grid_),
+                                               make_tuple(arg.p_b_grid_),
+                                               make_tuple(p_a_out_grid),
+                                               make_tuple(p_b_out_grid),
+                                               arg.elementwise_block_2_ctile_map_transpose_a_,
+                                               arg.elementwise_block_2_ctile_map_transpose_b_,
+                                               element_wise::PassThrough{},
+                                               a_grid_size);
+                }
+
+                avg_time += RunGemm(arg, stream_config);
+
+                if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                             is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                {
+                    const index_t grid_size =
+                        arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
+                            arg.e_in_transpose_desc_);
+
+                    const EDataType* p_e_in_grid =
+                        type_convert<EDataType*>(arg.p_workspace_) +
+                        (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                            sizeof(EDataType);
+
+                    EDataType* p_e_out_grid = arg.p_e_grid_;
+
+                    auto kernel_transpose = kernel_elementwise<GridwiseElementwiseOutputTranspose,
+                                                               ck::Tuple<NHWGCTransposeDescType>,
+                                                               ck::Tuple<NGCHWTransposeDescType>,
+                                                               ck::Tuple<const EDataType*>,
+                                                               ck::Tuple<EDataType*>,
+                                                               Block2TileMapElementwise,
+                                                               element_wise::PassThrough>;
+
+                    avg_time +=
+                        launch_and_time_kernel(stream_config,
+                                               kernel_transpose,
+                                               dim3(grid_size),
+                                               dim3(ElementwiseBlocksize),
+                                               0,
+                                               make_tuple(arg.e_in_transpose_desc_),
+                                               make_tuple(arg.e_out_transpose_desc_),
+                                               make_tuple(p_e_in_grid),
+                                               make_tuple(p_e_out_grid),
+                                               arg.elementwise_block_2_ctile_map_transpose_e_,
+                                               element_wise::PassThrough{});
+                }
             }
-
-            avg_time += RunGemm(arg, stream_config);
-
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
-            {
-                const index_t grid_size =
-                    arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
-                        arg.e_in_transpose_desc_);
-
-                const EDataType* p_e_in_grid =
-                    type_convert<EDataType*>(arg.p_workspace_) +
-                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
-                        sizeof(EDataType);
-
-                EDataType* p_e_out_grid = arg.p_e_grid_;
-
-                auto kernel_transpose = kernel_elementwise<GridwiseElementwiseOutputTranspose,
-                                                           ck::Tuple<NHWGCTransposeDescType>,
-                                                           ck::Tuple<NGCHWTransposeDescType>,
-                                                           ck::Tuple<const EDataType*>,
-                                                           ck::Tuple<EDataType*>,
-                                                           Block2TileMapElementwise,
-                                                           element_wise::PassThrough>;
-
-                avg_time += launch_and_time_kernel(stream_config,
-                                                   kernel_transpose,
-                                                   dim3(grid_size),
-                                                   dim3(ElementwiseBlocksize),
-                                                   0,
-                                                   make_tuple(arg.e_in_transpose_desc_),
-                                                   make_tuple(arg.e_out_transpose_desc_),
-                                                   make_tuple(p_e_in_grid),
-                                                   make_tuple(p_e_out_grid),
-                                                   arg.elementwise_block_2_ctile_map_transpose_e_,
-                                                   element_wise::PassThrough{});
-            }
-
             return avg_time;
         }
 
@@ -1182,6 +1184,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         const index_t G = arg.b_g_k_c_xs_lengths_[I0];
         const index_t K = arg.b_g_k_c_xs_lengths_[I1];
         const index_t C = arg.b_g_k_c_xs_lengths_[I2];
+        // Move this to runtime check to align Conv instances
+        // with Conv Multiple D instances
+        if constexpr(isMultiABD)
+        {
+            return false;
+        }
 
         // check device
         if(get_device_name() == "gfx908")
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index b2903121b1..3c34d77cc9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -192,7 +192,6 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
     static constexpr index_t NumDTensor  = DsDataType::Size();
     static constexpr index_t MaxGemmsNum = 32;
-    static_assert(NumDTensor == 0, "MultiD not supported.");
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -440,89 +439,94 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
               input_left_pads_{input_left_pads},
               input_right_pads_{input_right_pads}
         {
-            // Perform grouped gemm, generate array of tranformer for convolution
-            Array<ConvToGemmFwdTransformerIndexT, MaxGemmsNum> conv_to_gemm_transformer_arr;
-            Array<const ADataType*, MaxGemmsNum> a_grid_ptrs;
-            Array<EDataType*, MaxGemmsNum> c_grid_ptrs;
-
-            ck::tie(conv_to_gemm_transformer_arr,
-                    a_grid_ptrs,
-                    c_grid_ptrs,
-                    gemms_count_,
-                    is_split_valid_) =
-                GenerateConvToGemmTransforms(
-                    ConvToGemmFwdTransformerLongIndexT{a_g_n_c_wis_lengths_,
-                                                       a_g_n_c_wis_strides_,
-                                                       b_g_k_c_xs_lengths_,
-                                                       b_g_k_c_xs_strides_,
-                                                       e_g_n_k_wos_lengths_,
-                                                       e_g_n_k_wos_strides_,
-                                                       conv_filter_strides_,
-                                                       conv_filter_dilations_,
-                                                       input_left_pads_,
-                                                       input_right_pads_},
-                    static_cast<const ADataType*>(p_a),
-                    static_cast<EDataType*>(p_e));
-
-            grid_size_         = 0;
-            valid_gemms_count_ = 0;
-
-            if(is_split_valid_)
+            if constexpr(NumDTensor == 0)
             {
-                // Create GemmArg for each gemm(conv)
-                for(index_t i = 0; i < gemms_count_; i++)
+                // Perform grouped gemm, generate array of tranformer for convolution
+                Array<ConvToGemmFwdTransformerIndexT, MaxGemmsNum> conv_to_gemm_transformer_arr;
+                Array<const ADataType*, MaxGemmsNum> a_grid_ptrs;
+                Array<EDataType*, MaxGemmsNum> c_grid_ptrs;
+
+                ck::tie(conv_to_gemm_transformer_arr,
+                        a_grid_ptrs,
+                        c_grid_ptrs,
+                        gemms_count_,
+                        is_split_valid_) =
+                    GenerateConvToGemmTransforms(
+                        ConvToGemmFwdTransformerLongIndexT{a_g_n_c_wis_lengths_,
+                                                           a_g_n_c_wis_strides_,
+                                                           b_g_k_c_xs_lengths_,
+                                                           b_g_k_c_xs_strides_,
+                                                           e_g_n_k_wos_lengths_,
+                                                           e_g_n_k_wos_strides_,
+                                                           conv_filter_strides_,
+                                                           conv_filter_dilations_,
+                                                           input_left_pads_,
+                                                           input_right_pads_},
+                        static_cast<const ADataType*>(p_a),
+                        static_cast<EDataType*>(p_e));
+
+                grid_size_         = 0;
+                valid_gemms_count_ = 0;
+
+                if(is_split_valid_)
                 {
-                    const AGridDesc_M_K a_grid_desc_m_k{DeviceOp::MakeAGridDescriptor_M_K<ALayout>(
-                        conv_to_gemm_transformer_arr[i])};
-                    const BGridDesc_N_K b_grid_desc_n_k{DeviceOp::MakeBGridDescriptor_N_K<BLayout>(
-                        conv_to_gemm_transformer_arr[i])};
-                    const auto e_grid_desc_m_n =
-                        DeviceOp::MakeEGridDescriptor_M_N<ELayout>(conv_to_gemm_transformer_arr[i]);
-
-                    const auto block_2_etile_map =
-                        GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
-
-                    const index_t grid_size_grp =
-                        block_2_etile_map.CalculateGridSize(e_grid_desc_m_n);
-
-                    const index_t BlockStart = grid_size_;
-                    const index_t BlockEnd   = grid_size_ + grid_size_grp;
-
-                    grid_size_ += grid_size_grp;
-
-                    if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                   b_grid_desc_n_k,
-                                                   Tuple<>{},
-                                                   e_grid_desc_m_n,
-                                                   block_2_etile_map))
+                    // Create GemmArg for each gemm(conv)
+                    for(index_t i = 0; i < gemms_count_; i++)
                     {
+                        const AGridDesc_M_K a_grid_desc_m_k{
+                            DeviceOp::MakeAGridDescriptor_M_K<ALayout>(
+                                conv_to_gemm_transformer_arr[i])};
+                        const BGridDesc_N_K b_grid_desc_n_k{
+                            DeviceOp::MakeBGridDescriptor_N_K<BLayout>(
+                                conv_to_gemm_transformer_arr[i])};
+                        const auto e_grid_desc_m_n = DeviceOp::MakeEGridDescriptor_M_N<ELayout>(
+                            conv_to_gemm_transformer_arr[i]);
 
-                        gemm_desc_kernel_args_(valid_gemms_count_) = GemmArgs{
-                            a_grid_ptrs[i],
-                            static_cast<const BDataType*>(p_b),
-                            c_grid_ptrs[i],
-                            GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k),
-                            GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k),
-                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                e_grid_desc_m_n),
-                            block_2_etile_map,
-                            BlockStart,
-                            BlockEnd};
+                        const auto block_2_etile_map =
+                            GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
 
-                        valid_gemms_count_++;
+                        const index_t grid_size_grp =
+                            block_2_etile_map.CalculateGridSize(e_grid_desc_m_n);
+
+                        const index_t BlockStart = grid_size_;
+                        const index_t BlockEnd   = grid_size_ + grid_size_grp;
+
+                        grid_size_ += grid_size_grp;
+
+                        if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                       b_grid_desc_n_k,
+                                                       Tuple<>{},
+                                                       e_grid_desc_m_n,
+                                                       block_2_etile_map))
+                        {
+
+                            gemm_desc_kernel_args_(valid_gemms_count_) = GemmArgs{
+                                a_grid_ptrs[i],
+                                static_cast<const BDataType*>(p_b),
+                                c_grid_ptrs[i],
+                                GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k),
+                                GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k),
+                                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                    e_grid_desc_m_n),
+                                block_2_etile_map,
+                                BlockStart,
+                                BlockEnd};
+
+                            valid_gemms_count_++;
+                        }
                     }
+                    // N is the same for all convs
+                    conv_N_per_block_ = static_cast<index_t>(conv_to_gemm_transformer_arr[I0].N_);
                 }
-                // N is the same for all convs
-                conv_N_per_block_ = static_cast<index_t>(conv_to_gemm_transformer_arr[I0].N_);
+
+                // Strides for G and N remain the same
+                compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+                compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+                compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0];
+
+                compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_;
+                compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_;
             }
-
-            // Strides for G and N remain the same
-            compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0];
-            compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0];
-            compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0];
-
-            compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_;
-            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_;
         }
 
         void Print() const
@@ -578,55 +582,63 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
     {
         float Run(const DeviceOp::Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if(stream_config.log_level_ > 0)
+            if constexpr(NumDTensor == 0)
             {
-                arg.Print();
-            }
+                if(stream_config.log_level_ > 0)
+                {
+                    arg.Print();
+                }
 
-            const index_t num_workgroups_per_Conv_N =
-                arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;
+                const index_t num_workgroups_per_Conv_N =
+                    arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;
 
-            const index_t gdx = arg.grid_size_;
-            const index_t gdy = arg.num_group_;
-            const index_t gdz = num_workgroups_per_Conv_N;
+                const index_t gdx = arg.grid_size_;
+                const index_t gdy = arg.num_group_;
+                const index_t gdz = num_workgroups_per_Conv_N;
 
-            // K is constant for all gemms
-            const auto K = arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
-                           arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I2);
+                // K is constant for all gemms
+                const auto K = arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                               arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
-            auto launch_kernel = [&](auto has_main_k_block_loop) {
-                constexpr bool has_main_loop = has_main_k_block_loop.value;
-                const auto kernel = kernel_grouped_conv_fwd_multiple_d_grouped_gemm_xdl_cshuffle<
-                    GridwiseGemm,
-                    MaxGemmsNum,
-                    GemmArgs,
-                    AElementwiseOperation,
-                    BElementwiseOperation,
-                    CDEElementwiseOperation,
-                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                    has_main_loop>;
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
+                    const auto kernel =
+                        kernel_grouped_conv_fwd_multiple_d_grouped_gemm_xdl_cshuffle<
+                            GridwiseGemm,
+                            MaxGemmsNum,
+                            GemmArgs,
+                            AElementwiseOperation,
+                            BElementwiseOperation,
+                            CDEElementwiseOperation,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            has_main_loop>;
 
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(gdx, gdy, gdz),
-                                              dim3(BlockSize),
-                                              0,
-                                              arg.gemm_desc_kernel_args_,
-                                              arg.gemms_count_,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.cde_element_op_,
-                                              arg.compute_ptr_offset_of_groups_,
-                                              arg.compute_ptr_offset_of_n_);
-            };
+                    return launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(gdx, gdy, gdz),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.gemm_desc_kernel_args_,
+                                                  arg.gemms_count_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.cde_element_op_,
+                                                  arg.compute_ptr_offset_of_groups_,
+                                                  arg.compute_ptr_offset_of_n_);
+                };
 
-            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
-            {
-                return launch_kernel(integral_constant<bool, true>{});
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+                {
+                    return launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    return launch_kernel(integral_constant<bool, false>{});
+                }
             }
             else
             {
-                return launch_kernel(integral_constant<bool, false>{});
+                return 0.f;
             }
         }
 
@@ -643,6 +655,12 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
         const long_index_t K = arg.b_g_k_c_xs_lengths_[I1];
         const long_index_t C = arg.b_g_k_c_xs_lengths_[I2];
+        // Move this to runtime check to align Conv instances
+        // with Conv Multiple D instances
+        if constexpr(NumDTensor != 0)
+        {
+            return false;
+        }
 
         // Check if all descs are valid
         if(!(arg.is_split_valid_ && arg.gemms_count_ == arg.valid_gemms_count_))
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 530876650e..0e58d5acb4 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -404,6 +404,14 @@ struct AddRelu
         y             = a > type_convert<bhalf_t>(0.0f) ? a : type_convert<bhalf_t>(0.0f);
     };
 
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float a = type_convert<float>(x0) + type_convert<float>(x1);
+        y             = a > type_convert<bhalf_t>(0.0f) ? a : type_convert<bhalf_t>(0.0f);
+    };
+
     template <>
     __host__ __device__ constexpr void
     operator()<int, int, int8_t>(int& y, const int& x0, const int8_t& x1) const
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index 158ed26ec4..17ffa65d1c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -33,6 +33,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -55,14 +56,16 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_comp_instances_2x = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -71,7 +74,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -79,17 +84,17 @@ using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // Compute friendly
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,    64,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,   128,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,    64,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    64,   128,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
     // clang-format on
     >;
 
@@ -99,15 +104,17 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_comp_instances_part2 = std::tuple<
     // clang-format off
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>
     // clang-format on
     >;
 
@@ -117,14 +124,16 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_comp_instances_2x = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -133,14 +142,16 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
     // clang-format on
     >;
 
@@ -150,22 +161,24 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_comp_instances_part2 = std::tuple<
     // clang-format off
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -174,17 +187,19 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f32_comp_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -194,14 +209,16 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_int8_comp_instances_2x = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,DsLayout,ELayout,int8_t,int8_t,int32_t,   int8_t,    DsLayout,int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,             256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,        1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         1,            1,           1,              S<1, 64, 1, 4>,              16,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,DsLayout,ELayout,int8_t,int8_t,int32_t,   int8_t,    DsLayout,int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,             256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,        1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         1,            1,           1,              S<1, 64, 1, 4>,              16,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
@@ -210,14 +227,16 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>
     // clang-format on
     >;
 
@@ -227,18 +246,20 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_int8_comp_instances_part2 = std::tuple<
     // clang-format off
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index f5397308dc..df24b4cbcb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -33,6 +33,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -51,7 +52,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_generic_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -59,7 +62,7 @@ using device_grouped_conv_fwd_xdl_bf16_generic_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
     // clang-format on
     >;
 
@@ -68,7 +71,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -76,24 +81,24 @@ using device_grouped_conv_fwd_xdl_bf16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -102,17 +107,19 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_16x16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,               S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,               S<1, 32, 1, 4>,               2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,               S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,               S<1, 32, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,               S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,               S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,               S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,               S<1, 32, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -121,7 +128,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_generic_instances = std::tuple<
     // clang-format off
   //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -129,7 +138,7 @@ using device_grouped_conv_fwd_xdl_f16_generic_instances = std::tuple<
   //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
   //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
   // generic instance
-  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
     // clang-format on
     >;
 
@@ -138,7 +147,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -146,24 +157,24 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -172,17 +183,19 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_16x16_instances = std::tuple<
     // clang-format off
       //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
       //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
       //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
       //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              2,              8,          1,          1,           1,               S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              1,              8,          1,          1,           1,               S<1, 32, 1, 4>,               2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              4,              8,          1,          1,           1,               S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          1,          1,           1,               S<1, 32, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              2,              8,          1,          1,           1,               S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              1,              8,          1,          1,           1,               S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              4,              8,          1,          1,           1,               S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          1,          1,           1,               S<1, 32, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -191,7 +204,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f32_generic_instances = std::tuple<
     // clang-format off
   //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -199,7 +214,7 @@ using device_grouped_conv_fwd_xdl_f32_generic_instances = std::tuple<
   //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
   //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
   // generic instance
-  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>
     // clang-format on
     >;
 
@@ -208,7 +223,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f32_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -216,24 +233,24 @@ using device_grouped_conv_fwd_xdl_f32_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
     // clang-format on
     >;
 
@@ -242,7 +259,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f32_16x16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -250,9 +269,9 @@ using device_grouped_conv_fwd_xdl_f32_16x16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,              S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,              S<1, 32, 1, 4>,               2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,              S<1, 32, 1, 4>,               4>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          1,          1,           1,              S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              8,          1,          1,           1,              S<1, 32, 1, 4>,               2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,    64,    32,   8,   8,  16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              8,          1,          1,           1,              S<1, 32, 1, 4>,               4>
     // clang-format on
     >;
 
@@ -261,7 +280,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_int8_generic_instances = std::tuple<
     // clang-format off
   //########################################|     NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -269,7 +290,7 @@ using device_grouped_conv_fwd_xdl_int8_generic_instances = std::tuple<
   //########################################|           |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
   //########################################|           |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
   // generic instance
-  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
+  DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>
     // clang-format on
     >;
 
@@ -278,7 +299,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_int8_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -286,24 +309,24 @@ using device_grouped_conv_fwd_xdl_int8_instances = std::tuple<
         //########################################|           |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsLayout, int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout, int8_t, int8_t, int32_t,   int8_t,    DsDataTypes, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
     // clang-format on
     >;
 
@@ -312,7 +335,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_comp_f8_instances = std::tuple<
 // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ComputeType|
@@ -321,24 +346,24 @@ using device_grouped_conv_fwd_xdl_f16_comp_f8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |            |
 #ifdef CK_ENABLE_FP8
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>
 #endif
     // clang-format on
     >;
@@ -348,7 +373,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f8_instances = std::tuple<
 // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ComputeType|
@@ -357,24 +384,24 @@ using device_grouped_conv_fwd_xdl_f8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |            |
 #ifdef CK_ENABLE_FP8
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8>
 #endif
     // clang-format on
     >;
@@ -384,7 +411,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf8_instances = std::tuple<
 // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ComputeType|
@@ -393,24 +422,24 @@ using device_grouped_conv_fwd_xdl_bf8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |            |
 #ifdef CK_ENABLE_BF8
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   BF8,   BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8>
 #endif
     // clang-format on
     >;
@@ -420,7 +449,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f8_bf8_instances = std::tuple<
 // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|AComputeType|BComputeType|
@@ -429,24 +460,24 @@ using device_grouped_conv_fwd_xdl_f8_bf8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |            |            |
 #if(defined(CK_ENABLE_FP8) && defined(CK_ENABLE_BF8))
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8,         BF8>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8,         BF8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8,         BF8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8,         BF8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F8,  BF8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         F8,         BF8>
 #endif
     // clang-format on
     >;
@@ -456,7 +487,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf8_f8_instances = std::tuple<
 // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|AComputeType|BComputeType|
@@ -465,24 +498,24 @@ using device_grouped_conv_fwd_xdl_bf8_f8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |            |            |
 #if(defined(CK_ENABLE_FP8) && defined(CK_ENABLE_BF8))
         // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8,          F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8,          F8>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8,         F8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsLayout,   F8, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8,         F8>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8,         F8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF8,   F8,     F32,      F8,    DsDataTypes,   F8, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8,         BF8,         F8>
 #endif
     // clang-format on
     >;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 0a85cde3bc..6bb6d255f3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -25,6 +25,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -36,7 +37,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_large_tensor_bf16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -44,10 +47,10 @@ using device_grouped_conv_fwd_xdl_large_tensor_bf16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
 
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
     // clang-format on
     >;
 
@@ -56,7 +59,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_large_tensor_f16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -64,10 +69,10 @@ using device_grouped_conv_fwd_xdl_large_tensor_f16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
 
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               2>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
     // clang-format on
     >;
 
@@ -76,7 +81,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_large_tensor_f32_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -84,9 +91,9 @@ using device_grouped_conv_fwd_xdl_large_tensor_f32_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,              1>,
 
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
     // clang-format on
     >;
 
@@ -95,7 +102,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_large_tensor_int8_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -103,9 +112,9 @@ using device_grouped_conv_fwd_xdl_large_tensor_int8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // generic instance
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
 
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 1f381af08c..195367ffd7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -33,6 +33,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -52,7 +53,9 @@ template <index_t NDimSpatial,
           typename DsLayout,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec,
-          BlockGemmPipelineScheduler BlkGemmPipeSched>
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_bf16_mem_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -60,27 +63,27 @@ using device_grouped_conv_fwd_xdl_bf16_mem_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // Latency friendly 
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 
@@ -90,34 +93,36 @@ template <index_t NDimSpatial,
           typename DsLayout,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec,
-          BlockGemmPipelineScheduler BlkGemmPipeSched>
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f16_mem_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 
@@ -127,30 +132,32 @@ template <index_t NDimSpatial,
           typename DsLayout,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec,
-          BlockGemmPipelineScheduler BlkGemmPipeSched>
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_f32_mem_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsLayout,   F32, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F32,   F32,     F32,      F32,    DsDataTypes,   F32, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 
@@ -160,34 +167,36 @@ template <index_t NDimSpatial,
           typename DsLayout,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec,
-          BlockGemmPipelineScheduler BlkGemmPipeSched>
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_int8_mem_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // Memory friendly
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 153cc61b09..182c785978 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -25,6 +25,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -38,7 +39,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
@@ -46,9 +49,9 @@ using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|         |         | Scheduler|          |
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |         |         |          |          |
         // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
@@ -58,16 +61,18 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     Type|     Type|  Pipeline|   ToMerge|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|         |         | Scheduler|          |
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |         |         |          |          |
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsDataTypes,  BF16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
@@ -76,7 +81,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_merged_groups_f16_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -84,9 +91,9 @@ using device_grouped_conv_fwd_xdl_merged_groups_f16_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
@@ -96,7 +103,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -104,9 +113,9 @@ using device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
@@ -115,7 +124,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -123,9 +134,9 @@ using device_grouped_conv_fwd_xdl_merged_groups_f32_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsLayout,   F32,  PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsLayout,   F32,  PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsLayout,   F32,  PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 32>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsDataTypes,   F32,  PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsDataTypes,   F32,  PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,    F32,     F32,     F32,     F32, DsDataTypes,   F32,  PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F32, F32, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
@@ -134,7 +145,9 @@ template <index_t NDimSpatial,
           typename BLayout,
           typename DsLayout,
           typename ELayout,
-          ConvolutionForwardSpecialization ConvSpec>
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
 using device_grouped_conv_fwd_xdl_merged_groups_int8_instances = std::tuple<
     // clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -142,9 +155,9 @@ using device_grouped_conv_fwd_xdl_merged_groups_int8_instances = std::tuple<
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         // Instances with NumGroupsPerBatch > 1
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 8>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 16>,
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 32>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsDataTypes,   int8_t, PassThrough, PassThrough, OutElementOp,                  ConvSpec, GemmMNKPadding,  1,  64,     32,    64,     32,   8, 8,  32,   32,    1,    2,  S< 4, 16,  1>, S<1, 0, 2>,     S<1, 0, 2>,                   2,              8,              8,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      1,           1,           1,   S<1, 16, 1, 4>,                  1, int8_t, int8_t, LoopScheduler::Default, 32>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp
new file mode 100644
index 0000000000..d873edadba
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+#ifdef CK_USE_XDL
+#include "grouped_convolution_forward_bias_relu_xdl.inc"
+#endif
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DLayouts,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename DDataTypes,
+          typename AComputeType,
+          typename BComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::AddRelu,
+    AComputeType,
+    BComputeType>>
+{
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DLayouts,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DDataTypes,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::AddRelu,
+                                        AComputeType,
+                                        BComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        // layout NHWGC/GKYXC/NHWGK
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+        // layout NDHWGC/GKZYXC/NDHWGK
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+#endif // CK_USE_XDL
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc
new file mode 100644
index 0000000000..1935f123a8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_BF16
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances);
+
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt
new file mode 100644
index 0000000000..98b0b1c4cb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+add_instance_library(device_grouped_conv2d_fwd_bias_relu_instance
+   xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
+
+   xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+
+   xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+
+   xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+
+   xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..75acd604ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<BF16>,
+                                                               AddRelu>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0,
+                                                               Tuple<BF16>,
+                                                               AddRelu>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0,
+                                                               Tuple<BF16>,
+                                                               AddRelu>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..69a8a4bd9d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<BF16>,
+                                                                                   AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<BF16>,
+                                                                                   AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<BF16>,
+                                                                                   AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..043c724e4a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Tuple<NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault,
+                                                                  Tuple<BF16>,
+                                                                  AddRelu>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Tuple<NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0,
+                                                                  Tuple<BF16>,
+                                                                  AddRelu>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Tuple<NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0,
+                                                                  Tuple<BF16>,
+                                                                  AddRelu>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..c58631e169
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                                                    NHWGC,
+                                                                                    GKYXC,
+                                                                                    Tuple<NHWGK>,
+                                                                                    NHWGK,
+                                                                                    ConvFwdDefault,
+                                                                                    Tuple<BF16>,
+                                                                                    AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                                                    NHWGC,
+                                                                                    GKYXC,
+                                                                                    Tuple<NHWGK>,
+                                                                                    NHWGK,
+                                                                                    ConvFwd1x1P0,
+                                                                                    Tuple<BF16>,
+                                                                                    AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                                                    NHWGC,
+                                                                                    GKYXC,
+                                                                                    Tuple<NHWGK>,
+                                                                                    NHWGK,
+                                                                                    ConvFwd1x1S1P0,
+                                                                                    Tuple<BF16>,
+                                                                                    AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..cd80f2875f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Tuple<NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwdDefault,
+                                                                              Tuple<BF16>,
+                                                                              AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Tuple<NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1P0,
+                                                                              Tuple<BF16>,
+                                                                              AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Tuple<NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1S1P0,
+                                                                              Tuple<BF16>,
+                                                                              AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..a6286b55e8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<BF16>,
+                                                                AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..0736325b05
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..0d35ab1b05
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..253e8b196e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC,
+                                                                        Tuple<NHWGK>,
+                                                                        NHWGK,
+                                                                        ConvFwdDefault,
+                                                                        Tuple<BF16>,
+                                                                        AddRelu>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC,
+                                                                        Tuple<NHWGK>,
+                                                                        NHWGK,
+                                                                        ConvFwd3x3,
+                                                                        Tuple<BF16>,
+                                                                        AddRelu>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault,
+                                                                     Tuple<BF16>,
+                                                                     AddRelu>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3,
+                                                                     Tuple<BF16>,
+                                                                     AddRelu>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt
new file mode 100644
index 0000000000..afdddfec70
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_FWD
+   xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
+
+   xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+
+   xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+
+   xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
+
+   xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+)
+
+add_instance_library(device_grouped_conv3d_fwd_bias_relu_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..9819f0ea0b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<BF16>,
+                                                                                   AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<BF16>,
+                                                                                   AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<BF16>,
+                                                                                   AddRelu>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Tuple<NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault,
+                                                                  Tuple<BF16>,
+                                                                  AddRelu>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Tuple<NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0,
+                                                                  Tuple<BF16>,
+                                                                  AddRelu>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Tuple<NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0,
+                                                                  Tuple<BF16>,
+                                                                  AddRelu>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<BF16>,
+                                                               AddRelu>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0,
+                                                               Tuple<BF16>,
+                                                               AddRelu>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0,
+                                                               Tuple<BF16>,
+                                                               AddRelu>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..dc3fc7a4bf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                                                    NDHWGC,
+                                                                                    GKZYXC,
+                                                                                    Tuple<NDHWGK>,
+                                                                                    NDHWGK,
+                                                                                    ConvFwdDefault,
+                                                                                    Tuple<BF16>,
+                                                                                    AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                                                    NDHWGC,
+                                                                                    GKZYXC,
+                                                                                    Tuple<NDHWGK>,
+                                                                                    NDHWGK,
+                                                                                    ConvFwd1x1P0,
+                                                                                    Tuple<BF16>,
+                                                                                    AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                                                    NDHWGC,
+                                                                                    GKZYXC,
+                                                                                    Tuple<NDHWGK>,
+                                                                                    NDHWGK,
+                                                                                    ConvFwd1x1S1P0,
+                                                                                    Tuple<BF16>,
+                                                                                    AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..a9a8ff8459
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Tuple<NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwdDefault,
+                                                                              Tuple<BF16>,
+                                                                              AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Tuple<NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1P0,
+                                                                              Tuple<BF16>,
+                                                                              AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Tuple<NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1S1P0,
+                                                                              Tuple<BF16>,
+                                                                              AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..e58e879973
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<BF16>,
+                                                                AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..e76052c6e0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..0593f3f46a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<BF16>,
+                                                                                  AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..6552f26f88
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddRelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault,
+                                                                 Tuple<BF16>,
+                                                                 AddRelu>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd3x3,
+                                                                 Tuple<BF16>,
+                                                                 AddRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp
new file mode 100644
index 0000000000..9d38263d4e
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AComputeType = InDataType,
+          typename BComputeType = AComputeType,
+          typename IndexType    = ck::index_t>
+bool profile_grouped_conv_fwd_bias_relu_impl(int do_verification,
+                                             int init_method,
+                                             bool do_log,
+                                             bool time_kernel,
+                                             const ck::utils::conv::ConvParam& conv_param)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_dilations{};
+    std::array<IndexType, NDimSpatial> input_left_pads{};
+    std::array<IndexType, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> bias(out_g_n_k_wos_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+
+    // run reference op
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp,
+                                                                     0,
+                                                                     0,
+                                                                     1>{};
+
+        std::array<Tensor<OutDataType>, 1> d_tensors = {bias};
+        auto ref_invoker                             = ref_conv.MakeInvoker();
+        auto ref_argument                            = ref_conv.MakeArgument(input,
+                                                  weight,
+                                                  host_output,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        // init host output to zero
+        host_output.SetZero();
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        // workspace_sz will be equal to 0 for other layout than NGCHW
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    };
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
+                                                                      InLayout,
+                                                                      WeiLayout,
+                                                                      ck::Tuple<OutLayout>,
+                                                                      OutLayout,
+                                                                      InDataType,
+                                                                      WeiDataType,
+                                                                      ck::Tuple<OutDataType>,
+                                                                      OutDataType,
+                                                                      InElementOp,
+                                                                      WeiElementOp,
+                                                                      OutElementOp,
+                                                                      AComputeType,
+                                                                      BComputeType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        {bias_device_buf.GetDeviceBuffer()},
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        a_g_n_c_wis_lengths,
+                                                        a_g_n_c_wis_strides,
+                                                        b_g_k_c_xs_lengths,
+                                                        b_g_k_c_xs_strides,
+                                                        {e_g_n_k_wos_lengths},
+                                                        {e_g_n_k_wos_strides},
+                                                        e_g_n_k_wos_lengths,
+                                                        e_g_n_k_wos_strides,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+
+        run_impl(op_ptr, argument_ptr);
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 4bfbca5437..dfa6bc1edd 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6bde1140d9..69ffb94488 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -255,6 +255,7 @@ add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_convnd_fwd)
+add_subdirectory(grouped_convnd_fwd_bias_relu)
 add_subdirectory(grouped_convnd_bwd_weight)
 add_subdirectory(block_to_ctile_map)
 add_subdirectory(softmax)
diff --git a/test/grouped_convnd_fwd_bias_relu/CMakeLists.txt b/test/grouped_convnd_fwd_bias_relu/CMakeLists.txt
new file mode 100644
index 0000000000..680a92b19c
--- /dev/null
+++ b/test/grouped_convnd_fwd_bias_relu/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_grouped_convnd_fwd_bias_relu test_grouped_convnd_fwd_bias_relu.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_bias_relu PRIVATE utility device_grouped_conv2d_fwd_bias_relu_instance device_grouped_conv3d_fwd_bias_relu_instance)
+endif()
diff --git a/test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp b/test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp
new file mode 100644
index 0000000000..c508235d9c
--- /dev/null
+++ b/test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using AddRelu = ck::tensor_operation::element_wise::AddRelu;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_relu_impl<NDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 OutLayout,
+                                                                                 DataType,
+                                                                                 DataType,
+                                                                                 DataType,
+                                                                                 DataType,
+                                                                                 DataType,
+                                                                                 IndexType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}

From d8faf1c6a161ddcee98e9dfca3cc00941eec9f61 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Sat, 10 May 2025 22:40:05 -0700
Subject: [PATCH 112/443] Support for swizzle and transpose for
 MFMA_16x16x32_F16/BF16 (#2172)

* Changes for updating tile distribution for shuffle and transpose

* Fixed swizzle and transpose, removed comments

* clang formatted

* Adding support for bf16 type

* Addressing review comments
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 5cc5ddc70e..5ed97dc05c 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -77,6 +77,18 @@ using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
         2>>;
 #endif
 
+#if defined(__gfx950__)
+using WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
+        1>>;
+
+using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
+        1>>;
+#endif
+
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<

From 9d1e44e56ab15e65b86b0bb8fda82f9caed3af92 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 12 May 2025 00:41:45 -0700
Subject: [PATCH 113/443] Vectorized Transpose for Batched Transpose CK Tile
 Operator (#2131)

* Shared Memory for single data point

* CKTile Transpose vectorize CP1

* CKTile Transpose vectorize CP2

* CKTile Transpose vectorize CP2.1

* fixed the compile error of the transpose tile 2d

* Have the correct result for the current test sample

* Changes to printing tensor

* fp8 support added

* Debugging for transpose

* solving the corner issue

* Changed padding flag

* Intermideate Debugging

* Intermidiate Debugging

* Intermediate Debugging

* Finished debugging of the transpose op

* Code Cleanup

* Adding edge case smoke tests

* Adding Transpose test to CI/CD

* Adding Transpose test to CI/CD

* Adding Transpose test to CI/CD

* Addressing Review Comment

* Addressing Comments

* Addressing Comments

* Measuring Perf Tests

* Code Cleanup

* Changlog

* Added the running iterations

* clang format

* Fix the changelog

* Fix the compilation error

* change the printing factor

---------

Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
---
 CHANGELOG.md                                  |  2 +-
 Jenkinsfile                                   | 73 ++++++++++++++-
 .../ck_tile/35_batched_transpose/README.md    |  2 +
 .../batched_transpose_api.cpp                 | 89 +++++++++++++------
 .../batched_transpose_example.cpp             | 43 ++++-----
 .../35_batched_transpose/script/perf_test.sh  | 11 +++
 .../script/run_full_test.sh                   | 38 ++++++++
 .../35_batched_transpose/script/smoke_test.sh | 20 ++++-
 include/ck_tile/core/tensor/tensor_view.hpp   | 17 +---
 .../ck_tile/core/tensor/transpose_tile.hpp    | 23 +++--
 .../kernel/batched_transpose_kernel.hpp       | 63 ++++++-------
 .../pipeline/batched_transpose_pipeline.hpp   | 24 ++---
 .../pipeline/batched_transpose_policy.hpp     | 43 ++++-----
 .../pipeline/batched_transpose_problem.hpp    | 15 ++--
 14 files changed, 311 insertions(+), 152 deletions(-)
 create mode 100755 example/ck_tile/35_batched_transpose/script/perf_test.sh
 create mode 100755 example/ck_tile/35_batched_transpose/script/run_full_test.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0ec214c69..60fe2df99d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 ### Optimized
 
-None
+* Added Vectorize Transpose optimization for CK Tile (#2131)
 
 ### Fixes
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 2ad96ed44b..68e0fa1246 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -362,6 +362,20 @@ def cmake_build(Map conf=[:]){
             echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
         }
     }
+    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
+        try{
+            archiveArtifacts "perf_transpose_*.log"
+            if (arch_type == 1){
+                stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a"
+            }
+            else if (arch_type == 2){
+                stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942"
+            }
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
     if (params.RUN_CK_TILE_GEMM_TESTS){
         try{
             archiveArtifacts "perf_tile_gemm_**.log"
@@ -698,6 +712,15 @@ def process_results(Map conf=[:]){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                         }
                     }
+                    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
+                        try{
+                            unstash "perf_transpose_log_gfx942"
+                            unstash "perf_transpose_log_gfx90a"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the Transpose performance logs: ${err.getMessage()}."
+                        }
+                    }
                     if (params.RUN_CK_TILE_GEMM_TESTS){
                         try{
                             unstash "perf_tile_gemm_log_gfx942"
@@ -753,7 +776,7 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
                                               0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -833,6 +856,10 @@ pipeline {
             name: "RUN_CK_TILE_FMHA_TESTS",
             defaultValue: false,
             description: "Run the ck_tile FMHA tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_TRANSPOSE_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile Transpose tests (default: OFF)")
         booleanParam(
             name: "RUN_CK_TILE_GEMM_TESTS",
             defaultValue: false,
@@ -1032,6 +1059,50 @@ pipeline {
                 }
             }
         }
+        stage("Run CK_TILE_TRANSPOSE Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_TRANSPOSE Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_batched_transpose && \
+                                           cd ../ &&
+                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_TRANSPOSE Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_batched_transpose && \
+                                           cd ../ &&
+                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
         stage("Run CK_TILE_GEMM Tests")
         {
             parallel
diff --git a/example/ck_tile/35_batched_transpose/README.md b/example/ck_tile/35_batched_transpose/README.md
index d0583e7529..38bb2b32e4 100644
--- a/example/ck_tile/35_batched_transpose/README.md
+++ b/example/ck_tile/35_batched_transpose/README.md
@@ -24,4 +24,6 @@ args:
  -layout_out    output tensor data layout - NHWC by default
        -seed    seed to be used, -1 means random every time (default:-1)
      -k_name    t to 1 will print kernel name (default:0)
+     -warmup    warmup iterations to run this kernel (default:50)
+     -repeat    number of iterations to run this kernel (default:100)
 ```
\ No newline at end of file
diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
index 77d768fe3f..1eb0445c84 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "batched_transpose_example.hpp"
-#include <iostream>
 
 template <typename ts_type,
           ck_tile::index_t block_x,
@@ -9,23 +8,23 @@ template <typename ts_type,
           ck_tile::index_t warp_x,
           ck_tile::index_t warp_y,
           ck_tile::index_t thread_x,
-          ck_tile::index_t thread_y>
+          ck_tile::index_t thread_y,
+          bool kPadM,
+          bool kPadN>
 float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
 {
-    uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
-    uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
-    uint32_t dim_stride  = a.height * a.width;
+    uint32_t dim_stride = a.height * a.width;
 
     a.dim_stride  = dim_stride;
-    a.dim_block_h = dim_block_h;
-    a.dim_block_w = dim_block_w;
+    a.dim_block_h = block_y;
+    a.dim_block_w = block_x;
 
     using block_tile  = ck_tile::sequence<block_x, block_y>;
     using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
     using thread_tile = ck_tile::sequence<thread_x, thread_y>;
 
     using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile>;
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
     using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
 
     using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
@@ -35,25 +34,40 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
     const dim3 grids      = kernel::GridSize(a);
     constexpr dim3 blocks = kernel::BlockSize();
 
+    printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
+    printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
+    printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
+           kargs.batch,
+           kargs.height,
+           kargs.width,
+           kargs.dim_stride);
+
+    printf("Launching Kernel...\n");
+
     float ave_time = ck_tile::launch_kernel(
         s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
 
+    printf("Kernel finished...\n");
+
     return ave_time;
 }
 
 // Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)               \
-    F(fp16, ck_tile::fp16_t, 16, 16, 8, 8, 1, 1) \
-    F(bf16, ck_tile::bf16_t, 16, 16, 8, 8, 1, 1) \
-    F(fp32, ck_tile::fp32_t, 16, 16, 8, 8, 1, 1) \
-    F(int8, ck_tile::int8_t, 16, 16, 8, 8, 1, 1)
+#define FOREACH_TRANSPOSE_PARAM(F)                               \
+    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
 
 // Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY)               \
-    static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY( \
-        batched_transpose_kargs& a, ck_tile::stream_config& s)                        \
-    {                                                                                 \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY>(a, s);   \
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN)             \
+    static float                                                                                \
+        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN(  \
+            batched_transpose_kargs& a, ck_tile::stream_config& s)                              \
+    {                                                                                           \
+        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
     }
 
 FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -62,21 +76,38 @@ float batched_transpose(batched_transpose_trait t,
                         batched_transpose_kargs a,
                         ck_tile::stream_config s)
 {
-    if(t.type == "fp16")
+    if(t.type == "fp8")
     {
-        return transpose_fn_fp16_16_16_8_8_1_1(a, s);
+        if(a.height % 64 == 0 && a.width % 64 == 0)
+        {
+            return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
+        }
+        else
+        {
+            return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
+        }
+    }
+    else if(t.type == "fp16")
+    {
+        if(a.height % 64 == 0 && a.width % 64 == 0)
+        {
+            return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
+        }
+        else
+        {
+            return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
+        }
     }
     else if(t.type == "bf16")
     {
-        return transpose_fn_bf16_16_16_8_8_1_1(a, s);
-    }
-    else if(t.type == "fp32")
-    {
-        return transpose_fn_fp32_16_16_8_8_1_1(a, s);
-    }
-    else if(t.type == "int8")
-    {
-        return transpose_fn_int8_16_16_8_8_1_1(a, s);
+        if(a.height % 64 == 0 && a.width % 64 == 0)
+        {
+            return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
+        }
+        else
+        {
+            return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
+        }
     }
     return -1;
 }
diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
index 48fc2859bf..33b6f0eacf 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
@@ -21,13 +21,13 @@ void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
     std::cout << "[";
     for(size_t i = 0; i < len[0]; i++)
     {
-        std::cout << i << ": [";
+        std::cout << "Batch " << i << ":" << std::endl;
         for(size_t j = 0; j < len[1]; j++)
         {
-            std::cout << j << ": [";
+            std::cout << "  Channel " << j << ":" << std::endl;
             for(size_t k = 0; k < len[2]; k++)
             {
-                std::cout << k << ": [";
+                std::cout << "    Row " << k << ": ";
                 for(size_t v = 0; v < len[3]; v++)
                 {
                     if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
@@ -41,15 +41,15 @@ void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
                     }
                     else
                     {
-                        std::cout << x(std::vector<std::size_t>{i, j, k, v}) << " ";
+                        std::cout << static_cast<int>(x(std::vector<std::size_t>{i, j, k, v}))
+                                  << " ";
                     }
                 }
-                std::cout << "]" << std::endl;
+                std::cout << std::endl;
             }
-            std::cout << "]" << std::endl;
         }
-        std::cout << std::endl;
     }
+    std::cout << "]" << std::endl;
     std::cout << "--------------------" << std::endl;
 }
 #endif
@@ -93,12 +93,14 @@ auto create_args(int argc, char* argv[])
     ck_tile::ArgParser arg_parser;
     arg_parser.insert("v", "1", "whether do CPU validation or not")
         .insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
-        .insert("N", "2", "input batch size. ")
-        .insert("C", "16", "input channel size.")
-        .insert("H", "1", "input height size.")
-        .insert("W", "16", "input width size. ")
+        .insert("N", "1", "input batch size. ")
+        .insert("C", "64", "input channel size.")
+        .insert("H", "18", "input height size.")
+        .insert("W", "64", "input width size. ")
         .insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
         .insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("seed", "-1", "seed to be used, -1 means random every time")
         .insert("kname", "0", "t to 1 will print kernel name");
 
@@ -115,6 +117,8 @@ bool run_batched_transpose(ck_tile::ArgParser args)
     int C                  = args.get_int("C");
     int H                  = args.get_int("H");
     int W                  = args.get_int("W");
+    int n_warmup           = args.get_int("warmup");
+    int n_repeat           = args.get_int("repeat");
     std::string layout_in  = args.get_str("layout_in");
     std::string layout_out = args.get_str("layout_out");
     int seed               = args.get_int("seed");
@@ -177,7 +181,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
         return a_;
     }();
 
-    ck_tile::stream_config sc{nullptr, true};
+    ck_tile::stream_config sc{nullptr, true, n_warmup, n_repeat};
 
     auto ms = batched_transpose(trait, karg, sc);
 
@@ -202,7 +206,8 @@ bool run_batched_transpose(ck_tile::ArgParser args)
            layout_in.c_str(),
            ms);
     if(ms < 0)
-        printf("not supported\n");
+        printf("------------------------------------not "
+               "supported-------------------------------------\n");
     fflush(stdout);
 
     if(ms < 0)
@@ -227,7 +232,9 @@ bool run_batched_transpose(ck_tile::ArgParser args)
         rtn &= ck_tile::check_err(
             y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
     }
-    printf("valid:%s\n", rtn ? "y" : "n");
+    printf("-----------------------------------------------------------------------valid:%s--------"
+           "--------------------------------------------------------------------\n",
+           rtn ? "y" : "n");
     fflush(stdout);
     return rtn;
 }
@@ -240,9 +247,9 @@ int main(int argc, char** argv)
     std::string prec = args.get_str("pr");
 
     bool r = true;
-    if(prec.compare("fp32") == 0)
+    if(prec.compare("fp8") == 0)
     {
-        r &= run_batched_transpose<float>(args);
+        r &= run_batched_transpose<ck_tile::fp8_t>(args);
     }
     else if(prec.compare("fp16") == 0)
     {
@@ -252,10 +259,6 @@ int main(int argc, char** argv)
     {
         r &= run_batched_transpose<ck_tile::bf16_t>(args);
     }
-    else if(prec.compare("int8") == 0)
-    {
-        r &= run_batched_transpose<ck_tile::int8_t>(args);
-    }
 
     return r ? 0 : -1;
 }
diff --git a/example/ck_tile/35_batched_transpose/script/perf_test.sh b/example/ck_tile/35_batched_transpose/script/perf_test.sh
new file mode 100755
index 0000000000..7ecfefc580
--- /dev/null
+++ b/example/ck_tile/35_batched_transpose/script/perf_test.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+EXE=./build/bin/tile_example_batched_transpose
+
+for pr in "fp8" "fp16" "bf16"; do
+$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=4096 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
+
+done
\ No newline at end of file
diff --git a/example/ck_tile/35_batched_transpose/script/run_full_test.sh b/example/ck_tile/35_batched_transpose/script/run_full_test.sh
new file mode 100755
index 0000000000..4d0c988912
--- /dev/null
+++ b/example/ck_tile/35_batched_transpose/script/run_full_test.sh
@@ -0,0 +1,38 @@
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_batched_transpose executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+
+#get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run verification tests
+example/ck_tile/35_batched_transpose/script/smoke_test.sh
+
+#run performance benchmarks
+
diff --git a/example/ck_tile/35_batched_transpose/script/smoke_test.sh b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
index fdfef2cea8..fdc01a2eb4 100755
--- a/example/ck_tile/35_batched_transpose/script/smoke_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
@@ -2,10 +2,26 @@
 
 EXE=./build/bin/tile_example_batched_transpose
 
-for pr in "fp32" "fp16" "int8" ; do
+for pr in "fp8" "fp16" "bf16"; do
 $EXE -pr=$pr -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
 $EXE -pr=$pr -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
 $EXE -pr=$pr -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
 $EXE -pr=$pr -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
 $EXE -pr=$pr -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
-done
+$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+
+done
\ No newline at end of file
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 32de227b52..29db5e1fca 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -384,22 +384,6 @@ struct tensor_view
             coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tensor_view{");
-
-        // buf_
-        printf("buf_: ");
-        print(buf_);
-        printf(", ");
-
-        // desc_
-        printf("desc_: ");
-        print(desc_);
-
-        printf("}");
-    }
-
     // member
     buffer_view buf_;
     TensorDesc desc_;
@@ -494,6 +478,7 @@ template <typename TensorView,
 CK_TILE_HOST_DEVICE constexpr auto
 pad_tensor_view(const TensorView& tensor_view, const TileLengths& tile_lengths, DoPads)
 {
+
     constexpr index_t num_dim = DoPads::size();
 
     static_assert(num_dim == TileLengths::size() && num_dim == TensorView::get_num_of_dimension(),
diff --git a/include/ck_tile/core/tensor/transpose_tile.hpp b/include/ck_tile/core/tensor/transpose_tile.hpp
index 5b65b79c1a..d917cd5bac 100644
--- a/include/ck_tile/core/tensor/transpose_tile.hpp
+++ b/include/ck_tile/core/tensor/transpose_tile.hpp
@@ -85,7 +85,12 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
 
     // SFC
     constexpr auto scalars_per_access_arr = generate_array(
-        [&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
+        [&](auto i) {
+            if constexpr(vec_length_in == 1)
+                return 1;
+            else
+                return (i == y_dim_vec_in || i == y_dim_vec_out) ? y_lengths[i] : 1;
+        },
         number<NDimY>{});
 
     constexpr auto scalars_per_access = TO_SEQUENCE(scalars_per_access_arr, NDimY);
@@ -103,13 +108,19 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
         // loop over SFC
         static_for<0, num_access, 1>{}([&](auto iAccess) {
             // data index [y0, y1, ...] in the order of input tensor
-            constexpr auto idx_y = SFC_Y::get_index(iAccess);
-
-            constexpr index_t in_offset  = y_in_desc.calculate_offset(idx_y);
-            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y);
-
+            constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
+            constexpr auto idx_y_in =
+                generate_tuple([&](auto ii) { return idx_y_start[ii].value; }, number<NDimY>{});
+            constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
+            static_assert(in_offset % vec_length_in == 0);
+            constexpr auto idx_y_out_tmp =
+                generate_array([&](auto ii) { return idx_y_start[ii].value; }, number<NDimY>{});
+            constexpr auto idx_y_out =
+                container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
             if constexpr(vec_length_in == 1)
             {
+
                 out_tensor.get_thread_buffer()[number<out_offset>{}] =
                     in_tensor.get_thread_buffer()[number<in_offset>{}];
             }
diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index 7e7dd03c6a..4c3aa2ba29 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -19,7 +19,6 @@ struct BatchedTransposeHostArgs
     index_t batch;
     index_t height;
     index_t width;
-    // index_t dim_blocks;
     index_t dim_stride;
     index_t dim_block_h;
     index_t dim_block_w;
@@ -28,8 +27,10 @@ struct BatchedTransposeHostArgs
 template <typename Pipeline_>
 struct BatchedTransposeKernel
 {
-    using Pipeline = remove_cvref_t<Pipeline_>;
-    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
+
+    CK_TILE_DEVICE static index_t counter = 0;
+    using Pipeline                        = remove_cvref_t<Pipeline_>;
+    using Problem                         = remove_cvref_t<typename Pipeline::Problem>;
 
     using Type = typename Problem::InputType;
 
@@ -46,11 +47,11 @@ struct BatchedTransposeKernel
     using Kargs = BatchedTransposeKargs;
     using Hargs = BatchedTransposeHostArgs;
 
-    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& host_args)
     {
-        size_t grid_size_x = (h.width + h.dim_block_w - 1) / h.dim_block_w;
-        size_t grid_size_y = (h.height + h.dim_block_h - 1) / h.dim_block_h;
-        size_t grid_size_z = h.batch;
+        size_t grid_size_x = (host_args.height + host_args.dim_block_h - 1) / host_args.dim_block_h;
+        size_t grid_size_y = (host_args.width + host_args.dim_block_w - 1) / host_args.dim_block_w;
+        size_t grid_size_z = host_args.batch;
         return dim3(grid_size_x, grid_size_y, grid_size_z);
     }
 
@@ -70,58 +71,52 @@ struct BatchedTransposeKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
+        static constexpr ck_tile::index_t kMPerBlock       = Problem::kMPerBlock;
+        static constexpr ck_tile::index_t kNPerBlock       = Problem::kNPerBlock;
+        static constexpr bool kPadM                        = Problem::kPadM;
+        static constexpr bool kPadN                        = Problem::kPadN;
+        static constexpr ck_tile::index_t VectorSizeInput  = Problem::VectorSizeInput;
+        static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput;
 
-        static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock;
-        static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock;
-        static constexpr bool kPadM                  = Problem::kPadM;
-        static constexpr bool kPadN                  = Problem::kPadN;
+        const auto iM   = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
+        const auto iN   = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
+        const auto iDim = blockIdx.z;
 
-        static constexpr ck_tile::index_t kMPerThread = Problem::kMPerThread;
-        static constexpr ck_tile::index_t kNPerThread = Problem::kNPerThread;
-
-        static_assert(kMPerThread == 1 && kNPerThread == 1);
-
-        const auto iDim  = blockIdx.z;
         const auto x_m_n = [&]() {
             const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<const Type*>(kargs.p_input) + iDim * kargs.dim_stride,
                 make_tuple(kargs.height, kargs.width),
                 make_tuple(kargs.width, 1),
-                number<kNPerThread>{}, // TODO thread load value
+                number<VectorSizeInput>{},
                 number<1>{});
 
             return pad_tensor_view(x_dram_naive,
                                    make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                                   sequence<kPadM, kPadN>{});
+                                   sequence<kPadN, kPadM>{});
         }();
 
-        const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
-
         const auto y_n_m = [&]() {
             const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
                 static_cast<Type*>(kargs.p_output) + iDim * kargs.dim_stride,
                 make_tuple(kargs.width, kargs.height),
                 make_tuple(kargs.height, 1),
-                number<kMPerThread>{},
+                number<VectorSizeOutput>{},
                 number<1>{});
 
             return pad_tensor_view(y_dram_naive,
                                    make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
-                                   sequence<kPadN, kPadM>{});
+                                   sequence<kPadM, kPadN>{});
         }();
 
-        auto x_block_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                             {static_cast<ck_tile::index_t>(iM * kMPerBlock),
-                              static_cast<ck_tile::index_t>(iN * kNPerBlock)});
+        auto x_block_window = make_tile_window(
+            x_m_n,
+            make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+            {static_cast<ck_tile::index_t>(iM), static_cast<ck_tile::index_t>(iN)});
 
-        auto y_block_window =
-            make_tile_window(y_n_m,
-                             make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
-                             {static_cast<ck_tile::index_t>(iN * kNPerBlock),
-                              static_cast<ck_tile::index_t>(iM * kMPerBlock)});
+        auto y_block_window = make_tile_window(
+            y_n_m,
+            make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
+            {static_cast<ck_tile::index_t>(iN), static_cast<ck_tile::index_t>(iM)});
 
         Pipeline{}(x_block_window, y_block_window);
     }
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
index aa62333918..e815313c06 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
@@ -29,24 +29,18 @@ struct BatchedTransposePipeline
     {
         auto inp_win =
             make_tile_window(input_window, Policy::template MakeInputDistribution<Problem>());
+
+        auto input_tile = load_tile(inp_win);
+
+        auto output_tile = make_static_distributed_tensor<InputType>(
+            Policy::template MakeOutputDistribution<Problem>());
+
+        transpose_tile2d(output_tile, input_tile);
+
         auto out_win =
             make_tile_window(out_window, Policy::template MakeOutputDistribution<Problem>());
 
-        auto x = load_tile(inp_win); // x->thread input_win->block
-
-        auto y = make_static_distributed_tensor<InputType>(
-            Policy::template MakeOutputDistribution<Problem>());
-
-        constexpr auto span_2d_x = decltype(x)::get_distributed_spans();
-
-        sweep_tile_span(span_2d_x[number<0>{}], [&](auto idx0) {
-            sweep_tile_span(span_2d_x[number<1>{}], [&](auto idx1) {
-                constexpr auto i_j_idx = make_tuple(idx1, idx0);
-                y(i_j_idx)             = x(i_j_idx);
-            });
-        });
-
-        store_tile(out_win, y);
+        store_tile(out_win, output_tile);
     }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
index 9953e8b8bf..dd9a6d79a8 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
@@ -14,31 +14,34 @@ struct BatchedTransposePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
     {
-        using S = Problem;
-        return make_static_tile_distribution(
-            tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<S::kMWarpPerBlock, S::kMThreadPerWarp, S::kMPerThread>,
-                      sequence<S::kNWarpPerBlock, S::kNThreadPerWarp, S::kNPerThread>>,
-                tuple<sequence<1, 2>, sequence<1, 2>>,
-                tuple<sequence<0, 0>, sequence<1, 1>>,
-                sequence<1, 2>,
-                sequence<2, 2>>{});
+        constexpr index_t BlockSize   = Problem::kBlockSize;
+        constexpr index_t MPerBlock   = Problem::kMPerBlock;
+        constexpr index_t NPerBlock   = Problem::kNPerBlock;
+        constexpr index_t VecLoadSize = Problem::VectorSizeInput;
+        using TileEncodingPattern =
+            TileDistributionEncodingPattern2D<BlockSize,
+                                              MPerBlock,
+                                              NPerBlock,
+                                              VecLoadSize,
+                                              tile_distribution_pattern::thread_raked>;
+        return TileEncodingPattern::Make2DStaticTileDistribution();
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
     {
-        using S = Problem;
-        return make_static_tile_distribution(
-            tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<S::kNWarpPerBlock, S::kNThreadPerWarp, S::kNPerThread>,
-                      sequence<S::kMWarpPerBlock, S::kMThreadPerWarp, S::kMPerThread>>,
-                tuple<sequence<2, 1>, sequence<2, 1>>,
-                tuple<sequence<0, 0>, sequence<1, 1>>,
-                sequence<2, 1>,
-                sequence<2, 2>>{});
+        constexpr index_t BlockSize   = Problem::kBlockSize;
+        constexpr index_t MPerBlock   = Problem::kMPerBlock;
+        constexpr index_t NPerBlock   = Problem::kNPerBlock;
+        constexpr index_t VecLoadSize = Problem::VectorSizeOutput;
+
+        using TileEncodingPattern =
+            TileDistributionEncodingPattern2D<BlockSize,
+                                              NPerBlock,
+                                              MPerBlock,
+                                              VecLoadSize,
+                                              tile_distribution_pattern::thread_raked>;
+        return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
     }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
index af6b2d51aa..fd5ea004b6 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include <string>
 #include <type_traits>
 
 #define VectorLoadSize 16
@@ -12,11 +11,11 @@
 namespace ck_tile {
 
 template <typename InputType_,
-          typename BlockTile,  // Sequence<...
-          typename WarpTile,   // Sequence<...
-          typename ThreadTile, // Sequence<...
-          bool kPadM_ = true,
-          bool kPadN_ = true>
+          typename BlockTile, // Sequence<...
+          typename WarpTile,  // Sequence<...
+          typename ThreadTile,
+          bool kPadM_ = false,
+          bool kPadN_ = false> // Sequence<...
 struct BatchedTransposeProblem
 {
     using InputType = remove_cvref_t<InputType_>;
@@ -42,7 +41,7 @@ struct BatchedTransposeProblem
     static constexpr bool kPadM = kPadM_;
     static constexpr bool kPadN = kPadN_;
 
-    static constexpr index_t AlignmentM = kPadM ? VectorLoadSize / sizeof(InputType) : 1; // TODO
-    static constexpr index_t AlignmentN = kPadN ? VectorLoadSize / sizeof(InputType) : 1;
+    static constexpr index_t VectorSizeInput  = kPadM ? 1 : VectorLoadSize / sizeof(InputType);
+    static constexpr index_t VectorSizeOutput = kPadN ? 1 : VectorLoadSize / sizeof(InputType);
 };
 } // namespace ck_tile

From b49f7de81f35610c93129eadd2103e78bd0257d4 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 12 May 2025 09:52:58 -0700
Subject: [PATCH 114/443] Improve the general performance of the Preshuffled
 GEMM V3 & delete the unnecessary instances (#2166)

* make the work compiled

* Solved the example code, but still have the profiler error

* Finished the feature

* Clang format and update the CHANGELOG

* solve the preshuffle v1 & v2 problem

* Comment Addressed

* Comment Addressed
---
 CHANGELOG.md                                  |   3 +
 ..._multiply_multiply_xdl_fp8_bpreshuffle.cpp |   9 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v1.hpp |  53 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v2.hpp |  68 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp | 708 ++++++++----------
 .../blockwise_gemm_pipeline_xdlops_base.hpp   |   5 +
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |  16 +-
 include/ck/utility/blkgemmpipe_scheduler.hpp  |  20 +
 .../gpu/gemm_multiply_multiply_wp.hpp         | 389 ----------
 .../gemm_multiply_multiply_wp/CMakeLists.txt  |  48 --
 ..._multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp |  44 +-
 11 files changed, 445 insertions(+), 918 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60fe2df99d..4be173dd85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,8 +19,11 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 ### Optimized
 
+
+* Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout. (#2166)
 * Added Vectorize Transpose optimization for CK Tile (#2131)
 
+
 ### Fixes
 
 None
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index e4e6a4f1a7..9f758d5fc5 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -9,7 +9,6 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 
@@ -142,12 +141,12 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
         128,   128,    128,
         16,   16,
-        32,   32,
-        2,    2,
+        16,   16,
+        8,    2,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-        1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
-        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
+        2,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index d751543175..1d27a74bd7 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -122,6 +122,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -153,9 +154,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -280,12 +281,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf,
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_buf);
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
             });
         });
 
@@ -346,14 +349,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
                     block_sync_lds();
 
+                    // loop prefetch copy
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf,
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_buf);
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_buf);
+                            });
                         });
                     });
 
@@ -409,14 +416,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
             block_sync_lds();
 
+            // tail Local Prefetch A1
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_buf);
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                            a_thread_buf);
+                    });
                 });
             });
 
@@ -495,7 +506,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
index 4c019a41a4..7bbaaca5b6 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
@@ -122,6 +122,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -152,9 +153,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -281,12 +282,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf.At(I0),
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_bufs(I0));
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
             });
         });
 
@@ -318,14 +321,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                                          b_thread_bufs(local_read_buf));
                     b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+                    // main loop A matrix prefetch
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf.At(local_read_buf),
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_bufs(local_read_buf));
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_buf);
+                            });
                         });
                     });
 
@@ -389,14 +396,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                                  b_thread_bufs(local_read_reg));
             b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+            // tail prefetch A
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf.At(local_read_reg),
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_bufs(local_read_reg));
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                            a_thread_buf);
+                    });
                 });
             });
 
@@ -445,12 +456,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf.At(local_read_reg),
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_bufs(local_read_reg));
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                            a_thread_buf);
+                    });
                 });
             });
 
@@ -539,7 +553,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 6d115e7620..6f3a7e6357 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -5,6 +5,16 @@
 
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
 
+#define DS_READ_A_PREFETCH_STAGES 2
+
+template <typename T>
+constexpr auto compute_stage_loads(T total_loads, T stages)
+{
+    return std::make_pair((total_loads + stages - 1) / stages, // ceil
+                          total_loads / stages                 // floor
+    );
+}
+
 namespace ck {
 
 // Compute optimized pipeline
@@ -123,6 +133,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
     using Base::I0;
     using Base::I1;
     using Base::I2;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -139,10 +150,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
     using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-
-    using Base::AMmaKStride;
-    using Base::BMmaKStride;
-
     using Base::MWaves;
 
     static constexpr index_t PrefetchStages        = 2;
@@ -156,9 +163,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -184,298 +191,132 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
     }
 
-    template <typename Stage>
-    __device__ static constexpr auto HotLoopScheduler(Stage stage)
+    __device__ static constexpr auto HotLoopScheduler()
     {
-        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
-        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
 
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
 
-        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
 
-        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
 
-        if constexpr(stage.value == 0)
-        {
-            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
-                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
-            constexpr auto staged_num_mfma_per_buffer_load_b =
-                staged_num_mfma / num_buffer_load_inst_b;
-            // B global
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
+        constexpr auto num_total_stages = MRepeat;
 
-                static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}([&](auto ibuf_inst) {
-                    ignore = ibuf_inst;
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / MRepeat;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / MRepeat;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto total_buffer_loads = num_buffer_load_inst_a + num_buffer_load_inst_b;
+        constexpr auto stages_available   = MRepeat - DS_READ_A_PREFETCH_STAGES;
+
+        constexpr auto stage_loads = compute_stage_loads(total_buffer_loads, stages_available);
+
+        constexpr auto buffer_load_perstage_more = stage_loads.first;
+        constexpr auto buffer_load_perstage_less = stage_loads.second;
+
+        constexpr auto buffer_load_stages_more = total_buffer_loads % stages_available;
+
+        constexpr auto buffer_b_heavy_loads = buffer_load_perstage_more * buffer_load_stages_more;
+        constexpr auto buffer_b_remaining =
+            num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more;
+
+        constexpr auto buffer_load_b_stages =
+            buffer_b_heavy_loads > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more + buffer_b_remaining / buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - DS_READ_A_PREFETCH_STAGES - buffer_load_b_stages;
+
+        static_assert(buffer_load_a_stages > 0,
+                      "The buffer load a stages should always have a value over 0.");
+
+        constexpr auto buffer_load_issue_point_interval_more =
+            math::integer_divide_ceil(num_mfma_perstage, buffer_load_perstage_more);
+        constexpr auto buffer_load_issue_point_interval_less =
+            buffer_load_perstage_less == 0
+                ? INT32_MAX
+                : math::integer_divide_ceil(num_mfma_perstage, buffer_load_perstage_less);
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(SCHED_GROUP_MFMA, 1, 0);
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more == 0)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less == 0)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(SCHED_GROUP_VMEM, 1, 0);
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
                     __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                });
-
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else if constexpr(stage.value == 1)
-        {
-            constexpr auto staged_num_mfma_per_ds_write_a =
-                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
-
-            constexpr auto stage_more_mfma =
-                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
-
-            // A local write
-            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
-                if constexpr(i_inst.value < stage_more_mfma)
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-                else
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
+                        SCHED_GROUP_LDS_READ, ds_read_a_mfma_rate, 0);
                 }
             });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else if constexpr(stage.value == 2)
-        {
-            constexpr auto staged_num_mfma_per_buffer_load_a =
-                math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
-
-            constexpr auto stage_more_mfma =
-                staged_num_mfma - (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
-
-            // A global
-            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
-                if constexpr(i_inst.value < stage_more_mfma)
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                    }
-                }
-                else
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-                    }
-                }
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else
-        {
-            // A local Read
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-    }
-
-    template <typename Stage>
-    __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
-    {
-        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
-        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
-
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
-
-        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
-
-        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
-
-        if constexpr(stage.value == 0)
-        {
-            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
-                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
-            constexpr auto staged_num_mfma_per_buffer_load_b =
-                staged_num_mfma / num_buffer_load_inst_b;
-            // B global
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-
-                static_for<0, staged_num_buffer_load_b_per_ds_read_a, 1>{}([&](auto ibuf_inst) {
-                    ignore = ibuf_inst;
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                });
-
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else if constexpr(stage.value == 1)
-        {
-#if 0
-            constexpr auto staged_num_ds_write_a_per_ds_read_a =
-                num_ds_write_inst_a / staged_num_ds_read_inst_a;
-            constexpr auto staged_num_mfma_per_ds_write_a = staged_num_mfma / num_ds_write_inst_a;
-            // A local write
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-
-                static_for<0, staged_num_ds_write_a_per_ds_read_a, 1>{}([&](auto idswrite_inst) {
-                    ignore = idswrite_inst;
-                    __builtin_amdgcn_sched_group_barrier(
-                        0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                });
-
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_ds_write_a_per_ds_read_a, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);  // DS read
-            });
-#elif 1
-            constexpr auto staged_num_mfma_per_ds_write_a =
-                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
-
-            constexpr auto stage_more_mfma =
-                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
-
-            // A local write
-            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
-                if constexpr(i_inst.value < stage_more_mfma)
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-                else
-                {
-                    if(i_inst.value < staged_num_ds_read_inst_a)
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-                    }
-                    else
-                    {
-                        __builtin_amdgcn_sched_group_barrier(
-                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
-                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
-                    }
-                }
-            });
-#endif
-            __builtin_amdgcn_sched_barrier(0);
-        }
-        else
-        {
-            // A local Read
-            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-                ignore = i_inst;
-                __builtin_amdgcn_sched_group_barrier(
-                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            });
-
-            __builtin_amdgcn_sched_barrier(0);
-        }
-    }
-
-    __device__ static constexpr auto EpilogueScheduler_2()
-    {
-        constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
-
-        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
-
-        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
-        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
-
-        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
-
-        // A local Read
-        static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
-            ignore = i_inst;
-            __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
         });
 
-        __builtin_amdgcn_sched_barrier(0);
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(SCHED_GROUP_MFMA, 1, 0);
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more == 0)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less == 0)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(SCHED_GROUP_LDS_WRITE, 1, 0);
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(SCHED_GROUP_VMEM, 1, 0);
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(
+                        SCHED_GROUP_LDS_READ, ds_read_a_mfma_rate, 0);
+                }
+            });
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, DS_READ_A_PREFETCH_STAGES, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(SCHED_GROUP_MFMA, 1, 0);
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(
+                        SCHED_GROUP_LDS_READ, ds_read_a_mfma_rate, 0);
+                }
+            });
+        });
     }
 
     template <bool HasMainLoop,
@@ -528,22 +369,27 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         __builtin_amdgcn_sched_barrier(0);
 
-        // // Local prefill A1
+        // Local prefill A1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
 
-        // // Global prefetch A2
+        // Global prefetch A2
         a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
 
         // Local prefetch A1
         block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k0) {
-            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                               make_tuple(I0, I0, I0, k0, I0, I0),
-                               a_block_buf.At(I0),
-                               a_thread_desc_,
-                               make_tuple(I0, I0, I0, k0, I0, I0),
-                               a_thread_buf);
+        static_for<0, DS_READ_A_PREFETCH_STAGES, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    // K = k0 × KGroup × k1 = k0 × kg0 × A_K1
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
+            });
         });
 
         // Initialize C
@@ -558,26 +404,18 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
             do
             {
                 auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
-                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        if constexpr(m0.value == 0)
-                        {
-                            b_blockwise_copy.Run(b_grid_desc,
-                                                 b_grid_buf,
-                                                 b_block_desc_n0_n1_k0_k1,
-                                                 b_block_origin_idx,
-                                                 b_thread_bufs(local_read_buf));
-                            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                        }
-                        else if constexpr(m0.value == 1)
-                        {
-                            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
-                        }
-                        else if constexpr(m0.value == 2)
-                        {
-                            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                        }
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
                                 vector_type<ComputeDataType, KPack> a_thread_vec;
@@ -613,49 +451,88 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                             });
                         });
 
-                        if constexpr(m0.value == MRepeat - 1)
+                        if constexpr(m0.value == (MRepeat - 2))
                         {
                             block_sync_lds();
 
                             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                a_thread_copy_.Run(
-                                    a_block_desc_m0_m1_m2_k0_k1_k2,
-                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                                    a_block_buf.At(local_read_buf),
-                                    a_thread_desc_,
-                                    make_tuple(
-                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
-                                               2>{},
-                                        I0,
-                                        I0,
-                                        k0,
-                                        I0,
-                                        I0),
-                                    a_thread_buf);
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<0>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else if constexpr(m0.value == (MRepeat - 1))
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
                             });
                         }
                         else
                         {
                             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                a_thread_copy_.Run(
-                                    a_block_desc_m0_m1_m2_k0_k1_k2,
-                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                                    a_block_buf.At(mfma_reg_buf),
-                                    a_thread_desc_,
-                                    make_tuple(
-                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
-                                               2>{},
-                                        I0,
-                                        I0,
-                                        k0,
-                                        I0,
-                                        I0),
-                                    a_thread_buf);
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(mfma_reg_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
                             });
                         }
-
-                        HotLoopScheduler(m0);
                     });
+                    HotLoopScheduler();
                 };
 
                 LoopFunc(I0, I1);
@@ -667,20 +544,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         // tail
         if constexpr(TailNum == TailNumber::Even)
         {
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                if constexpr(m0.value == 0)
-                {
-                    b_blockwise_copy.Run(b_grid_desc,
-                                         b_grid_buf,
-                                         b_block_desc_n0_n1_k0_k1,
-                                         b_block_origin_idx,
-                                         b_thread_bufs(I1));
-                }
-                else if constexpr(m0.value == MRepeat - 1)
-                {
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
-                }
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
 
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         vector_type<ComputeDataType, KPack> a_thread_vec;
@@ -707,36 +578,68 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     });
                 });
 
-                if constexpr(m0.value == MRepeat - 1)
+                if constexpr(m0.value == (MRepeat - 2))
                 {
                     block_sync_lds();
 
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k0_k1_k2,
-                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                            a_block_buf.At(I1),
-                            a_thread_desc_,
-                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
-                            a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<0>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else if constexpr(m0.value == (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
                 }
                 else
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k0_k1_k2,
-                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
-                            a_block_buf.At(I0),
-                            a_thread_desc_,
-                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
-                            a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
                 }
-
-                EpilogueScheduler_1(m0);
             });
 
+            HotLoopScheduler();
+
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
@@ -764,25 +667,29 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     });
                 });
 
-                if constexpr(m0.value != (MRepeat - 1))
+                if constexpr(m0.value < (MRepeat - 2))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_m0_m1_m2_k0_k1_k2,
-                            make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
-                            a_block_buf.At(I1),
-                            a_thread_desc_,
-                            make_tuple(
-                                Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{}, I0, I0, k0, I0, I0),
-                            a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
-
-                    EpilogueScheduler_2();
                 }
             });
-            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
-            // latency
-            // __builtin_amdgcn_sched_barrier(0);
+
+            HotLoopScheduler();
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
@@ -813,18 +720,21 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                     });
                 });
 
-                if constexpr(m0.value != (MRepeat - 1))
+                if constexpr(m0.value < (MRepeat - 2))
                 {
                     static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                           make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
-                                           a_block_buf.At(I0),
-                                           a_thread_desc_,
-                                           make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
-                                           a_thread_buf);
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
                     });
-
-                    EpilogueScheduler_2();
                 }
             });
         }
@@ -841,7 +751,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index ce507ca8d3..6c1c5b1c4d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -58,6 +58,11 @@ struct BlockwiseGemmXdlops_pipeline_base
     static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t KRepeat       = KPerThread / KPack;
     static constexpr index_t KPerInnerLoop = KPack;
+    static constexpr index_t KGroup =
+        ((MPerXDL == 16 && MPerXDL == 16 && xdlops_gemm.KPerXdlops == 128) ||
+         (MPerXDL == 32 && MPerXDL == 32 && xdlops_gemm.KPerXdlops == 64))
+            ? 2
+            : 1;
 
     static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 238ab14606..c0d9464136 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -167,11 +167,13 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
     using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
     static constexpr index_t KPack =
         math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+    static constexpr index_t KGroup = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
     static constexpr index_t KLane =
         mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
-    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
-    static constexpr index_t NLane   = NPerXdl;
-    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    static constexpr index_t KPackPerGroup = KPack / KGroup;
+    static constexpr index_t KRepeat       = KPerBlock / KLane / KPackPerGroup;
+    static constexpr index_t NLane         = NPerXdl;
+    static constexpr index_t NWave         = NPerBlock / NPerXdl / NXdlPerWave;
 
     static constexpr auto MakeDsGridPointer()
     {
@@ -209,7 +211,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
     }
     __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
     {
-        return math::integer_divide_ceil(K, KLane * KPack);
+        return math::integer_divide_ceil(K, KLane * KPackPerGroup);
     }
 
     __host__ __device__ static auto CalculateKPadded(index_t K)
@@ -351,7 +353,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPackPerGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1228,7 +1230,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPackPerGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1668,7 +1670,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPackPerGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
diff --git a/include/ck/utility/blkgemmpipe_scheduler.hpp b/include/ck/utility/blkgemmpipe_scheduler.hpp
index 39407cb8f6..6c788fb41e 100644
--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -48,6 +48,15 @@ enum struct TailNumber
     // prefetchstages
     Full,
 };
+
+enum SchedulerGroup : uint32_t
+{
+    SCHED_GROUP_MFMA      = 0x008, // Matrix FMA instructions
+    SCHED_GROUP_VMEM      = 0x020, // Global memory operations
+    SCHED_GROUP_LDS_READ  = 0x100, // LDS read operations
+    SCHED_GROUP_LDS_WRITE = 0x200  // LDS write operations
+};
+
 template <index_t BlockSize,
           index_t MPerBlock,
           index_t NPerBlock,
@@ -90,6 +99,17 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
     static constexpr index_t C_MFMA_Inst_Num =
         MPerBlock * NPerBlock * KPerBlock / (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
 
+    static constexpr index_t C_MFMA_Inst_Cycle = []() {
+        if constexpr(NPerXDL == 16)
+        {
+            return KPerXDL == 128 ? 32 : 16;
+        }
+        else if constexpr(NPerXDL == 32)
+        {
+            return KPerXDL == 64 ? 64 : 32;
+        }
+    }();
+
     static constexpr auto Print()
     {
         printf(" Blk/Wave Size: %d, %d, M/N/K PerBlk: %d, %d, %d, M/N/K PerXdl: %d, %d, %d\n",
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
index 07891ea932..90a9fa381d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
@@ -18,173 +18,6 @@ namespace device {
 namespace instance {
 
 #if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
 
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
@@ -268,174 +101,6 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16
 #endif
 
 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
-
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
@@ -562,33 +227,6 @@ struct DeviceOperationInstanceFactory<
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                          is_same_v<CLayout, Row>)
             {
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
-                    op_ptrs);
-
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
                     op_ptrs);
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
@@ -612,33 +250,6 @@ struct DeviceOperationInstanceFactory<
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                          is_same_v<CLayout, Row>)
             {
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
-                    op_ptrs);
-
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
-                    op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
-                    op_ptrs);
-
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
                     op_ptrs);
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p2(
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
index 37233ac5b4..743a0272f7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
@@ -2,18 +2,6 @@
 set(GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES)
 
 list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES 
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp
-        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -21,18 +9,6 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
 
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
-        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -41,18 +17,6 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
         )
 
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -60,18 +24,6 @@ set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
index e5ada03a46..4613a0f24d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
@@ -171,13 +171,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // Compute friendly
         // 256x[64, 256, 32]x128
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,   16,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    224,   128,  16,  16,  16,   16,    8,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,   16,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,   16,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,   16,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -190,13 +190,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 224x[64, 256, 32]x128
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,   14,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,   14,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,   14,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,   14,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 template <GemmSpecialization GemmSpec>
@@ -208,13 +208,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 192x[64, 256, 32]x128, 192x[64]x256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,   12,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,   12,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,   12,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,   12,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 template <GemmSpecialization GemmSpec>
@@ -226,13 +226,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,   10,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,    5,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,   10,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,   10,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,   10,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 template <GemmSpecialization GemmSpec>
@@ -244,10 +244,10 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    8,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    8,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    8,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -259,11 +259,11 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
         //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 

From f05e45ba59b76cb6ea83c471860ded65d5fc623f Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Mon, 12 May 2025 09:56:23 -0700
Subject: [PATCH 115/443] Disable SMFMA gfx90a (#2184)

* sparsity fix for gfx90a

* reverting tile_engine changes
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp              | 9 ---------
 .../ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp    | 4 ++--
 tile_engine/ops/gemm/gemm_instance_builder.py            | 6 +-----
 3 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 5ed97dc05c..f050a8e382 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -109,20 +109,11 @@ using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK
     4>>;
 
 // fp16 2:4 structured sparsity
-#if defined(__gfx94__) || defined(__gfx95__)
 using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmfmac<
     WarpGemmAttributeSmfmacImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
-#else // gfx 90a does not support smfmac
-using WarpGemmSmfmacF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
-    WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
-    2>>;
-using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
-    WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
-    2>>;
-#endif
 
 // bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
index 97fd2a8742..cd6cd3a399 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp
@@ -49,7 +49,7 @@ struct WarpGemmAttributeSmfmacImplF16F16F32M32N32K16
                                    const int32_t& idx,
                                    bool_constant<post_nop_> = {}) const
     {
-#if defined(__gfx9__)
+#if defined(__gfx94_) or defined(__gfx95_)
         c_vec = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a_vec, b_vec, c_vec, idx, 0, 0);
 #else
         ck_tile::ignore = c_vec;
@@ -100,7 +100,7 @@ struct WarpGemmAttributeSmfmacImplF16F16F32M16N16K32
                                    const int32_t& idx,
                                    bool_constant<post_nop_> = {}) const
     {
-#if defined(__gfx9__)
+#if defined(__gfx94_) or defined(__gfx95_)
         c_vec = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a_vec, b_vec, c_vec, idx, 0, 0);
 #else
         ck_tile::ignore = c_vec;
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index c00554df8f..3839523e3d 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -535,11 +535,7 @@ struct GemmDispatcher {
                     ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
                      (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
                 content += f"""
-#if defined(__gfx908__)
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
-#else
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);
-#endif"""
+                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
             content += f"""
             }} else {{"""
             for tile in tile_params:

From 29206047868b5a3eda88aa33ff5b997ba4e008b4 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Tue, 13 May 2025 12:19:25 +0800
Subject: [PATCH 116/443] [CK_TILE] Add logits soft-capping & customization
 support to the FMHA forward kernel/pipelines (#2163)

* hack for cap logits

* fix bug

* Re-format files

* Allow specifying logits_soft_cap through APIs

* Support turn on/off logits_soft_cap in async pipeline

* Do not generate non-verified kernels

* Align receipt used in Aiter

* Sync logits soft-capping across pipelines

* Re-enable some hdim pipelines

* fix perf

* Add attention variant for logits_soft_cap

* Add newline at end-of-file

* Fix performance

* Add comment to explain logits_soft_cap pre-processing

* Unify code

* Unify floating-point literal style

* Use class data member to slience the compilation error

* [CK_TILE] Update attention customizaton interface: add LogitsMask() (#2133)

* Send 'mask' along with variant params to the LogitsMask()

* Send block indices to the variant

* Add indices parameters in variant interface

* Fix fmha bwd codegen error

* Allow switch logits_soft_cap impl

* Eliminate register spills

* Fix compilation errors

* Fix wrong LSE

* Fix LSE for splitkv kernel

* Sync splitkv pipeline changes

* Add batch_prefill kernel/pipeline

* Fix codegen error

* Undo changes in CMakeLists.txt

* Merge pipeline filtering check

* Use different code path if kHasLogitsSoftCap=false

* Remove [[maybe_unused]] attribute

* Use pre-existing compile-time flag to instantiate templates

* Sync pipeline changes

* Update CHANGELOG.md

---------

Co-authored-by: Bernard <bernaliu@amd.com>
Co-authored-by: coderfeli <coderfeli@163.com>
---
 CHANGELOG.md                                  |    1 +
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |    2 +
 .../01_fmha/codegen/ops/fmha_batch_prefill.py |  595 +++++++++
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   |    1 +
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |   72 +-
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |   57 +-
 example/ck_tile/01_fmha/fmha_fwd.cpp          |   17 +
 example/ck_tile/01_fmha/fmha_fwd.hpp          |  212 +++
 example/ck_tile/01_fmha/generate.py           |    3 +-
 include/ck_tile/core.hpp                      |    1 +
 include/ck_tile/core/numeric/math.hpp         |   41 +
 include/ck_tile/core/tensor/load_tile.hpp     |   90 +-
 include/ck_tile/core/tensor/tensor_view.hpp   |   21 +
 .../core/tensor/tile_scatter_gather.hpp       |  731 +++++++++++
 .../ck_tile/core/tensor/tile_window_utils.hpp |    7 +
 include/ck_tile/ops/fmha.hpp                  |    4 +
 include/ck_tile/ops/fmha/block/variants.hpp   |  274 ++++
 .../fmha/kernel/fmha_batch_prefill_kernel.hpp | 1134 +++++++++++++++++
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |   83 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |   77 +-
 ..._batch_prefill_pipeline_qr_ks_vs_async.hpp |  900 +++++++++++++
 ...pipeline_qr_ks_vs_async_default_policy.hpp |   18 +
 ...litkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp |   98 +-
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp |   93 +-
 .../pipeline/block_fmha_pipeline_problem.hpp  |    6 +
 .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp |  102 +-
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    |  101 +-
 .../pipeline/block_fmha_pipeline_qs_ks_vs.hpp |  102 +-
 .../ops/fmha/pipeline/tile_fmha_traits.hpp    |    4 +
 29 files changed, 4621 insertions(+), 226 deletions(-)
 create mode 100644 example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
 create mode 100644 include/ck_tile/core/tensor/tile_scatter_gather.hpp
 create mode 100644 include/ck_tile/ops/fmha/block/variants.hpp
 create mode 100644 include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4be173dd85..a1163f059c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added GEMM pipeline for microscaling (MX) data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
+* Added logit soft-capping support for fMHA forward kernels.
 
 ### Optimized
 
diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 332707eafd..5b9d5742b4 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -114,12 +114,14 @@ LAYOUT_MAP = {
 PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
+    "qs" : "ck_tile::BlockFmhaPipelineQSKSVS",
 }
 
 PIPELINE_ENUM_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
new file mode 100644
index 0000000000..30b9299963
--- /dev/null
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -0,0 +1,595 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import copy
+from dataclasses import dataclass
+import fnmatch
+import itertools
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.cmake_config import *
+from codegen.cpp_symbol_map import *
+
+
+DTYPE_BITS = {
+    "fp32": 32,
+    "fp16": 16,
+    "bf16": 16,
+    "fp8" : 8,
+    "bf8" : 8
+}
+
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
+
+FMHA_BATCH_PREFILL_PIPELINE_MAP = {
+    "qr_async" : "ck_tile::BlockFmhaBatchPrefillPipelineQRKSVSAsync",
+}
+
+FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "fmha_fwd.hpp"
+"""
+
+FMHA_FWD_KERNEL_BODY="""
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
+                                                    {F_skpad},
+                                                    {F_dpad},
+                                                    {F_dvpad},
+                                                    {F_logits},
+                                                    {F_bias},
+                                                    false,
+                                                    {F_lse},
+                                                    {F_dropout},
+                                                    {F_squant},
+                                                    {F_occupancy}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::RandValOutputDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_variant_{F_idx},
+    fmha_mask_{F_idx},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} =
+    ck_tile::FmhaBatchPrefillWithPagedKVCacheKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+
+#include <iostream>
+
+template<>
+float fmha_batch_prefill_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_batch_prefill_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_batch_prefill_create_kargs_and_grids<k_>(a);
+    constexpr dim3 blocks             = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+}}
+"""
+
+FMHA_FWD_API_FILENAME="fmha_batch_prefill_api.cpp"
+FMHA_FWD_API="""
+float fmha_batch_prefill(fmha_batch_prefill_traits t, fmha_batch_prefill_args a, const ck_tile::stream_config& s){{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                return fmha_batch_prefill_<trait_>(s, a);
+            }}
+"""
+
+@dataclass
+class FmhaFwdApiTrait:
+    pipeline_tag : str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim      : str
+    dtype     : str  # data type
+    mode      : str  # value from MODE_MAP
+    bm0       : int  # tile size along q seqlen (block size)
+    bn0       : int  # tile size along qk seqlen
+    bk0       : int  # tile size along qk gemm unroll
+    bn1       : int  # tile size along v head_dim
+    bk1       : int  # tile size along kv gemm unroll
+    bk0max    : int
+    vlayout   : str
+    logits    : str
+    mask      : str
+    bias      : str  #
+    lse       : str  #
+    dropout   : str
+    squant    : str  #
+    spad      : str
+    skpad     : str
+    dpad      : str
+    dvpad     : str
+
+    @property
+    def name(self) -> str:
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.spad == 't' : return 'true' # always support
+            else :                return 'true'
+        elif self.pipeline_tag in ['qr']:
+            if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_q % {self.bm0} == 0'
+        else: assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode skpad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
+            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_k % {self.bn0} == 0'
+        else: assert False
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
+            else :               assert False
+        elif self.pipeline_tag in ['qr']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
+        else:   assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
+            else :                assert False
+        elif self.pipeline_tag in ['qr']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
+        else:   assert False
+
+@dataclass
+class FmhaFwdPipeline:
+    tag : str
+
+    F_vlayout   : str  # row/col
+    F_spad      : str  # true/false
+    F_skpad     : str  #
+    F_dpad      : str  #
+    F_dvpad     : str  #
+    F_logits    : str  # t/f
+    F_bias      : str  # true/false
+    F_lse       : str  #
+    F_dropout   : str  #
+    F_squant    : str  #
+    F_mask      : str  # value from MASK_MAP
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ''
+            if self.F_spad == 't': n += 's'
+            if self.F_skpad == 't' : n += 'sk'
+            if self.F_dpad == 't' : n += 'd'
+            if self.F_dvpad == 't' : n += 'dv'
+            if n != '' : n = 'p' + n
+            return n
+        pn = pad_name()
+        n = f'{self.tag}_v{self.F_vlayout[0]}'
+        if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
+        if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
+        if self.F_mask[0:2] == 's_':
+            if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
+        else:
+            if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
+        if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
+        if self.F_dropout == 't' : n += '_dropout'
+        else: n += '_ndropout'
+
+        if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
+        return n
+
+class FmhaFwdApiPool:
+    def __init__(self, mask_impl):
+        self.pool = dict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait : FmhaFwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.pool.keys():
+            self.pool[trait.dtype] = dict()
+        if trait.hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][trait.hdim] = list()
+
+        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][hdim]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
+                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0       : int  # tile size along q seqlen (block size)
+    F_bn0       : int  # tile size along k seqlen
+    F_bk0       : int  # tile size along qk gemm unroll
+    F_bn1       : int  # tile size along v head_dim
+    F_bk1       : int  # tile size along kv gemm unroll
+    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0       : int  # number of warps for gemm0 along q seqlen
+    F_rn0       : int  # number of warps for gemm0 along k seqlen
+    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1       : int  # number of warps for gemm1 along q seqlen
+    F_rn1       : int  # number of warps for gemm1 along head dim v
+    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0       : int  # gemm0 warp size along m
+    F_wn0       : int  # gemm0 warp size along n
+    F_wk0       : int  # gemm0 warp size along k
+    F_wm1       : int  # gemm1 warp size along m
+    F_wn1       : int  # gemm1 warp size along n
+    F_wk1       : int  # gemm1 warp size along k
+    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    @property
+    def name(self) -> str:
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
+        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\
+        ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+
+@dataclass
+class FmhaFwdKernel:
+    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim          : int  # hdim
+    F_dtype         : str  # data type
+    F_mode          : str  # value from MODE_MAP
+    F_tile          : FmhaFwdTileSize
+    F_pipeline      : FmhaFwdPipeline
+    mask_impl       : str
+
+    @property
+    def template(self) -> str:
+        kernel_body = str()
+        return FMHA_FWD_KERNEL_HEADER + \
+            FMHA_FWD_KERNEL_BODY.format(
+                F_idx           = self.F_idx,
+                F_hdim          = self.F_hdim,
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
+                F_bm0           = self.F_tile.F_bm0,
+                F_bn0           = self.F_tile.F_bn0,
+                F_bk0           = self.F_tile.F_bk0,
+                F_bn1           = self.F_tile.F_bn1,
+                F_bk1           = self.F_tile.F_bk1,
+                F_bk0max        = self.F_tile.F_bk0max,
+                F_rm0           = self.F_tile.F_rm0,
+                F_rn0           = self.F_tile.F_rn0,
+                F_rk0           = self.F_tile.F_rk0,
+                F_rm1           = self.F_tile.F_rm1,
+                F_rn1           = self.F_tile.F_rn1,
+                F_rk1           = self.F_tile.F_rk1,
+                F_wm0           = self.F_tile.F_wm0,
+                F_wn0           = self.F_tile.F_wn0,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
+                F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
+                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
+                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
+                F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
+                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
+                F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
+                F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
+                F_dropout       = BOOL_MAP[self.F_pipeline.F_dropout],
+                F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
+                F_occupancy     = self.F_tile.F_occupancy,
+                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
+                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
+                F_mode          = MODE_MAP[self.F_mode],
+                F_pipeline      = FMHA_BATCH_PREFILL_PIPELINE_MAP[self.F_pipeline.tag])
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return f"fmha_batch_prefill_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
+                self.F_tile.name + '_' + self.F_pipeline.name
+
+    @property
+    def filename(self) -> str:
+        return self.name + ".cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+                pipeline_tag=self.F_pipeline.tag,
+                hdim=str(self.F_hdim),
+                dtype=self.F_dtype,
+                mode=self.F_mode,
+                bm0=self.F_tile.F_bm0,
+                bn0=self.F_tile.F_bn0,
+                bk0=self.F_tile.F_bk0,
+                bn1=self.F_tile.F_bn1,
+                bk1=self.F_tile.F_bk1,
+                bk0max=self.F_tile.F_bk0max,
+                vlayout=self.F_pipeline.F_vlayout,
+                mask=self.F_pipeline.F_mask,
+                logits=self.F_pipeline.F_logits,
+                bias=self.F_pipeline.F_bias,
+                lse=self.F_pipeline.F_lse,
+                dropout=self.F_pipeline.F_dropout,
+                squant=self.F_pipeline.F_squant,
+                spad=self.F_pipeline.F_spad,
+                skpad=self.F_pipeline.F_skpad,
+                dpad=self.F_pipeline.F_dpad,
+                dvpad=self.F_pipeline.F_dvpad)
+
+# TODO: design a more practical way to do it
+# this is current supported tile size per hdim
+def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
+    if dtype == 'fp16' or dtype == 'bf16':
+        return {
+        ### '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        }
+    elif dtype == 'fp8' or dtype == 'bf8':
+        return {
+        ### '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+        ### '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+        ### '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+        }
+    else:
+        return None
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr pipeline, let 't' padding to appear later!!
+        # TODO: how to design this more generic?
+        squant = 't' if dtype == 'fp8' else 'f'
+        pipelines = []
+        if dtype in ['fp16', 'bf16']:
+            for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
+                if hdim == 256:
+                # if True:
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                else:
+                    if bias == "bias":
+                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    else:
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    if receipt == 1 and bias != "bias":
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
+        elif dtype in ['fp8', 'bf8']:
+            # no need lse/dropout kernels
+            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask))
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
+        else:
+            assert False
+        return pipelines
+
+    gen = list()
+    api_pool = FmhaFwdApiPool(mask_impl)
+
+    for dtype in FWD_DTYPE_MAP.keys():
+        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
+        if d == None:
+            continue
+        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
+            tile = d[hdim_str]
+            hdim = int(hdim_str)
+            for pipeline in get_pipelines(dtype, hdim):
+                if mode == "group":
+                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
+                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
+                        continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' or pipeline.F_dropout == 't':
+                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
+                k = FmhaFwdKernel(F_idx=0,
+                                  F_hdim=hdim,
+                                  F_dtype=dtype,
+                                  F_mode=mode,
+                                  F_tile=tile,
+                                  F_pipeline=pipeline,
+                                  mask_impl=mask_impl)
+                if kernel_filter != '':
+                    if not fnmatch.fnmatch(k.name, kernel_filter):
+                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
+                # 2 - Flash attention integration
+                if receipt in (2, 3):
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_fwd) integration
+                elif receipt == 100:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'batch'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_batch_prefill) integration
+                elif receipt == 200:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # aiter::mha_batch_prefill C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    (autogen_dir / kernel.filename).write_text(kernel.template)
+
+def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
+    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+    for kernel in kernels:
+        write_single_fwd_kernel(kernel, output_dir)
+    write_fwd_api(api_pool, output_dir)
+
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    with file_path.open('a') as f:
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+        for kernel in kernels:
+            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 932f6020b6..80b64f918a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -60,6 +60,7 @@ using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
                                                        {F_skpad},
                                                        {F_dpad},
                                                        {F_dvpad},
+                                                       false,
                                                        {F_bias},
                                                        {F_dbias},
                                                        false,
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index c31a0ce954..2f1287c87a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -32,6 +32,7 @@ K0_MAX_SUBMAX_MAP = {
 FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
 #include "fmha_fwd.hpp"
 """
 
@@ -51,12 +52,16 @@ using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
                                                     {F_skpad},
                                                     {F_dpad},
                                                     {F_dvpad},
+                                                    {F_logits},
                                                     {F_bias},
                                                     false,
                                                     {F_lse},
                                                     {F_dropout},
                                                     {F_squant},
                                                     {F_occupancy}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
 using fmha_mask_{F_idx} = {F_mask};
 
 using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
@@ -73,6 +78,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     fmha_shape_{F_idx},
     {F_mode},
+    fmha_variant_{F_idx},
     fmha_mask_{F_idx},
     fmha_trait_{F_idx}>;
 
@@ -88,7 +94,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
 
 #include <iostream>
 
@@ -123,9 +129,9 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
         }}
 """
 
-FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -144,6 +150,7 @@ class FmhaFwdApiTrait:
     bk1       : int  # tile size along kv gemm unroll
     bk0max    : int
     vlayout   : str
+    logits    : str
     mask      : str
     bias      : str  #
     lse       : str  #
@@ -157,7 +164,7 @@ class FmhaFwdApiTrait:
     @property
     def name(self) -> str:
         return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
-                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
 
     @property
     def scheck(self) -> str:
@@ -165,7 +172,7 @@ class FmhaFwdApiTrait:
         if self.pipeline_tag == 'qr_async':
             if self.spad == 't' : return 'true' # always support
             else :                return 'true'
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qs']:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
@@ -176,7 +183,7 @@ class FmhaFwdApiTrait:
         if self.pipeline_tag == 'qr_async':
             if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
             else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
-        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+        elif self.pipeline_tag in ['qr', 'qs']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_k % {self.bn0} == 0'
         else: assert False
@@ -187,7 +194,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qs']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -199,7 +206,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qs']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -214,6 +221,7 @@ class FmhaFwdPipeline:
     F_skpad     : str  #
     F_dpad      : str  #
     F_dvpad     : str  #
+    F_logits    : str  # t/f
     F_bias      : str  # true/false
     F_lse       : str  #
     F_dropout   : str  #
@@ -235,6 +243,9 @@ class FmhaFwdPipeline:
         if pn != '' : n += f'_{pn}'
         else: n += '_npad'
 
+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
         if self.F_bias != 'no' : n += f'_{self.F_bias}'
         else: n += '_nbias'
 
@@ -280,7 +291,7 @@ class FmhaFwdApiPool:
                 for k, trait in enumerate(traits):
                     if_k = 'if' if k == 0 else 'else if'
                     inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
                                    F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
                                    F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
@@ -365,6 +376,7 @@ class FmhaFwdKernel:
                 F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
                 F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
                 F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
                 F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
                 F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
                 F_dropout       = BOOL_MAP[self.F_pipeline.F_dropout],
@@ -399,6 +411,7 @@ class FmhaFwdKernel:
                 bk0max=self.F_tile.F_bk0max,
                 vlayout=self.F_pipeline.F_vlayout,
                 mask=self.F_pipeline.F_mask,
+                logits=self.F_pipeline.F_logits,
                 bias=self.F_pipeline.F_bias,
                 lse=self.F_pipeline.F_lse,
                 dropout=self.F_pipeline.F_dropout,
@@ -440,36 +453,36 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         squant = 't' if dtype == 'fp8' else 'f'
         pipelines = []
         if dtype in ['fp16', 'bf16']:
-            for mask, bias, lse, dropout in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
+            for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
                 if hdim == 256:
                 # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
 
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
                     else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
-            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask))
+            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -497,6 +510,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_lse == 't' or pipeline.F_dropout == 't':
                         continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
                 k = FmhaFwdKernel(F_idx=0,
                                   F_hdim=hdim,
                                   F_dtype=dtype,
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 5ad118fd1a..3ae0e28be3 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -45,6 +45,7 @@ FMHA_FWD_SPLITKV_PIPELINE_MAP = {
 
 FMHA_FWD_SPLITKV_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
 using fmha_mask_{F_idx} = {F_mask};
 
 namespace {{
@@ -63,6 +64,7 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
                                                      {F_skpad},
                                                      {F_dpad},
                                                      {F_dvpad},
+                                                     {F_logits},
                                                      {F_bias},
                                                      /*kHasBiasGrad=*/false,
                                                      {F_lse},
@@ -85,6 +87,7 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
     typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
     fmha_shape,
     {F_mode},
+    fmha_variant_{F_idx},
     fmha_mask_{F_idx},
     fmha_trait>;
 
@@ -113,7 +116,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }}
 
 using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
                         {F_dvpad}>;
 
 #include <iostream>
@@ -267,9 +270,9 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 }}
 """
 
-FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
+FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
 
                 // get combine kernel tile sizes
                 using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType;
@@ -310,6 +313,7 @@ class FmhaFwdSplitKVApiTrait:
     bk0max    : int
     vlayout   : str
     mask      : str
+    logits    : str
     bias      : str  #
     lse       : str  #
     squant    : str  #
@@ -322,7 +326,7 @@ class FmhaFwdSplitKVApiTrait:
     @property
     def name(self) -> str:
         return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
-                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\
                     f'{self.dvpad}-{self.pagedkv}'
 
     @property
@@ -380,6 +384,7 @@ class FmhaFwdSplitKVPipeline:
     F_skpad     : str  #
     F_dpad      : str  #
     F_dvpad     : str  #
+    F_logits    : str  # t/f
     F_bias      : str  # true/false
     F_lse       : str  #
     F_squant    : str  #
@@ -401,6 +406,9 @@ class FmhaFwdSplitKVPipeline:
         if pn != '' : n += f'_{pn}'
         else: n += '_npad'
 
+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
         if self.F_bias != 'no' : n += f'_{self.F_bias}'
         else: n += '_nbias'
 
@@ -475,7 +483,7 @@ class FmhaFwdSplitKVApiPool:
                 for k, trait in enumerate(traits):
                     if_k = 'if' if k == 0 else 'else if'
                     inners = inners + FMHA_FWD_SPLITKV_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
                                    F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv],
                                    F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
@@ -541,6 +549,7 @@ class FmhaFwdSplitKVKernel:
                 F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
                 F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
                 F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
                 F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
                 F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
                 F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
@@ -574,6 +583,7 @@ class FmhaFwdSplitKVKernel:
                 bk1=self.F_tile.F_bk1,
                 bk0max=self.F_tile.F_bk0max,
                 vlayout=self.F_pipeline.F_vlayout,
+                logits=self.F_pipeline.F_logits,
                 mask=self.F_pipeline.F_mask,
                 bias=self.F_pipeline.F_bias,
                 lse=self.F_pipeline.F_lse,
@@ -671,32 +681,32 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         squant = 't' if dtype == 'fp8' else 'f'
         pipelines = []
         if dtype in ['fp16', 'bf16']:
-            for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
+            for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
                 # TODO: use async pipeline when compiler is more stable
                 if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
                 # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
                 else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
                     if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
-            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask))
+            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -720,6 +730,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                         continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
                 k = Kernel(F_idx=0,
                            F_hdim=hdim,
                            F_dtype=dtype,
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 8f6fb8df54..bb1f495c4e 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -11,6 +11,7 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <cmath>
 #include <numeric>
 #include <ostream>
 #include <string>
@@ -72,6 +73,7 @@ auto create_args(int argc, char* argv[])
                 "0",
                 "scale factor of S. 0 means equal to 1/sqrt(hdim).\n"
                 "note when squant=1, this value will be modified by range_q/k")
+        .insert("logits_soft_cap", "0", "attention logits soft capping value.")
         .insert("range_q", "16", "per-tensor quantization range of q. used if squant=1.")
         .insert("range_k", "16", "per-tensor quantization range of k. used if squant=1.")
         .insert("range_v", "16", "per-tensor quantization range of v. used if squant=1.")
@@ -416,6 +418,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(scale_s == .0f)
         scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ?
 
+    const float logits_soft_cap = arg_parser.get_float("logits_soft_cap");
+
     std::string squant_str = arg_parser.get_str("squant");
     bool squant            = [&]() {
         if(squant_str == "auto")
@@ -850,6 +854,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         else // fmha_fwd_traits or fmha_splitkv_traits
         {
             traits.is_group_mode       = (mode == mode_enum::group);
+            traits.has_logits_soft_cap = 0.f < logits_soft_cap;
             traits.mask_type           = mask.type;
             traits.bias_type           = bias.type;
             traits.has_lse             = lse;
@@ -1007,6 +1012,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
             args.scale_p = scale_p;
             args.scale_o = scale_o;
 
+            args.logits_soft_cap = logits_soft_cap;
+
             args.stride_bias =
                 (bias.type == bias_enum::alibi ? (bias.rank_info == 0 ? 0 : nhead) : stride_bias);
             args.stride_o          = stride_o;
@@ -1375,6 +1382,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
             ck_tile::identity{},
             ck_tile::scales(scale_s));
 
+        if(0.f < logits_soft_cap)
+        {
+            ck_tile::reference_unary_elementwise<SaccDataType, SaccDataType, SaccDataType>(
+                s_host_ref, s_host_ref, [logits_soft_cap](SaccDataType logits) {
+                    return ck_tile::type_convert<SaccDataType>(
+                        logits_soft_cap *
+                        std::tanhf(ck_tile::type_convert<float>(logits / logits_soft_cap)));
+                });
+        }
+
         if(bias.type == bias_enum::elementwise_bias)
         {
             // elementwise bias
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 765c221a7b..1838ee5bd9 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -143,6 +143,8 @@ struct fmha_fwd_args
     float scale_p;
     float scale_o;
 
+    float logits_soft_cap;
+
     ck_tile::index_t stride_q;
     ck_tile::index_t stride_k;
     ck_tile::index_t stride_v;
@@ -232,6 +234,8 @@ struct fmha_fwd_splitkv_args
     float scale_p;
     float scale_o;
 
+    float logits_soft_cap;
+
     ck_tile::index_t stride_q;
     ck_tile::index_t stride_k;
     ck_tile::index_t stride_v;
@@ -308,6 +312,85 @@ struct fmha_fwd_appendkv_args
     ck_tile::index_t batch_stride_vnew;
 };
 
+struct fmha_batch_prefill_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    void* rand_val_ptr;
+    void* lse_ptr;
+    void* o_ptr;
+
+    // the real seqlen_q & seqlen_k are decided by following:
+    // batch mode (kvcache):
+    //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.page_block_size * (kargs.kv_indptr[b + 1] - kargs.kv_indptr[b] -
+    //             1) +
+    //                        kargs.kv_last_page_lens[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //             seqlen_k = kargs.page_block_size * (kargs.kv_indptr[b + 1] - kargs.kv_indptr[b] -
+    //             1) +
+    //                        kargs.kv_last_page_lens[b]
+    const void* seqstart_q_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    // SGLang-style page table
+    int32_t num_total_pages;
+    void* kv_indptr;
+    void* kv_page_indices;
+#if 0 // we assume page_block_size=1 for now
+    void* kv_last_page_lens;
+    ck_tile::index_t page_block_size;
+#endif
+
+    float scale_s;
+    float scale_p;
+    float scale_o;
+
+    float logits_soft_cap;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_randval;
+    ck_tile::index_t stride_o;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_randval;
+    ck_tile::index_t nhead_stride_lse;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_randval;
+    ck_tile::index_t batch_stride_lse;
+    ck_tile::index_t batch_stride_o;
+
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+
+    float p_drop;
+    bool s_randval;
+
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
+};
+
 template <typename FmhaKernel>
 auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
 {
@@ -333,6 +416,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.scale_s,
                                              args.scale_p,
                                              args.scale_o,
+                                             args.logits_soft_cap,
                                              args.stride_q,
                                              args.stride_k,
                                              args.stride_v,
@@ -371,6 +455,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.scale_s,
                                              args.scale_p,
                                              args.scale_o,
+                                             args.logits_soft_cap,
                                              args.stride_q,
                                              args.stride_k,
                                              args.stride_v,
@@ -443,6 +528,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                      args.is_gappy,
                                      args.scale_s,
                                      args.scale_p,
+                                     args.logits_soft_cap,
                                      args.stride_q,
                                      args.stride_k,
                                      args.stride_v,
@@ -485,6 +571,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                      args.cache_batch_idx,
                                      args.scale_s,
                                      args.scale_p,
+                                     args.logits_soft_cap,
                                      args.stride_q,
                                      args.stride_k,
                                      args.stride_v,
@@ -618,6 +705,117 @@ auto fmha_fwd_appendkv_create_kargs_and_grids(fmha_fwd_appendkv_args args)
     return ck_tile::make_tuple(kargs, grids);
 }
 
+template <typename FmhaKernel>
+auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaKernel::kIsGroupMode)
+        {
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
+                                             args.k_ptr,
+                                             args.v_ptr,
+                                             args.bias_ptr,
+                                             args.rand_val_ptr,
+                                             args.lse_ptr,
+                                             args.o_ptr,
+                                             args.seqstart_q_ptr,
+                                             args.hdim_q,
+                                             args.hdim_v,
+                                             args.nhead_q,
+                                             args.nhead_q / args.nhead_k,
+                                             args.num_total_pages,
+                                             args.kv_indptr,
+                                             args.kv_page_indices,
+#if 0 // we assume page_block_size=1 for now
+                                         args.kv_last_page_lens,
+                                         args.page_block_size,
+#endif
+                                             args.scale_s,
+                                             args.scale_p,
+                                             args.scale_o,
+                                             args.logits_soft_cap,
+                                             args.stride_q,
+                                             args.stride_k,
+                                             args.stride_v,
+                                             args.stride_bias,
+                                             args.stride_randval,
+                                             args.stride_o,
+                                             args.nhead_stride_q,
+                                             args.nhead_stride_k,
+                                             args.nhead_stride_v,
+                                             args.nhead_stride_bias,
+                                             args.nhead_stride_randval,
+                                             args.nhead_stride_lse,
+                                             args.nhead_stride_o,
+                                             args.batch_stride_k,
+                                             args.batch_stride_v,
+                                             args.window_size_left,
+                                             args.window_size_right,
+                                             args.mask_type,
+                                             args.p_drop,
+                                             args.s_randval,
+                                             args.drop_seed_offset);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaKernel::MakeKargsImpl(args.q_ptr,
+                                             args.k_ptr,
+                                             args.v_ptr,
+                                             args.bias_ptr,
+                                             args.rand_val_ptr,
+                                             args.lse_ptr,
+                                             args.o_ptr,
+                                             args.seqlen_q,
+                                             args.hdim_q,
+                                             args.hdim_v,
+                                             args.nhead_q,
+                                             args.nhead_q / args.nhead_k,
+                                             args.num_total_pages,
+                                             args.kv_indptr,
+                                             args.kv_page_indices,
+#if 0 // we assume page_block_size=1 for now
+                                         args.kv_last_page_lens,
+                                         args.page_block_size,
+#endif
+                                             args.scale_s,
+                                             args.scale_p,
+                                             args.scale_o,
+                                             args.logits_soft_cap,
+                                             args.stride_q,
+                                             args.stride_k,
+                                             args.stride_v,
+                                             args.stride_bias,
+                                             args.stride_randval,
+                                             args.stride_o,
+                                             args.nhead_stride_q,
+                                             args.nhead_stride_k,
+                                             args.nhead_stride_v,
+                                             args.nhead_stride_bias,
+                                             args.nhead_stride_randval,
+                                             args.nhead_stride_lse,
+                                             args.nhead_stride_o,
+                                             args.batch_stride_q,
+                                             args.batch_stride_k,
+                                             args.batch_stride_v,
+                                             args.batch_stride_bias,
+                                             args.batch_stride_randval,
+                                             args.batch_stride_lse,
+                                             args.batch_stride_o,
+                                             args.window_size_left,
+                                             args.window_size_right,
+                                             args.mask_type,
+                                             args.p_drop,
+                                             args.s_randval,
+                                             args.drop_seed_offset);
+        }
+    }();
+
+    dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
 // this is used to pattern-match internl kernel implementation, not to instantiate kernel
 template <ck_tile::index_t HDim_,
           typename DataType_,
@@ -630,6 +828,7 @@ template <ck_tile::index_t HDim_,
           ck_tile::index_t kK0BlockLength_,
           bool kIsVLayoutRowMajor_,
           ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          bool kHasLogitsSoftCap_,
           typename FmhaMask_,
           ck_tile::BlockAttentionBiasEnum BiasEnum_,
           bool kStoreLse_,
@@ -652,6 +851,7 @@ struct fmha_fwd_traits_
     static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
     static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
     static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    static constexpr bool kHasLogitsSoftCap          = kHasLogitsSoftCap_;
     using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
     static constexpr auto BiasEnum                   = BiasEnum_;
     static constexpr bool kStoreLse                  = kStoreLse_;
@@ -677,6 +877,7 @@ template <ck_tile::index_t HDim_,
           ck_tile::index_t kK0BlockLength_,
           bool kIsVLayoutRowMajor_,
           ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          bool kHasLogitsSoftCap_,
           typename FmhaMask_,
           ck_tile::BlockAttentionBiasEnum BiasEnum_,
           bool kStoreLse_,
@@ -699,6 +900,7 @@ struct fmha_fwd_splitkv_traits_
     static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
     static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
     static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    static constexpr bool kHasLogitsSoftCap          = kHasLogitsSoftCap_;
     using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
     static constexpr auto BiasEnum                   = BiasEnum_;
     static constexpr bool kStoreLse                  = kStoreLse_;
@@ -776,6 +978,9 @@ struct fmha_fwd_appendkv_traits_
 template <typename Traits_>
 float fmha_fwd_appendkv_(const ck_tile::stream_config&, fmha_fwd_appendkv_args);
 
+template <typename Traits_>
+float fmha_batch_prefill_(const ck_tile::stream_config&, fmha_batch_prefill_args);
+
 // This is the public API, will be generated by script
 struct fmha_fwd_traits
 {
@@ -784,6 +989,7 @@ struct fmha_fwd_traits
     std::string data_type;
     bool is_group_mode;
     bool is_v_rowmajor;
+    bool has_logits_soft_cap;
     mask_enum mask_type;
     bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
     bool has_lse;
@@ -800,6 +1006,7 @@ struct fmha_fwd_splitkv_traits
     std::string data_type;
     bool is_group_mode;
     bool is_v_rowmajor;
+    bool has_logits_soft_cap;
     mask_enum mask_type;
     bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
     bool has_lse;
@@ -821,3 +1028,8 @@ struct fmha_fwd_appendkv_traits
 float fmha_fwd_appendkv(fmha_fwd_appendkv_traits,
                         fmha_fwd_appendkv_args,
                         const ck_tile::stream_config&);
+
+using fmha_batch_prefill_traits = fmha_fwd_traits;
+float fmha_batch_prefill(fmha_batch_prefill_traits,
+                         fmha_batch_prefill_args,
+                         const ck_tile::stream_config&);
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index c2b0924eb3..c611618824 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -21,8 +21,7 @@ class HandlerId(IntEnum):
 ops = []
 for importer, module_name, _ in pkgutil.iter_modules(codegen.ops.__path__):
     full_module_name = '%s.%s' % (codegen.ops.__name__, module_name)
-    if full_module_name not in sys.modules:
-        ops.append(importer.find_spec(module_name).loader.load_module(module_name))
+    ops.append(importer.find_spec(module_name).loader.load_module(module_name))
 unwanted_prefix = 'fmha_'
 handlers = dict(
     [(op.__name__[len(unwanted_prefix):] if op.__name__.startswith(unwanted_prefix) else op.__name__,
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index b94157eaec..b9791f0b55 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -54,6 +54,7 @@
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
+#include "ck_tile/core/tensor/tile_scatter_gather.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/tile_window_utils.hpp"
diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp
index 6bdcb509b0..8176fe551c 100644
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -487,6 +487,9 @@ struct log2e<float>
 template <typename T = double>
 constexpr T log2e_v = log2e<T>::value;
 
+template <typename T = double>
+constexpr T log2e_rcp_v = 1. / log2e<T>::value;
+
 CK_TILE_DEVICE
 float exp2(float x) { return exp2f(x); };
 
@@ -1380,6 +1383,44 @@ CK_TILE_DEVICE double exp<double>(double x)
     return exp(x);
 };
 
+template <typename T>
+CK_TILE_DEVICE T tanh_fast(T x)
+{
+    return type_convert<T>((exp<T>(2.0 * type_convert<float>(x)) - 1.0) /
+                           (exp<T>(2.0 * type_convert<float>(x)) + 1.0));
+};
+
+template <>
+CK_TILE_DEVICE float tanh_fast<float>(float x)
+{
+    // float a = __builtin_amdgcn_sinh(x);
+    // float b = __builtin_amdgcn_cosh(x);
+    // float e = a * __builtin_amdgcn_rcpf(b);
+    // return e;
+
+    float a = 2.0f * log2e_v<float> * x;
+    a       = __builtin_amdgcn_exp2f(a);
+    a       = __builtin_amdgcn_rcpf(a + 1.0f);
+    a       = 2 * a;
+    a       = 1 - a;
+    return a;
+
+    // float e, r, s, t, d;
+    // float a = x;
+    // s = abs(a);
+    // t = -log2e_v<float> * 2.0f * s;
+    // e = __builtin_amdgcn_exp2f(t);
+    // d = e + 1.0f;
+    // r = __builtin_amdgcn_rcpf(d);
+    // r = e * (-r) + r;
+    // if (s < 4.997253418e-3f) r = a;
+    // union fipnr {float f; unsigned int i;};
+    // fipnr r_; r_.f = r;
+    // fipnr a_; a_.f = a;
+    // { r_.i = (r_.i|(a_.i&0x80000000)); r = r_.f; }
+    // return r;
+};
+
 template <typename T>
 CK_TILE_DEVICE T log(T x)
 {
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index b280a1725d..4601261197 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -18,32 +18,8 @@
 
 namespace ck_tile {
 
-template <typename BottomTensorView_,
-          typename WindowLengths_,
-          typename TileDistribution_,
-          index_t NumCoord,
-          index_t i_access           = -1,
-          bool oob_conditional_check = true>
-CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution<BottomTensorView_,
-                                                                         WindowLengths_,
-                                                                         TileDistribution_,
-                                                                         NumCoord>& tile_window,
-                              number<i_access>                     = {},
-                              bool_constant<oob_conditional_check> = {})
-{
-    return tile_window.load(number<i_access>{}, bool_constant<oob_conditional_check>{});
-}
-
-template <typename BottomTensorView_,
-          typename WindowLengths_,
-          typename TileDistribution_,
-          typename LinearBottomDims_,
-          index_t i_access           = -1,
-          bool oob_conditional_check = true>
-CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
-                                                       WindowLengths_,
-                                                       TileDistribution_,
-                                                       LinearBottomDims_>& tile_window,
+template <typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(const TileWindow_& tile_window,
                               number<i_access>                     = {},
                               bool_constant<oob_conditional_check> = {})
 {
@@ -51,35 +27,11 @@ CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
 }
 
 template <typename DistributedTensor_,
-          typename BottomTensorView_,
-          typename WindowLengths_,
-          typename TileDistribution_,
-          index_t NumCoord,
+          typename TileWindow_,
           index_t i_access           = -1,
           bool oob_conditional_check = true>
 CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
-                              const tile_window_with_static_distribution<BottomTensorView_,
-                                                                         WindowLengths_,
-                                                                         TileDistribution_,
-                                                                         NumCoord>& tile_window,
-                              number<i_access>                     = {},
-                              bool_constant<oob_conditional_check> = {})
-{
-    return tile_window.load(dst_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
-}
-
-template <typename DistributedTensor_,
-          typename BottomTensorView_,
-          typename WindowLengths_,
-          typename TileDistribution_,
-          typename LinearBottomDims_,
-          index_t i_access           = -1,
-          bool oob_conditional_check = true>
-CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
-                              const tile_window_linear<BottomTensorView_,
-                                                       WindowLengths_,
-                                                       TileDistribution_,
-                                                       LinearBottomDims_>& tile_window,
+                              const TileWindow_& tile_window,
                               number<i_access>                     = {},
                               bool_constant<oob_conditional_check> = {})
 {
@@ -138,42 +90,12 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
 }
 
 template <typename LdsTileWindow_,
-          typename BottomTensorView_,
-          typename WindowLengths_,
-          typename TileDistribution_,
-          index_t NumCoord,
-          index_t i_access           = -1,
-          bool oob_conditional_check = true,
-          bool pre_nop               = false>
-CK_TILE_DEVICE auto
-async_load_tile_raw(LdsTileWindow_&& lds_tile,
-                    const tile_window_with_static_distribution<BottomTensorView_,
-                                                               WindowLengths_,
-                                                               TileDistribution_,
-                                                               NumCoord>& tile_window,
-                    number<i_access>                     = {},
-                    bool_constant<oob_conditional_check> = {},
-                    bool_constant<pre_nop>               = {})
-{
-    return tile_window.async_load_raw(lds_tile,
-                                      number<i_access>{},
-                                      bool_constant<oob_conditional_check>{},
-                                      bool_constant<pre_nop>{});
-}
-
-template <typename LdsTileWindow_,
-          typename BottomTensorView_,
-          typename WindowLengths_,
-          typename TileDistribution_,
-          typename LinearBottomDims_,
+          typename TileWindow_,
           index_t i_access           = -1,
           bool oob_conditional_check = true,
           bool pre_nop               = false>
 CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile,
-                                        const tile_window_linear<BottomTensorView_,
-                                                                 WindowLengths_,
-                                                                 TileDistribution_,
-                                                                 LinearBottomDims_>& tile_window,
+                                        const TileWindow_& tile_window,
                                         number<i_access>                     = {},
                                         bool_constant<oob_conditional_check> = {},
                                         bool_constant<pre_nop>               = {})
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 29db5e1fca..656ce8d20d 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -210,6 +210,27 @@ struct tensor_view
             bool_constant<pre_nop>{});
     }
 
+    template <typename X,
+              bool pre_nop = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t coord_extra_offset,
+                                      index_t linear_offset,
+                                      bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template async_get_raw<X>(
+            smem,
+            (coord.get_offset() + coord_extra_offset) / PackedSize,
+            linear_offset / PackedSize,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<pre_nop>{});
+    }
+
     template <typename X,
               bool pre_nop = false,
               typename std::enable_if<
diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
new file mode 100644
index 0000000000..351737d4d9
--- /dev/null
+++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
@@ -0,0 +1,731 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+/**
+ * @brief This class provides tile (windowed) view and access to the device memory.
+ *
+ * @note This tile window does not support single issue you need to use tile_window_linear
+ *       structure for this purpose
+ *
+ * @tparam BottomTensorView_        Class describing & holding device tensor memory.
+ * @tparam WindowLengths_           Spatial sizes of windowed view on tensor.
+ * @tparam StaticTileDistribution_  Thread distribution (mapping) into Tile dimensions
+ * @tparam NumCoord                 TBD
+ */
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename StaticPageIndexArray_,
+          index_t HsGatherDim = 0,
+          index_t NumCoord    = 1,
+          index_t YsGatherDim = 0>
+struct tile_scatter_gather
+{
+    using BottomTensorView = remove_reference_t<BottomTensorView_>;
+    using WindowLengths    = remove_cvref_t<WindowLengths_>;
+    using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
+    using PageIdxArray     = remove_cvref_t<StaticPageIndexArray_>;
+    using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
+    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
+
+    using DataType = remove_cvref_t<typename BottomTensorView::DataType>;
+
+    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
+    static constexpr index_t NDimBottomTensor     = BottomTensorDesc::get_num_of_dimension();
+
+    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
+    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static_assert(NumCoord == 1);
+
+    // TODO: check WindowLengths and StaticTileDistribution are consistent
+
+    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
+                  "wrong! lengths should be static");
+    static_assert(TileDstr::is_static(), "wrong!");
+
+    static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
+                  "wrong! inconsistent # of diemsnions");
+
+    using AdaptorTopIndex   = array<index_t, NDimWindowAdaptorTop>;
+    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
+
+    using WindowAdaptorCoord =
+        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
+
+    using BottomTensorCoord =
+        decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{}));
+
+    struct load_store_traits
+    {
+        private:
+        static constexpr auto get_vector_dim_y_scalar_per_vector()
+        {
+            const auto [ys_vector_lengths, ys_vector_strides] =
+                tile_scatter_gather::get_window_adaptor_ys_safe_vector_length_strides();
+
+            index_t VectorDimY_      = 0;
+            index_t ScalarPerVector_ = 1;
+
+            for(index_t i = 0; i < NDimY; ++i)
+            {
+                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
+                {
+                    ScalarPerVector_ = ys_vector_lengths[i];
+                    VectorDimY_      = i;
+                }
+            }
+
+            return make_tuple(VectorDimY_, ScalarPerVector_);
+        }
+
+        public:
+        static constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
+        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
+        static constexpr index_t ScalarPerVector =
+            get_vector_dim_y_scalar_per_vector().template at<1>();
+
+        // using vector_type_t = vector_type_maker_t<DataType, ScalarPerVector>;
+        // using vector_t      = typename vector_type_t::type;
+        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;
+
+        private:
+        static constexpr auto scalars_per_access_ = [] {
+            constexpr auto scalars_per_access_arr = generate_array(
+                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
+
+            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
+            constexpr auto NDimY_ = NDimY;
+
+            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
+        }();
+
+        static constexpr auto get_space_filling_curve()
+        {
+            constexpr auto tile_dstr = TileDstr{};
+
+            constexpr auto thread_tensor_lengths_ys =
+                to_sequence(tile_dstr.get_ys_to_d_descriptor().get_lengths());
+
+            // FIXME: need logic to judge dim access order
+            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
+
+            return space_filling_curve<decltype(thread_tensor_lengths_ys),
+                                       DimAccessOrder,
+                                       decltype(scalars_per_access_)>{};
+        }
+
+        public:
+        using SFC_Ys = decltype(get_space_filling_curve());
+
+        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
+
+        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
+        static_assert(NumAccess % NumCoord == 0, "wrong! # of access is not divisible by NumCoord");
+    };
+
+    static constexpr index_t NumAccessPerCoord = load_store_traits::NumAccess / NumCoord;
+
+    CK_TILE_DEVICE constexpr tile_scatter_gather() = default;
+
+    CK_TILE_DEVICE constexpr tile_scatter_gather(const BottomTensorView& bottom_tensor_view,
+                                                 const WindowLengths& window_lengths,
+                                                 const BottomTensorIndex& window_origin,
+                                                 const TileDstr& tile_distribution,
+                                                 const PageIdxArray& page_idx)
+        : bottom_tensor_view_{bottom_tensor_view},
+          window_lengths_{window_lengths},
+          window_origin_{window_origin},
+          tile_dstr_{tile_distribution},
+          page_idx_{page_idx},
+          pre_computed_coords_{}
+    {
+#if 0 // debug
+      // TODO: this use more register for FA, but less register for GEMM
+      // need investigation
+      // only support warp-tile and block-tile
+        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
+
+        WindowAdaptorCoord window_adaptor_thread_coord_tmp;
+
+        if constexpr(NDimP == 1)
+        {
+            window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+                tile_distribution.get_ps_ys_to_xs_adaptor(), AdaptorTopIndex{get_lane_id(), 0});
+        }
+        else if constexpr(NDimP == 2)
+        {
+            window_adaptor_thread_coord_tmp =
+                make_tensor_adaptor_coordinate(tile_distribution.get_ps_ys_to_xs_adaptor(),
+                                               AdaptorTopIndex{get_warp_id(), get_lane_id(), 0});
+        }
+#else
+        // TODO: this use less register for FA, but more register for GEMM
+        // need investigation
+        const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            tile_distribution.get_ps_ys_to_xs_adaptor(),
+            container_concat(detail::get_partition_index(tile_distribution),
+                             array<index_t, NDimY>{0}));
+#endif
+
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
+        bottom_tensor_thread_origin_idx_tmp(HsGatherDim) = 0;
+        const auto bottom_tensor_thread_coord_tmp        = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+
+        // pre-compute NumCoord (WindowAdaptorCoord, BottomTensorCoord) bundles to speed up
+        // future load/store() calls (might allocate more registers)
+        using Traits = load_store_traits;
+        using SFC_Ys = typename Traits::SFC_Ys;
+
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            auto window_adaptor_thread_coord = window_adaptor_thread_coord_tmp;
+            auto bottom_tensor_thread_coord  = bottom_tensor_thread_coord_tmp;
+
+            constexpr auto idx_diff_ys =
+                SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
+
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
+
+            move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+
+            pre_computed_coords_(iCoord) =
+                make_tuple(window_adaptor_thread_coord, bottom_tensor_thread_coord);
+        });
+    }
+
+    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
+
+    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
+    {
+        return TileDstr::is_static();
+    }
+
+    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
+
+    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
+
+    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
+
+    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
+
+    CK_TILE_DEVICE constexpr void
+    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
+    {
+        bottom_tensor_view_.buf_.p_data_ = data;
+    }
+
+    // move thread's window adaptor coordinate and bottom tensor coordinate
+    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
+    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
+        WindowAdaptorCoord& window_adaptor_thread_coord,
+        BottomTensorCoord& bottom_tensor_thread_coord,
+        const ATopIndex& idx_diff_adaptor_top) const
+    {
+        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
+
+        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                       window_adaptor_thread_coord,
+                                       idx_diff_adaptor_top,
+                                       idx_diff_adaptor_bottom);
+
+        move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                               bottom_tensor_thread_coord,
+                               idx_diff_adaptor_bottom);
+    }
+
+    // return vector dimension among [y0, y1, ...]
+    CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
+    {
+        // bottom tensor top dimension vector lengths and strides
+        const auto [bottom_tensor_top_dim_vector_lengths, bottom_tensor_top_dim_vector_strides] =
+            BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
+
+        // window vector lengths/strides
+        const auto window_adaptor_bottom_dim_vector_lengths = bottom_tensor_top_dim_vector_lengths;
+        const auto window_adaptor_bottom_dim_vector_strides = bottom_tensor_top_dim_vector_strides;
+
+        // window adaptor [p0, p1, ..., y0, y1, ...]
+        array<index_t, WindowAdaptor::get_num_of_hidden_dimension()> window_adaptor_vector_lengths{
+            -1};
+        array<index_t, WindowAdaptor::get_num_of_hidden_dimension()> window_adaptor_vector_strides{
+            -1};
+
+        constexpr auto window_adaptor_bottom_dims =
+            WindowAdaptor::get_bottom_dimension_hidden_ids();
+
+        set_container_subset(window_adaptor_vector_lengths,
+                             window_adaptor_bottom_dims,
+                             window_adaptor_bottom_dim_vector_lengths);
+        set_container_subset(window_adaptor_vector_strides,
+                             window_adaptor_bottom_dims,
+                             window_adaptor_bottom_dim_vector_strides);
+
+        const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
+            WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
+                window_adaptor_vector_lengths, window_adaptor_vector_strides);
+
+        // [y0, y1, ...]
+        constexpr auto y_dims = typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
+                                                                 NDimWindowAdaptorTop,
+                                                                 1>::type{};
+
+        return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
+                          get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
+    }
+
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return load_store_traits::NumAccess; }
+
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        constexpr auto tile_dstr = TileDstr{};
+        auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
+        load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }
+
+    template <typename DistributedTensor,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+                             number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits   = load_store_traits;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+                constexpr auto idx_gather   = idx_ys_start[number<YsGatherDim>{}];
+                const auto page_offset      = page_idx_[idx_gather];
+                // read from bottom tensor
+                const vector_t vec_value =
+                    get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                        bottom_tensor_thread_coord,
+                        page_offset,
+                        bool_constant<oob_conditional_check>{});
+#if 1
+                // write into distributed tensor
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
+                    constexpr auto idx_ys = generate_tuple(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<NDimY>{});
+
+                    constexpr index_t d =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
+
+                    dst_tensor.get_thread_buffer().template at<d>() =
+                        vec_value.template get_as<DataType>()[j / Traits::PackedSize];
+                });
+#else
+                constexpr index_t d =
+                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+                static_assert(d % Traits::ScalarPerVector == 0);
+
+                dst_tensor.get_thread_buffer().template get_as<vector_t>()(
+                    number<d / Traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
+#endif
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto forward_step_scatter = generate_tuple(
+                        [&](auto i) { return i == YsGatherDim ? 0 : idx_diff_ys[i]; },
+                        number<NDimY>{});
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        forward_step_scatter);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
+    // TODO: currently async load only implemented in inline asm
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
+    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access_unsupport_>          = {},
+                                       bool_constant<oob_conditional_check> = {},
+                                       bool_constant<pre_nop>               = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        // using LdsTensorView = typename LdsTileWindow::BottomTensorView;
+        using LdsDataType = typename LdsTileWindow::DataType;
+        // using LdsDescriptor = typename LdsTileWindow::BottomTensorDesc;
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        const index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{})) *
+            sizeof(LdsDataType);
+
+        const index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+
+        const index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+        m0_set_with_memory(m0_init_value); // This should be wave independent
+
+        using Traits = load_store_traits;
+
+        // using vector_type_t = typename Traits::vector_type_t;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess  = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+                constexpr auto pre_nop_ = [&]() {
+                    if constexpr(pre_nop && iCoord == 0 && iCoordAccess == 0)
+                        return bool_constant<true>{};
+                    else
+                        return bool_constant<false>{};
+                }();
+
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+                constexpr auto idx_gather   = idx_ys_start[number<YsGatherDim>{}];
+                const auto page_offset      = page_idx_[idx_gather];
+                // read from bottom tensor
+                get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                    smem, bottom_tensor_thread_coord, page_offset, 0, pre_nop_);
+
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto forward_step_scatter = generate_tuple(
+                        [&](auto i) { return i == YsGatherDim ? 0 : idx_diff_ys[i]; },
+                        number<NDimY>{});
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        forward_step_scatter);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+
+                    m0_inc_with_memory(size_per_issue);
+                }
+            });
+        });
+    }
+
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access_unsupport_>          = {},
+                              bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits = load_store_traits;
+
+        // using vector_type_t = typename Traits::vector_type_t;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+        // printf("off %d\n", page_idx_[I0]);
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+                constexpr auto idx_gather   = idx_ys_start[number<0>{}];
+                const auto page_offset      = page_idx_[idx_gather];
+
+                // printf("idx_ys_start[0], idx_ys_start[1](%d, %d) \n",
+                // idx_ys_start[number<0>{}]+0, idx_ys_start[number<1>{}]+0);
+
+                // read from distributed tensor
+                // vector_type_t vec;
+                vector_t vec_value;
+
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
+                    constexpr auto idx_ys = generate_tuple(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<NDimY>{});
+
+                    constexpr index_t d =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
+                    // printf("thread_idx_m: %d j: %d\n", idx_ys[number<0>{}] + 0, 0+j);
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
+                        dstr_tensor.get_thread_buffer().template at<d>();
+                });
+
+                // const vector_t vec_value = vec.template get_as<vector_t>().template at<0>();
+
+                // write into bottom tensor
+                get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord,
+                    page_offset,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});
+                // printf("coord_offset:%d,   scatter_offset:%d \n",
+                // bottom_tensor_thread_coord.get_offset(), offset); move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto forward_step_scatter = generate_tuple(
+                        [&](auto i) { return i == YsGatherDim ? 0 : idx_diff_ys[i]; },
+                        number<NDimY>{});
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        forward_step_scatter);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
+    // move thread's botom tensor coordiante
+    // [x0', x1', ... ] ==> [offset]
+    // also move window-origin
+    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    {
+        window_origin_ += step;
+        BottomTensorIndex step_new = step;
+        step_new(HsGatherDim)      = 0;
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                                   pre_computed_coords_(iCoord)(I1),
+                                   step_new);
+        });
+    }
+
+    CK_TILE_DEVICE void update_page_idx(const PageIdxArray& new_idx)
+    {
+        page_idx_ = new_idx;
+
+        // static_for<0, 2, 1>{}([&](auto k0) {
+        //     printf("update tid %d %d \n", threadIdx.x, page_idx_[k0]);
+        // });
+    }
+    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    {
+        window_origin_ = new_window_origin;
+
+#if 0 // debug
+      // TODO: this use more register for FA, but less register for GEMM
+      // need investigation
+      // only support warp-tile and block-tile
+        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
+
+        WindowAdaptorCoord window_adaptor_thread_coord_tmp;
+
+        if constexpr(NDimP == 1)
+        {
+            window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+                tile_dstr_.get_ps_ys_to_xs_adaptor(), AdaptorTopIndex{get_lane_id(), 0});
+        }
+        else if constexpr(NDimP == 2)
+        {
+            window_adaptor_thread_coord_tmp =
+                make_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                               AdaptorTopIndex{get_warp_id(), get_lane_id(), 0});
+        }
+#else
+        // TODO: this use less register for FA, but more register for GEMM
+        // need investigation
+        const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            tile_dstr_.get_ps_ys_to_xs_adaptor(),
+            container_concat(detail::get_partition_index(tile_dstr_), array<index_t, NDimY>{0}));
+#endif
+
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
+
+        bottom_tensor_thread_origin_idx_tmp(HsGatherDim) = 0;
+        const auto bottom_tensor_thread_coord_tmp        = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+
+        // pre-compute NumCoord (WindowAdaptorCoord, BottomTensorCoord) bundles to speed up
+        // future load/store() calls (might allocate more registers)
+        using Traits = load_store_traits;
+        using SFC_Ys = typename Traits::SFC_Ys;
+
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            auto window_adaptor_thread_coord = window_adaptor_thread_coord_tmp;
+            auto bottom_tensor_thread_coord  = bottom_tensor_thread_coord_tmp;
+
+            constexpr auto idx_diff_ys =
+                SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
+
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
+
+            move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+
+            pre_computed_coords_(iCoord) =
+                make_tuple(window_adaptor_thread_coord, bottom_tensor_thread_coord);
+        });
+    }
+
+    CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); }
+
+    // this is the bottom tensor view
+    // [x0', x1', ...] ==> [offset]
+    BottomTensorView bottom_tensor_view_;
+
+    //
+    WindowLengths window_lengths_;
+
+    // origin ([x0', x1', ...]) of window on bottom tensor
+    BottomTensorIndex window_origin_;
+
+    // Tile tensor distribution, which contains:
+    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
+    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
+    TileDstr tile_dstr_;
+
+    PageIdxArray page_idx_;
+
+    // this contains:
+    //   per-thread coordinate for window adaptor
+    //   per-thread coordinate for bottom tensor
+    array<tuple<WindowAdaptorCoord, BottomTensorCoord>, NumCoord> pre_computed_coords_;
+};
+
+// TODO: use strategy
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename StaticPageIndexArray_,
+          index_t HsGatherDim = 0,
+          index_t NumCoord    = 1>
+CK_TILE_DEVICE constexpr auto
+make_tile_scatter_gather(const TensorView_& tensor_view,
+                         const WindowLengths_& window_lengths,
+                         const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                         const StaticTileDistribution_& tile_distribution,
+                         const StaticPageIndexArray_& page_idx,
+                         number<HsGatherDim> = {},
+                         number<NumCoord>    = {})
+{
+    return tile_scatter_gather<remove_cvref_t<TensorView_>,
+                               remove_cvref_t<WindowLengths_>,
+                               remove_cvref_t<StaticTileDistribution_>,
+                               remove_cvref_t<StaticPageIndexArray_>,
+                               HsGatherDim,
+                               NumCoord>{
+        tensor_view, window_lengths, origin, tile_distribution, page_idx};
+}
+
+template <typename TensorView,
+          typename WindowLengths,
+          typename StaticTileDistribution,
+          typename StaticPageIndexArray,
+          index_t HsGatherDim>
+CK_TILE_DEVICE constexpr auto make_tile_scatter_gather(
+    const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+    const multi_index<TensorView::get_num_of_dimension()>& origin,
+    const StaticTileDistribution& tile_distribution,
+    const StaticPageIndexArray& page_idx,
+    number<HsGatherDim> = {})
+{
+    return make_tile_scatter_gather(tile_window.get_bottom_tensor_view(),
+                                    tile_window.get_window_lengths(),
+                                    origin,
+                                    tile_distribution,
+                                    page_idx,
+                                    number<HsGatherDim>{});
+}
+
+template <typename TensorView,
+          typename WindowLengths,
+          typename StaticTileDistribution,
+          typename StaticPageIndexArray,
+          index_t HsGatherDim>
+CK_TILE_DEVICE constexpr auto make_tile_scatter_gather(
+    const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+    const StaticTileDistribution& tile_distribution,
+    const StaticPageIndexArray& page_idx,
+    number<HsGatherDim> = {})
+{
+    return make_tile_scatter_gather(tile_window.get_bottom_tensor_view(),
+                                    tile_window.get_window_lengths(),
+                                    tile_window.get_window_origin(),
+                                    tile_distribution,
+                                    page_idx,
+                                    number<HsGatherDim>{});
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_window_utils.hpp b/include/ck_tile/core/tensor/tile_window_utils.hpp
index 71a72329f8..f8b232a7af 100644
--- a/include/ck_tile/core/tensor/tile_window_utils.hpp
+++ b/include/ck_tile/core/tensor/tile_window_utils.hpp
@@ -18,6 +18,13 @@
 #pragma once
 namespace ck_tile {
 
+template <typename TileWindow_>
+CK_TILE_DEVICE void move_tile_window(TileWindow_& window,
+                                     const typename TileWindow_::BottomTensorIndex& step)
+{
+    window.move(step);
+}
+
 // input a lds store tile, extract some information from it
 // used to set m0 value for gfx9 serious
 template <typename LdsTileWindow_>
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index a28b63f813..ac6ef9cae3 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -9,12 +9,16 @@
 #include "ck_tile/ops/fmha/block/block_position_encoding.hpp"
 #include "ck_tile/ops/fmha/block/block_rotary_embedding.hpp"
 #include "ck_tile/ops/fmha/block/page_block_navigator.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
diff --git a/include/ck_tile/ops/fmha/block/variants.hpp b/include/ck_tile/ops/fmha/block/variants.hpp
new file mode 100644
index 0000000000..90fc5656fc
--- /dev/null
+++ b/include/ck_tile/ops/fmha/block/variants.hpp
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <type_traits>
+
+#include <ck_tile/core/numeric/math.hpp>
+#include <ck_tile/core/numeric/type_convert.hpp>
+
+#define CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH 0
+#define CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN 1
+
+#ifndef CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT
+#define CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH
+#endif
+
+namespace ck_tile {
+
+template <typename ImplMask>
+struct StandardAttentionParams
+{
+    __device__ __host__ StandardAttentionParams(const ImplMask& impl_mask_, float sm_scale_)
+        : impl_mask(impl_mask_), sm_scale(sm_scale_)
+    {
+    }
+
+    const ImplMask& impl_mask;
+    float sm_scale;
+};
+
+template <typename ImplMask, bool UseExp2 = false>
+struct LogitsSoftCapParams
+{
+    __device__
+    LogitsSoftCapParams(const ImplMask& impl_mask_, float sm_scale_, float logits_soft_cap_)
+        : impl_mask(impl_mask_), sm_scale(sm_scale_), logits_soft_cap(logits_soft_cap_)
+    {
+        if(0.f < logits_soft_cap)
+        {
+            logits_soft_cap_rcp = __builtin_amdgcn_rcpf(logits_soft_cap);
+        }
+        else
+        {
+            logits_soft_cap_rcp = 0.f;
+        }
+
+        // move computation here to prevent compiler from generating inefficient instruction
+        // sequence
+        if constexpr(UseExp2)
+        {
+            logits_soft_cap     = log2e_v<float> * logits_soft_cap;
+            logits_soft_cap_rcp = sm_scale * log2e_rcp_v<float> * logits_soft_cap_rcp;
+        }
+    }
+
+    __host__
+    LogitsSoftCapParams(const ImplMask& impl_mask_, float sm_scale_, float logits_soft_cap_)
+        : impl_mask(impl_mask_), sm_scale(sm_scale_), logits_soft_cap(logits_soft_cap_)
+    {
+        if(0.f < logits_soft_cap)
+        {
+            logits_soft_cap_rcp = 1.f / logits_soft_cap;
+        }
+        else
+        {
+            logits_soft_cap_rcp = 0.f;
+        }
+
+        // move computation here to prevent compiler from generating inefficient instruction
+        // sequence
+        if constexpr(UseExp2)
+        {
+            logits_soft_cap     = log2e_v<float> * logits_soft_cap;
+            logits_soft_cap_rcp = sm_scale * log2e_rcp_v<float> * logits_soft_cap_rcp;
+        }
+    }
+
+    __device__ __host__ LogitsSoftCapParams(const ImplMask& impl_mask_,
+                                            float sm_scale_,
+                                            float logits_soft_cap_,
+                                            float logits_soft_cap_rcp_)
+        : impl_mask(impl_mask_),
+          sm_scale(sm_scale_),
+          logits_soft_cap(logits_soft_cap_),
+          logits_soft_cap_rcp(logits_soft_cap_rcp_)
+    {
+        // move computation here to prevent compiler from generating inefficient instruction
+        // sequence
+        if constexpr(UseExp2)
+        {
+            logits_soft_cap     = log2e_v<float> * logits_soft_cap;
+            logits_soft_cap_rcp = sm_scale * log2e_rcp_v<float> * logits_soft_cap_rcp;
+        }
+    }
+
+    const ImplMask& impl_mask;
+    float sm_scale;
+    float logits_soft_cap;
+    float logits_soft_cap_rcp;
+};
+
+struct StandardAttention
+{
+    __device__ __host__ StandardAttention() = default;
+
+    template <typename Params, typename T>
+    __device__ __forceinline__ T QueryTransform(const Params& params, T q) const
+    {
+        return type_convert<float>(q) * params.sm_scale;
+    }
+
+    /// NOTICE: For better performance, we simpliy transform thread buffer without calculating
+    /// qo_idx/kv_idx.
+    template <typename Params, typename T>
+    __device__ __forceinline__ T LogitsTransform([[maybe_unused]] const Params& params,
+                                                 T logits,
+                                                 [[maybe_unused]] uint32_t batch_idx,
+                                                 /*uint32_t qo_idx, uint32_t kv_idx,*/
+                                                 [[maybe_unused]] uint32_t qo_head_idx,
+                                                 [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return logits;
+    }
+
+    template <typename Params>
+    __device__ __forceinline__ bool LogitsMask(const Params& params,
+                                               [[maybe_unused]] uint32_t batch_idx,
+                                               uint32_t qo_idx,
+                                               uint32_t kv_idx,
+                                               [[maybe_unused]] uint32_t qo_head_idx,
+                                               [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return !params.impl_mask.IsOutOfBound(qo_idx, kv_idx);
+    }
+};
+
+template <bool UseExp2 = false>
+struct LogitsSoftCap
+{
+    __device__ __host__ LogitsSoftCap() = default;
+
+    template <typename Params, typename T>
+    __device__ __forceinline__ T QueryTransform(const Params& params, T q) const
+    {
+        if constexpr(UseExp2)
+        {
+            return q;
+        }
+        else
+        {
+            return type_convert<float>(q) * params.sm_scale;
+        }
+    }
+
+    /// NOTICE: For better performance, we simpliy transform thread buffer without calculating
+    /// qo_idx/kv_idx.
+    template <typename Params, typename T>
+    __device__ __forceinline__ T LogitsTransform(const Params& params,
+                                                 T logits,
+                                                 [[maybe_unused]] uint32_t batch_idx,
+                                                 /*uint32_t qo_idx, uint32_t kv_idx,*/
+                                                 [[maybe_unused]] uint32_t qo_head_idx,
+                                                 [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        if constexpr(UseExp2)
+        {
+#if CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH
+            return params.logits_soft_cap *
+                   tanh_fast<float>(type_convert<float>(logits) * params.logits_soft_cap_rcp);
+#elif CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN
+            return params.sm_scale * type_convert<float>(logits) *
+                   rcp<float>(1.f + abs(type_convert<float>(logits) * params.logits_soft_cap_rcp));
+#endif
+        }
+        else
+        {
+#if CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH
+            return params.logits_soft_cap *
+                   tanhf(type_convert<float>(logits) * params.logits_soft_cap_rcp);
+#elif CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN
+            return type_convert<float>(logits) *
+                   rcp<float>(1.f + abs(type_convert<float>(logits) * params.logits_soft_cap_rcp));
+#endif
+        }
+    }
+
+    template <typename Params>
+    __device__ __forceinline__ bool LogitsMask(const Params& params,
+                                               [[maybe_unused]] uint32_t batch_idx,
+                                               uint32_t qo_idx,
+                                               uint32_t kv_idx,
+                                               [[maybe_unused]] uint32_t qo_head_idx,
+                                               [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return !params.impl_mask.IsOutOfBound(qo_idx, kv_idx);
+    }
+};
+
+constexpr uint32_t CUSTOM_MASK     = 1U;
+constexpr uint32_t SLIDING_WINDOW  = 2U;
+constexpr uint32_t LOGITS_SOFT_CAP = 4U;
+constexpr uint32_t ALIBI           = 8U;
+
+template <uint32_t VARIANT_CODE, bool UseExp2 = false>
+struct ComposedAttention
+{
+    static constexpr bool use_exp2 = UseExp2;
+
+    static constexpr bool use_logits_soft_cap = (VARIANT_CODE & LOGITS_SOFT_CAP) != 0;
+
+    __device__ __host__ ComposedAttention() = default;
+
+    template <typename Params, typename T>
+    __device__ __forceinline__ T QueryTransform(const Params& params, T q) const
+    {
+        if constexpr(use_logits_soft_cap && UseExp2)
+        {
+            return q;
+        }
+        return type_convert<float>(q) * params.sm_scale;
+    }
+
+    /// NOTICE: For better performance, we simpliy transform thread buffer without calculating
+    /// qo_idx/kv_idx.
+    template <typename Params, typename T>
+    __device__ __forceinline__ T LogitsTransform(const Params& params,
+                                                 T logits,
+                                                 [[maybe_unused]] uint32_t batch_idx,
+                                                 /*uint32_t qo_idx, uint32_t kv_idx,*/
+                                                 [[maybe_unused]] uint32_t qo_head_idx,
+                                                 [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        if constexpr(use_logits_soft_cap)
+        {
+            if constexpr(UseExp2)
+            {
+#if CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH
+                return params.logits_soft_cap *
+                       tanh_fast<float>(type_convert<float>(logits) * params.logits_soft_cap_rcp);
+#elif CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN
+                return params.sm_scale * type_convert<float>(logits) *
+                       rcp<float>(1.f +
+                                  abs(type_convert<float>(logits) * params.logits_soft_cap_rcp));
+#endif
+            }
+            else
+            {
+#if CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH
+                return params.logits_soft_cap *
+                       tanhf(type_convert<float>(logits) * params.logits_soft_cap_rcp);
+#elif CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN
+                return type_convert<float>(logits) *
+                       rcp<float>(1.f +
+                                  abs(type_convert<float>(logits) * params.logits_soft_cap_rcp));
+#endif
+            }
+        }
+        return logits;
+    }
+
+    template <typename Params>
+    __device__ __forceinline__ bool LogitsMask(const Params& params,
+                                               [[maybe_unused]] uint32_t batch_idx,
+                                               uint32_t qo_idx,
+                                               uint32_t kv_idx,
+                                               [[maybe_unused]] uint32_t qo_head_idx,
+                                               [[maybe_unused]] uint32_t kv_head_idx) const
+    {
+        return !params.impl_mask.IsOutOfBound(qo_idx, kv_idx);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
new file mode 100644
index 0000000000..ba327ee511
--- /dev/null
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -0,0 +1,1134 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
+
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+// S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] @ K[seqlen_k, hdim_q]
+// S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
+// S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
+// P[seqlen_q, seqlen_k] = Softmax(S''[seqlen_q, seqlen_k])
+// O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] @ V^T[hdim_v, seqlen_k]
+
+namespace ck_tile {
+
+template <typename FmhaPipeline_, typename EpiloguePipeline_>
+struct FmhaBatchPrefillWithPagedKVCacheKernel
+{
+    using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
+    static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
+    static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+    static_assert(kBlockPerCu > 0);
+    static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
+
+    using QDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::QDataType>;
+    using KDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::KDataType>;
+    using VDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::VDataType>;
+    using BiasDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
+    using RandValOutputDataType =
+        ck_tile::remove_cvref_t<typename FmhaPipeline::RandValOutputDataType>;
+    using LSEDataType  = ck_tile::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
+    using ODataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::ODataType>;
+    using SaccDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::SaccDataType>;
+
+    using VLayout = ck_tile::remove_cvref_t<typename FmhaPipeline::VLayout>;
+
+    static constexpr bool kIsGroupMode      = FmhaPipeline::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = FmhaPipeline::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = FmhaPipeline::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = FmhaPipeline::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = FmhaPipeline::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = FmhaPipeline::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = FmhaPipeline::BiasEnum;
+    static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
+    static constexpr bool kHasDropout       = FmhaPipeline::kHasDropout;
+    static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
+    using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
+    using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    static constexpr bool kHasMask = FmhaMask::IsMasking;
+
+    static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // sync with generate.py
+        // clang-format off
+        using bfs = typename FmhaPipeline::BlockFmhaShape;
+        using g0br = typename bfs::Gemm0BlockWarps;
+        using g1br = typename bfs::Gemm1BlockWarps;
+        using g0wt = typename bfs::Gemm0WarpTile;
+        using g1wt = typename bfs::Gemm1WarpTile;
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        auto pn = [&] () {
+            std::string n;
+            if (kPadSeqLenQ) n += "s";
+            if (kPadSeqLenK) n += "sk";
+            if (kPadHeadDimQ) n += "d";
+            if (kPadHeadDimV) n += "dv";
+            return n.empty() ? n : std::string("p") + n; }();
+        return
+            _SS_("fmha_batch_prefill_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
+            "_" + (kIsGroupMode ? "group" : "batch") + "_"
+            "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
+                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
+            "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
+            "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
+            (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
+            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
+            (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    template <ck_tile::index_t I> // to avoid duplicated base class prblem, introduce an template
+                                  // arg
+    struct FmhaFwdEmptyKargs
+    {
+    };
+
+    // kargs use aggregate initializer, so no constructor will provided
+    // use inheritance to minimize karg size
+    // user need to use MakeKargs() function to create kargs.
+    struct FmhaFwdCommonKargs
+    {
+        const void* q_ptr;
+        const void* k_ptr;
+        const void* v_ptr;
+        void* o_ptr;
+
+        ck_tile::index_t seqlen_q;
+        ck_tile::index_t seqlen_k;
+        ck_tile::index_t hdim_q;
+        ck_tile::index_t hdim_v;
+
+        ck_tile::index_t num_head_q;
+        // for MQA/GQA, nhead could be different. This parameter is nhead_q / nhead_k
+        // if this param is larger than 1, indicate MQA/GQA case
+        ck_tile::index_t nhead_ratio_qk;
+
+        int32_t num_total_pages;
+        const int32_t* kv_indptr;
+        const int32_t* kv_page_indices;
+#if 0 // we assume page_block_size=1 for now
+        const int32_t* kv_last_page_lens;
+        ck_tile::index_t page_block_size;
+#else
+        static constexpr ck_tile::index_t page_block_size = 1;
+#endif
+
+        float scale_s;
+
+        ck_tile::index_t stride_q;
+        ck_tile::index_t stride_k;
+        ck_tile::index_t stride_v;
+        ck_tile::index_t stride_o;
+
+        ck_tile::index_t nhead_stride_q;
+        ck_tile::index_t nhead_stride_k;
+        ck_tile::index_t nhead_stride_v;
+        ck_tile::index_t nhead_stride_o;
+    };
+
+    struct FmhaFwdLogitsSoftCapKargs
+    {
+        FmhaFwdLogitsSoftCapKargs() = default;
+
+        void init_logits_soft_cap(float logits_soft_cap_)
+        {
+            if(0 < logits_soft_cap_)
+            {
+                logits_soft_cap     = logits_soft_cap_;
+                logits_soft_cap_rcp = 1.f / logits_soft_cap;
+            }
+            else
+            {
+                logits_soft_cap     = 0.f;
+                logits_soft_cap_rcp = 0.f;
+            }
+        }
+
+        float logits_soft_cap;
+        float logits_soft_cap_rcp;
+    };
+
+    struct FmhaFwdCommonBiasKargs
+    {
+        const void* bias_ptr               = nullptr;
+        ck_tile::index_t stride_bias       = 0;
+        ck_tile::index_t nhead_stride_bias = 0;
+    };
+
+    struct FmhaFwdBatchModeBiasKargs : FmhaFwdCommonBiasKargs
+    {
+        ck_tile::index_t batch_stride_bias = 0;
+    };
+
+    struct FmhaFwdAlibiKargs
+    {
+        // alibi is batch*nhead*1, no matter in batch/group mode, they are the same
+        const void* alibi_slope_ptr;
+        ck_tile::index_t alibi_slope_stride; // stride in batch, or 0 for all batch share same slope
+    };
+
+    struct FmhaFwdMaskKargs
+    {
+        // ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::GenericAttentionMaskEnum mask_type;
+    };
+
+    struct FmhaFwdFp8StaticQuantKargs
+    {
+        float scale_p;
+        float scale_o;
+    };
+
+    struct FmhaFwdCommonLSEKargs
+    {
+        void* lse_ptr                     = nullptr;
+        ck_tile::index_t nhead_stride_lse = 0;
+        ck_tile::index_t batch_stride_lse = 0;
+    };
+
+    struct FmhaFwdDropoutSeedOffset
+    {
+        template <typename T>
+        union ValueOrPointer
+        {
+            T val;
+            const T* ptr;
+        };
+
+        ValueOrPointer<uint64_t> drop_seed;
+        ValueOrPointer<uint64_t> drop_offset;
+        bool is_drop_seed_offset_from_host;
+    };
+
+    struct FmhaFwdCommonDropoutKargs : FmhaFwdDropoutSeedOffset
+    {
+        void init_dropout(float p_drop, uint64_t seed, uint64_t offset)
+        {
+            float p_undrop = 1.0 - p_drop;
+            p_undrop_in_uint8_t =
+                uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
+            rp_undrop = 1.0 / p_undrop;
+
+            this->drop_seed.val                 = seed;
+            this->drop_offset.val               = offset;
+            this->is_drop_seed_offset_from_host = true;
+        }
+
+        void init_dropout(float p_drop, const uint64_t* seed_ptr, const uint64_t* offset_ptr)
+        {
+            float p_undrop = 1.0 - p_drop;
+            p_undrop_in_uint8_t =
+                uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
+            rp_undrop = 1.0 / p_undrop;
+
+            this->drop_seed.ptr                 = seed_ptr;
+            this->drop_offset.ptr               = offset_ptr;
+            this->is_drop_seed_offset_from_host = false;
+        }
+
+        float rp_undrop             = 1;
+        uint8_t p_undrop_in_uint8_t = std::numeric_limits<uint8_t>::max();
+        bool is_store_randval       = false;
+        void* rand_val_ptr          = nullptr;
+
+        ck_tile::index_t stride_randval       = 0;
+        ck_tile::index_t nhead_stride_randval = 0;
+    };
+
+    struct FmhaFwdBatchModeDropoutKargs : FmhaFwdCommonDropoutKargs
+    {
+        ck_tile::index_t batch_stride_randval = 0;
+    };
+
+    struct FmhaFwdBatchModeKargs
+        : FmhaFwdCommonKargs,
+          std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS,
+                             FmhaFwdBatchModeBiasKargs,
+                             std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ALIBI,
+                                                FmhaFwdAlibiKargs,
+                                                FmhaFwdEmptyKargs<0>>>,
+          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+          std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
+          std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
+          std::conditional_t<kHasDropout, FmhaFwdBatchModeDropoutKargs, FmhaFwdEmptyKargs<4>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<5>>
+    {
+        ck_tile::index_t batch_stride_q;
+        ck_tile::index_t batch_stride_k;
+        ck_tile::index_t batch_stride_v;
+        ck_tile::index_t batch_stride_o;
+    };
+
+    struct FmhaFwdGroupModeKargs
+        : FmhaFwdCommonKargs,
+          std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS,
+                             FmhaFwdCommonBiasKargs,
+                             std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ALIBI,
+                                                FmhaFwdAlibiKargs,
+                                                FmhaFwdEmptyKargs<0>>>,
+          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+          std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
+          std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
+          std::conditional_t<kHasDropout, FmhaFwdCommonDropoutKargs, FmhaFwdEmptyKargs<4>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<5>>
+    {
+        const int32_t* seqstart_q_ptr;
+        ck_tile::index_t batch_stride_k;
+        ck_tile::index_t batch_stride_v;
+    };
+
+    using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
+
+    struct BlockIndices
+    {
+        ck_tile::index_t batch_idx;
+        ck_tile::index_t qo_head_idx;
+        ck_tile::index_t kv_head_idx;
+    };
+
+    template <bool Cond = !kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  void* rand_val_ptr,
+                  void* lse_ptr,
+                  void* o_ptr,
+                  ck_tile::index_t seqlen_q,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  int32_t num_total_pages,
+                  const void* kv_indptr,
+                  const void* kv_page_indices,
+#if 0 // we assume page_block_size=1 for now
+              const void* kv_last_page_lens,
+              ck_tile::index_t page_block_size,
+#endif
+                  float scale_s,
+                  float scale_p,
+                  float scale_o,
+                  float logits_soft_cap,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_randval,
+                  ck_tile::index_t stride_o,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_randval,
+                  ck_tile::index_t nhead_stride_lse,
+                  ck_tile::index_t nhead_stride_o,
+                  ck_tile::index_t batch_stride_q,
+                  ck_tile::index_t batch_stride_k,
+                  ck_tile::index_t batch_stride_v,
+                  ck_tile::index_t batch_stride_bias,
+                  ck_tile::index_t batch_stride_randval,
+                  ck_tile::index_t batch_stride_lse,
+                  ck_tile::index_t batch_stride_o,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  float p_drop,
+                  bool s_randval,
+                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                      drop_seed_offset)
+    {
+        Kargs kargs{{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
+                     seqlen_q,
+                     -1,
+                     hdim_q,
+                     hdim_v,
+                     num_head_q,
+                     nhead_ratio_qk,
+                     num_total_pages,
+                     reinterpret_cast<const int32_t*>(kv_indptr),
+                     reinterpret_cast<const int32_t*>(kv_page_indices),
+#if 0 // we assume page_block_size=1 for now
+                     reinterpret_cast<const int32_t*>(kv_last_page_lens),
+                     page_block_size,
+#endif
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                     static_cast<float>(scale_s * ck_tile::log2e_v<>),
+#else
+                     scale_s,
+#endif
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o}, // args for common karg
+                    {},               // placeholder for bias
+                    {},               // placeholder for mask
+                    {},               // placeholder for lse
+                    {},               // placeholder for fp8_static_quant args
+                    {},               // placeholder for dropout
+                    {},               // placeholder for logits_soft_cap
+                    batch_stride_q,
+                    batch_stride_k,
+                    batch_stride_v,
+                    batch_stride_o};
+
+        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+        {
+            kargs.bias_ptr          = bias_ptr;
+            kargs.stride_bias       = stride_bias;
+            kargs.nhead_stride_bias = nhead_stride_bias;
+            kargs.batch_stride_bias = batch_stride_bias;
+        }
+        else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+        {
+            kargs.alibi_slope_ptr    = bias_ptr;
+            kargs.alibi_slope_stride = stride_bias;
+        }
+        if constexpr(kHasMask)
+        {
+            kargs.window_size_left  = window_size_left;
+            kargs.window_size_right = window_size_right;
+            kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
+        }
+        if constexpr(kStoreLSE)
+        {
+            kargs.lse_ptr          = lse_ptr;
+            kargs.nhead_stride_lse = nhead_stride_lse;
+            kargs.batch_stride_lse = batch_stride_lse;
+        }
+        if constexpr(kDoFp8StaticQuant)
+        {
+            kargs.scale_p = scale_p;
+            kargs.scale_o = scale_o;
+        }
+        if constexpr(kHasDropout)
+        {
+            if(drop_seed_offset.index() == 0) // seed & offset come from host
+            {
+                const auto& [seed, offset] = std::get<0>(drop_seed_offset);
+                kargs.init_dropout(p_drop, seed, offset);
+            }
+            else // seed & offset come from device
+            {
+                const auto& [seed_ptr, offset_ptr] = std::get<1>(drop_seed_offset);
+                kargs.init_dropout(p_drop,
+                                   reinterpret_cast<const uint64_t*>(seed_ptr),
+                                   reinterpret_cast<const uint64_t*>(offset_ptr));
+            }
+
+            kargs.rand_val_ptr         = rand_val_ptr;
+            kargs.stride_randval       = stride_randval;
+            kargs.nhead_stride_randval = nhead_stride_randval;
+            kargs.batch_stride_randval = batch_stride_randval;
+            kargs.is_store_randval     = s_randval;
+        }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
+
+        return kargs;
+    }
+
+    template <bool Cond = kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  void* rand_val_ptr,
+                  void* lse_ptr,
+                  void* o_ptr,
+                  const void* seqstart_q_ptr,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  int32_t num_total_pages,
+                  const void* kv_indptr,
+                  const void* kv_page_indices,
+#if 0 // we assume page_block_size=1 for now
+              const void* kv_last_page_lens,
+              ck_tile::index_t page_block_size,
+#endif
+                  float scale_s,
+                  float scale_p,
+                  float scale_o,
+                  float logits_soft_cap,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_randval,
+                  ck_tile::index_t stride_o,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_randval,
+                  ck_tile::index_t nhead_stride_lse,
+                  ck_tile::index_t nhead_stride_o,
+                  ck_tile::index_t batch_stride_k,
+                  ck_tile::index_t batch_stride_v,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  float p_drop,
+                  bool s_randval,
+                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                      drop_seed_offset)
+    {
+        Kargs kargs{{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
+                     -1, // seqlen will be updated by another pointer
+                     -1, //
+                     hdim_q,
+                     hdim_v,
+                     num_head_q,
+                     nhead_ratio_qk,
+                     num_total_pages,
+                     reinterpret_cast<const int32_t*>(kv_indptr),
+                     reinterpret_cast<const int32_t*>(kv_page_indices),
+#if 0 // we assume page_block_size=1 for now
+                     reinterpret_cast<const int32_t*>(kv_last_page_lens),
+                     page_block_size,
+#endif
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                     static_cast<float>(scale_s * ck_tile::log2e_v<>),
+#else
+                     scale_s,
+#endif
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o}, // args for common karg
+                    {},               // placeholder for bias
+                    {},               // placeholder for mask
+                    {},               // placeholder for lse
+                    {},               // placeholder for fp8_static_quant args
+                    {},               // placeholder for dropout
+                    {},               // placeholder for logits_soft_cap
+                    reinterpret_cast<const int32_t*>(seqstart_q_ptr),
+                    batch_stride_k,
+                    batch_stride_v};
+
+        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+        {
+            kargs.bias_ptr          = bias_ptr;
+            kargs.stride_bias       = stride_bias;
+            kargs.nhead_stride_bias = nhead_stride_bias;
+        }
+        else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+        {
+            kargs.alibi_slope_ptr    = bias_ptr;
+            kargs.alibi_slope_stride = stride_bias;
+        }
+        if constexpr(kHasMask)
+        {
+            kargs.window_size_left  = window_size_left;
+            kargs.window_size_right = window_size_right;
+            kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
+        }
+        if constexpr(kStoreLSE)
+        {
+            kargs.lse_ptr          = lse_ptr;
+            kargs.nhead_stride_lse = nhead_stride_lse;
+        }
+        if constexpr(kDoFp8StaticQuant)
+        {
+            kargs.scale_p = scale_p;
+            kargs.scale_o = scale_o;
+        }
+        if constexpr(kHasDropout)
+        {
+            if(drop_seed_offset.index() == 0) // seed & offset come from host
+            {
+                const auto& [seed, offset] = std::get<0>(drop_seed_offset);
+                kargs.init_dropout(p_drop, seed, offset);
+            }
+            else // seed & offset come from device
+            {
+                const auto& [seed_ptr, offset_ptr] = std::get<1>(drop_seed_offset);
+                kargs.init_dropout(p_drop,
+                                   reinterpret_cast<const uint64_t*>(seed_ptr),
+                                   reinterpret_cast<const uint64_t*>(offset_ptr));
+            }
+
+            kargs.rand_val_ptr         = rand_val_ptr;
+            kargs.stride_randval       = stride_randval;
+            kargs.nhead_stride_randval = nhead_stride_randval;
+            kargs.is_store_randval     = s_randval;
+        }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
+
+        return kargs;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
+                                                ck_tile::index_t nhead_,
+                                                ck_tile::index_t seqlen_q_,
+                                                ck_tile::index_t hdim_v_)
+    {
+        if constexpr(kIsGroupMode)
+        {
+            // TODO: this may need tuning
+            return dim3(nhead_,
+                        batch_size_,
+                        ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1));
+        }
+        else
+        {
+            // TODO: this may need tuning
+            return dim3(ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
+                        nhead_,
+                        batch_size_);
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs)
+    {
+        if constexpr(kIsGroupMode)
+        {
+            // const index_t num_tile_m0 = seqlen_q / kM0;
+            const index_t num_tile_n1 =
+                ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+            const index_t i_block = blockIdx.z;
+            const index_t i_nhead = blockIdx.x;
+            const index_t i_batch = blockIdx.y;
+
+            const auto f = [](index_t dividend, index_t divisor) {
+                index_t quotient = dividend / divisor;
+                index_t modulus  = dividend - quotient * divisor;
+                return ck_tile::make_tuple(quotient, modulus);
+            };
+
+            const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+        }
+        else
+        {
+            // const index_t num_tile_m0 = seqlen_q / kM0;
+            const index_t num_tile_n1 =
+                ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+            const index_t i_block = blockIdx.x;
+            const index_t i_nhead = blockIdx.y;
+            const index_t i_batch = blockIdx.z;
+
+            const auto f = [](index_t dividend, index_t divisor) {
+                index_t quotient = dividend / divisor;
+                index_t modulus  = dividend - quotient * divisor;
+                return ck_tile::make_tuple(quotient, modulus);
+            };
+
+            const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+        }
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return ck_tile::max(FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        // divide problem
+        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
+
+        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+        long_index_t batch_offset_q       = 0;
+        long_index_t batch_offset_bias    = 0;
+        long_index_t batch_offset_randval = 0;
+        long_index_t batch_offset_lse     = 0;
+        long_index_t batch_offset_o       = 0;
+
+        const int32_t num_page_blocks = kargs.kv_indptr[i_batch + 1] - kargs.kv_indptr[i_batch];
+#if 0 // we assume page_block_size=1 for now
+        const int32_t last_page_len   = kargs.kv_last_page_lens[i_batch];
+#endif
+        if constexpr(kIsGroupMode)
+        {
+            // get starting offset for each batch
+            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+
+            batch_offset_q = query_start * kargs.stride_q;
+
+            kargs.kv_page_indices += kargs.kv_indptr[i_batch];
+
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                batch_offset_bias = query_start * kargs.stride_bias;
+            }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = query_start;
+            }
+            if constexpr(kHasDropout)
+            {
+                batch_offset_randval = query_start * kargs.stride_randval;
+            }
+            batch_offset_o = query_start * kargs.stride_o;
+
+            // get real # queries & # keys under group mode
+            kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - query_start;
+
+            // # of required blocks is different in each groups, terminate unnecessary blocks
+            // earlier
+            if(kargs.seqlen_q <= i_m0)
+            {
+                return;
+            }
+
+#if 0 // we assume page_block_size=1 for now
+            kargs.seqlen_k = (num_page_blocks - 1) * kargs.page_block_size + last_page_len;
+#else
+            kargs.seqlen_k = num_page_blocks;
+#endif
+        }
+        else
+        {
+            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+
+            kargs.kv_page_indices += kargs.kv_indptr[i_batch];
+
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+            }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+            }
+            if constexpr(kHasDropout)
+            {
+                batch_offset_randval =
+                    static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
+            }
+            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+#if 0 // we assume page_block_size=1 for now
+            kargs.seqlen_k = (num_page_blocks - 1) * kargs.page_block_size + last_page_len;
+#else
+            kargs.seqlen_k = num_page_blocks;
+#endif
+        }
+
+        // for simplicity, batch stride we just modify the pointer
+        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                 batch_offset_q;
+        const KDataType* k_ptr =
+            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k;
+        const VDataType* v_ptr =
+            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v;
+        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                           batch_offset_o;
+
+        // Q/K/V DRAM and DRAM window
+        const auto q_dram = [&]() {
+            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                q_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                make_tuple(kargs.stride_q, 1),
+                number<FmhaPipeline::kAlignmentQ>{},
+                number<1>{});
+            if constexpr(FmhaPipeline::kQLoadOnce)
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            }
+        }();
+        const auto k_dram = [&]() {
+            const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                k_ptr,
+                make_tuple(kargs.num_total_pages * kargs.page_block_size, kargs.hdim_q),
+                make_tuple(kargs.stride_k, 1),
+                number<FmhaPipeline::kAlignmentK>{},
+                number<1>{});
+
+            constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : true;
+            return pad_tensor_view(
+                k_dram_naive,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                sequence<kPadSeqLenK_, kPadHeadDimQ>{});
+        }();
+        const auto v_dram = [&]() {
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    v_ptr,
+                    make_tuple(kargs.num_total_pages * kargs.page_block_size, kargs.hdim_v),
+                    make_tuple(kargs.stride_v, 1),
+                    number<FmhaPipeline::kAlignmentV>{},
+                    number<1>{});
+
+                const auto v_dram_transposed = transform_tensor_view(
+                    v_dram_naive,
+                    make_tuple(
+                        make_pass_through_transform(kargs.hdim_v),
+                        make_pass_through_transform(kargs.num_total_pages * kargs.page_block_size)),
+                    make_tuple(sequence<1>{}, sequence<0>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : true;
+                return pad_tensor_view(
+                    v_dram_transposed,
+                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                    sequence<kPadHeadDimV, kPadSeqLenK_>{});
+            }
+            else
+            {
+                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    v_ptr,
+                    make_tuple(kargs.hdim_v, kargs.num_total_pages * kargs.page_block_size),
+                    make_tuple(kargs.stride_v, 1),
+                    number<FmhaPipeline::kAlignmentV>{},
+                    number<1>{});
+
+                constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
+                return pad_tensor_view(
+                    v_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                    sequence<kPadHeadDimV_, kPadSeqLenK>{});
+            }
+        }();
+
+        auto q_dram_window = make_tile_window(
+            q_dram,
+            [&]() {
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                    return make_tuple(number<FmhaPipeline::kM0>{},
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
+                else
+                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+            }(),
+            {i_m0, 0});
+
+        auto k_dram_window = make_tile_window(
+            k_dram, make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), {0, 0});
+
+        auto v_dram_window =
+            make_tile_window(v_dram,
+                             make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                             {i_n1, 0});
+        /// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
+        /// following copy capture of the 'i_nhead' if in C++20
+        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto bias_dram_window_lengths =
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                const BiasDataType* bias_ptr =
+                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                    batch_offset_bias;
+
+                const auto bias_dram = [&]() {
+                    const auto bias_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        bias_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                        make_tuple(kargs.stride_bias, 1),
+                        number<FmhaPipeline::kAlignmentBias>{},
+                        number<1>{});
+
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                }();
+
+                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+            }
+            else
+            {
+                return make_null_tile_window(bias_dram_window_lengths);
+            }
+        }();
+
+        // lse
+        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+            if constexpr(kStoreLSE)
+            {
+                LSEDataType* lse_ptr =
+                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
+
+                const auto lse_dram = [&]() {
+                    const auto lse_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        lse_ptr,
+                        make_tuple(kargs.seqlen_q),
+                        make_tuple(1),
+                        number<1>{},
+                        number<1>{});
+
+                    return pad_tensor_view(
+                        lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                }();
+
+                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+            }
+            else
+            {
+                return make_null_tile_window(lse_dram_window_lengths);
+            }
+        }();
+
+        auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
+            if constexpr(kHasDropout)
+            {
+                return BlockDropout{i_batch_,
+                                    i_nhead_,
+                                    kargs.num_head_q,
+                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
+                                                                        : *kargs.drop_seed.ptr,
+                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_offset.val
+                                                                        : *kargs.drop_offset.ptr,
+                                    kargs.rp_undrop,
+                                    kargs.p_undrop_in_uint8_t,
+                                    kargs.is_store_randval};
+            }
+            else
+            {
+                return NullBlockDropout{};
+            };
+        }();
+
+        auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto randval_dram_window_lengths =
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+            if constexpr(kHasDropout)
+            {
+                RandValOutputDataType* rand_val_ptr =
+                    reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
+                    batch_offset_randval;
+
+                const auto randval_dram = [&]() {
+                    const auto randval_dram_naive =
+                        make_naive_tensor_view<address_space_enum::global>(
+                            rand_val_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                            make_tuple(kargs.stride_randval, 1),
+                            number<1>{},
+                            number<1>{});
+
+                    return pad_tensor_view(randval_dram_naive,
+                                           randval_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                }();
+
+                return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
+            }
+            else
+            {
+                return make_null_tile_window(randval_dram_window_lengths);
+            }
+        }();
+
+        FmhaMask mask = [&]() {
+            if constexpr(kHasMask)
+                return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                    kargs.window_size_left,
+                    kargs.window_size_right,
+                    kargs.seqlen_q,
+                    kargs.seqlen_k,
+                    kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+            else
+                return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+        }();
+
+        // WA i_batch capture structure binding before c++20
+        auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                // data loading, shared by entire wg
+                // TODO: how to use s_read?
+                SaccDataType slope =
+                    *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                      i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                slope *= ck_tile::log2e_v<>;
+#endif
+                if constexpr(kHasMask)
+                {
+                    return make_alibi_from_lr_mask<SaccDataType, true>(slope,
+                                                                       kargs.window_size_left,
+                                                                       kargs.window_size_right,
+                                                                       kargs.seqlen_q,
+                                                                       kargs.seqlen_k,
+                                                                       kargs.mask_type);
+                }
+                else
+                {
+                    return Alibi<SaccDataType, true>{
+                        slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                }
+            }
+            else
+            {
+                return EmptyPositionEncoding<SaccDataType>{};
+            }
+        }();
+
+        AttentionVariant variant;
+        const auto variant_params = [&] {
+            if constexpr(kHasLogitsSoftCap)
+            {
+                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+            }
+            else
+            {
+                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+            }
+        }();
+
+        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+
+        auto o_acc_tile = [&]() {
+            if constexpr(kDoFp8StaticQuant)
+            {
+                return FmhaPipeline{}(
+                    q_dram_window,
+                    identity{}, // q_element_func
+                    k_dram_window,
+                    identity{}, // k_element_func
+                    v_dram_window,
+                    identity{}, // v_element_func
+                    bias_dram_window,
+                    identity{}, // bias_element_func
+                    randval_dram_window,
+                    lse_dram_window,
+                    identity{},                                          // lse_element_func
+                    identity{},                                          // s_acc_element_func
+                    scales{kargs.scale_p},                               // p_compute_element_func
+                    composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
+                    mask,
+                    position_encoding,
+                    kargs.scale_s,
+                    variant,
+                    variant_params,
+                    block_indices,
+                    smem_ptr,
+                    kargs.kv_page_indices,
+                    kargs.stride_k,
+                    kargs.stride_v,
+                    dropout);
+            }
+            else
+            {
+                return FmhaPipeline{}(q_dram_window,
+                                      k_dram_window,
+                                      v_dram_window,
+                                      bias_dram_window,
+                                      randval_dram_window,
+                                      lse_dram_window,
+                                      mask,
+                                      position_encoding,
+                                      kargs.scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
+                                      smem_ptr,
+                                      kargs.kv_page_indices,
+                                      kargs.stride_k,
+                                      kargs.stride_v,
+                                      dropout);
+            }
+        }();
+
+        // O DRAM and O DRAM window
+        auto o_dram = [&]() {
+            const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                o_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                make_tuple(kargs.stride_o, 1),
+                number<FmhaPipeline::kAlignmentO>{},
+                number<1>{});
+
+            return pad_tensor_view(
+                o_dram_naive,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                sequence<kPadSeqLenQ, kPadHeadDimV>{});
+        }();
+
+        auto o_dram_window =
+            make_tile_window(o_dram,
+                             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                             {i_m0, i_n1});
+
+        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 1202524950..a4b3765455 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
 
 #include <string>
 #include <type_traits>
@@ -47,11 +48,13 @@ struct FmhaFwdKernel
     static constexpr bool kPadSeqLenK       = FmhaPipeline::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ      = FmhaPipeline::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV      = FmhaPipeline::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = FmhaPipeline::kHasLogitsSoftCap;
     static constexpr auto BiasEnum          = FmhaPipeline::BiasEnum;
     static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
     static constexpr bool kHasDropout       = FmhaPipeline::kHasDropout;
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
-    using FmhaMask                 = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
+    using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask = FmhaMask::IsMasking;
 
     static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
@@ -94,7 +97,7 @@ struct FmhaFwdKernel
             "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
             (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
         #undef _SS_
         #undef _TS_
@@ -139,6 +142,28 @@ struct FmhaFwdKernel
         ck_tile::index_t nhead_stride_o;
     };
 
+    struct FmhaFwdLogitsSoftCapKargs
+    {
+        FmhaFwdLogitsSoftCapKargs() = default;
+
+        void init_logits_soft_cap(float logits_soft_cap_)
+        {
+            if(0 < logits_soft_cap_)
+            {
+                logits_soft_cap     = logits_soft_cap_;
+                logits_soft_cap_rcp = 1.f / logits_soft_cap;
+            }
+            else
+            {
+                logits_soft_cap     = 0.f;
+                logits_soft_cap_rcp = 0.f;
+            }
+        }
+
+        float logits_soft_cap;
+        float logits_soft_cap_rcp;
+    };
+
     struct FmhaFwdCommonBiasKargs
     {
         const void* bias_ptr               = nullptr;
@@ -242,7 +267,8 @@ struct FmhaFwdKernel
           std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
           std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
           std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
-          std::conditional_t<kHasDropout, FmhaFwdBatchModeDropoutKargs, FmhaFwdEmptyKargs<4>>
+          std::conditional_t<kHasDropout, FmhaFwdBatchModeDropoutKargs, FmhaFwdEmptyKargs<4>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<5>>
     {
         ck_tile::index_t batch_stride_q;
         ck_tile::index_t batch_stride_k;
@@ -260,7 +286,8 @@ struct FmhaFwdKernel
           std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
           std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
           std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
-          std::conditional_t<kHasDropout, FmhaFwdCommonDropoutKargs, FmhaFwdEmptyKargs<4>>
+          std::conditional_t<kHasDropout, FmhaFwdCommonDropoutKargs, FmhaFwdEmptyKargs<4>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<5>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
@@ -269,6 +296,13 @@ struct FmhaFwdKernel
 
     using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
 
+    struct BlockIndices
+    {
+        ck_tile::index_t batch_idx;
+        ck_tile::index_t qo_head_idx;
+        ck_tile::index_t kv_head_idx;
+    };
+
     template <bool Cond = !kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargsImpl(const void* q_ptr,
@@ -287,6 +321,7 @@ struct FmhaFwdKernel
                   float scale_s,
                   float scale_p,
                   float scale_o,
+                  float logits_soft_cap,
                   ck_tile::index_t stride_q,
                   ck_tile::index_t stride_k,
                   ck_tile::index_t stride_v,
@@ -343,6 +378,7 @@ struct FmhaFwdKernel
                     {},               // placeholder for lse
                     {},               // placeholder for fp8_static_quant args
                     {},               // placeholder for dropout
+                    {},               // placeholder for logits_soft_cap
                     batch_stride_q,
                     batch_stride_k,
                     batch_stride_v,
@@ -398,6 +434,10 @@ struct FmhaFwdKernel
             kargs.batch_stride_randval = batch_stride_randval;
             kargs.is_store_randval     = s_randval;
         }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
 
         return kargs;
     }
@@ -421,6 +461,7 @@ struct FmhaFwdKernel
               float scale_s,
               float scale_p,
               float scale_o,
+              float logits_soft_cap,
               ck_tile::index_t stride_q,
               ck_tile::index_t stride_k,
               ck_tile::index_t stride_v,
@@ -465,6 +506,7 @@ struct FmhaFwdKernel
             scale_s,
             scale_p,
             scale_o,
+            logits_soft_cap,
             stride_q,
             stride_k,
             stride_v,
@@ -512,6 +554,7 @@ struct FmhaFwdKernel
               float scale_s,
               float scale_p,
               float scale_o,
+              float logits_soft_cap,
               ck_tile::index_t stride_q,
               ck_tile::index_t stride_k,
               ck_tile::index_t stride_v,
@@ -556,6 +599,7 @@ struct FmhaFwdKernel
             scale_s,
             scale_p,
             scale_o,
+            logits_soft_cap,
             stride_q,
             stride_k,
             stride_v,
@@ -603,6 +647,7 @@ struct FmhaFwdKernel
                   float scale_s,
                   float scale_p,
                   float scale_o,
+                  float logits_soft_cap,
                   ck_tile::index_t stride_q,
                   ck_tile::index_t stride_k,
                   ck_tile::index_t stride_v,
@@ -652,6 +697,7 @@ struct FmhaFwdKernel
                     {},               // placeholder for lse
                     {},               // placeholder for fp8_static_quant args
                     {},               // placeholder for dropout
+                    {},               // placeholder for logits_soft_cap
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqstart_k_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
@@ -703,6 +749,10 @@ struct FmhaFwdKernel
             kargs.nhead_stride_randval = nhead_stride_randval;
             kargs.is_store_randval     = s_randval;
         }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
 
         return kargs;
     }
@@ -727,6 +777,7 @@ struct FmhaFwdKernel
               float scale_s,
               float scale_p,
               float scale_o,
+              float logits_soft_cap,
               ck_tile::index_t stride_q,
               ck_tile::index_t stride_k,
               ck_tile::index_t stride_v,
@@ -765,6 +816,7 @@ struct FmhaFwdKernel
             scale_s,
             scale_p,
             scale_o,
+            logits_soft_cap,
             stride_q,
             stride_k,
             stride_v,
@@ -806,6 +858,7 @@ struct FmhaFwdKernel
               float scale_s,
               float scale_p,
               float scale_o,
+              float logits_soft_cap,
               ck_tile::index_t stride_q,
               ck_tile::index_t stride_k,
               ck_tile::index_t stride_v,
@@ -844,6 +897,7 @@ struct FmhaFwdKernel
             scale_s,
             scale_p,
             scale_o,
+            logits_soft_cap,
             stride_q,
             stride_k,
             stride_v,
@@ -1307,6 +1361,21 @@ struct FmhaFwdKernel
             }
         }();
 
+        AttentionVariant variant;
+        const auto variant_params = [&] {
+            if constexpr(kHasLogitsSoftCap)
+            {
+                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+            }
+            else
+            {
+                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+            }
+        }();
+
+        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+
         auto o_acc_tile = [&]() {
             if constexpr(kDoFp8StaticQuant)
             {
@@ -1328,6 +1397,9 @@ struct FmhaFwdKernel
                     mask,
                     position_encoding,
                     kargs.scale_s,
+                    variant,
+                    variant_params,
+                    block_indices,
                     smem_ptr,
                     dropout);
             }
@@ -1342,6 +1414,9 @@ struct FmhaFwdKernel
                                       mask,
                                       position_encoding,
                                       kargs.scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
                                       smem_ptr,
                                       dropout);
             }
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index ea1762abc1..63011d2ba9 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -6,6 +6,8 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
+
 #include <string>
 #include <type_traits>
 
@@ -43,14 +45,15 @@ struct FmhaFwdSplitKVKernel
     static constexpr bool kPadSeqLenK       = FmhaPipeline::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ      = FmhaPipeline::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV      = FmhaPipeline::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = FmhaPipeline::kHasLogitsSoftCap;
     static constexpr auto BiasEnum          = FmhaPipeline::BiasEnum;
     static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
     static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
     static constexpr bool kMergeNumHeadGroupsSeqLenQ =
         FmhaPipeline::Problem::kMergeNumHeadGroupsSeqLenQ;
-
-    using FmhaMask                 = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
+    using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask = FmhaMask::IsMasking;
 
     static_assert(!kMergeNumHeadGroupsSeqLenQ ||
@@ -95,7 +98,7 @@ struct FmhaFwdSplitKVKernel
             "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
             (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) +
             (kDoFp8StaticQuant ? "_squant" : "_nsquant") + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
         #undef _SS_
@@ -150,6 +153,28 @@ struct FmhaFwdSplitKVKernel
         ck_tile::index_t split_stride_o_acc;
     };
 
+    struct LogitsSoftCapKargs
+    {
+        LogitsSoftCapKargs() = default;
+
+        void init_logits_soft_cap(float logits_soft_cap_)
+        {
+            if(0 < logits_soft_cap_)
+            {
+                logits_soft_cap     = logits_soft_cap_;
+                logits_soft_cap_rcp = 1.f / logits_soft_cap;
+            }
+            else
+            {
+                logits_soft_cap     = 0.f;
+                logits_soft_cap_rcp = 0.f;
+            }
+        }
+
+        float logits_soft_cap;
+        float logits_soft_cap_rcp;
+    };
+
     struct CommonBiasKargs
     {
         const void* bias_ptr               = nullptr;
@@ -207,7 +232,8 @@ struct FmhaFwdSplitKVKernel
                                                 EmptyKargs<0>>>,
           std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
           std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
-          std::conditional_t<kIsPagedKV, CommonPageBlockTableKargs, CacheBatchIdxKargs>
+          std::conditional_t<kIsPagedKV, CommonPageBlockTableKargs, CacheBatchIdxKargs>,
+          std::conditional_t<kHasLogitsSoftCap, LogitsSoftCapKargs, EmptyKargs<3>>
     {
         const int32_t* seqlen_k_ptr;
 
@@ -229,7 +255,8 @@ struct FmhaFwdSplitKVKernel
                                                 EmptyKargs<0>>>,
           std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
           std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
-          std::conditional_t<kIsPagedKV, GroupModePageBlockTableKargs, EmptyKargs<3>>
+          std::conditional_t<kIsPagedKV, GroupModePageBlockTableKargs, EmptyKargs<3>>,
+          std::conditional_t<kHasLogitsSoftCap, LogitsSoftCapKargs, EmptyKargs<4>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
@@ -243,6 +270,13 @@ struct FmhaFwdSplitKVKernel
 
     using Kargs = std::conditional_t<kIsGroupMode, GroupModeKargs, BatchModeKargs>;
 
+    struct BlockIndices
+    {
+        ck_tile::index_t batch_idx;
+        ck_tile::index_t qo_head_idx;
+        ck_tile::index_t kv_head_idx;
+    };
+
     template <bool Cond = !kIsGroupMode>
     __host__ static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargs(const void* q_ptr,
@@ -268,6 +302,7 @@ struct FmhaFwdSplitKVKernel
               const void* cache_batch_idx,
               float scale_s,
               float scale_p,
+              float logits_soft_cap,
               ck_tile::index_t stride_q,
               ck_tile::index_t stride_k,
               ck_tile::index_t stride_v,
@@ -324,6 +359,7 @@ struct FmhaFwdSplitKVKernel
                     {},                   // placeholder for mask
                     {},                   // placeholder for fp8_static_quant args
                     {},                   // placeholder for paged-block table or cache_batch_idx
+                    {},                   // placeholder for logits_soft_cap
                     reinterpret_cast<const int32_t*>(seqlen_k_ptr),
                     batch_stride_q,
                     batch_stride_k,
@@ -363,6 +399,10 @@ struct FmhaFwdSplitKVKernel
         {
             kargs.cache_batch_idx = reinterpret_cast<const int32_t*>(cache_batch_idx);
         }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
 
         return kargs;
     }
@@ -392,6 +432,7 @@ struct FmhaFwdSplitKVKernel
               bool is_gappy,
               float scale_s,
               float scale_p,
+              float logits_soft_cap,
               ck_tile::index_t stride_q,
               ck_tile::index_t stride_k,
               ck_tile::index_t stride_v,
@@ -444,6 +485,7 @@ struct FmhaFwdSplitKVKernel
                     {},                   // placeholder for mask
                     {},                   // placeholder for fp8_static_quant args
                     {},                   // placeholder for paged-block table
+                    {},                   // placeholder for logits_soft_cap
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqstart_k_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_k_ptr),
@@ -478,6 +520,10 @@ struct FmhaFwdSplitKVKernel
             kargs.page_block_size          = page_block_size;
             kargs.is_gappy                 = is_gappy;
         }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
 
         return kargs;
     }
@@ -968,6 +1014,21 @@ struct FmhaFwdSplitKVKernel
             }
         }();
 
+        AttentionVariant variant;
+        const auto variant_params = [&] {
+            if constexpr(kHasLogitsSoftCap)
+            {
+                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+            }
+            else
+            {
+                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+            }
+        }();
+
+        BlockIndices block_indices{i_batch, i_nhead, i_nhead_k};
+
         auto o_acc_tile = [&, i_split_ = i_split]() {
             if constexpr(kDoFp8StaticQuant)
             {
@@ -991,6 +1052,9 @@ struct FmhaFwdSplitKVKernel
                                       mask,
                                       position_encoding,
                                       kargs.scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
                                       kv_l2p_offset,
                                       smem_ptr);
             }
@@ -1008,6 +1072,9 @@ struct FmhaFwdSplitKVKernel
                                       mask,
                                       position_encoding,
                                       kargs.scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
                                       kv_l2p_offset,
                                       smem_ptr);
             }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
new file mode 100644
index 0000000000..e07cf1c94e
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
@@ -0,0 +1,900 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
+#include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+// a variation of qr/ks/vs, where we use async copy to load k (potentially v in the future)
+template <typename Problem_,
+          typename Policy_ = BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy>
+struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
+{
+    using Problem               = remove_cvref_t<Problem_>;
+    using Policy                = remove_cvref_t<Policy_>;
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType          = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType   = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType             = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+    static constexpr auto I0               = number<0>{};
+    static constexpr auto I1               = number<1>{};
+    static constexpr auto I2               = number<2>{};
+    static constexpr auto I3               = number<3>{};
+
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
+    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
+    // TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x)
+    //       only need special care about seq_k padding (oob need set -INF of p instead of zero)
+    static_assert(Problem::kPadSeqLenQ == true && Problem::kPadHeadDimQ == true &&
+                  Problem::kPadHeadDimV == true);
+    static constexpr bool kPadSeqLenQ       = true;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = true; // support multiple of vector(like 8x)
+    static constexpr bool kPadHeadDimV      = true; // support multiple of vector(like 8x)
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ = Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK = Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+    static constexpr index_t kAlignmentO = Policy::template GetAlignmentO<Problem>();
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+    static constexpr auto R_LOG2E = 1.0 / log2e_v<SaccDataType>;
+#endif
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            // minimize occupancy
+            if constexpr(BiasEnum != BlockAttentionBiasEnum::NO_BIAS && kHasDropout)
+            {
+                return 1;
+            }
+
+            if constexpr(kQKHeaddim <= 32)
+            {
+                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS &&
+                             FmhaMask::IsMasking)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 64)
+            {
+                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 2;
+                else
+                    return 3;
+            }
+            else if constexpr(kQKHeaddim <= 128)
+            {
+                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 192)
+            {
+                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 256)
+            {
+                return 1;
+            }
+            else
+            {
+                return 1;
+            };
+        }
+    }();
+
+    static constexpr const char* name = "qr_async";
+
+    using DropoutType = std::conditional_t<kHasDropout, BlockDropout, NullBlockDropout>;
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename RandValDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename QElementFunction,
+              typename KElementFunction,
+              typename VElementFunction,
+              typename BiasElementFunction,
+              typename LSEElementFunction,
+              typename SAccElementFunction,
+              typename PComputeElementFunction,
+              typename OAccElementFunction,
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
+               const QElementFunction& q_element_func,
+               const KDramBlockWindowTmp& k_dram_block_window_tmp, // N0*K0 tile
+               const KElementFunction& /*k_element_func*/,
+               const VDramBlockWindowTmp& v_dram_block_window_tmp, // N1*K1 tile
+               const VElementFunction& v_element_func,
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               const BiasElementFunction& bias_element_func,
+               RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
+               LSEDramBlockWindowTmp& lse_dram_window_tmp, // M0*1 tile
+               const LSEElementFunction& lse_element_func,
+               const SAccElementFunction& s_acc_element_func,
+               const PComputeElementFunction& p_compute_element_func,
+               const OAccElementFunction& o_acc_element_func,
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
+               void* smem_ptr,
+               const index_t* page_idx,
+               const index_t stride_k,
+               const index_t stride_v,
+               DropoutType& dropout) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        constexpr auto LdsSeq = Policy::template GetLdsBufferSequence<Problem>();
+
+        // K tile in LDS
+        auto k_lds_ptr   = reinterpret_cast<KDataType*>(smem_ptr);
+        auto k_lds_store = generate_tuple(
+            [&](auto i_buf) {
+                return make_tile_window(
+                    make_tensor_view<address_space_enum::lds>(
+                        k_lds_ptr, Policy::template MakeKLdsStoreBlockDescriptor<Problem>(i_buf)),
+                    Policy::template MakeKLdsStoreBlockDescriptor<Problem>(i_buf).get_lengths(),
+                    {0, 0, 0});
+            },
+            number<Policy::NumKVLdsBuffers>{});
+
+        auto k_lds_Load_view = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsLoadBlockDescriptor<Problem>());
+
+        auto k_lds_load =
+            make_tile_window(k_lds_Load_view,
+                             Policy::template MakeKLdsLoadBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        // V tile in LDS
+        auto v_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(smem_ptr),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_window = make_tile_window(
+            v_lds, Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
+
+        auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                                              q_dram_block_window_tmp.get_window_lengths(),
+                                              q_dram_block_window_tmp.get_window_origin(),
+                                              Policy::template MakeQRegTileDistribution<Problem>());
+        q_dram_window.init_raw();
+
+        // TODO: we use async Copy for K, which is inline asm
+        // a side effect is we have to use inline asm for q as well
+        auto q = decltype(load_tile(q_dram_window)){};
+        // TODO: start from rocm-6.2, compiler will have problem if manually set clear of q.
+        // however, q would be cleared in the constructor of static distributed tensor
+        // set_tile(q, number<0>{}); // use per-dword clear to avoid scratch
+        load_tile_raw(q, q_dram_window);
+        __builtin_amdgcn_sched_barrier(0);
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(s_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        // init Oacc, M, L
+        auto o_acc = OaccBlockTileType{};
+        auto m     = MLBlockTileType{};
+        auto l     = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        __builtin_amdgcn_sched_barrier(0);
+        const auto q_origin = q_dram_window.get_window_origin();
+        const auto [seqlen_k_start, seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+
+        const auto num_total_loop = integer_divide_ceil(seqlen_k_end - seqlen_k_start, kN0);
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK)
+        {
+            if(num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse, -numeric<SMPLComputeDataType>::infinity());
+
+                    store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse));
+                }
+                buffer_load_fence(0); // rocm-6.1, if whole tile is masked out, need to fence(0)
+                                      // otherwise will have compute error(maybe compiler bug?)
+
+                // Note: here occ are all cleard, return it
+                return o_acc;
+            }
+            __builtin_amdgcn_sched_barrier(0); // make sure sched_barrier(0) for this check
+        }
+
+        auto k_dram_block_window =
+            make_tile_window(k_dram_block_window_tmp.get_bottom_tensor_view(),
+                             k_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_k_start, 0});
+
+        auto k_dist               = Policy::template MakeKDramTileDistribution<Problem>();
+        auto k_coord              = k_dist.calculate_index();
+        using KDstrEncode         = typename decltype(k_dist)::DstrEncode;
+        constexpr index_t NRepeat = KDstrEncode::hs_lengthss_[I0][I0];
+        statically_indexed_array<index_t, NRepeat> k_offsets;
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            k_offsets[n0] = page_idx[k_coord[0] + kN0 / NRepeat * n0.value] * stride_k;
+        });
+        auto k_dram_window = make_tile_scatter_gather(k_dram_block_window.get_bottom_tensor_view(),
+                                                      k_dram_block_window.get_window_lengths(),
+                                                      k_dram_block_window.get_window_origin(),
+                                                      k_dist,
+                                                      k_offsets); // K DRAM tile window for
+        k_dram_window.init_raw();
+        constexpr auto k_oob_ck = bool_constant<true>{};
+        constexpr auto k_pre_np = [&]() {
+            if constexpr(kPadSeqLenK &&
+                         (BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                          (BiasEnum != BlockAttentionBiasEnum::NO_BIAS && kHasDropout)))
+                return bool_constant<true>{};
+            else
+                return bool_constant<false>{};
+        }();
+
+        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}), seqlen_k_start}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
+
+        auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0)>(
+            randval_dram_block_window_tmp, seqlen_k_start);
+
+        auto v_dist                 = Policy::template MakeVDramTileDistribution<Problem>();
+        auto v_coord                = v_dist.calculate_index();
+        const auto VPageIndexDim    = I1;
+        using VDstrEncode           = typename decltype(v_dist)::DstrEncode;
+        constexpr index_t V_KRepeat = VDstrEncode::hs_lengthss_[I1][I3];
+        statically_indexed_array<index_t, V_KRepeat> v_offsets;
+        (void)stride_k;
+        static_for<0, V_KRepeat, 1>{}([&](auto k0) {
+            v_offsets[k0] = page_idx[v_coord[VPageIndexDim] + k0.value] * stride_v;
+        });
+
+        auto v_dram_window =
+            make_tile_scatter_gather(v_dram_block_window_tmp.get_bottom_tensor_view(),
+                                     v_dram_block_window_tmp.get_window_lengths(),
+                                     {0, seqlen_k_start}, // TODO: hdim split?
+                                     v_dist,
+                                     v_offsets,
+                                     VPageIndexDim);
+
+        // prefetch K tile
+        async_load_tile_raw(
+            k_lds_store(LdsSeq.at(number<0>{})), k_dram_window, number<-1>{}, k_oob_ck, k_pre_np);
+        move_tile_window(k_dram_window, {0, kK0});
+        __builtin_amdgcn_sched_barrier(0);
+
+        buffer_load_fence(k_dram_window.get_num_of_access(), q.get_thread_buffer());
+        (void)q_element_func; // ??? rocm-6.x if use q element func will have scratch on hdim=64/32
+        // auto q_tile = q;      // tile_elementwise_in(q_element_func, q);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+        // main loop
+        do
+        {
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+            if constexpr(k0_loops > 1)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    async_load_tile_raw(k_lds_store(number<LdsSeq.at(number<i_k0 + 1>{})>{}),
+                                        k_dram_window,
+                                        number<-1>{},
+                                        k_oob_ck,
+                                        k_pre_np);
+                    if constexpr(i_k0 < k0_loops - 1)
+                        move_tile_window(k_dram_window, {0, kK0});
+
+                    async_load_fence(k_dram_window.get_num_of_access());
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                    gemm_0(s_acc,
+                           get_slice_tile(
+                               q, sequence<0, i_k0 * kK0>{}, sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           get_slice_tile(k_lds_load,
+                                          sequence<(LdsSeq.at(number<i_k0>{})) * kN0, 0>{},
+                                          sequence<(LdsSeq.at(number<i_k0>{}) + 1) * kN0, kK0>{}));
+                });
+            }
+
+            // TODO: this to fix a bug when loop smaller than 2,
+            // the following fence/barrier will be scheduled inside 1st loop
+            if constexpr(k0_loops <= 2)
+                __builtin_amdgcn_sched_barrier(0);
+
+            async_load_fence();
+            __builtin_amdgcn_s_barrier();
+
+            const auto bias_tile = load_tile(bias_dram_window); // load bias tile
+            auto v_buf           = load_tile(v_dram_window, number<-1>{}, bool_constant<false>{});
+            static_for<0, V_KRepeat, 1>{}([&](auto k0) {
+                v_offsets[k0] = page_idx[kK1 + v_coord[VPageIndexDim] + k0.value] * stride_v;
+            });
+            v_dram_window.update_page_idx(v_offsets);
+
+            __builtin_amdgcn_sched_barrier(0);
+            { // tail
+                gemm_0(
+                    s_acc,
+                    get_slice_tile(
+                        q, sequence<0, (k0_loops - 1) * kK0>{}, sequence<kM0, k0_loops * kK0>{}),
+                    get_slice_tile(k_lds_load,
+                                   sequence<(LdsSeq.at(number<k0_loops - 1>{})) * kN0, 0>{},
+                                   sequence<(LdsSeq.at(number<k0_loops - 1>{}) + 1) * kN0, kK0>{}));
+            }
+            __builtin_amdgcn_sched_barrier(1);
+
+            // STAGE 2, scale_s, add bias, mask, softmax
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                tile_elementwise_inout(
+                    [&](auto& x, const auto& y) {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                        x += type_convert<SaccDataType>(bias_element_func(y));
+#else
+                        x += log2e_v<SaccDataType> *
+                             type_convert<SaccDataType>(bias_element_func(y));
+#endif
+                    },
+                    s_acc,
+                    bias_tile);
+            }
+            else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                const auto k_origin    = k_dram_block_window.get_window_origin();
+                constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
+                s_acc                  = tile_elementwise_in(s_acc_element_func, s_acc);
+                sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                        const auto tile_idx = get_x_indices_from_distributed_indices(
+                            s_acc.get_tile_distribution(), make_tuple(idx0, idx1));
+
+                        const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                        const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                        s_acc(i_j_idx) *= scale_s;
+                        position_encoding.update(s_acc(i_j_idx), row, col);
+                    });
+                });
+            }
+            else
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
+                    {
+                        apply_logits_transform(s_acc.thread_buf_[i]);
+                    }
+#else
+                    for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
+                    {
+                        apply_logits_transform(s_acc.thread_buf_[i]);
+                    }
+#endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
+            }
+            move_tile_window(bias_dram_window, {0, kN0});
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin      = k_dram_block_window.get_window_origin();
+                bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}),
+                                                           k_origin.at(number<0>{}),
+                                                           number<kM0>{},
+                                                           number<kN0>{});
+
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                            return !variant.LogitsMask(variant_params,
+                                                       block_indices.batch_idx,
+                                                       row,
+                                                       col,
+                                                       block_indices.qo_head_idx,
+                                                       block_indices.kv_head_idx);
+                        });
+                }
+            }
+
+            const auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(m_local, f_max, bool_constant<false>{});
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s.get_tile_distribution()); // Pcompute{j}
+
+            __builtin_amdgcn_sched_barrier(0x7F);
+            // store & prefetch next v, after the max reduction
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                    Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                shuffle_tile(v_shuffle_tmp, v_buf);
+
+                auto v_lds_window_tmp =
+                    get_slice_tile(v_lds_window,
+                                   sequence<(LdsSeq.at(number<k0_loops>{})) * kN1, 0>{},
+                                   sequence<(LdsSeq.at(number<k0_loops>{}) + 1) * kN1, kK1>{});
+
+                store_tile(
+                    v_lds_window_tmp,
+                    tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
+            }
+            else
+            {
+                auto v_lds_window_tmp =
+                    get_slice_tile(v_lds_window,
+                                   sequence<(LdsSeq.at(number<k0_loops>{})) * kN1, 0>{},
+                                   sequence<(LdsSeq.at(number<k0_loops>{}) + 1) * kN1, kK1>{});
+                store_tile(v_lds_window_tmp,
+                           tile_elementwise_in(v_element_func, v_buf)); // store the prefetch
+            }
+
+            if constexpr(k1_loops > 1)
+            {
+                move_tile_window(
+                    v_dram_window,
+                    {0, kK1}); // will have scratch if move this right after load_tile(v_dram)...
+                v_buf = load_tile(
+                    v_dram_window, number<-1>{}, bool_constant<false>{}); // load next v_buf
+                static_for<0, V_KRepeat, 1>{}([&](auto k0) {
+                    v_offsets[k0] =
+                        page_idx[kK1 * 2 + v_coord[VPageIndexDim] + k0.value] * stride_v;
+                });
+                v_dram_window.update_page_idx(v_offsets);
+            }
+            __builtin_amdgcn_sched_barrier(0);
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration. alibi does not have this problem
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                auto row_max = scale_s * get_validated_m(m[i_idx]);
+#endif
+                sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        }
+                    }
+#else
+                    p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
+#endif
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{});
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                const auto tmp = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+#else
+                const auto tmp = exp(m_old[i_idx] - get_validated_m(m[i_idx]));
+#endif
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    // FIXME: this use different equation from FA v2 paper,
+                    // but produce correc result.
+                    // Is the equation wrong?
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            if constexpr(kHasDropout)
+            {
+                auto randval_ptr =
+                    reinterpret_cast<char*>(smem_ptr) + Policy::template GetSmemSizeKV<Problem>();
+                dropout.template Run<decltype(gemm_0), SMPLComputeDataType, RandValOutputDataType>(
+                    randval_ptr,
+                    seqlen_k_start + i_total_loops * kN0,
+                    p_compute,
+                    randval_dram_window);
+            }
+
+            const auto p = [&]() {
+                if constexpr(std::is_same_v<PDataType, fp16_t>)
+                    return impl::cast_tile_pk_fp16_fp32<PDataType>(
+                        tile_elementwise_in(p_compute_element_func, p_compute));
+                else
+                    return cast_tile<PDataType>(
+                        tile_elementwise_in(p_compute_element_func, p_compute));
+            }();
+
+            // STAGE 3, KV gemm
+            if constexpr(k1_loops > 1)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    if constexpr(i_k1 != 0 && i_k1 < k1_loops - 1)
+                    {
+                        v_buf = load_tile(
+                            v_dram_window, number<-1>{}, bool_constant<false>{}); // load next v_buf
+                        static_for<0, V_KRepeat, 1>{}([&](auto k0) {
+                            v_offsets[k0] = page_idx[kK1 * 2 + i_k1.value * kK1 +
+                                                     v_coord[VPageIndexDim] + k0.value] *
+                                            stride_v;
+                        });
+                        v_dram_window.update_page_idx(v_offsets);
+                    }
+                    block_sync_lds();
+                    gemm_1(o_acc,
+                           get_slice_tile(
+                               p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           get_slice_tile(
+                               v_lds_window,
+                               sequence<(LdsSeq.at(number<k0_loops + i_k1>{})) * kN1, 0>{},
+                               sequence<(LdsSeq.at(number<k0_loops + i_k1>{}) + 1) * kN1, kK1>{}));
+
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                            Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                        shuffle_tile(v_shuffle_tmp, v_buf);
+                        auto v_lds_window_tmp = get_slice_tile(
+                            v_lds_window,
+                            sequence<(LdsSeq.at(number<k0_loops + i_k1 + 1>{})) * kN1, 0>{},
+                            sequence<(LdsSeq.at(number<k0_loops + i_k1 + 1>{}) + 1) * kN1, kK1>{});
+                        store_tile(v_lds_window_tmp,
+                                   tile_elementwise_in(v_element_func,
+                                                       v_shuffle_tmp)); // store the prefetch
+                    }
+                    else
+                    {
+                        auto v_lds_window_tmp = get_slice_tile(
+                            v_lds_window,
+                            sequence<(LdsSeq.at(number<k0_loops + i_k1 + 1>{})) * kN1, 0>{},
+                            sequence<(LdsSeq.at(number<k0_loops + i_k1 + 1>{}) + 1) * kN1, kK1>{});
+                        store_tile(v_lds_window_tmp,
+                                   tile_elementwise_in(v_element_func, v_buf)); // store next v_buf
+                    }
+                    if constexpr(i_k1 < k1_loops - 1)
+                        move_tile_window(v_dram_window, {0, kK1});
+                });
+            }
+            i_total_loops++;
+            if(i_total_loops < num_total_loop)
+            {
+                page_idx += kN0;
+                // move K tile windows
+                move_tile_window(k_dram_block_window, {kN0, 0});
+                k_dram_window.set_window_origin(k_dram_block_window.get_window_origin());
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    k_offsets[n0] = page_idx[k_coord[0] + kN0 / NRepeat * n0.value] * stride_k;
+                });
+                k_dram_window.update_page_idx(k_offsets);
+                if constexpr(k1_loops >= 2 &&
+                             LdsSeq.at(number<0>{}) == LdsSeq.at(number<k0_loops + k1_loops - 2>{}))
+                    __builtin_amdgcn_s_barrier();
+                async_load_tile_raw(k_lds_store(LdsSeq.at(number<0>{})),
+                                    k_dram_window,
+                                    number<-1>{},
+                                    k_oob_ck,
+                                    k_pre_np);
+                move_tile_window(k_dram_window, {0, kK0});
+            }
+            // tail
+            {
+                block_sync_lds();
+                gemm_1(
+                    o_acc,
+                    get_slice_tile(p, sequence<0, (k1_loops - 1) * kK1>{}, sequence<kM0, kN0>{}),
+                    get_slice_tile(
+                        v_lds_window,
+                        sequence<(LdsSeq.at(number<k0_loops + k1_loops - 1>{})) * kN1, 0>{},
+                        sequence<(LdsSeq.at(number<k0_loops + k1_loops - 1>{}) + 1) * kN1, kK1>{}));
+            }
+        } while(i_total_loops < num_total_loop);
+
+        // store lse
+        if constexpr(kStoreLSE)
+        {
+            auto lse = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_spans = decltype(lse)::get_distributed_spans();
+            sweep_tile_span(lse_spans[number<0>{}], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse(i_idx) = m_[i_idx] * R_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse(i_idx) = m_[i_idx] * R_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse(i_idx) = m_[i_idx] * scale_s * R_LOG2E + log(l_[i_idx]);
+                    }
+                }
+#else
+                lse(i_idx) = m_[i_idx] + log(l_[i_idx]);
+#endif
+            });
+
+            store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse));
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        o_acc = tile_elementwise_in(o_acc_element_func, o_acc);
+
+        return o_acc;
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename RandValDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               RandValDramBlockWindowTmp& randval_dram_block_window_tmp, // M0*N0 tile
+               LSEDramBlockWindowTmp& lse_dram_block_window_tmp,         // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
+               void* smem_ptr,
+               const index_t* page_idx,
+               const index_t stride_k,
+               const index_t stride_v,
+               DropoutType& dropout) const
+    {
+        return operator()(q_dram_block_window_tmp,
+                          identity{},
+                          k_dram_block_window_tmp,
+                          identity{},
+                          v_dram_block_window_tmp,
+                          identity{},
+                          bias_dram_block_window_tmp,
+                          identity{},
+                          randval_dram_block_window_tmp,
+                          lse_dram_block_window_tmp,
+                          identity{},
+                          identity{},
+                          identity{},
+                          identity{},
+                          mask,
+                          position_encoding,
+                          scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
+                          smem_ptr,
+                          page_idx,
+                          stride_k,
+                          stride_v,
+                          dropout);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp
new file mode 100644
index 0000000000..02731ca8f8
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+using BlockFmhaBatchPrefillPipelineQRKSVSAsyncDefaultPolicy =
+    BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                        /* AsyncCopy = */ true,
+                                        /* NumPrefetchK = */ 3,
+                                        /* NumPrefetchV = */ 3>;
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
index 809c58f1d1..4d1c38e079 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
@@ -27,6 +27,7 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
     using PDataType           = remove_cvref_t<typename Problem::PDataType>;
     using OaccDataType        = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType           = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant    = remove_cvref_t<typename Problem::AttentionVariant>;
     using FmhaMask            = remove_cvref_t<typename Problem::FmhaMask>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
@@ -46,15 +47,21 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
 
     static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
 
-    static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
-    static constexpr auto BiasEnum         = Problem::BiasEnum;
-    static constexpr bool kStoreLSE        = Problem::kStoreLSE;
-    static constexpr bool kIsPagedKV       = Problem::kIsPagedKV;
-    static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits;
+    static constexpr bool kIsGroupMode      = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = Problem::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kIsPagedKV        = Problem::kIsPagedKV;
+    static constexpr bool kHasUnevenSplits  = Problem::kHasUnevenSplits;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -128,7 +135,9 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
               typename SAccElementFunction,
               typename PComputeElementFunction,
               typename OAccElementFunction,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
                const QElementFunction& q_element_func,
@@ -150,6 +159,9 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
                void* smem_ptr) const
     {
@@ -453,9 +465,34 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
             else
             {
                 s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
 #if !CK_TILE_FMHA_FWD_FAST_EXP2
-                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                    for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
+                    {
+                        apply_logits_transform(s_acc.thread_buf_[i]);
+                    }
+#else
+                    for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
+                    {
+                        apply_logits_transform(s_acc.thread_buf_[i]);
+                    }
 #endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
             }
             move_tile_window(bias_dram_window, {0, kN0});
 
@@ -574,7 +611,14 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                     }
                     else
                     {
-                        p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
                     }
 #else
                     p_compute(i_j_idx)     = exp(s_new[i_j_idx] - get_validated_m(m[i_idx]));
@@ -603,8 +647,15 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                     }
                     else
                     {
-                        auto row_max = scale_s * get_validated_m(m[i_idx]);
-                        return exp2(scale_s * m_old[i_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
                     }
                 }();
 #else
@@ -711,7 +762,14 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                 }
                 else
                 {
-                    lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
                 }
 #else
                     lse_acc(i_idx) = m_[i_idx] + log(l_[i_idx]);
@@ -757,7 +815,9 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
               typename VPageBlockNavigator,
               typename BiasDramBlockWindowTmp,
               typename LSEaccDramBlockWindowTmp,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,         // M0*K0 tile
                const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile
@@ -771,6 +831,9 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
                void* smem_ptr) const
     {
@@ -794,6 +857,9 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
                           mask,
                           position_encoding,
                           scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
                           kv_l2p_offset,
                           smem_ptr);
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index ce80dba5eb..7f5f79d7a7 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -26,6 +26,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
     using PDataType           = remove_cvref_t<typename Problem::PDataType>;
     using OaccDataType        = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType           = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant    = remove_cvref_t<typename Problem::AttentionVariant>;
     using FmhaMask            = remove_cvref_t<typename Problem::FmhaMask>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
@@ -45,15 +46,21 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
 
     static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
 
-    static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
-    static constexpr auto BiasEnum         = Problem::BiasEnum;
-    static constexpr bool kStoreLSE        = Problem::kStoreLSE;
-    static constexpr bool kIsPagedKV       = Problem::kIsPagedKV;
-    static constexpr bool kHasUnevenSplits = Problem::kHasUnevenSplits;
+    static constexpr bool kIsGroupMode      = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = Problem::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kIsPagedKV        = Problem::kIsPagedKV;
+    static constexpr bool kHasUnevenSplits  = Problem::kHasUnevenSplits;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -127,7 +134,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
               typename SAccElementFunction,
               typename PComputeElementFunction,
               typename OAccElementFunction,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
                const QElementFunction& q_element_func,
@@ -149,6 +158,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
                void* smem_ptr) const
     {
@@ -401,9 +413,28 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
             else
             {
                 s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
 #if !CK_TILE_FMHA_FWD_FAST_EXP2
-                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
+#else
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
 #endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
             }
             move_tile_window(bias_dram_window, {0, kN0});
 
@@ -497,7 +528,14 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                     }
                     else
                     {
-                        p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        }
                     }
 #else
                     p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
@@ -522,8 +560,16 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                     }
                     else
                     {
-                        auto row_max = scale_s * get_validated_m(m[i_idx]);
-                        return exp2(scale_s * m_old[i_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
                     }
                 }();
 #else
@@ -620,7 +666,14 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 }
                 else
                 {
-                    lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
                 }
 #else
                     lse_acc(i_idx) = m_[i_idx] + log(l_[i_idx]);
@@ -662,7 +715,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
               typename VPageBlockNavigator,
               typename BiasDramBlockWindowTmp,
               typename LSEaccDramBlockWindowTmp,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,         // M0*K0 tile
                const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile
@@ -676,6 +731,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
                void* smem_ptr) const
     {
@@ -699,6 +757,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                           mask,
                           position_encoding,
                           scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
                           kv_l2p_offset,
                           smem_ptr);
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 9a5208c025..f35c00c268 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -20,6 +20,7 @@ template <typename QDataType_,
           typename ODataType_,
           typename BlockFmhaShape_,
           bool kIsGroupMode_,
+          typename AttentionVariant_,
           typename FmhaMask_,
           typename Traits_>
 struct BlockFmhaPipelineProblem
@@ -36,6 +37,7 @@ struct BlockFmhaPipelineProblem
     using OaccDataType          = remove_cvref_t<OaccDataType_>;
     using ODataType             = remove_cvref_t<ODataType_>;
     using BlockFmhaShape        = remove_cvref_t<BlockFmhaShape_>;
+    using AttentionVariant      = remove_cvref_t<AttentionVariant_>;
     using FmhaMask              = remove_cvref_t<FmhaMask_>;
     using Traits                = remove_cvref_t<Traits_>;
 
@@ -50,6 +52,7 @@ struct BlockFmhaPipelineProblem
     static constexpr bool kPadSeqLenK       = Traits::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ      = Traits::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV      = Traits::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Traits::kHasLogitsSoftCap;
     static constexpr auto BiasEnum          = Traits::BiasEnum;
     static constexpr bool kStoreLSE         = Traits::kStoreLSE;
     static constexpr bool kHasDropout       = Traits::kHasDropout;
@@ -69,6 +72,7 @@ template <typename QDataType_,
           typename ODataType_,
           typename BlockFmhaShape_,
           bool kIsGroupMode_,
+          typename AttentionVariant_,
           typename FmhaMask_,
           typename Traits_>
 struct BlockFmhaFwdSplitKVPipelineProblem
@@ -84,6 +88,7 @@ struct BlockFmhaFwdSplitKVPipelineProblem
     using OaccDataType        = remove_cvref_t<OaccDataType_>;
     using ODataType           = remove_cvref_t<ODataType_>;
     using BlockFmhaShape      = remove_cvref_t<BlockFmhaShape_>;
+    using AttentionVariant    = remove_cvref_t<AttentionVariant_>;
     using FmhaMask            = remove_cvref_t<FmhaMask_>;
     using Traits              = remove_cvref_t<Traits_>;
 
@@ -98,6 +103,7 @@ struct BlockFmhaFwdSplitKVPipelineProblem
     static constexpr bool kPadSeqLenK                = Traits::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ               = Traits::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV               = Traits::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap          = Traits::kHasLogitsSoftCap;
     static constexpr auto BiasEnum                   = Traits::BiasEnum;
     static constexpr bool kStoreLSE                  = Traits::kStoreLSE;
     static constexpr bool kDoFp8StaticQuant          = Traits::kDoFp8StaticQuant;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index 8a4a925b81..29f183c613 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -5,8 +5,8 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
 
 namespace ck_tile {
@@ -28,6 +28,7 @@ struct BlockFmhaPipelineQRKSVS
     using PDataType             = remove_cvref_t<typename Problem::PDataType>;
     using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
     using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
@@ -47,14 +48,20 @@ struct BlockFmhaPipelineQRKSVS
 
     static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
 
-    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV;
-    static constexpr auto BiasEnum     = Problem::BiasEnum;
-    static constexpr bool kStoreLSE    = Problem::kStoreLSE;
-    static constexpr bool kHasDropout  = Problem::kHasDropout;
+    static constexpr bool kIsGroupMode      = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = Problem::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -101,7 +108,7 @@ struct BlockFmhaPipelineQRKSVS
             else
             {
                 return 1;
-            };
+            }
         }
     }();
 
@@ -128,7 +135,9 @@ struct BlockFmhaPipelineQRKSVS
               typename SAccElementFunction,
               typename PComputeElementFunction,
               typename OAccElementFunction,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
                const QElementFunction& q_element_func,
@@ -147,6 +156,9 @@ struct BlockFmhaPipelineQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -380,9 +392,28 @@ struct BlockFmhaPipelineQRKSVS
             else
             {
                 s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
 #if !CK_TILE_FMHA_FWD_FAST_EXP2
-                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
+#else
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
 #endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
             }
             move_tile_window(bias_dram_window, {0, kN0});
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
@@ -398,7 +429,12 @@ struct BlockFmhaPipelineQRKSVS
                         s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
                             const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
                             const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return mask.IsOutOfBound(row, col);
+                            return !variant.LogitsMask(variant_params,
+                                                       block_indices.batch_idx,
+                                                       row,
+                                                       col,
+                                                       block_indices.qo_head_idx,
+                                                       block_indices.kv_head_idx);
                         });
                 }
             }
@@ -450,7 +486,14 @@ struct BlockFmhaPipelineQRKSVS
                     }
                     else
                     {
-                        p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        }
                     }
 #else
                     p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
@@ -475,8 +518,16 @@ struct BlockFmhaPipelineQRKSVS
                     }
                     else
                     {
-                        auto row_max = scale_s * get_validated_m(m[i_idx]);
-                        return exp2(scale_s * m_old[i_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
                     }
                 }();
 #else
@@ -574,7 +625,14 @@ struct BlockFmhaPipelineQRKSVS
                 }
                 else
                 {
-                    lse(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
                 }
 #else
                 lse(i_idx) = m_[i_idx] + log(l_[i_idx]);
@@ -614,7 +672,9 @@ struct BlockFmhaPipelineQRKSVS
               typename BiasDramBlockWindowTmp,
               typename RandValDramBlockWindowTmp,
               typename LSEDramBlockWindowTmp,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
                const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
@@ -625,6 +685,9 @@ struct BlockFmhaPipelineQRKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -645,6 +708,9 @@ struct BlockFmhaPipelineQRKSVS
                           mask,
                           position_encoding,
                           scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
                           smem_ptr,
                           dropout);
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 67354fc72d..7af3902dc5 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -29,6 +29,7 @@ struct BlockFmhaPipelineQRKSVSAsync
     using PDataType             = remove_cvref_t<typename Problem::PDataType>;
     using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
     using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
@@ -53,13 +54,19 @@ struct BlockFmhaPipelineQRKSVSAsync
     //       only need special care about seq_k padding (oob need set -INF of p instead of zero)
     static_assert(Problem::kPadSeqLenQ == true && Problem::kPadHeadDimQ == true &&
                   Problem::kPadHeadDimV == true);
-    static constexpr bool kPadSeqLenQ  = true;
-    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ = true; // support multiple of vector(like 8x)
-    static constexpr bool kPadHeadDimV = true; // support multiple of vector(like 8x)
-    static constexpr auto BiasEnum     = Problem::BiasEnum;
-    static constexpr bool kStoreLSE    = Problem::kStoreLSE;
-    static constexpr bool kHasDropout  = Problem::kHasDropout;
+    static constexpr bool kPadSeqLenQ       = true;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = true; // support multiple of vector(like 8x)
+    static constexpr bool kPadHeadDimV      = true; // support multiple of vector(like 8x)
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -153,7 +160,9 @@ struct BlockFmhaPipelineQRKSVSAsync
               typename SAccElementFunction,
               typename PComputeElementFunction,
               typename OAccElementFunction,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
                const QElementFunction& q_element_func,
@@ -172,6 +181,9 @@ struct BlockFmhaPipelineQRKSVSAsync
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -435,9 +447,34 @@ struct BlockFmhaPipelineQRKSVSAsync
             else
             {
                 s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
 #if !CK_TILE_FMHA_FWD_FAST_EXP2
-                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                    for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
+                    {
+                        apply_logits_transform(s_acc.thread_buf_[i]);
+                    }
+#else
+                    for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
+                    {
+                        apply_logits_transform(s_acc.thread_buf_[i]);
+                    }
 #endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
             }
             move_tile_window(bias_dram_window, {0, kN0});
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
@@ -454,7 +491,12 @@ struct BlockFmhaPipelineQRKSVSAsync
                         s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
                             const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
                             const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return mask.IsOutOfBound(row, col);
+                            return !variant.LogitsMask(variant_params,
+                                                       block_indices.batch_idx,
+                                                       row,
+                                                       col,
+                                                       block_indices.qo_head_idx,
+                                                       block_indices.kv_head_idx);
                         });
                 }
             }
@@ -543,7 +585,14 @@ struct BlockFmhaPipelineQRKSVSAsync
                     }
                     else
                     {
-                        p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        }
                     }
 #else
                     p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
@@ -568,8 +617,15 @@ struct BlockFmhaPipelineQRKSVSAsync
                     }
                     else
                     {
-                        auto row_max = scale_s * get_validated_m(m[i_idx]);
-                        return exp2(scale_s * m_old[i_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
                     }
                 }();
 #else
@@ -695,7 +751,14 @@ struct BlockFmhaPipelineQRKSVSAsync
                 }
                 else
                 {
-                    lse(i_idx) = m_[i_idx] * scale_s * R_LOG2E + log(l_[i_idx]);
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse(i_idx) = m_[i_idx] * R_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse(i_idx) = m_[i_idx] * scale_s * R_LOG2E + log(l_[i_idx]);
+                    }
                 }
 #else
                 lse(i_idx) = m_[i_idx] + log(l_[i_idx]);
@@ -735,7 +798,9 @@ struct BlockFmhaPipelineQRKSVSAsync
               typename BiasDramBlockWindowTmp,
               typename RandValDramBlockWindowTmp,
               typename LSEDramBlockWindowTmp,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
                const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
@@ -746,6 +811,9 @@ struct BlockFmhaPipelineQRKSVSAsync
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -766,6 +834,9 @@ struct BlockFmhaPipelineQRKSVSAsync
                           mask,
                           position_encoding,
                           scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
                           smem_ptr,
                           dropout);
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
index 7be6a347f5..4efcd871dc 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
@@ -7,6 +7,7 @@
 #include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
 #include "ck_tile/ops/fmha/block/block_dropout.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
 
 namespace ck_tile {
 
@@ -27,6 +28,7 @@ struct BlockFmhaPipelineQSKSVS
     using PDataType             = remove_cvref_t<typename Problem::PDataType>;
     using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
     using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
@@ -44,14 +46,21 @@ struct BlockFmhaPipelineQSKSVS
     static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
     static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
 
-    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV;
-    static constexpr auto BiasEnum     = Problem::BiasEnum;
-    static constexpr bool kStoreLSE    = Problem::kStoreLSE;
-    static constexpr bool kHasDropout  = Problem::kHasDropout;
+    static constexpr bool kIsGroupMode      = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = Problem::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
     static constexpr index_t kAlignmentQ =
@@ -95,7 +104,9 @@ struct BlockFmhaPipelineQSKSVS
                 return 1;
             }
             else
+            {
                 return 1;
+            }
         }
     }();
 
@@ -122,7 +133,9 @@ struct BlockFmhaPipelineQSKSVS
               typename SAccElementFunction,
               typename PComputeElementFunction,
               typename OAccElementFunction,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
                const QElementFunction& q_element_func,
@@ -141,6 +154,9 @@ struct BlockFmhaPipelineQSKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& /* unused_dropout */) const
     {
@@ -380,9 +396,28 @@ struct BlockFmhaPipelineQSKSVS
             else
             {
                 s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
 #if !CK_TILE_FMHA_FWD_FAST_EXP2
-                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
+#else
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
 #endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
             }
             move_tile_window(bias_dram_window, {0, kN0});
             if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
@@ -398,7 +433,12 @@ struct BlockFmhaPipelineQSKSVS
                         s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
                             const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
                             const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return mask.IsOutOfBound(row, col);
+                            return !variant.LogitsMask(variant_params,
+                                                       block_indices.batch_idx,
+                                                       row,
+                                                       col,
+                                                       block_indices.qo_head_idx,
+                                                       block_indices.kv_head_idx);
                         });
                 }
             }
@@ -450,7 +490,14 @@ struct BlockFmhaPipelineQSKSVS
                     }
                     else
                     {
-                        p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        }
                     }
 #else
                     p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
@@ -481,8 +528,16 @@ struct BlockFmhaPipelineQSKSVS
                     }
                     else
                     {
-                        auto row_max = scale_s * get_validated_m(m[i_idx]);
-                        return exp2(scale_s * m_old[i_idx] - row_max);
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
                     }
                 }();
 #else
@@ -571,7 +626,14 @@ struct BlockFmhaPipelineQSKSVS
                 }
                 else
                 {
-                    lse(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
                 }
 #else
                 lse(i_idx) = m_[i_idx] + log(l_[i_idx]);
@@ -611,7 +673,9 @@ struct BlockFmhaPipelineQSKSVS
               typename BiasDramBlockWindowTmp,
               typename RandValDramBlockWindowTmp,
               typename LSEDramBlockWindowTmp,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
                const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
@@ -622,6 +686,9 @@ struct BlockFmhaPipelineQSKSVS
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -642,6 +709,9 @@ struct BlockFmhaPipelineQSKSVS
                           mask,
                           position_encoding,
                           scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
                           smem_ptr,
                           dropout);
     }
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
index 8d2d848558..4530b58d85 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -13,6 +13,7 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kPadSeqLenK_ /* padding for seqlen_k */,
           bool kPadHeadDimQ_ /* paddding for hdim_q */,
           bool kPadHeadDimV_ /* paddding for hdim_v */,
+          bool kHasLogitsSoftCap_,
           BlockAttentionBiasEnum BiasEnum_,
           bool kHasBiasGrad_,
           bool kStoreLSE_,
@@ -25,6 +26,7 @@ struct TileFmhaTraits
     static constexpr bool kPadSeqLenK       = kPadSeqLenK_;
     static constexpr bool kPadHeadDimQ      = kPadHeadDimQ_;
     static constexpr bool kPadHeadDimV      = kPadHeadDimV_;
+    static constexpr bool kHasLogitsSoftCap = kHasLogitsSoftCap_;
     static constexpr auto BiasEnum          = BiasEnum_;
     static constexpr bool kHasBiasGrad      = kHasBiasGrad_;
     static constexpr bool kStoreLSE         = kStoreLSE_;
@@ -37,6 +39,7 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kPadSeqLenK_ /* padding for seqlen_k */,
           bool kPadHeadDimQ_ /* paddding for hdim_q */,
           bool kPadHeadDimV_ /* paddding for hdim_v */,
+          bool kHasLogitsSoftCap_,
           BlockAttentionBiasEnum BiasEnum_,
           bool kHasBiasGrad_,
           bool kStoreLSE_, /* set to true if either num_splits > 1 or fwd training is running */
@@ -51,6 +54,7 @@ struct TileFmhaFwdSplitKVTraits
     static constexpr bool kPadSeqLenK       = kPadSeqLenK_;
     static constexpr bool kPadHeadDimQ      = kPadHeadDimQ_;
     static constexpr bool kPadHeadDimV      = kPadHeadDimV_;
+    static constexpr bool kHasLogitsSoftCap = kHasLogitsSoftCap_;
     static constexpr auto BiasEnum          = BiasEnum_;
     static constexpr bool kHasBiasGrad      = kHasBiasGrad_;
     static constexpr bool kStoreLSE         = kStoreLSE_;

From c53b7bd22e75c69beddb6ffefc22b5f95354ffba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 13 May 2025 10:14:30 +0200
Subject: [PATCH 117/443] Switch to v2 pipeline for grouped conv bwd data
 (#2181)

* Change to old pipeline for grouped conv bwd data

* fix

* fix

* fix

* fix

* fix

* fix

* Fix
---
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |   29 +-
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |   30 +-
 .../device_batched_gemm_e_permute_xdl.hpp     |   28 +-
 .../impl/device_batched_gemm_multi_d_xdl.hpp  |   30 +-
 ...ce_contraction_multiple_d_xdl_cshuffle.hpp |   30 +-
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   |   30 +-
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |    5 +-
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 1052 ++---------------
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |   28 +-
 ...d_multiple_d_xdl_large_tensor_cshuffle.hpp |   21 +-
 .../device/impl/device_grouped_gemm_xdl.hpp   |    3 +-
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |   81 +-
 12 files changed, 256 insertions(+), 1111 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 00518b369f..72c011bfb2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -153,7 +153,7 @@ __device__ void device_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
         const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
             static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
 
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
             p_as_grid + a_batch_offset,
             p_bs_grid + b_batch_offset,
             p_ds_grid_grp,
@@ -439,7 +439,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     using GemmADataType = ck::conditional_t<!isMultiA && isMultiB, Tuple<ADataType>, ADataType>;
     using GemmBDataType = ck::conditional_t<!isMultiB && isMultiA, Tuple<BDataType>, BDataType>;
 
-#define GridwiseGemmTemplateParameters                                                          \
+#define GridwiseGemmMultiABDTemplateParameters                                                  \
     GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType,   \
         EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,       \
         InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \
@@ -454,11 +454,26 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                           \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched
+
+#define GridwiseGemmTemplateParameters                                                         \
+    GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType,  \
+        EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,      \
+        NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,  \
+        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,       \
+        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                 \
+        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                          \
+        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                          \
+        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, \
+        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                              \
+        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,         \
+        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched
     // Use appropriate gridwise gemm
-    using GridwiseGemm =
-        ck::conditional_t<isMultiA || isMultiB,
-                          GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmTemplateParameters>,
-                          GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
+    using GridwiseGemm = ck::conditional_t<
+        isMultiA || isMultiB,
+        GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>,
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
 
     // If ADataTypes or BDataTypes is tuple, user has to pass ck::Array with pointers.
     using APointers = ck::conditional_t<isMultiA, ck::Array<const void*, NumATensor>&, const void*>;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index d53fbca4ea..fc1a2b995a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -80,19 +80,20 @@ __global__ void
     static_for<0, NumDTensor, 1>{}(
         [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+        p_a_grid + a_batch_offset,
+        p_b_grid + b_batch_offset,
+        p_ds_grid_grp,
+        p_e_grid + e_batch_offset,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        e_grid_desc_mblock_mperblock_nblock_nperblock,
+        block_2_etile_map);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -556,7 +557,6 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
index 25a9d7f96d..0cd1d84a43 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -88,19 +88,20 @@ __global__ void
 
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  ck::Tuple<>{},
-                                                  p_e_grid + e_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ck::Tuple<>{},
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+        p_a_grid + a_batch_offset,
+        p_b_grid + b_batch_offset,
+        ck::Tuple<>{},
+        p_e_grid + e_batch_offset,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ck::Tuple<>{},
+        e_grid_desc_mblock_mperblock_nblock_nperblock,
+        block_2_etile_map);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -344,7 +345,6 @@ struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         AGridDesc_M_K,
         BGridDesc_N_K,
         Tuple<>,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 630f143260..12085edaae 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -107,19 +107,20 @@ __global__ void
     static_for<0, NumDTensor, 1>{}(
         [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                                  block_2_etile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+        p_a_grid + a_batch_offset,
+        p_b_grid + b_batch_offset,
+        p_ds_grid_grp,
+        p_e_grid + e_batch_offset,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        a_grid_desc_k0_m_k1,
+        b_grid_desc_k0_n_k1,
+        ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        block_2_etile_map);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -336,7 +337,6 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index f0f89f1d1b..77974f84ae 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -56,19 +56,20 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+        p_a_grid,
+        p_b_grid,
+        p_ds_grid,
+        p_e_grid,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        e_grid_desc_mblock_mperblock_nblock_nperblock,
+        block_2_etile_map);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -324,7 +325,6 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index 3fae3a3765..6c4195e75d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -57,19 +57,20 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_etile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
+        p_a_grid,
+        p_b_grid,
+        p_ds_grid,
+        p_e_grid,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        e_grid_desc_mblock_mperblock_nblock_nperblock,
+        block_2_etile_map);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -257,7 +258,6 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 1cf58fec25..884175eaca 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -64,7 +64,7 @@ __global__ void
         group_id = index_t((left + right) / 2);
     }
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
         contraction_arg_ptr[group_id].p_a_grid_,
         contraction_arg_ptr[group_id].p_b_grid_,
         contraction_arg_ptr[group_id].p_ds_grid_,
@@ -368,7 +368,6 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         NumGemmKPrefetchStage,
         BlockSize,
         MPerBlock,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 41f596d160..f18ce40fc5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -15,7 +15,6 @@
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
@@ -71,7 +70,8 @@ template <typename GridwiseGemm,
           typename Block2ETileMap,
           typename ComputePtrOffsetOfBatch,
           typename ComputePtrOffsetOfN,
-          bool HasMainKBlockLoop>
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum OutElementOp>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -92,12 +92,14 @@ __global__ void
                 e_grid_desc_mblock_mperblock_nblock_nperblock_,
             const Block2ETileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
+            const ComputePtrOffsetOfN compute_ptr_offset_of_n,
+            const index_t KBatch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z / KBatch);
+    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.z - n_idx * KBatch);
 
     const long_index_t a_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
@@ -123,19 +125,22 @@ __global__ void
     static_for<0, NumDTensor, 1>{}(
         [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset + a_n_offset,
-                                                  p_b_grid + b_batch_offset,
-                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset + e_n_offset,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_ak0_m_ak1,
-                                                  b_grid_desc_bk0_n_bk1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                                  block_2_ctile_map);
+    GridwiseGemm::template Run<HasMainKBlockLoop, OutElementOp>(
+        p_a_grid + a_batch_offset + a_n_offset,
+        p_b_grid + b_batch_offset,
+        p_ds_grid_grp,
+        p_e_grid + e_batch_offset + e_n_offset,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        block_2_ctile_map,
+        KBatch,
+        k_idx);
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
@@ -154,151 +159,6 @@ __global__ void
 #endif
 }
 
-template <typename GridwiseGemm,
-          typename AGridDesc_AK0_M_K1,
-          typename BGridDesc_BK0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename ComputePtrOffsetOfBatch,
-          typename ComputePtrOffsetOfN,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Full>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
-#endif
-    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3(
-        typename GridwiseGemm::Argument karg,
-        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock,
-        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
-        const index_t num_k_per_block)
-{
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
-    const index_t k_idx =
-        __builtin_amdgcn_readfirstlane((blockIdx.y - n_idx * karg.KBatch) * num_k_per_block);
-
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
-
-    const long_index_t a_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
-
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
-                               BGridDesc_BK0_N_K1,
-                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                               HasMainKBlockLoop,
-                               CGlobalMemoryDataOperation,
-                               TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
-                                        karg.p_b_grid + b_batch_offset,
-                                        karg.p_c_grid + e_batch_offset + e_n_offset,
-                                        p_shared,
-                                        karg,
-                                        a_grid_desc_ak0_m_ak1,
-                                        b_grid_desc_bk0_n_bk1,
-                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                        k_idx);
-#else
-    ignore = karg;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = compute_ptr_offset_of_batch;
-    ignore = compute_ptr_offset_of_n;
-    ignore = num_k_per_block;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
-}
-
-template <typename GridwiseGemm,
-          typename AGridDesc_AK0_M_K1,
-          typename BGridDesc_BK0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename ComputePtrOffsetOfBatch,
-          typename ComputePtrOffsetOfN,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Full>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
-#endif
-    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds(
-        typename GridwiseGemm::Argument karg,
-        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-            c_grid_desc_mblock_mperblock_nblock_nperblock,
-        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
-        const index_t num_k_per_block)
-{
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
-    const index_t k_idx =
-        __builtin_amdgcn_readfirstlane((blockIdx.y - n_idx * karg.KBatch) * num_k_per_block);
-
-    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
-
-    const long_index_t a_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
-    const long_index_t e_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
-
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
-                                    BGridDesc_BK0_N_K1,
-                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    HasMainKBlockLoop,
-                                    CGlobalMemoryDataOperation,
-                                    TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
-                                             karg.p_b_grid + b_batch_offset,
-                                             karg.p_c_grid + e_batch_offset + e_n_offset,
-                                             p_shared_0,
-                                             p_shared_1,
-                                             karg,
-                                             a_grid_desc_ak0_m_ak1,
-                                             b_grid_desc_bk0_n_bk1,
-                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                             k_idx);
-#else
-    ignore = karg;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = compute_ptr_offset_of_batch;
-    ignore = compute_ptr_offset_of_n;
-    ignore = num_k_per_block;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
-}
-
 } // namespace
 
 // Conv backward data multiple D:
@@ -358,9 +218,7 @@ template <index_t NDimSpatial,
           typename AComputeType                          = ADataType,
           typename BComputeType                          = AComputeType,
           index_t MaxTransposeTransferInScalarPerVector  = 1,
-          index_t MaxTransposeTransferOutScalarPerVector = 1,
-          BlockGemmPipelineScheduler BlkGemmPipeSched    = BlockGemmPipelineScheduler::Intrawave,
-          BlockGemmPipelineVersion BlkGemmPipelineVer    = BlockGemmPipelineVersion::v1>
+          index_t MaxTransposeTransferOutScalarPerVector = 1>
 struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
                                                ALayout,    // output image
@@ -384,7 +242,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
 
     static constexpr index_t NumDTensor          = DsDataType::Size();
-    static constexpr bool isMultiD               = NumDTensor > 0;
     static constexpr GemmSpecialization GemmSpec = GemmSpecialization::MNKPadding;
     static constexpr bool IsSplitKSupported =
         (CDEBlockTransferScalarPerVector_NPerBlock % 2 == 0 || sizeof(EDataType) % 4 == 0) &&
@@ -473,59 +330,25 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 // GridwiseGemm
 #define GridwiseGemmMultiDTemplateParams                                                        \
     ABDataType, ABDataType, AComputeType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
-        AElementwiseOp, BElementwiseOp, CDEElementwiseOp, InMemoryDataOperationEnum::Set,       \
-        NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,   \
-        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,        \
-        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                  \
-        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                           \
-        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                           \
-        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder,  \
-        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                               \
-        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,          \
-        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,          \
+        AElementwiseOp, BElementwiseOp, CDEElementwiseOp, NumGemmKPrefetchStage, BlockSize,     \
+        MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,  \
+        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,  \
+        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                               \
+        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,          \
+        ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1,                          \
+        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                  \
+        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                           \
+        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                           \
+        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                           \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, BComputeType
-
-#define GridwiseGemmTemplateParams                                                               \
-    tensor_layout::gemm::RowMajor, tensor_layout::gemm::RowMajor, tensor_layout::gemm::RowMajor, \
-        ADataType, BDataType, AccDataType, CShuffleDataType, EDataType, AElementwiseOp,          \
-        BElementwiseOp, CDEElementwiseOp, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock,  \
-        AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,                                    \
-        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,   \
-        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                                \
-        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,           \
-        ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1,                           \
-        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                   \
-        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                            \
-        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                            \
-        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                            \
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                        \
-        CDEBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer,         \
-        AComputeType, BComputeType
-
-    using GridwiseGemm =
-        std::conditional_t<isMultiD,
-                           GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>,
-                           GridwiseGemm_xdl_cshuffle_v3<GridwiseGemmTemplateParams>>;
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>;
 
     template <typename EGridDesc_M_N>
     static auto
     MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N e_grid_desc_m_n)
     {
-        if constexpr(isMultiD)
-        {
-            return GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                e_grid_desc_m_n);
-        }
-        else
-        {
-            const index_t M = e_grid_desc_m_n.GetLength(I0);
-            const index_t N = e_grid_desc_m_n.GetLength(I1);
-            return GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                e_grid_desc_m_n,
-                GridwiseGemm::CalculateMBlock(M),
-                GridwiseGemm::CalculateNBlock(N));
-        }
+        return GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n);
     }
 
     template <typename Desc_K0_M_K1>
@@ -850,46 +673,34 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         const auto b_grid_desc_n_k =
                             transform_k0_m_k1_to_m_k(b_grid_desc_bk0_n_bk1);
 
-                        if constexpr(isMultiD)
-                        {
-                            a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
-                            b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
-                            ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
-                            e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
-                        }
+                        a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
+                        b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
+                        ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
+                        e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
 
                         // desc for blockwise copy
                         a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
                         b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
 
-                        if constexpr(isMultiD)
+                        // block-to-e-tile-map
+                        auto block_2_etile_map =
+                            GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+
+                        block_2_etile_map_container_.push_back(block_2_etile_map);
+
+                        if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                       b_grid_desc_n_k,
+                                                       ds_grid_desc_m_n,
+                                                       e_grid_desc_m_n,
+                                                       block_2_etile_map,
+                                                       k_batch_))
                         {
-                            // block-to-e-tile-map
-                            auto block_2_etile_map =
-                                GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+                            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
 
-                            block_2_etile_map_container_.push_back(block_2_etile_map);
+                                GridwiseGemm::
+                                    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                        ds_grid_desc_m_n));
 
-                            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                           b_grid_desc_n_k,
-                                                           ds_grid_desc_m_n,
-                                                           e_grid_desc_m_n,
-                                                           block_2_etile_map))
-                            {
-                                ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-
-                                    GridwiseGemm::
-                                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                            ds_grid_desc_m_n));
-
-                                e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-                                    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                        e_grid_desc_m_n));
-                            }
-                        }
-                        else
-                        {
-                            // there is no need to check since M, N, K are padded
                             e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
                                 MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                                     e_grid_desc_m_n));
@@ -1083,12 +894,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         using Argument = DeviceOp::Argument;
 
+        template <InMemoryDataOperationEnum ElementOp>
         float RunMultiDGemm(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
 
             const index_t gdy = arg.num_group_;
-            const index_t gdz = arg.num_workgroups_per_Conv_N_;
+            const index_t gdz = arg.num_workgroups_per_Conv_N_ * arg.k_batch_;
 
             const ADataType* p_a_grid = arg.p_a_grid_;
             const BDataType* p_b_grid = arg.p_b_grid_;
@@ -1117,7 +929,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                 arg.b_grid_desc_n_k_container_[i],
                                                 arg.ds_grid_desc_m_n_container_[i],
                                                 arg.e_grid_desc_m_n_container_[i],
-                                                arg.block_2_etile_map_container_[i]))
+                                                arg.block_2_etile_map_container_[i],
+                                                arg.k_batch_))
                 {
                     throw std::runtime_error("wrong! device_op has invalid setting");
                 }
@@ -1145,7 +958,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         Block2ETileMap,
                         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
                         ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                        has_main_loop>;
+                        has_main_loop,
+                        ElementOp>;
 
                     return launch_and_time_kernel(
                         stream_config,
@@ -1166,10 +980,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                         arg.block_2_etile_map_container_[i],
                         arg.compute_ptr_offset_of_batch_,
-                        arg.compute_ptr_offset_of_n_);
+                        arg.compute_ptr_offset_of_n_,
+                        arg.k_batch_);
                 };
 
-                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK, arg.k_batch_))
                 {
                     ave_time += launch_kernel(integral_constant<bool, true>{});
                 }
@@ -1182,678 +997,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             return ave_time;
         }
 
-        float RunGemmV3(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            float ave_time = 0;
-
-            const ADataType* p_a_grid = arg.p_a_grid_;
-            const BDataType* p_b_grid = arg.p_b_grid_;
-            EDataType* p_e_grid       = arg.p_e_grid_;
-
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
-            {
-                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                p_e_grid =
-                    type_convert<EDataType*>(arg.p_workspace_) +
-                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
-                        sizeof(EDataType);
-            }
-
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
-            {
-                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
-                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
-            }
-
-            constexpr index_t minimum_occupancy =
-                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
-
-            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
-            {
-                const index_t GemmM = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I1);
-                const index_t GemmN = arg.b_grid_desc_bk0_n_bk1_container_[i].GetLength(I1);
-                const index_t GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
-                                      arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
-
-                const auto num_k_per_block =
-                    arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(Number<0>{}) / arg.k_batch_;
-
-                // gdy is for the kbatch and num_workgrups_per_Conv_N
-                index_t gdx, gdy, gdz;
-                std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(
-                    GemmM, GemmN, arg.k_batch_ * arg.num_workgroups_per_Conv_N_, arg.num_group_);
-
-                index_t k_grain = arg.k_batch_ * KPerBlock;
-                index_t K_split = (GemmK + k_grain - 1) / k_grain * KPerBlock;
-                const bool has_main_k_block_loop =
-                    GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
-
-                typename GridwiseGemm::Argument gemm_arg{
-                    p_a_grid, p_b_grid, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
-
-                const auto Run = [&](const auto& kernel) {
-                    if(stream_config.flush_cache)
-                    {
-                        typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
-                        ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument>
-                            rotating_mem(gemm_arg_,
-                                         stream_config.rotating_count,
-                                         gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
-                                         gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
-                        rotating_mem.Print();
-
-                        auto run_flush_cache = [&]() {
-                            // flush icache
-                            ck::utility::flush_icache();
-                            // rotating mem
-                            rotating_mem.Next();
-                        };
-
-                        ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
-                            stream_config,
-                            run_flush_cache,
-                            kernel,
-                            dim3(gdx, gdy, gdz),
-                            dim3(BlockSize),
-                            0,
-                            gemm_arg_,
-                            arg.a_grid_desc_ak0_m_ak1_container_[i],
-                            arg.b_grid_desc_bk0_n_bk1_container_[i],
-                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
-                            arg.compute_ptr_offset_of_batch_,
-                            arg.compute_ptr_offset_of_n_,
-                            num_k_per_block);
-                    }
-                    else
-                    {
-                        ave_time += launch_and_time_kernel(
-                            stream_config,
-                            kernel,
-                            dim3(gdx, gdy, gdz),
-                            dim3(BlockSize),
-                            0,
-                            gemm_arg,
-                            arg.a_grid_desc_ak0_m_ak1_container_[i],
-                            arg.b_grid_desc_bk0_n_bk1_container_[i],
-                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
-                            arg.compute_ptr_offset_of_batch_,
-                            arg.compute_ptr_offset_of_n_,
-                            num_k_per_block);
-                    }
-                };
-
-                if(has_main_k_block_loop)
-                {
-                    // Tail number always full
-                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
-                                 BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
-                    {
-                        if(gemm_arg.KBatch > 1)
-                        {
-                            if constexpr(IsSplitKSupported)
-                            {
-                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    DeviceOp::AGridDesc_AK0_M_AK1,
-                                    DeviceOp::BGridDesc_BK0_N_BK1,
-                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy>;
-                                Run(kernel);
-                            }
-                        }
-                        else
-                        {
-                            const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                GridwiseGemm,
-                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                true,
-                                InMemoryDataOperationEnum::Set,
-                                minimum_occupancy>;
-                            Run(kernel);
-                        }
-                    }
-                    // Tail number could be One to Seven
-                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-                    {
-                        if(gemm_arg.KBatch > 1)
-                        {
-                            if constexpr(IsSplitKSupported)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::One)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::AtomicAdd,
-                                            minimum_occupancy,
-                                            TailNumber::One>;
-                                    Run(kernel);
-                                }
-                                else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                        TailNumber::Full)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::AtomicAdd,
-                                            minimum_occupancy,
-                                            TailNumber::Full>;
-                                    Run(kernel);
-                                }
-
-                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
-                                {
-                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                       TailNumber::Two)
-                                    {
-                                        const auto kernel =
-                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                                GridwiseGemm,
-                                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                                DeviceOp::
-                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                true,
-                                                InMemoryDataOperationEnum::AtomicAdd,
-                                                minimum_occupancy,
-                                                TailNumber::Two>;
-                                        Run(kernel);
-                                    }
-                                }
-
-                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                                {
-                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                       TailNumber::Three)
-                                    {
-                                        const auto kernel =
-                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                                GridwiseGemm,
-                                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                                DeviceOp::
-                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                true,
-                                                InMemoryDataOperationEnum::AtomicAdd,
-                                                minimum_occupancy,
-                                                TailNumber::Three>;
-                                        Run(kernel);
-                                    }
-                                }
-
-                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
-                                {
-                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                       TailNumber::Four)
-                                    {
-                                        const auto kernel =
-                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                                GridwiseGemm,
-                                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                                DeviceOp::
-                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                true,
-                                                InMemoryDataOperationEnum::AtomicAdd,
-                                                minimum_occupancy,
-                                                TailNumber::Four>;
-                                        Run(kernel);
-                                    }
-                                }
-
-                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                                {
-                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                       TailNumber::Five)
-                                    {
-                                        const auto kernel =
-                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                                GridwiseGemm,
-                                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                                DeviceOp::
-                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                true,
-                                                InMemoryDataOperationEnum::AtomicAdd,
-                                                minimum_occupancy,
-                                                TailNumber::Five>;
-                                        Run(kernel);
-                                    }
-                                }
-
-                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                                {
-                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                       TailNumber::Six)
-                                    {
-                                        const auto kernel =
-                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                                GridwiseGemm,
-                                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                                DeviceOp::
-                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                true,
-                                                InMemoryDataOperationEnum::AtomicAdd,
-                                                minimum_occupancy,
-                                                TailNumber::Six>;
-                                        Run(kernel);
-                                    }
-                                }
-
-                                if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                                {
-                                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                       TailNumber::Seven)
-                                    {
-                                        const auto kernel =
-                                            kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                                GridwiseGemm,
-                                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                                DeviceOp::
-                                                    EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                                true,
-                                                InMemoryDataOperationEnum::AtomicAdd,
-                                                minimum_occupancy,
-                                                TailNumber::Seven>;
-                                        Run(kernel);
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
-                            {
-                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    DeviceOp::AGridDesc_AK0_M_AK1,
-                                    DeviceOp::BGridDesc_BK0_N_BK1,
-                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    true,
-                                    InMemoryDataOperationEnum::Set,
-                                    minimum_occupancy,
-                                    TailNumber::One>;
-                                Run(kernel);
-                            }
-                            else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                    TailNumber::Full)
-                            {
-                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    DeviceOp::AGridDesc_AK0_M_AK1,
-                                    DeviceOp::BGridDesc_BK0_N_BK1,
-                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    true,
-                                    InMemoryDataOperationEnum::Set,
-                                    minimum_occupancy,
-                                    TailNumber::Full>;
-                                Run(kernel);
-                            }
-
-                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Two)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::Set,
-                                            minimum_occupancy,
-                                            TailNumber::Two>;
-                                    Run(kernel);
-                                }
-                            }
-
-                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Three)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::Set,
-                                            minimum_occupancy,
-                                            TailNumber::Three>;
-                                    Run(kernel);
-                                }
-                            }
-
-                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Four)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::Set,
-                                            minimum_occupancy,
-                                            TailNumber::Four>;
-                                    Run(kernel);
-                                }
-                            }
-
-                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Five)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::Set,
-                                            minimum_occupancy,
-                                            TailNumber::Five>;
-                                    Run(kernel);
-                                }
-                            }
-
-                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Six)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::Set,
-                                            minimum_occupancy,
-                                            TailNumber::Six>;
-                                    Run(kernel);
-                                }
-                            }
-
-                            if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Seven)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::Set,
-                                            minimum_occupancy,
-                                            TailNumber::Seven>;
-                                    Run(kernel);
-                                }
-                            }
-                        }
-                    }
-                    // Tail number could be Odd or Even
-                    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
-                    {
-                        if(gemm_arg.KBatch > 1)
-                        {
-                            if constexpr(IsSplitKSupported)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Odd)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::AtomicAdd,
-                                            minimum_occupancy,
-                                            TailNumber::Odd>;
-                                    Run(kernel);
-                                }
-                                else
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::AtomicAdd,
-                                            minimum_occupancy,
-                                            TailNumber::Even>;
-                                    Run(kernel);
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                            {
-                                const auto kernel =
-                                    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
-                                        GridwiseGemm,
-                                        DeviceOp::AGridDesc_AK0_M_AK1,
-                                        DeviceOp::BGridDesc_BK0_N_BK1,
-                                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                        true,
-                                        InMemoryDataOperationEnum::Set,
-                                        minimum_occupancy,
-                                        TailNumber::Odd>;
-                                Run(kernel);
-                            }
-                            else
-                            {
-                                const auto kernel =
-                                    kernel_grouped_conv_bwd_data_xdl_cshuffle_v3_2lds<
-                                        GridwiseGemm,
-                                        DeviceOp::AGridDesc_AK0_M_AK1,
-                                        DeviceOp::BGridDesc_BK0_N_BK1,
-                                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                        true,
-                                        InMemoryDataOperationEnum::Set,
-                                        minimum_occupancy,
-                                        TailNumber::Even>;
-                                Run(kernel);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(gemm_arg.KBatch > 1)
-                        {
-                            if constexpr(IsSplitKSupported)
-                            {
-                                if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                   TailNumber::Odd)
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::AtomicAdd,
-                                            minimum_occupancy,
-                                            TailNumber::Odd>;
-                                    Run(kernel);
-                                }
-                                else
-                                {
-                                    const auto kernel =
-                                        kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                            GridwiseGemm,
-                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                            true,
-                                            InMemoryDataOperationEnum::AtomicAdd,
-                                            minimum_occupancy,
-                                            TailNumber::Even>;
-                                    Run(kernel);
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                            {
-                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    DeviceOp::AGridDesc_AK0_M_AK1,
-                                    DeviceOp::BGridDesc_BK0_N_BK1,
-                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    true,
-                                    InMemoryDataOperationEnum::Set,
-                                    minimum_occupancy,
-                                    TailNumber::Odd>;
-                                Run(kernel);
-                            }
-                            else
-                            {
-                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    DeviceOp::AGridDesc_AK0_M_AK1,
-                                    DeviceOp::BGridDesc_BK0_N_BK1,
-                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    true,
-                                    InMemoryDataOperationEnum::Set,
-                                    minimum_occupancy,
-                                    TailNumber::Even>;
-                                Run(kernel);
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    // Tail number always 1
-                    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
-                    {
-                        if(gemm_arg.KBatch > 1)
-                        {
-                            if constexpr(IsSplitKSupported)
-                            {
-                                const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    DeviceOp::AGridDesc_AK0_M_AK1,
-                                    DeviceOp::BGridDesc_BK0_N_BK1,
-                                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                    false,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy>;
-                                Run(kernel);
-                            }
-                        }
-                        else
-                        {
-                            const auto kernel = kernel_grouped_conv_bwd_data_xdl_cshuffle_v3<
-                                GridwiseGemm,
-                                DeviceOp::AGridDesc_AK0_M_AK1,
-                                DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                                false,
-                                InMemoryDataOperationEnum::Set,
-                                minimum_occupancy>;
-                            Run(kernel);
-                        }
-                    }
-                }
-            }
-            return ave_time;
-        }
-
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
@@ -1940,14 +1083,17 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         static_cast<index_t>(arg.compute_ptr_offset_of_n_.BatchStrideA_)},
                     std::array<index_t, I1>{0});
             }
-
-            if constexpr(isMultiD)
+            if(arg.k_batch_ > 1)
             {
-                ave_time += RunMultiDGemm(arg, stream_config);
+                if constexpr(IsSplitKSupported)
+                {
+                    ave_time +=
+                        RunMultiDGemm<InMemoryDataOperationEnum::AtomicAdd>(arg, stream_config);
+                }
             }
             else
             {
-                ave_time += RunGemmV3(arg, stream_config);
+                ave_time += RunMultiDGemm<InMemoryDataOperationEnum::Set>(arg, stream_config);
             }
 
             // Transpose from NHWGC to NGCHW
@@ -2031,29 +1177,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
         const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
 
-        if constexpr(!isMultiD)
-        {
-            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
-            {
-                const index_t GemmM = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I1);
-                const index_t GemmN = arg.b_grid_desc_bk0_n_bk1_container_[i].GetLength(I1);
-                const index_t GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
-                                      arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
-
-                typename GridwiseGemm::Argument gemm_arg{
-                    nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
-
-                const auto num_k_loop = gemm_arg.AK0 / (KPerBlock / AK1);
-                if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
-                {
-                    if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
-                    {
-                        return false;
-                    }
-                }
-            }
-        }
-
         // Specifialization
         if constexpr(ConvBackwardDataSpecialization ==
                      ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
@@ -2156,16 +1279,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // Gridwise GEMM size
         for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
         {
-            if constexpr(isMultiD)
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                            arg.b_grid_desc_n_k_container_[i],
+                                            arg.ds_grid_desc_m_n_container_[i],
+                                            arg.e_grid_desc_m_n_container_[i],
+                                            arg.block_2_etile_map_container_[i],
+                                            arg.k_batch_))
             {
-                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
-                                                arg.b_grid_desc_n_k_container_[i],
-                                                arg.ds_grid_desc_m_n_container_[i],
-                                                arg.e_grid_desc_m_n_container_[i],
-                                                arg.block_2_etile_map_container_[i]))
-                {
-                    return false;
-                }
+                return false;
             }
         }
 
@@ -2322,17 +1443,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     {
         auto str = std::stringstream();
 
-        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
-            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
-            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
-
-        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
-            {BlockGemmPipelineVersion::v1, "v1"},
-            {BlockGemmPipelineVersion::v2, "v2"},
-            {BlockGemmPipelineVersion::v3, "v3"},
-            {BlockGemmPipelineVersion::v4, "v4"},
-            {BlockGemmPipelineVersion::v5, "v5"}};
-
         // clang-format off
         str << "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1"
             << "<"
@@ -2350,11 +1460,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             << ABlockTransferSrcScalarPerVector << ", "
             << BBlockTransferSrcScalarPerVector << ", "
             << CShuffleMXdlPerWavePerShuffle << ", "
-            << CShuffleNXdlPerWavePerShuffle << ", "
-            << "BlkGemmPipelineScheduler: "
-            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
-            << "BlkGemmPipelineVersion: "
-            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer];
+            << CShuffleNXdlPerWavePerShuffle;
 
             if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>()) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index c0148c3b9c..27da1d91a3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -179,7 +179,7 @@ __global__ void
         const long_index_t a_n_offset =
             amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
 
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
             p_as_grid + a_group_offset + a_n_offset,
             p_bs_grid + b_group_offset,
             p_ds_grid_grp,
@@ -434,7 +434,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     using GemmADataType = std::conditional_t<!isMultiA && isMultiB, Tuple<ADataType>, ADataType>;
     using GemmBDataType = std::conditional_t<!isMultiB && isMultiA, Tuple<BDataType>, BDataType>;
 
-#define GridwiseGemmTemplateParameters                                                          \
+#define GridwiseGemmMultiABDTemplateParameters                                                  \
     GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType,  \
         EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,       \
         InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \
@@ -450,11 +450,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,              \
         BComputeDataType
+
+#define GridwiseGemmTemplateParameters                                                         \
+    GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \
+        EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,      \
+        NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,  \
+        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,       \
+        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                 \
+        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                          \
+        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                          \
+        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, \
+        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                              \
+        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,         \
+        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
+        BComputeDataType
     // Use appropriate gridwise gemm
-    using GridwiseGemm =
-        std::conditional_t<isMultiA || isMultiB,
-                           GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmTemplateParameters>,
-                           GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
+    using GridwiseGemm = std::conditional_t<
+        isMultiA || isMultiB,
+        GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>,
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
 
     // If ADataTypes or BDataTypes is tuple, user has to pass std::array with pointers.
     using APointers =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index 3c34d77cc9..94a4e0da4c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -89,7 +89,7 @@ __global__ void
         group_id = index_t((left + right) / 2);
     }
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
         gemm_desc_kernel_args[group_id].a_ptr_ + a_group_offset + a_n_offset,
         gemm_desc_kernel_args[group_id].b_ptr_ + b_group_offset,
         Tuple<>{},
@@ -350,16 +350,15 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 #define GridwiseGemmTemplateParameters                                                            \
     ADataType, BDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
         AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,                    \
-        InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock,   \
-        KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave,                          \
-        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder,    \
-        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                                 \
-        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,            \
-        ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1,                            \
-        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                    \
-        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                             \
-        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                             \
-        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                             \
+        NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL,     \
+        NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,          \
+        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                    \
+        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                             \
+        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                             \
+        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder,    \
+        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                                 \
+        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,            \
+        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,            \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                         \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,                \
         AComputeDataType
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index aa70a24fc1..cbee4e09f4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -65,7 +65,7 @@ __global__ void
         group_id = index_t((left + right) / 2);
     }
 
-    GridwiseGemm::template Run<HasMainKBlockLoop>(
+    GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
         gemm_desc_ptr[group_id].a_ptr_,
         gemm_desc_ptr[group_id].b_ptr_,
         gemm_desc_ptr[group_id].ds_ptr_,
@@ -242,7 +242,6 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
         AElementwiseOperation,
         BElementwiseOperation,
         CDEElementwiseOperation,
-        InMemoryDataOperationEnum::Set,
         NumPrefetch, // NumGemmKPrefetchStage
         BlockSize,
         MPerBlock,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index d54a00eaa2..a3301dd932 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -39,7 +39,6 @@ template <typename ADataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CDEElementwiseOperation,
-          InMemoryDataOperationEnum EGlobalMemoryDataOperation,
           index_t NumGemmKPrefetchStage,
           index_t BlockSize,
           index_t MPerBlock,
@@ -330,7 +329,8 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                             const BGridDesc_N_K& b_grid_desc_n_k,
                                                             const DsGridDesc_M_N& ds_grid_desc_m_n,
                                                             const EGridDesc_M_N& e_grid_desc_m_n,
-                                                            [[maybe_unused]] const Block2ETileMap&)
+                                                            [[maybe_unused]] const Block2ETileMap&,
+                                                            index_t k_batch = 1)
     {
         static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
@@ -367,7 +367,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         }
 
         // check gridwise gemm pipeline
-        const auto num_k_loop = AK / KPerBlock;
+        const auto num_k_loop = AK / (KPerBlock * k_batch);
         if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
             return false;
@@ -393,9 +393,10 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         return true;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K,
+                                                                         index_t k_batch = 1)
     {
-        const index_t num_loop = K / KPerBlock;
+        const index_t num_loop = K / (KPerBlock * k_batch);
 
         return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
     }
@@ -500,6 +501,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
     __device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; }
 
     template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
               typename AGridDesc_AK0_M_AK1,
               typename BGridDesc_BK0_N_BK1,
               typename DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -519,7 +521,9 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
                                const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
-                               const Block2ETileMap& block_2_etile_map)
+                               const Block2ETileMap& block_2_etile_map,
+                               const index_t k_batch = 1,
+                               const index_t k_idx   = 0)
     {
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
@@ -550,6 +554,9 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             return;
         }
 
+        const index_t num_k_per_block =
+            __builtin_amdgcn_readfirstlane(a_grid_desc_ak0_m_ak1.GetLength(I0) / k_batch);
+
         // HACK: this force m/n_block_data_idx_on_grid into SGPR
         const index_t m_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
@@ -591,7 +598,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                 true,
                                                 NumGemmKPrefetchStage>(
                 a_grid_desc_ak0_m_ak1,
-                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                make_multi_index(num_k_per_block * k_idx, m_block_data_idx_on_grid, 0),
                 a_element_op,
                 a_block_desc_ak0_m_ak1,
                 make_multi_index(0, 0, 0),
@@ -622,7 +629,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                                 true,
                                                 NumGemmKPrefetchStage>(
                 b_grid_desc_bk0_n_bk1,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                make_multi_index(num_k_per_block * k_idx, n_block_data_idx_on_grid, 0),
                 b_element_op,
                 b_block_desc_bk0_n_bk1,
                 make_multi_index(0, 0, 0),
@@ -688,7 +695,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
-            KPerBlock);
+            (KPerBlock * k_batch));
 
         gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
                                                                a_block_desc_ak0_m_ak1,
@@ -943,6 +950,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
     }
 
     template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
               GemmSpecialization GemmSpec,
               typename ALayout,
               typename BLayout,
@@ -1010,22 +1018,24 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n);
 
-        Run<HasMainKBlockLoop>(p_a_grid,
-                               p_b_grid,
-                               p_ds_grid,
-                               p_e_grid,
-                               p_shared,
-                               a_element_op,
-                               b_element_op,
-                               cde_element_op,
-                               a_grid_desc_ak0_m_ak1,
-                               b_grid_desc_bk0_n_bk1,
-                               ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                               e_grid_desc_mblock_mperblock_nblock_nperblock,
-                               block_2_etile_map);
+        Run<HasMainKBlockLoop, EGlobalMemoryDataOperation>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
     }
 
     template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum EGlobalMemoryDataOperation,
               typename AGridDesc_MK,
               typename BGridDesc_NK,
               typename DsGridDesc_MN,
@@ -1067,19 +1077,20 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n);
 
-        Run<HasMainKBlockLoop>(p_a_grid,
-                               p_b_grid,
-                               p_ds_grid,
-                               p_e_grid,
-                               p_shared,
-                               a_element_op,
-                               b_element_op,
-                               cde_element_op,
-                               a_grid_desc_ak0_m_ak1,
-                               b_grid_desc_bk0_n_bk1,
-                               ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                               e_grid_desc_mblock_mperblock_nblock_nperblock,
-                               block_2_etile_map);
+        Run<HasMainKBlockLoop, EGlobalMemoryDataOperation>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_e_grid,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+            block_2_etile_map);
     }
 };
 

From 58f9e9ffbc190188f85895deb952cb47cc89c403 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 13 May 2025 10:18:14 -0700
Subject: [PATCH 118/443] Update the buffer load/store intrinsic names for
 clang>=20. (#2192)

* fix the buffer load/store intrinsic names

* fix clang format
---
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |   74 +-
 .../amd_buffer_addressing_builtins.hpp        |   20 +-
 include/ck_tile/core.hpp                      |    1 -
 .../arch/amd_buffer_addressing_builtins.hpp   | 2559 -----------------
 include/ck_tile/core/tensor/buffer_view.hpp   |    4 -
 5 files changed, 51 insertions(+), 2607 deletions(-)
 delete mode 100644 include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp

diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 15a9df2c0c..c191fff7d0 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -4,14 +4,22 @@
 #include <type_traits>
 
 template <typename T>
-constexpr const char* DataTypeToString() {
-    if constexpr (std::is_same_v<T, ck_tile::half_t>) {
+constexpr const char* DataTypeToString()
+{
+    if constexpr(std::is_same_v<T, ck_tile::half_t>)
+    {
         return "fp16";
-    } else if constexpr (std::is_same_v<T, ck_tile::fp8_t>) {
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::fp8_t>)
+    {
         return "fp8";
-    } else if constexpr (std::is_same_v<T, ck_tile::bf8_t>) {
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::bf8_t>)
+    {
         return "bf8";
-    } else {
+    }
+    else
+    {
         return "unknown";
     }
 }
@@ -112,8 +120,9 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time = flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+    float ave_time =
+        flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -121,18 +130,15 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>() << " M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>()
+              << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
+              << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
+              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
 
     return ave_time;
 }
 
-template <typename PrecType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
+template <typename PrecType, typename ALayout, typename BLayout, typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                     char* argv[],
                                     const ALayout a_layout                  = ALayout{},
@@ -147,7 +153,7 @@ int run_flatmm_example_with_layouts(int argc,
     using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
     using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
     using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
-    
+
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
     ck_tile::index_t K = arg_parser.get_int("k");
@@ -182,7 +188,7 @@ int run_flatmm_example_with_layouts(int argc,
     c_rslt_host.SetZero();
 
     // do pre-shuffle
-    std::string mfma                              = arg_parser.get_str("prec");
+    std::string mfma = arg_parser.get_str("prec");
 #if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
     ck_tile::index_t mfma_type = 1;
 #else
@@ -193,18 +199,18 @@ int run_flatmm_example_with_layouts(int argc,
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
     invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-                                             a_dev_buf,
-                                             b_shuffle_dev_buf,
-                                             c_dev_buf,
-                                             M,
-                                             N,
-                                             K,
-                                             stride_A,
-                                             stride_B,
-                                             stride_C,
-                                             kbatch,
-                                             n_warmup,
-                                             n_repeat);
+        a_dev_buf,
+        b_shuffle_dev_buf,
+        c_dev_buf,
+        M,
+        N,
+        K,
+        stride_A,
+        stride_B,
+        stride_C,
+        kbatch,
+        n_warmup,
+        n_repeat);
 
     c_dev_buf.FromDevice(c_rslt_host.data());
     bool pass = true;
@@ -219,8 +225,9 @@ int run_flatmm_example_with_layouts(int argc,
             a_host, b_origin_host, c_ref_host);
         const float max_accumulated_value =
             *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
-        pass                 = ck_tile::check_err(c_rslt_host,
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_rslt_host,
                                   c_ref_host,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
@@ -277,8 +284,9 @@ int run_flatmm_example_with_layouts(int argc,
         c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
         const float max_accumulated_value =
             *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
-        pass                 = ck_tile::check_err(c_rslt_host,
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_rslt_host,
                                   c_gpu_ref_host,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 19869906dc..296c1d44d7 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -80,7 +80,7 @@ __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
 
 // buffer atomic-add i32
 __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
@@ -88,7 +88,7 @@ __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
 
 // buffer atomic-add fp32
 __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
@@ -96,15 +96,15 @@ __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
 
 // buffer atomic-add fp32
-__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(
-    double vdata,
-    int32x4_t rsrc, // dst_wave_buffer_resource
-    int voffset,    // dst_thread_addr_offset
-    int soffset,    // dst_wave_addr_offset
-    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
+__device__ double
+llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
+                                       int32x4_t rsrc, // dst_wave_buffer_resource
+                                       int voffset,    // dst_thread_addr_offset
+                                       int soffset,    // dst_wave_addr_offset
+                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
 // memory coherency bit for buffer store/load instruction
 // check ISA manual for each GFX target
@@ -827,7 +827,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t offset,
-                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
 
 #ifndef __HIPCC_RTC__
 template <typename T, index_t NumElemsPerThread>
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index b9791f0b55..2ea8bf15a7 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -9,7 +9,6 @@
 #include "ck_tile/core/algorithm/space_filling_curve.hpp"
 #include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
-#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/arch/utility.hpp"
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
deleted file mode 100644
index 0b9956cd01..0000000000
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ /dev/null
@@ -1,2559 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#if CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
-
-#include "ck_tile/core/numeric/integer.hpp"
-#include "ck_tile/core/numeric/integral_constant.hpp"
-#include "ck_tile/core/numeric/vector_type.hpp"
-#include "ck_tile/core/container/container_helper.hpp"
-#include "ck_tile/core/container/thread_buffer.hpp"
-#include "ck_tile/core/utility/type_traits.hpp"
-#include "ck_tile/core/utility/bit_cast.hpp"
-#include "ck_tile/core/utility/functional.hpp"
-
-namespace ck_tile {
-
-// 128 bit SGPRs to supply buffer resource in buffer instructions
-// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
-struct __attribute__((packed)) buffer_resource
-{
-    const void* ptr;
-    uint32_t range;
-    uint32_t config;
-};
-
-CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
-{
-    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
-    int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
-    return r;
-}
-
-namespace impl {
-// below type indicate the data type used for buffer load inline asm
-// clang-format off
-template<index_t N, typename T> struct buffer_load_trait;
-
-template<typename T> struct buffer_load_trait<16, T> { using payload_t = fp32x4_t; };
-template<typename T> struct buffer_load_trait<8 , T> { using payload_t = fp32x2_t; };
-template<typename T> struct buffer_load_trait<4 , T> { using payload_t = float; };
-template<typename T> struct buffer_load_trait<2 , T> { using payload_t = float; };
-template<typename T> struct buffer_load_trait<1 , T> { using payload_t = float; };
-
-#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
-template<> struct buffer_load_trait<16, thread_buffer<bf16_t, 8>> { using payload_t = bf16x8_t; };
-template<> struct buffer_load_trait<8 , thread_buffer<bf16_t, 4>> { using payload_t = bf16x4_t; };
-template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payload_t = bf16x2_t; };
-#endif
-// clang-format on
-} // namespace impl
-
-// TODO: glc/slc/...
-template <index_t bytes, bool pre_nop = false>
-struct buffer_load;
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
-// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
-// (exp_vector_type(xxx))
-template <bool pre_nop>
-struct buffer_load<16, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/       = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 16);
-        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-        else
-            asm volatile("buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load<8, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/       = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 8);
-        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-        else
-            asm volatile("buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load<4, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/       = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-        else
-            asm volatile("buffer_load_dword %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load<2, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/       = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
-        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-        else
-            asm volatile("buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load<1, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/       = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-        else
-            asm volatile("buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset)
-                         : "memory");
-    }
-};
-
-template <index_t bytes, bool pre_nop = false>
-struct buffer_load_if;
-
-template <bool pre_nop>
-struct buffer_load_if<16, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 16);
-        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
-        static_assert(sizeof(mbuf_t) == sizeof(T));
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-        else
-            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load_if<8, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 8);
-        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-        else
-            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load_if<4, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 4);
-        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-        else
-            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load_if<2, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 4);
-        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-        else
-            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-    }
-};
-
-template <bool pre_nop>
-struct buffer_load_if<1, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag           = 0,
-                                   bool_constant<pre_nop> = {})
-    {
-        static_assert(sizeof(T) == 4);
-        auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
-        if constexpr(pre_nop)
-            asm volatile("s_nop 4\n"
-                         "v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-        else
-            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
-                         "s_mov_b64 exec %5"
-                         : "+v"(reinterpret_cast<mbuf_t&>(value))
-                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
-                         : "memory");
-    }
-};
-#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
-template <index_t bytes>
-struct buffer_store;
-
-template <>
-struct buffer_store<16>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 1)
-    {
-        static_assert(sizeof(T) == 16);
-        using mbuf_t = fp32x4_t;
-        asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store<8>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 1)
-    {
-        static_assert(sizeof(T) == 8);
-        using mbuf_t = fp32x2_t;
-        asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store<4>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 1)
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
-        asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store<2>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 1)
-    {
-        static_assert(sizeof(T) == 2);
-        using mbuf_t = short;
-        asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store<1>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag*/ = 1)
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
-        asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <index_t bytes>
-struct buffer_store_if;
-
-template <>
-struct buffer_store_if<16>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 1)
-    {
-        static_assert(sizeof(T) == 16);
-        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = fp32x4_t;
-        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                     "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
-                     "s_mov_b64 exec %5"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)),
-                       "v"(v_offset),
-                       "s"(res),
-                       "n"(i_offset),
-                       "v"(flag),
-                       "s"(save_exec)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store_if<8>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 1)
-    {
-        static_assert(sizeof(T) == 8);
-        auto save_exec = __builtin_amdgcn_read_exec();
-        // TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
-        using mbuf_t = ext_vector_t<typename T::value_type, T::size()>;
-        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                     "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
-                     "s_mov_b64 exec %5"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)),
-                       "v"(v_offset),
-                       "s"(res),
-                       "n"(i_offset),
-                       "v"(flag),
-                       "s"(save_exec)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store_if<4>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 1)
-    {
-        static_assert(sizeof(T) == 4);
-        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
-        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                     "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
-                     "s_mov_b64 exec %5"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)),
-                       "v"(v_offset),
-                       "s"(res),
-                       "n"(i_offset),
-                       "v"(flag),
-                       "s"(save_exec)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store_if<2>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 1)
-    {
-        static_assert(sizeof(T) == 2);
-        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = short;
-        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                     "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
-                     "s_mov_b64 exec %5"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)),
-                       "v"(v_offset),
-                       "s"(res),
-                       "n"(i_offset),
-                       "v"(flag),
-                       "s"(save_exec)
-                     : "memory");
-    }
-};
-
-template <>
-struct buffer_store_if<1>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 1)
-    {
-        static_assert(sizeof(T) == 4);
-        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
-        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                     "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
-                     "s_mov_b64 exec %5"
-                     :
-                     : "v"(bit_cast<mbuf_t>(value)),
-                       "v"(v_offset),
-                       "s"(res),
-                       "n"(i_offset),
-                       "v"(flag),
-                       "s"(save_exec)
-                     : "memory");
-    }
-};
-
-CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
-{
-    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
-}
-
-CK_TILE_DEVICE void lds_load_fence(index_t cnt = 0)
-{
-    asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory");
-}
-
-template <typename scalar_type, index_t N, bool pre_nop = false>
-struct buffer_atomic_add_if;
-
-template <bool pre_nop>
-struct buffer_atomic_add_if<bf16_t, 2, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 1)
-    {
-        static_assert(sizeof(T) == 4);
-        auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t   = float;
-        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
-                     "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
-                     "s_mov_b64 exec %5"
-                     :
-                     : "v"(v_offset),
-                       "v"(bit_cast<mbuf_t>(value)),
-                       "s"(res.xy),
-                       "n"(i_offset),
-                       "v"(flag),
-                       "s"(save_exec)
-                     : "memory");
-    }
-};
-
-template <typename scalar_type, index_t N, bool pre_nop = false>
-struct buffer_atomic_add;
-
-template <bool pre_nop>
-struct buffer_atomic_add<bf16_t, 2, pre_nop>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(const T& value,
-                                   int32x4_t res /*buffer resource*/,
-                                   index_t v_offset,
-                                   index_t /*s_offset*/,
-                                   index_t i_offset /*max 0xFFF*/,
-                                   index_t /*flag = 1*/)
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = float;
-        asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
-                     :
-                     : "v"(v_offset), "v"(bit_cast<mbuf_t>(value)), "s"(res.xy), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-namespace impl {
-// below type indicate the data type used for buffer load inline asm
-// clang-format off
-template<index_t N, typename T> struct smem_load_trait;
-
-template<typename T> struct smem_load_trait<16, T> { using payload_t = fp32x4_t; };
-template<typename T> struct smem_load_trait<8 , T> { using payload_t = fp32x2_t; };
-template<typename T> struct smem_load_trait<4 , T> { using payload_t = float; };
-template<typename T> struct smem_load_trait<2 , T> { using payload_t = float; };
-template<typename T> struct smem_load_trait<1 , T> { using payload_t = float; };
-
-// clang-format on
-} // namespace impl
-
-// NOTE: smem load/store no need pre_nop to make sure dependency by sw, happy :)
-template <index_t>
-struct smem_load;
-
-template <>
-struct smem_load<16>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
-    {
-        static_assert(sizeof(T) == 16);
-        using mbuf_t = typename impl::smem_load_trait<16, T>::payload_t;
-        asm volatile("ds_read_b128 %0, %1 offset:%2"
-                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
-                     : "v"(v_offset), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct smem_load<8>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
-    {
-        static_assert(sizeof(T) == 8);
-        using mbuf_t = typename impl::smem_load_trait<8, T>::payload_t;
-        asm volatile("ds_read_b64 %0, %1 offset:%2"
-                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
-                     : "v"(v_offset), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct smem_load<4>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = typename impl::smem_load_trait<4, T>::payload_t;
-        asm volatile("ds_read_b32 %0, %1 offset:%2"
-                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
-                     : "v"(v_offset), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct smem_load<2>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
-    {
-        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
-        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
-        asm volatile("ds_read_u16 %0, %1 offset:%2"
-                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
-                     : "v"(v_offset), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-template <>
-struct smem_load<1>
-{
-    template <typename T>
-    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
-    {
-        static_assert(sizeof(T) == 4);
-        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
-        asm volatile("ds_read_u8 %0, %1 offset:%2"
-                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
-                     : "v"(v_offset), "n"(i_offset)
-                     : "memory");
-    }
-};
-
-// clang-format off
-namespace impl{
-
-// can't use "+v" since there could be potential extra move(read/write)
-// use "v" can help remove such duplicated moves
-// besides, fake this as "memory" operation to force later valu after this fence
-// TODO: may have scratch (because this is memory?)
-//       need to reduce extra move inside compiler
-template<index_t N>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword(array<float, N>& b)
-{
-    constexpr auto kSize = remove_cvref_t<decltype(b)>::size(); 
-    static_for<0, kSize, 1>{}([&](auto i){
-        asm volatile(" " : : "v"(b.get(number<i>{})) : "memory");
-    });
-}
-#if 1
-// below specialization just merge size() of dwords into single section
-template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<2>(array<float, 2>& b)
-{
-    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})) : "memory");
-}
-
-template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<3>(array<float, 3>& b)
-{
-    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})) : "memory");
-}
-
-template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<4>(array<float, 4>& b)
-{
-    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})) : "memory");
-}
-
-template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<8>(array<float, 8>& b)
-{
-    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
-                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})) : "memory");
-}
-
-template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<16>(array<float, 16>& b)
-{
-    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
-                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
-                         "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
-                         "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})) : "memory");
-}
-
-template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<32>(array<float, 32>& b)
-{
-    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
-                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
-                         "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
-                         "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})),
-                         "v"(b.get(number<16>{})), "v"(b.get(number<17>{})), "v"(b.get(number<18>{})), "v"(b.get(number<19>{})),
-                         "v"(b.get(number<20>{})), "v"(b.get(number<21>{})), "v"(b.get(number<22>{})), "v"(b.get(number<23>{})),
-                         "v"(b.get(number<24>{})), "v"(b.get(number<25>{})), "v"(b.get(number<26>{})), "v"(b.get(number<27>{})),
-                         "v"(b.get(number<28>{})), "v"(b.get(number<29>{})), "v"(b.get(number<30>{})), "v"(b.get(number<31>{})) : "memory");
-}
-#endif
-CK_TILE_DEVICE void insert_dummy_dep() {}
-
-template<typename T>
-CK_TILE_DEVICE void insert_dummy_dep(T & buffer)
-{
-    // TODO: indeed we expect T to be multiple of dword. subdword is always buggy
-    using da_type = array<float, (sizeof(T) + 3) / 4>;
-    auto & dummy = reinterpret_cast<da_type&>(buffer);
-    insert_dummy_dep_per_dword(dummy);
-}
-
-template<typename Tx, typename... Ty>
-CK_TILE_DEVICE void insert_dummy_dep(Tx& bx, Ty&... by)
-{
-    insert_dummy_dep(bx);
-    insert_dummy_dep(by...);
-}
-}
-// clang-format on
-template <typename... T>
-CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0, T&... o)
-{
-    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
-    impl::insert_dummy_dep(o...);
-}
-
-CK_TILE_DEVICE void buffer_store_fence(index_t cnt = 0)
-{
-    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
-}
-
-CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
-{
-    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
-}
-
-// buffer load i8
-CK_TILE_DEVICE_EXTERN int8_t
-llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
-                               index_t voffset,
-                               index_t soffset,
-                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8.v4i32");
-
-CK_TILE_DEVICE_EXTERN int8x2_t
-llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
-                                 index_t voffset,
-                                 index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8.v4i32");
-
-CK_TILE_DEVICE_EXTERN int8x4_t
-llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
-                                 index_t voffset,
-                                 index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8.v4i32");
-
-// buffer load i16
-CK_TILE_DEVICE_EXTERN int16_t
-llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
-                                index_t voffset,
-                                index_t soffset,
-                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN int16x2_t
-llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN int16x4_t
-llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16.v4i32");
-
-// buffer load i32
-CK_TILE_DEVICE_EXTERN int32_t
-llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
-                                index_t voffset,
-                                index_t soffset,
-                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32.v4i32");
-
-CK_TILE_DEVICE_EXTERN int32x2_t
-llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32.v4i32");
-
-CK_TILE_DEVICE_EXTERN int32x4_t
-llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32.v4i32");
-
-// buffer load fp16
-CK_TILE_DEVICE_EXTERN _Float16
-llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
-                                 index_t voffset,
-                                 index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16.v4i32");
-
-CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16.v4i32");
-
-CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16.v4i32");
-
-// buffer load fp32
-CK_TILE_DEVICE_EXTERN float
-llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
-                                 index_t voffset,
-                                 index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32.v4i32");
-
-CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32.v4i32");
-
-CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32.v4i32");
-
-// buffer store i8
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
-                                int32x4_t rsrc,
-                                index_t voffset,
-                                index_t soffset,
-                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8.v4i32");
-
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
-                                  int32x4_t rsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8.v4i32");
-
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
-                                  int32x4_t rsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8.v4i32");
-
-// buffer store i16
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
-                                 int32x4_t rsrc,
-                                 index_t voffset,
-                                 index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(
-    int16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(
-    int16x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
-
-// buffer store i32
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
-                                 int32x4_t rsrc,
-                                 index_t voffset,
-                                 index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32.v4i32");
-
-// buffer store ui16
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata,
-                                  int32x4_t rsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(
-    uint16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(
-    uint16x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(
-    int32x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(
-    int32x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32.v4i32");
-
-// buffer store fp16
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata,
-                                  int32x4_t rsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(
-    fp16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(
-    fp16x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16.v4i32");
-
-// buffer store fp32
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_store_fp32(float vdata,
-                                  int32x4_t rsrc,
-                                  index_t voffset,
-                                  index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(
-    fp32x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32.v4i32");
-
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(
-    fp32x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32.v4i32");
-
-// buffer atomic-add fp16
-CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
-    fp16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
-
-// buffer atomic-add i32
-CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
-    int32_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
-
-// buffer atomic-add fp32
-CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
-    float vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
-
-// buffer atomic-max fp64
-CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(
-    double vdata,
-    int32x4_t rsrc, // dst_wave_buffer_resource
-    int voffset,    // dst_thread_addr_offset
-    int soffset,    // dst_wave_addr_offset
-    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
-
-// Direct loads from global to LDS.
-CK_TILE_DEVICE_EXTERN void
-llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
-                                __attribute__((address_space(3))) uint32_t* lds_ptr,
-                                index_t size,
-                                index_t voffset,
-                                index_t soffset,
-                                index_t offset,
-                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
-
-template <bool pre_nop = false>
-CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
-                                              int32x4_t rsrc,
-                                              index_t voffset,
-                                              index_t /*soffset*/,
-                                              index_t ioffset /*max 0xFFF*/,
-                                              index_t /*flag*/       = 0,
-                                              bool_constant<pre_nop> = {})
-{
-    if constexpr(pre_nop)
-        asm volatile("s_nop 4\n"
-                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
-                     : "=r"(smem) /*dummy dependency for smem*/
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
-                     : "memory");
-    else
-        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
-                     : "=r"(smem) /*dummy dependency for smem*/
-                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
-                     : "memory");
-}
-
-CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
-{
-    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
-}
-
-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct amd_buffer_coherence_enum
-{
-    coherence_default = 0, // default value
-    glc               = 1,
-    slc               = 2,
-    glc_slc           = 3,
-};
-
-template <index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE thread_buffer<int8_t, N>
-amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
-                                index_t src_thread_addr_offset,
-                                index_t src_wave_addr_offset)
-{
-    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
-                  "wrong! not implemented");
-
-    using rtn_type = thread_buffer<int8_t, N>;
-
-    if constexpr(N == 1)
-    {
-        return bit_cast<rtn_type>(llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
-                                                                 src_thread_addr_offset,
-                                                                 src_wave_addr_offset,
-                                                                 static_cast<index_t>(coherence)));
-    }
-    else if constexpr(N == 2)
-    {
-
-        int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
-                                                      src_thread_addr_offset,
-                                                      src_wave_addr_offset,
-                                                      static_cast<index_t>(coherence));
-
-        return bit_cast<rtn_type>(tmp);
-    }
-    else if constexpr(N == 4)
-    {
-        int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
-                                                      src_thread_addr_offset,
-                                                      src_wave_addr_offset,
-                                                      static_cast<index_t>(coherence));
-
-        return bit_cast<rtn_type>(tmp);
-    }
-    else if constexpr(N == 8)
-    {
-        int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
-                                                          src_thread_addr_offset,
-                                                          src_wave_addr_offset,
-                                                          static_cast<index_t>(coherence));
-
-        return bit_cast<rtn_type>(tmp);
-    }
-    else if constexpr(N == 16)
-    {
-        int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                                          src_thread_addr_offset,
-                                                          src_wave_addr_offset,
-                                                          static_cast<index_t>(coherence));
-        return bit_cast<rtn_type>(tmp);
-    }
-    else if constexpr(N == 32)
-    {
-        int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                                           src_thread_addr_offset,
-                                                           src_wave_addr_offset,
-                                                           static_cast<index_t>(coherence));
-        int32x4_t tmp1 =
-            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                              src_thread_addr_offset,
-                                              src_wave_addr_offset + 4 * sizeof(int32_t),
-                                              static_cast<index_t>(coherence));
-        thread_buffer<int32_t, 8> tmp;
-
-        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
-        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
-
-        return bit_cast<rtn_type>(tmp);
-    }
-    else if constexpr(N == 64)
-    {
-        int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                                           src_thread_addr_offset,
-                                                           src_wave_addr_offset,
-                                                           static_cast<index_t>(coherence));
-        int32x4_t tmp1 =
-            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                              src_thread_addr_offset,
-                                              src_wave_addr_offset + 4 * sizeof(int32_t),
-                                              static_cast<index_t>(coherence));
-        int32x4_t tmp2 =
-            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                              src_thread_addr_offset,
-                                              src_wave_addr_offset + 8 * sizeof(int32_t),
-                                              static_cast<index_t>(coherence));
-        int32x4_t tmp3 =
-            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                              src_thread_addr_offset,
-                                              src_wave_addr_offset + 12 * sizeof(int32_t),
-                                              static_cast<index_t>(coherence));
-
-        thread_buffer<int32_t, 16> tmp;
-
-        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
-        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
-        tmp.template get_as<int32x4_t>()(number<2>{}) = tmp2;
-        tmp.template get_as<int32x4_t>()(number<3>{}) = tmp3;
-
-        return bit_cast<rtn_type>(tmp);
-    }
-}
-
-#ifndef BUFFER_LOAD_USE_INLINEASM
-#define BUFFER_LOAD_USE_INLINEASM 0
-#endif
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
-                                                        index_t src_thread_addr_offset,
-                                                        index_t src_wave_addr_offset)
-{
-    static_assert(
-        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (std::is_same<T, int32_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, pk_int4_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)),
-        "wrong! not implemented");
-
-    using rtn_type = thread_buffer<T, N>;
-
-    if constexpr(std::is_same<T, float>::value) // fp32
-    {
-        if constexpr(N == 1)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset,
-                                                 static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 2)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 4)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 8)
-        {
-            thread_buffer<float, 8> tmp;
-
-            tmp.template get_as<fp32x4_t>()(number<0>{}) =
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence));
-
-            tmp.template get_as<fp32x4_t>()(number<1>{}) =
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset + 4 * sizeof(float),
-                                                   static_cast<index_t>(coherence));
-
-            return tmp;
-        }
-        else if constexpr(N == 16)
-        {
-            thread_buffer<float, 16> tmp;
-
-            tmp.template get_as<fp32x4_t>()(number<0>{}) =
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence));
-
-            tmp.template get_as<fp32x4_t>()(number<1>{}) =
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset + 4 * sizeof(float),
-                                                   static_cast<index_t>(coherence));
-
-            tmp.template get_as<fp32x4_t>()(number<2>{}) =
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset + 8 * sizeof(float),
-                                                   static_cast<index_t>(coherence));
-
-            tmp.template get_as<fp32x4_t>()(number<3>{}) =
-                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset + 12 * sizeof(float),
-                                                   static_cast<index_t>(coherence));
-
-            return tmp;
-        }
-    }
-    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
-    {
-        if constexpr(N == 1)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset,
-                                                 static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 2)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 4)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 8)
-        {
-            // use fp32 load to mimic fp16 load
-            fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                              src_thread_addr_offset,
-                                                              src_wave_addr_offset,
-                                                              static_cast<index_t>(coherence));
-
-            return bit_cast<rtn_type>(tmp);
-        }
-    }
-    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
-    {
-        if constexpr(N == 1)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
-                                                src_thread_addr_offset,
-                                                src_wave_addr_offset,
-                                                static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 2)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
-                                                  src_thread_addr_offset,
-                                                  src_wave_addr_offset,
-                                                  static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 4)
-        {
-            return bit_cast<rtn_type>(
-                llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
-                                                  src_thread_addr_offset,
-                                                  src_wave_addr_offset,
-                                                  static_cast<index_t>(coherence)));
-        }
-        else if constexpr(N == 8)
-        {
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                                              src_thread_addr_offset,
-                                                              src_wave_addr_offset,
-                                                              static_cast<index_t>(coherence));
-
-            return bit_cast<rtn_type>(tmp);
-        }
-    }
-    else // other datatype
-    {
-        auto raw_data = amd_buffer_load_impl_with_bytes<sizeof(T) * N, coherence>(
-            src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
-
-        return bit_cast<rtn_type>(raw_data);
-    }
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
-                                             int32x4_t src_wave_buffer_resource,
-                                             index_t src_thread_addr_offset,
-                                             index_t src_wave_addr_offset,
-                                             index_t src_linear_addr_offset,
-                                             index_t flag           = 0,
-                                             bool_constant<pre_nop> = {})
-{
-    constexpr index_t bytes = sizeof(T) * N;
-    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
-                  "wrong! not supported by buffer_load instruction");
-
-    using type = thread_buffer<T, N>;
-    if constexpr(oob_conditional_check)
-    {
-        buffer_load_if<sizeof(type), pre_nop>{}(dst,
-                                                src_wave_buffer_resource,
-                                                src_thread_addr_offset,
-                                                src_wave_addr_offset,
-                                                src_linear_addr_offset,
-                                                flag,
-                                                bool_constant<pre_nop>{});
-    }
-    else
-    {
-        buffer_load<sizeof(type), pre_nop>{}(dst,
-                                             src_wave_buffer_resource,
-                                             src_thread_addr_offset,
-                                             src_wave_addr_offset,
-                                             src_linear_addr_offset,
-                                             flag,
-                                             bool_constant<pre_nop>{});
-    }
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
-                                               int32x4_t src_wave_buffer_resource,
-                                               index_t src_thread_addr_offset,
-                                               index_t src_wave_addr_offset,
-                                               index_t src_immediate_addr_offset = 0,
-                                               bool_constant<pre_nop>            = {})
-{
-    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
-
-    async_buffer_load_dword_v(smem,
-                              src_wave_buffer_resource,
-                              src_thread_addr_offset,
-                              src_wave_addr_offset,
-                              src_immediate_addr_offset,
-                              0,
-                              bool_constant<pre_nop>{});
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
-CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
-                                          int32x4_t src_wave_buffer_resource,
-                                          index_t src_thread_addr_offset,
-                                          index_t src_wave_addr_offset,
-                                          index_t src_immediate_addr_offset    = 0,
-                                          index_t flag                         = 0,
-                                          bool_constant<oob_conditional_check> = {})
-{
-    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
-
-    if constexpr(oob_conditional_check)
-    {
-        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                        smem,
-                                        sizeof(uint32_t),
-                                        v_offset,
-                                        src_wave_addr_offset,
-                                        src_immediate_addr_offset,
-                                        static_cast<index_t>(coherence));
-    }
-    else
-    {
-        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                        smem,
-                                        sizeof(uint32_t),
-                                        src_thread_addr_offset,
-                                        src_wave_addr_offset,
-                                        src_immediate_addr_offset,
-                                        static_cast<index_t>(coherence));
-    }
-}
-
-template <index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
-                                                     int32x4_t dst_wave_buffer_resource,
-                                                     index_t dst_thread_addr_offset,
-                                                     index_t dst_wave_addr_offset)
-{
-    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
-                  "wrong! not implemented");
-
-    if constexpr(N == 1)
-    {
-        llvm_amdgcn_raw_buffer_store_i8(bit_cast<int8_t>(src_thread_data),
-                                        dst_wave_buffer_resource,
-                                        dst_thread_addr_offset,
-                                        dst_wave_addr_offset,
-                                        static_cast<index_t>(coherence));
-    }
-    else if constexpr(N == 2)
-    {
-
-        llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
-                                         dst_wave_buffer_resource,
-                                         dst_thread_addr_offset,
-                                         dst_wave_addr_offset,
-                                         static_cast<index_t>(coherence));
-    }
-    else if constexpr(N == 4)
-    {
-        llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
-                                         dst_wave_buffer_resource,
-                                         dst_thread_addr_offset,
-                                         dst_wave_addr_offset,
-                                         static_cast<index_t>(coherence));
-    }
-    else if constexpr(N == 8)
-    {
-        llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset,
-                                           static_cast<index_t>(coherence));
-    }
-    else if constexpr(N == 16)
-    {
-        llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset,
-                                           static_cast<index_t>(coherence));
-    }
-    else if constexpr(N == 32)
-    {
-        llvm_amdgcn_raw_buffer_store_i32x4(
-            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
-            dst_wave_buffer_resource,
-            dst_thread_addr_offset,
-            dst_wave_addr_offset,
-            static_cast<index_t>(coherence));
-
-        llvm_amdgcn_raw_buffer_store_i32x4(
-            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
-            dst_wave_buffer_resource,
-            dst_thread_addr_offset,
-            dst_wave_addr_offset + sizeof(int32_t) * 4,
-            static_cast<index_t>(coherence));
-    }
-    else if constexpr(N == 64)
-    {
-        llvm_amdgcn_raw_buffer_store_i32x4(
-            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
-            dst_wave_buffer_resource,
-            dst_thread_addr_offset,
-            dst_wave_addr_offset,
-            static_cast<index_t>(coherence));
-
-        llvm_amdgcn_raw_buffer_store_i32x4(
-            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
-            dst_wave_buffer_resource,
-            dst_thread_addr_offset,
-            dst_wave_addr_offset + sizeof(int32_t) * 4,
-            static_cast<index_t>(coherence));
-
-        llvm_amdgcn_raw_buffer_store_i32x4(
-            src_thread_data.template get_as<int32x4_t>()[number<2>{}],
-            dst_wave_buffer_resource,
-            dst_thread_addr_offset,
-            dst_wave_addr_offset + sizeof(int32_t) * 8,
-            static_cast<index_t>(coherence));
-
-        llvm_amdgcn_raw_buffer_store_i32x4(
-            src_thread_data.template get_as<int32x4_t>()[number<3>{}],
-            dst_wave_buffer_resource,
-            dst_thread_addr_offset,
-            dst_wave_addr_offset + sizeof(int32_t) * 12,
-            static_cast<index_t>(coherence));
-    }
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_data,
-                                          int32x4_t dst_wave_buffer_resource,
-                                          index_t dst_thread_addr_offset,
-                                          index_t dst_wave_addr_offset)
-{
-    static_assert(
-        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, int32_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, uint16_t>::value &&
-             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
-        "wrong! not implemented");
-
-    if constexpr(std::is_same<T, float>::value) // fp32
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_store_fp32(bit_cast<float>(src_thread_data),
-                                              dst_wave_buffer_resource,
-                                              dst_thread_addr_offset,
-                                              dst_wave_addr_offset,
-                                              static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<fp32x2_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 8)
-        {
-            llvm_amdgcn_raw_buffer_store_fp32x4(
-                src_thread_data.template get_as<fp32x4_t>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                static_cast<index_t>(coherence));
-            llvm_amdgcn_raw_buffer_store_fp32x4(
-                src_thread_data.template get_as<fp32x4_t>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 4 * sizeof(float),
-                static_cast<index_t>(coherence));
-        }
-    }
-    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_store_fp16(bit_cast<_Float16>(src_thread_data),
-                                              dst_wave_buffer_resource,
-                                              dst_thread_addr_offset,
-                                              dst_wave_addr_offset,
-                                              static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_store_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_store_fp16x4(bit_cast<fp16x4_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 8)
-        {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
-            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-#endif
-        }
-    }
-    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
-                                             dst_wave_buffer_resource,
-                                             dst_thread_addr_offset,
-                                             dst_wave_addr_offset,
-                                             static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_store_i16x2(bit_cast<int16x2_t>(src_thread_data),
-                                               dst_wave_buffer_resource,
-                                               dst_thread_addr_offset,
-                                               dst_wave_addr_offset,
-                                               static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_store_i16x4(bit_cast<int16x4_t>(src_thread_data),
-                                               dst_wave_buffer_resource,
-                                               dst_thread_addr_offset,
-                                               dst_wave_addr_offset,
-                                               static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 8)
-        {
-            llvm_amdgcn_raw_buffer_store_i16x4(
-                src_thread_data.template get_as<int16x4_t>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_i16x4(
-                src_thread_data.template get_as<int16x4_t>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 4 * sizeof(bf16_t),
-                static_cast<index_t>(coherence));
-        }
-    }
-    else if constexpr(std::is_same<T, uint16_t>::value)
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_store_ui16(bit_cast<uint16_t>(src_thread_data),
-                                              dst_wave_buffer_resource,
-                                              dst_thread_addr_offset,
-                                              dst_wave_addr_offset,
-                                              static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_store_ui16x2(bit_cast<uint16x2_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_store_ui16x4(bit_cast<uint16x4_t>(src_thread_data),
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 8)
-        {
-            llvm_amdgcn_raw_buffer_store_ui16x4(
-                src_thread_data.template get_as<uint16x4_t>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_ui16x4(
-                src_thread_data.template get_as<uint16x4_t>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 4 * sizeof(uint16_t),
-                static_cast<index_t>(coherence));
-        }
-    }
-    else
-    {
-        using r_t = thread_buffer<int8_t, sizeof(T) * N>;
-
-        amd_buffer_store_impl_with_bytes<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
-                                                                   dst_wave_buffer_resource,
-                                                                   dst_thread_addr_offset,
-                                                                   dst_wave_addr_offset);
-    }
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
-CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thread_data,
-                                              int32x4_t dst_wave_buffer_resource,
-                                              index_t dst_thread_addr_offset,
-                                              index_t dst_wave_addr_offset,
-                                              index_t dst_linear_addr_offset,
-                                              index_t is_valid_element = 1)
-{
-    constexpr index_t bytes = sizeof(T) * N;
-    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
-                  "wrong! not supported by buffer_store instruction");
-
-    using type = thread_buffer<T, N>;
-    if constexpr(oob_conditional_check)
-    {
-        buffer_store_if<sizeof(type)>{}(dst_thread_data,
-                                        dst_wave_buffer_resource,
-                                        dst_thread_addr_offset,
-                                        dst_wave_addr_offset,
-                                        dst_linear_addr_offset,
-                                        is_valid_element);
-    }
-    else
-    {
-        buffer_store<sizeof(type)>{}(dst_thread_data,
-                                     dst_wave_buffer_resource,
-                                     dst_thread_addr_offset,
-                                     dst_wave_addr_offset,
-                                     dst_linear_addr_offset);
-    }
-}
-
-template <typename T, index_t N>
-CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_thread_data,
-                                               int32x4_t dst_wave_buffer_resource,
-                                               index_t dst_thread_addr_offset,
-                                               index_t dst_wave_addr_offset)
-{
-    static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
-                      (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
-                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
-                  "wrong! not implemented");
-
-    if constexpr(std::is_same<T, float>::value)
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(bit_cast<float>(src_thread_data),
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset,
-                                                   0);
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(
-                src_thread_data.template get_as<float>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(
-                src_thread_data.template get_as<float>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + sizeof(float),
-                0);
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(
-                src_thread_data.template get_as<float>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(
-                src_thread_data.template get_as<float>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + sizeof(float),
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(
-                src_thread_data.template get_as<float>()[number<2>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 2 * sizeof(float),
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(
-                src_thread_data.template get_as<float>()[number<3>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 3 * sizeof(float),
-                0);
-        }
-    }
-    else if constexpr(std::is_same<T, fp16_t>::value)
-    {
-        if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
-                                                     dst_wave_buffer_resource,
-                                                     dst_thread_addr_offset,
-                                                     dst_wave_addr_offset,
-                                                     0);
-        }
-        else if constexpr(N == 4)
-        {
-            static_for<0, 2, 1>{}([&](auto i) {
-                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
-                    src_thread_data.template get_as<fp16x2_t>()[i],
-                    dst_wave_buffer_resource,
-                    dst_thread_addr_offset,
-                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
-                    0);
-            });
-        }
-        else if constexpr(N == 8)
-        {
-            static_for<0, 4, 1>{}([&](auto i) {
-                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
-                    src_thread_data.template get_as<fp16x2_t>()[i],
-                    dst_wave_buffer_resource,
-                    dst_thread_addr_offset,
-                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
-                    0);
-            });
-        }
-    }
-    else if constexpr(std::is_same<T, int32_t>::value)
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_i32(bit_cast<int32_t>(src_thread_data),
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset,
-                                                  0);
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_i32(
-                src_thread_data.template get_as<int32_t>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_i32(
-                src_thread_data.template get_as<int32_t>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + sizeof(int32_t),
-                0);
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_atomic_add_i32(
-                src_thread_data.template get_as<int32_t>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_i32(
-                src_thread_data.template get_as<int32_t>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + sizeof(int32_t),
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_i32(
-                src_thread_data.template get_as<int32_t>()[number<2>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 2 * sizeof(int32_t),
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_i32(
-                src_thread_data.template get_as<int32_t>()[number<3>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 3 * sizeof(int32_t),
-                0);
-        }
-    }
-}
-
-template <typename T, index_t N>
-CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const thread_buffer<T, N> src_thread_data,
-                                               int32x4_t dst_wave_buffer_resource,
-                                               index_t dst_thread_addr_offset,
-                                               index_t dst_wave_addr_offset)
-{
-    static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
-                  "wrong! not implemented");
-    if constexpr(std::is_same<T, double>::value)
-    {
-        if constexpr(N == 1)
-        {
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(bit_cast<double>(src_thread_data),
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset,
-                                                   0);
-        }
-        else if constexpr(N == 2)
-        {
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(
-                src_thread_data.template get_as<double>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(
-                src_thread_data.template get_as<double>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + sizeof(double),
-                0);
-        }
-        else if constexpr(N == 4)
-        {
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(
-                src_thread_data.template get_as<double>()[number<0>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset,
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(
-                src_thread_data.template get_as<double>()[number<1>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + sizeof(double),
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(
-                src_thread_data.template get_as<double>()[number<2>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 2 * sizeof(double),
-                0);
-
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(
-                src_thread_data.template get_as<double>()[number<3>{}],
-                dst_wave_buffer_resource,
-                dst_thread_addr_offset,
-                dst_wave_addr_offset + 3 * sizeof(double),
-                0);
-        }
-    }
-}
-
-// buffer_load requires:
-//   1) p_src_wave must point to global memory space
-//   2) p_src_wave must be a wavewise pointer.
-// It is user's responsibility to make sure that is true.
-//   oob_conditional_check : dynamic check if out-of-bound
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
-CK_TILE_DEVICE thread_buffer<T, N>
-amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
-                                            index_t src_thread_element_offset,
-                                            bool src_thread_element_valid,
-                                            index_t src_element_space_size)
-{
-    const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
-
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-
-#if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-    uint32_t src_addr_shift = [&]() {
-        if constexpr(oob_conditional_check)
-            return src_thread_element_valid ? 0 : 0x80000000;
-        else
-            return 0;
-    }();
-    return amd_buffer_load_impl<T, N, coherence>(
-        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
-#else
-    thread_buffer<T, N> tmp =
-        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
-    if constexpr(oob_conditional_check)
-        return src_thread_element_valid ? tmp : thread_buffer<T, N>{numeric<T>::zero()};
-    else
-        return tmp;
-#endif
-}
-
-// buffer_load requires:
-//   1) p_src_wave must point to global memory space
-//   2) p_src_wave must be a wavewise pointer.
-// It is user's responsibility to make sure that is true.
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
-CK_TILE_DEVICE thread_buffer<T, N>
-amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
-                                                        index_t src_thread_element_offset,
-                                                        bool src_thread_element_valid,
-                                                        index_t src_element_space_size,
-                                                        T customized_value)
-{
-    const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
-
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-
-    thread_buffer<T, N> tmp =
-        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
-
-    if constexpr(oob_conditional_check)
-        return src_thread_element_valid ? tmp : thread_buffer<T, N>{customized_value};
-    else
-        return tmp;
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
-                                        const T* p_src_wave,
-                                        index_t src_thread_element_offset,
-                                        index_t src_linear_element_offset,
-                                        index_t src_element_space_size,
-                                        index_t is_valid_element = 0,
-                                        bool_constant<pre_nop>   = {})
-{
-    const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
-
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
-
-    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
-        dst,
-        src_wave_buffer_resource,
-        src_thread_addr_offset,
-        0,
-        src_linear_addr_offset,
-        is_valid_element,
-        bool_constant<pre_nop>{});
-}
-
-// This version support buffer resource as input arg
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
-                                        const int32x4_t src_wave_buffer_resource,
-                                        index_t src_thread_element_offset,
-                                        index_t src_linear_element_offset,
-                                        index_t is_valid_element = 0,
-                                        bool_constant<pre_nop>   = {})
-{
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
-
-    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
-        dst,
-        src_wave_buffer_resource,
-        src_thread_addr_offset,
-        0,
-        src_linear_addr_offset,
-        is_valid_element,
-        bool_constant<pre_nop>{});
-}
-
-// unfortunately async copy can not make sure invalid data is zero inside LDS
-// ... unless people manually write zero to LDS at the proper address.
-// so not support invalid_element check for now.
-// buffer_load OOB still working.
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
-                                                       const T* p_src_wave,
-                                                       index_t src_thread_element_offset,
-                                                       index_t src_linear_element_offset,
-                                                       index_t src_element_space_size,
-                                                       bool_constant<pre_nop> = {})
-{
-    const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
-
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
-
-    amd_async_buffer_load_impl<T, N, coherence>(smem,
-                                                src_wave_buffer_resource,
-                                                src_thread_addr_offset,
-                                                0,
-                                                src_linear_addr_offset,
-                                                bool_constant<pre_nop>{});
-}
-
-// This version support buffer resource as input arg
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
-                                                       const int32x4_t src_wave_buffer_resource,
-                                                       index_t src_thread_element_offset,
-                                                       index_t src_linear_element_offset,
-                                                       bool_constant<pre_nop> = {})
-{
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
-
-    amd_async_buffer_load_impl<T, N, coherence>(smem,
-                                                src_wave_buffer_resource,
-                                                src_thread_addr_offset,
-                                                0,
-                                                src_linear_addr_offset,
-                                                bool_constant<pre_nop>{});
-}
-
-// This version support buffer resource as input arg
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = false>
-CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
-                                                   const int32x4_t src_wave_buffer_resource,
-                                                   index_t src_thread_element_offset,
-                                                   index_t src_linear_element_offset,
-                                                   bool is_valid_element,
-                                                   bool_constant<oob_conditional_check> = {})
-{
-    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
-    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
-
-    amd_async_buffer_load<T, N, coherence>(smem,
-                                           src_wave_buffer_resource,
-                                           src_thread_addr_offset,
-                                           0,
-                                           src_linear_addr_offset,
-                                           is_valid_element,
-                                           bool_constant<oob_conditional_check>{});
-}
-
-// buffer_store requires:
-//   1) p_dst_wave must point to global memory
-//   2) p_dst_wave must be a wavewise pointer.
-// It is user's responsibility to make sure that is true.
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
-CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
-                                     T* p_dst_wave,
-                                     const index_t dst_thread_element_offset,
-                                     const bool dst_thread_element_valid,
-                                     const index_t dst_element_space_size)
-{
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
-
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
-
-#if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = [&]() {
-        if constexpr(oob_conditional_check)
-            return dst_thread_element_valid ? 0 : 0x80000000;
-        else
-            return 0;
-    }();
-    amd_buffer_store_impl<T, N, coherence>(
-        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
-#else
-    if constexpr(oob_conditional_check)
-    {
-        if(dst_thread_element_valid)
-        {
-            amd_buffer_store_impl<T, N, coherence>(
-                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
-        }
-    }
-    else
-    {
-        amd_buffer_store_impl<T, N, coherence>(
-            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
-    }
-#endif
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true>
-CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_data,
-                                         T* p_dst_wave,
-                                         const index_t dst_thread_element_offset,
-                                         const index_t dst_linear_element_offset,
-                                         const bool dst_thread_element_valid,
-                                         const index_t dst_element_space_size)
-{
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
-
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
-    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
-
-    amd_buffer_store_raw_impl<T, N, coherence, oob_conditional_check>(src_thread_data,
-                                                                      dst_wave_buffer_resource,
-                                                                      dst_thread_addr_offset,
-                                                                      0,
-                                                                      dst_linear_addr_offset,
-                                                                      dst_thread_element_valid);
-}
-
-// buffer_atomic_add requires:
-//   1) p_dst_wave must point to global memory
-//   2) p_dst_wave must be a wavewise pointer.
-// It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
-CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_data,
-                                          T* p_dst_wave,
-                                          const index_t dst_thread_element_offset,
-                                          const bool dst_thread_element_valid,
-                                          const index_t dst_element_space_size)
-{
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
-
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
-
-#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
-
-    amd_buffer_atomic_add_impl<T, N>(
-        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
-#else
-    if(dst_thread_element_valid)
-    {
-        amd_buffer_atomic_add_impl<T, N>(
-            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
-    }
-#endif
-}
-
-template <typename T,
-          index_t N,
-          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
-          bool oob_conditional_check          = true,
-          bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer<T, N>& src_thread_data,
-                                              T* p_dst_wave,
-                                              const index_t dst_thread_element_offset,
-                                              const index_t dst_linear_element_offset,
-                                              const bool dst_thread_element_valid,
-                                              const index_t dst_element_space_size,
-                                              bool_constant<pre_nop> = {})
-{
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
-
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
-    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
-
-    if constexpr(oob_conditional_check)
-    {
-        buffer_atomic_add_if<T, N, pre_nop>{}(src_thread_data,
-                                              dst_wave_buffer_resource,
-                                              dst_thread_addr_offset,
-                                              0,
-                                              dst_linear_addr_offset,
-                                              dst_thread_element_valid);
-    }
-    else
-    {
-        buffer_atomic_add<T, N, pre_nop>{}(src_thread_data,
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           0,
-                                           dst_linear_addr_offset,
-                                           1);
-    }
-}
-
-// buffer_atomic_max requires:
-//   1) p_dst_wave must point to global memory
-//   2) p_dst_wave must be a wavewise pointer.
-// It is user's responsibility to make sure that is true.
-template <typename T, index_t N>
-CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_data,
-                                          T* p_dst_wave,
-                                          const index_t dst_thread_element_offset,
-                                          const bool dst_thread_element_valid,
-                                          const index_t dst_element_space_size)
-{
-    const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
-
-    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
-
-#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
-    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
-
-    amd_buffer_atomic_max_impl<T, N>(
-        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
-#else
-    if(dst_thread_element_valid)
-    {
-        amd_buffer_atomic_max_impl<T, N>(
-            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
-    }
-#endif
-}
-
-template <typename T, index_t NumElemsPerThread>
-CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
-                                                  const index_t global_offset,
-                                                  T* lds_base_ptr,
-                                                  const index_t lds_offset,
-                                                  const bool is_valid,
-                                                  const index_t src_element_space_size)
-{
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-    constexpr auto dword_bytes      = 4;
-    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
-    static_assert(bytes_per_thread == dword_bytes);
-
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
-    const int32x4_t src_resource =
-        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
-    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
-
-#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
-    T* lds_ptr = lds_base_ptr + lds_offset;
-    auto const lds_ptr_sgpr =
-        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
-    asm volatile("s_mov_b32 m0, %0; \n\t"
-                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
-                 "v"(global_offset_bytes),
-                 "s"(src_resource)
-                 : "memory");
-#else
-    // LDS pointer must be attributed with the LDS address space.
-    __attribute__((address_space(3))) uint32_t* lds_ptr =
-        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
-            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
-
-    llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
-#endif
-}
-
-} // namespace ck_tile
-
-#endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index bdcfbdd920..c2a093f1ab 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -5,11 +5,7 @@
 
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/arch/arch.hpp"
-#if __clang_major__ == 20
-#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
-#else
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
-#endif
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/numeric/integer.hpp"

From 41c17d0a953a5c399a3cf15ff283d1b57992f06d Mon Sep 17 00:00:00 2001
From: "BingYuan.Zhou" <BingYuan.Zhou@amd.com>
Date: Wed, 14 May 2025 09:31:26 +0800
Subject: [PATCH 119/443] fix moe sorting build fail (#2190)

* fix moe sorting build fail

* refile code

---------

Co-authored-by: solin <bingzhou@amd.com>
---
 .../flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp    | 3 ++-
 .../pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 2ff9d1ebf0..cbd20a6ea3 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -75,6 +75,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
 
     CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
     {
+#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8) || defined(USING_MFMA_32x32x16)
         constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
 
         using WG = remove_cvref_t<decltype(config.template at<0>())>;
@@ -90,7 +91,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
         constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
         constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
         constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
-        // constexpr index_t A_LDS_Read_Inst_Remain = A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num;
+#endif
 #if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
         static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
             ignore = i;
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 474924ec84..1a1b729394 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -112,8 +112,8 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
                         make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
             make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
+            return a_lds_block_desc;
 #endif
-        return a_lds_block_desc;
     }
 
     template <typename Problem>

From 7c0e29cc0f6f60ab66b48e324b2481d167722dd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 15 May 2025 16:21:34 +0200
Subject: [PATCH 120/443] Extend 64x64 with 4 waves instances for grouped conv
 bwd wei (#2187)

* Extend 64x64 with 4 waves instnaces for grouped conv bwd wei

* Fix

* fix

* fix
---
 ...conv_bwd_weight_two_stage_xdl_instance.hpp | 29 ++++++++++++++++---
 ...e_grouped_conv_bwd_weight_xdl_instance.hpp |  7 ++++-
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index 1c4dc8a445..0ed12b984b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -72,7 +72,14 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 16, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 16,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              4,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>
+
     // clang-format on
     >;
 
@@ -138,7 +145,13 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instance
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 4, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 4,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 16, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 16,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              4,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>
     // clang-format on
     >;
 
@@ -218,7 +231,11 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8 ,1>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1, F16, F16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1, F16, F16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>
     // clang-format on
     >;
 
@@ -275,7 +292,11 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instance
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8 ,1>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1, BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
index a493719637..3587570e42 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -87,7 +87,12 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances = std::tuple<
         DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
         DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
         DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     8,  8,   32,   32,    1,    1, S<1, 8, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,                2,              4,              4,      true,  S<1, 8, 16,  1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,             2,             4,              4,      true,           1,           1,   S<1, 16, 1, 16>,              4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     8,  8,   32,   32,    1,    1, S<1, 8, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,                2,              4,              4,      true,  S<1, 8, 32,  1>, S<0, 3, 1, 2>,  S<0, 3, 1, 2>,             2,              1,              4,      true,           1,           1,    S<1, 4, 1, 64>,              1, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     8,  8,   32,   32,    1,    1, S<1, 8, 32, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,                2,              1,              4,      true,  S<1, 8, 16,  1>, S<0, 3, 1, 2>,  S<0, 3, 1, 2>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 16>,              4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     8,  8,   32,   32,    1,    1, S<1, 8, 32, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,                2,              1,              4,      true,  S<1, 8, 32,  1>, S<0, 3, 1, 2>,  S<0, 3, 1, 2>,             2,              1,              4,      true,           1,           1,    S<1, 4, 1, 64>,              1, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
     // clang-format on
     >;
 

From 3d8d6e75e485f5811df0ca37272f119392727726 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 15 May 2025 10:28:31 -0700
Subject: [PATCH 121/443] Adding validation for tile sizes in Tile Engine
 (#2189)

* Adding validation for tile sizes

* Add architecture in config, and shuffle lines of code in warp_gemm.hpp

* Enable MFMA for gfx950, and invalid tile handling
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 26 ++---
 .../warp/warp_gemm_attribute_mfma_impl.hpp    |  8 +-
 .../gemm/configs/instance_combination.json    |  4 +-
 tile_engine/ops/gemm/gemm_instance_builder.py | 96 +++++++++++++++----
 4 files changed, 96 insertions(+), 38 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index f050a8e382..be5d5690ff 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -204,14 +204,6 @@ using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterat
 using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
-    2>>;
-
-using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
-    WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
-    2>>;
-
 using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
@@ -221,20 +213,28 @@ using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
 using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
-    WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
+using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
+    2>>;
+
+using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
 using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
+using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+    WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
+    2>>;
+
 using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
-using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
-
 using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
     WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 69d22496f1..4bc4884beb 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1092,7 +1092,7 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
         }
         else
         {
-#if defined(__gfx94__)
+#if defined(__gfx94__) or defined(__gfx95__)
             if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
                     bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
@@ -1116,7 +1116,7 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-#if defined(__gfx94__)
+#if defined(__gfx94__) or defined(__gfx95__)
         if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
                 bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
@@ -1251,7 +1251,7 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
         }
         else
         {
-#if defined(__gfx94__)
+#if defined(__gfx94__) or defined(__gfx95__)
             if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
                     bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
@@ -1286,7 +1286,7 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-#if defined(__gfx94__)
+#if defined(__gfx94__) or defined(__gfx95__)
         if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
                 bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
index 53197ada6c..b497513efa 100644
--- a/tile_engine/ops/gemm/configs/instance_combination.json
+++ b/tile_engine/ops/gemm/configs/instance_combination.json
@@ -1,5 +1,7 @@
 {
-   
+    "architecture": {
+        "values": ["gfx90a"]
+    },
     "layout_a": {
       "values": ["r"]
     },
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 3839523e3d..dd8b4d1157 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -23,7 +23,39 @@ DATA_TYPE_MAP = {'fp32'  : 'float',
                 }
 
 LAYOUT_MAP = {'r' : 'ck_tile::tensor_layout::gemm::RowMajor',
-              'c' : 'ck_tile::tensor_layout::gemm::ColumnMajor'}                                       
+              'c' : 'ck_tile::tensor_layout::gemm::ColumnMajor'}   
+
+
+warp_tile_combinations_map = {
+        "gfx90a": {
+            'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+            'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+            'fp8': [[32, 32, 16], [32, 32, 32]],
+            'bf8': [[32, 32, 16], [32, 32, 32]]
+        },
+        "gfx942": {
+            'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+            'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+            'fp8': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+            'bf8': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]]
+        },
+        "gfx950": {
+            'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+            'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+            'fp8': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
+            'bf8': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
+        }
+    }      
+
+def sizeOf(data_type):
+    if data_type == 'fp16' or data_type == 'bf16':
+        return 2
+    elif data_type == 'int8' or data_type == 'fp8' or data_type == 'bf8':
+        return 1
+    elif data_type == 'int4': ## TODO:: needs to confirm
+        return 0.5
+    else:
+        return 4                                         
 
 DEFAULT_EPILOGUE = """
             using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
@@ -168,11 +200,15 @@ class GemmConfig:
         self.matrix_cfg : Dict[str, Any] = {}
         self.impl_cfg : Dict[str, Any] = {}
         for key, value in config_data.items():
-            if key in ["datatype", "layout_a", "layout_b", "layout_c"]:
+            if key in ["architecture", "datatype", "layout_a", "layout_b", "layout_c"]:
                 self.matrix_cfg[key] = value
             else:
                 self.impl_cfg[key] = value
     
+    @property
+    def architecture(self) -> str:
+        return self.matrix_cfg["architecture"]["values"][0]
+    
     @property
     def datatype(self) -> str:
         return self.matrix_cfg["datatype"]["values"][0]
@@ -201,7 +237,7 @@ class GemmCodeGenerator:
     def _validate_config(self):
         """Validate matrix and implementation configurations"""
         # Matrix config validation
-        for param in ["datatype", "layout_a", "layout_b", "layout_c"]:
+        for param in ["architecture", "datatype", "layout_a", "layout_b", "layout_c"]:
             if len(self.config.matrix_cfg[param]["values"]) != 1:
                 raise ValueError(f"Matrix config {param} must have exactly one value")
         
@@ -327,7 +363,7 @@ namespace {group_name} {{
         return f"""
 template <typename Pipeline, ck_tile::TailNumber TN>
 void try_run(ck_tile::TailNumber tn) {{
-    if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN)) {{
+    if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN) - 1) {{
         if (tn == TN) {{
             RunSplitk(ck_tile::bool_constant<true>{{}},
                 ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
@@ -477,6 +513,30 @@ struct GemmKernel {{
             content += f"#include \"gemm_{group}.hpp\"\n"
         (self.output_dir / "gemm_instances.hpp").write_text(content)
 
+    def is_tile_valid(self, tile: tuple, group: str) -> bool:
+        """Check if the tile configuration is valid for the given group"""
+        # Extract tile parameters
+        tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile
+
+        # Extract the pipeline and epilogue from the group name
+        _, pipeline, epilogue, scheduler, *_ = group.split("_")
+
+        if tile_m % (warp_m * warp_tile_m) == 0 and \
+                tile_n % (warp_n * warp_tile_n) == 0 and \
+                tile_k % (warp_k * warp_tile_k) == 0:
+            total_tile_in_lds = (tile_m * tile_k + tile_n * tile_k ) * sizeOf(self.config.datatype)
+            # Validate and append valid tile parameters
+            is_compv4 = pipeline == "compv4"
+            max_tile_size = pow(2, 16) if is_compv4 else pow(2, 15)
+
+            if total_tile_in_lds > max_tile_size:
+                raise ValueError(f'Total tile size should not exceed {max_tile_size / 1024}KB of LDS. '
+                                f'{tile_m} * {tile_n} * {tile_k} > {max_tile_size / 1024}KB')
+            arch = self.config.architecture
+            if [warp_tile_m, warp_tile_n, warp_tile_k] in warp_tile_combinations_map[arch][self.config.datatype]:
+               return  True
+        return False
+
     def _generate_dispatcher(self):
         """Generate dispatch mechanism"""
         content = """// SPDX-License-Identifier: MIT
@@ -517,7 +577,7 @@ struct GemmDispatcher {
             self.config.impl_cfg["warp_tile_k"]["values"]
         ))
 
-        
+       
         for group in self.all_kernels:
             content += f"""        kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf,
                                                                ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -526,26 +586,22 @@ struct GemmDispatcher {
                                                                const ck_tile::stream_config& stream) {{
             if(structured_sparsity){{  // SMFMA"""
             for tile in tile_params:
-                # Check if we have valid tile/warp combinations 
-                # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
-                if ((tile[0]/(tile[3] * tile[7]) * tile[3] * tile[7]) != tile[0]) or \
-                   ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
-                    continue
-                sparse = self.atype == 'fp16' and \
-                    ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
-                     (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
-                content += f"""
+                if self.is_tile_valid(tile, group):
+                    sparse = self.atype == 'fp16' and \
+                        ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
+                        (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
+                    content += f"""
                 run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
+                else:
+                    raise ValueError(f"Invalid tile configuration for group {group}: {tile}")
             content += f"""
             }} else {{"""
             for tile in tile_params:
-                # Check if we have valid tile/warp combinations 
-                # (tile_m/(warp_m*warp_tile_m)) * warp_m * warp_tile_m == tile_m
-                if ((tile[0]/(tile[3] * tile[7]) * tile[3] * tile[7]) != tile[0]) or \
-                   ((tile[1]/(tile[4] * tile[8]) * tile[4] * tile[8]) != tile[1]):
-                    continue
-                content += f"""
+                if self.is_tile_valid(tile, group):
+                    content += f"""
                 run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
+                else:
+                    raise ValueError(f"Invalid tile configuration for group {group}: {tile}")
             content += f"""
             }}
         }};\n"""

From 8cb0474b3d880abe55bca977856a4be104aac337 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Fri, 16 May 2025 02:47:29 +0800
Subject: [PATCH 122/443] Use only qr_async pipeline for batch_prefill (#2195)

---
 .../ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py    | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index 30b9299963..76b9429b2e 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -470,11 +470,10 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
                 else:
                     if bias == "bias":
-                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
                     else:
                         pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
                         pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))

From 791802b381c99e47966cbf4a987b91ab3d56bcfc Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Fri, 16 May 2025 15:14:46 +0800
Subject: [PATCH 123/443] [CK_TILE] fMHA batch_prefill block index & logits
 soft-capping optimizations (#2198)

* Write soft-sign in inline asm

* Change tile idx computation

* Add macro to turn off soft-sign asm opt

* Use simple for loop to avoid register spill

* Only do block id transform for masking cases
---
 include/ck_tile/ops/fmha/block/variants.hpp   | 38 ++++++++++++++++---
 .../fmha/kernel/fmha_batch_prefill_kernel.hpp | 21 ++++++++--
 ..._batch_prefill_pipeline_qr_ks_vs_async.hpp | 13 ++++++-
 3 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/include/ck_tile/ops/fmha/block/variants.hpp b/include/ck_tile/ops/fmha/block/variants.hpp
index 90fc5656fc..d8b0cdbb86 100644
--- a/include/ck_tile/ops/fmha/block/variants.hpp
+++ b/include/ck_tile/ops/fmha/block/variants.hpp
@@ -15,7 +15,36 @@
 #define CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT CK_TILE_ATTENTION_LOGITS_SOFT_CAP_TANH
 #endif
 
+#ifndef CK_TILE_ATTENTION_USE_SOFTSIGN_ASM
+#define CK_TILE_ATTENTION_USE_SOFTSIGN_ASM 0
+#endif
+
 namespace ck_tile {
+namespace internal {
+__device__ inline float
+exp2_soft_sign_impl(float softmax_scale, float logits, float logits_soft_cap_rcp)
+{
+#if(defined(__gfx90a__) || defined(__gfx94__)) &&                                               \
+    (CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN && \
+     CK_TILE_ATTENTION_USE_SOFTSIGN_ASM)
+    /// NOTICE: Make sure softmax_scale is stored in SGPR
+    float result, numerator, denominator;
+    asm volatile(
+        "v_mul_f32_e32 %[denominator], %[logits], %[logits_soft_cap_rcp]\n"
+        "v_add_f32_e64 %[denominator], |%[denominator]|, 1.0\n"
+        "v_rcp_f32_e32 %[denominator], %[denominator]\n"
+        "v_mul_f32_e32 %[numerator], %[softmax_scale], %[logits]\n"
+        "v_mul_f32_e32 %[result], %[numerator], %[denominator]"
+        : [numerator] "=&v"(numerator), [denominator] "=&v"(denominator), [result] "=v"(result)
+        : [softmax_scale] "s"(softmax_scale),
+          [logits] "v"(logits),
+          [logits_soft_cap_rcp] "v"(logits_soft_cap_rcp));
+    return result;
+#else
+    return softmax_scale * logits * rcp<float>(1.f + abs(logits * logits_soft_cap_rcp));
+#endif
+}
+} // namespace internal
 
 template <typename ImplMask>
 struct StandardAttentionParams
@@ -169,8 +198,8 @@ struct LogitsSoftCap
             return params.logits_soft_cap *
                    tanh_fast<float>(type_convert<float>(logits) * params.logits_soft_cap_rcp);
 #elif CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN
-            return params.sm_scale * type_convert<float>(logits) *
-                   rcp<float>(1.f + abs(type_convert<float>(logits) * params.logits_soft_cap_rcp));
+            return internal::exp2_soft_sign_impl(
+                params.sm_scale, type_convert<float>(logits), params.logits_soft_cap_rcp);
 #endif
         }
         else
@@ -239,9 +268,8 @@ struct ComposedAttention
                 return params.logits_soft_cap *
                        tanh_fast<float>(type_convert<float>(logits) * params.logits_soft_cap_rcp);
 #elif CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN
-                return params.sm_scale * type_convert<float>(logits) *
-                       rcp<float>(1.f +
-                                  abs(type_convert<float>(logits) * params.logits_soft_cap_rcp));
+                return internal::exp2_soft_sign_impl(
+                    params.sm_scale, type_convert<float>(logits), params.logits_soft_cap_rcp);
 #endif
             }
             else
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index ba327ee511..7472c82114 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -651,8 +651,15 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
             };
 
             const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
-
-            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            if constexpr(kHasMask)
+            {
+                // assume that num_tile_n1 is always 1
+                return ck_tile::make_tuple(gridDim.z - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+            else
+            {
+                return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
         }
         else
         {
@@ -672,7 +679,15 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
 
             const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
 
-            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            if constexpr(kHasMask)
+            {
+                // assume that num_tile_n1 is always 1
+                return ck_tile::make_tuple(gridDim.x - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+            else
+            {
+                return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
         }
     }
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
index e07cf1c94e..8691622bb0 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
@@ -6,8 +6,9 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
 #include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
 
 namespace ck_tile {
@@ -498,6 +499,16 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
 #else
                     for(index_t i = 0; i < s_acc.thread_buf_.size(); ++i)
                     {
+#if(defined(__gfx90a__) || defined(__gfx94__)) &&                                               \
+    (CK_TILE_ATTENTION_LOGITS_SOFT_CAP_DEFAULT == CK_TILE_ATTENTION_LOGITS_SOFT_CAP_SOFTSIGN && \
+     CK_TILE_ATTENTION_USE_SOFTSIGN_ASM)
+                        // Avoid data hazard if v_mfma is followed by inline asm consumer
+                        // instructions. In this case, compiler won't add s_nop for us
+                        if(i == s_acc.thread_buf_.size() / 2)
+                        {
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
+#endif
                         apply_logits_transform(s_acc.thread_buf_[i]);
                     }
 #endif

From fa3c6811d8e81096f52779bf0877777bf405d241 Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Fri, 16 May 2025 10:18:47 +0200
Subject: [PATCH 124/443] Disable conv for Filter1x1Stride1Pad0 when K or C is
 even (#2186)

---
 include/ck/ck.hpp                                          | 3 +++
 .../device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp     | 7 +++++++
 .../test_grouped_convnd_bwd_weight.cpp                     | 1 +
 3 files changed, 11 insertions(+)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index e38f166c1a..26e4787949 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -222,6 +222,9 @@
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 
+// workaround: conv crash when K, C is even
+#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1
+
 // workaround: compiler crash when compiling recursive lambda
 #define CK_WORKAROUND_SWDEV_275126 1
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index dd5b97096d..869457a99e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -1206,6 +1206,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         if constexpr(ConvBackwardWeightSpecialization ==
                      ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
         {
+// workaround: disable when K, C is even
+#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN
+            if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0)
+            {
+                return false;
+            }
+#endif
             // check if it's 1x1, stride=1 pad = 0 conv
             for(int i = 0; i < NDimSpatial; i++)
             {
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 21f2cb5ce6..95a0a09414 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -188,6 +188,7 @@ TYPED_TEST(TestGroupedConvndBwdWeight1d, Test1D)
 TYPED_TEST(TestGroupedConvndBwdWeight2d, Test2D)
 {
     this->conv_params.clear();
+    this->conv_params.push_back({2, 2, 64, 4, 4, {1, 1}, {7, 7}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back(
         {2, 2, 64, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back({2, 2, 64, 3, 3, {1, 1}, {7, 7}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});

From 40668c9a993ca9391eb628bbb4be3ca3fb4e7e56 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 16 May 2025 07:40:53 -0700
Subject: [PATCH 125/443] Build and store CK library deb package for all
 targets daily. (#2196)

* generate and store library package for all targets

* use ninja to build packages for all targets

* make sure to use ftime-trace when using ninja

* make sure build trace only runs on gfx9

* archive lib package and stash only library package
---
 Jenkinsfile                                   | 135 +++++++++---------
 .../gpu/CMakeLists.txt                        |   2 +-
 2 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 68e0fa1246..c26350f120 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -93,6 +93,30 @@ def build_compiler(){
     return compiler
 }
 
+def check_arch(){
+    def arch_type = 0
+    sh 'rocminfo | tee rocminfo.log'
+    if ( runShell('grep -n "gfx90a" rocminfo.log') ){
+        arch_type = 1
+    }
+    else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
+        arch_type = 2
+    }
+    else if ( runShell('grep -n "gfx10" rocminfo.log') ) {
+        arch_type = 3
+    }
+    else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
+        arch_type = 4
+    }
+    else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
+        arch_type = 5
+    }
+    else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
+        arch_type = 6
+    }
+    return arch_type
+}
+
 def getDockerImage(Map conf=[:]){
     env.DOCKER_BUILDKIT=1
     def prefixpath = conf.get("prefixpath", "/opt/rocm")
@@ -287,7 +311,7 @@ def cmake_build(Map conf=[:]){
     def build_cmd
     def execute_cmd = conf.get("execute_cmd", "")
     if(!setup_args.contains("NO_CK_BUILD")){
-        if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
+        if (setup_args.contains("gfx9") && params.NINJA_BUILD_TRACE){
             echo "running ninja build trace"
             setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace "  .. """)
             build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
@@ -315,7 +339,7 @@ def cmake_build(Map conf=[:]){
         sh cmd
         //run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
         if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
-            if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
+            if ((setup_args.contains("gfx9") && params.NINJA_BUILD_TRACE) || params.BUILD_INSTANCES_ONLY){
                 sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
                 sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --all . clang_build.log"
                 sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --analyze clang_build.log > clang_build_analysis.log"
@@ -323,7 +347,15 @@ def cmake_build(Map conf=[:]){
                 archiveArtifacts "clang_build_analysis.log"
                 // do not run unit tests when building instances only
                 if(!params.BUILD_INSTANCES_ONLY){
-                    sh "ninja test"
+                    sh "ninja check"
+                }
+                if(params.BUILD_INSTANCES_ONLY){
+                    // build deb packages
+                    echo "Build packages"
+                    sh 'ninja -j64 package'
+                    archiveArtifacts artifacts: 'composablekernel-dev*.deb'
+                    sh 'mv composablekernel-dev_*.deb composablekernel-dev_all_targets_1.1.0_amd64.deb'
+                    stash includes: "composablekernel-dev_all_targets_1.1.0_amd64.deb", name: "packages"
                 }
             }
             else{
@@ -340,21 +372,14 @@ def cmake_build(Map conf=[:]){
         archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
     }
     //check the node gpu architecture
-    def arch_type = 0
-    sh 'rocminfo | tee rocminfo.log'
-    if ( runShell('grep -n "gfx90a" rocminfo.log') ){
-        arch_type = 1
-    }
-    else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
-        arch_type = 2
-    }
+    def arch = check_arch()
     if (params.RUN_CK_TILE_FMHA_TESTS){
         try{
             archiveArtifacts "perf_fmha_*.log"
-            if (arch_type == 1){
+            if (arch == 1){
                 stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
             }
-            else if (arch_type == 2){
+            else if (arch == 2){
                 stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
             }
         }
@@ -379,10 +404,10 @@ def cmake_build(Map conf=[:]){
     if (params.RUN_CK_TILE_GEMM_TESTS){
         try{
             archiveArtifacts "perf_tile_gemm_**.log"
-            if (arch_type == 1){
+            if (arch == 1){
                 stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
             }
-            else if (arch_type == 2){
+            else if (arch == 2){
                 stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
             }
         }
@@ -410,7 +435,13 @@ def buildHipClangJob(Map conf=[:]){
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        def dockerOpts
+        if ( params.BUILD_INSTANCES_ONLY ){
+            dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        }
+        else{
+            dockerOpts = "--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        }
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
         }
@@ -521,28 +552,9 @@ def Build_CK(Map conf=[:]){
                 timeout(time: 20, unit: 'HOURS')
                 {
                     //check whether to run performance tests on this node
-                    def arch_type = 0
-                    sh 'rocminfo | tee rocminfo.log'
-                    if ( runShell('grep -n "gfx90a" rocminfo.log') ){
-                        arch_type = 1
-                    }
-                    else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
-                        arch_type = 2
-                    }
-                    else if ( runShell('grep -n "gfx10" rocminfo.log') ) {
-                        arch_type = 3
-                    }
-                    else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
-                        arch_type = 4
-                    }
-                    else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
-                        arch_type = 5
-                    }
-                    else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
-                        arch_type = 6
-                    }
+                    def arch = check_arch()
                     cmake_build(conf)
-                    if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch_type == 1 ){
+                    if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch == 1 ){
                             echo "Run inductor codegen tests"
                             sh """
                                   python3 -m venv ${env.WORKSPACE}
@@ -553,9 +565,9 @@ def Build_CK(Map conf=[:]){
                             """
                     }
                     dir("build"){
-                        if (params.RUN_FULL_QA && arch_type == 2 ){
-                            // build deb packages for all gfx9 targets on gfx90a system and prepare to export
-                            echo "Build ckProfiler package"
+                        if (params.RUN_FULL_QA && arch == 2 ){
+                            // build deb packages
+                            echo "Build packages"
                             sh 'make -j package'
                             archiveArtifacts artifacts: 'composablekernel*.deb'
                             sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb'
@@ -568,7 +580,7 @@ def Build_CK(Map conf=[:]){
                     // run performance tests, stash the logs, results will be processed on the master node
 					dir("script"){
                         if (params.RUN_PERFORMANCE_TESTS){
-                        if (params.RUN_FULL_QA && arch_type == 1){
+                        if (params.RUN_FULL_QA && arch == 1){
                             // run full tests on gfx90a
                             echo "Run full performance tests"
                             sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
@@ -587,7 +599,7 @@ def Build_CK(Map conf=[:]){
                             archiveArtifacts "perf_mixed_gemm.log"
                             stash includes: "perf_**.log", name: "perf_log"
                         }
-                        else if ( arch_type == 1 ){
+                        else if ( arch == 1 ){
                             // run standard tests on gfx90a
                             echo "Run performance tests"
                             sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
@@ -598,28 +610,28 @@ def Build_CK(Map conf=[:]){
                             stash includes: "perf_**.log", name: "perf_log"
                         }
                         // disable performance tests on gfx1030 for now.
-                        //else if ( arch_type == 3){
+                        //else if ( arch == 3){
                             // run basic tests on gfx1030
                         //    echo "Run gemm performance tests"
                         //    sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
                         //    archiveArtifacts "perf_onnx_gemm_gfx10.log"
                         //    stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
                         //}
-                        else if ( arch_type == 4){
+                        else if ( arch == 4){
                             // run basic tests on gfx11
                             echo "Run gemm performance tests"
                             sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
                             archiveArtifacts "perf_onnx_gemm_gfx11.log"
                             stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
                         }
-                        else if ( arch_type == 5 ){
+                        else if ( arch == 5 ){
                             // run basic tests on gfx12
                             echo "Run gemm performance tests"
                             sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
                             archiveArtifacts "perf_onnx_gemm_gfx12.log"
                             stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
                         }
-                        else if ( arch_type == 6 ){
+                        else if ( arch == 6 ){
                             // run basic tests on gfx908
                             echo "Run performance tests"
                             sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
@@ -628,7 +640,7 @@ def Build_CK(Map conf=[:]){
                         }
                         }
                     }
-                    if (params.hipTensor_test && arch_type == 1 ){
+                    if (params.hipTensor_test && arch == 1 ){
                         // build and test hipTensor on gfx90a node
                         sh """#!/bin/bash
                             rm -rf "${params.hipTensor_branch}".zip
@@ -730,24 +742,10 @@ def process_results(Map conf=[:]){
                             echo "could not locate the GEMM performance logs: ${err.getMessage()}."
                         }
                     }
-                    if (params.RUN_FULL_QA){
-                        // unstash perf files to master
+                    if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
+                        // unstash deb packages
                         unstash "packages"
                         sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no composablekernel-*.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
-                        try{
-                            unstash "perf_log"
-                        }
-                        catch(Exception err){
-                            echo "could not locate perf_log: ${err.getMessage()}."
-                        }
-                        try{
-                            unstash "perf_log_gfx11"
-                            unstash "perf_log_gfx12"
-                        }
-                        catch(Exception err){
-                            echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
-                        }
-                        sh "./process_qa_data.sh"
                     }
                     else{
                         // unstash perf files to master
@@ -775,12 +773,12 @@ def process_results(Map conf=[:]){
     }
 }
 
-//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
+//launch develop branch daily jobs
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
+                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 13 * * * % BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
 
 pipeline {
@@ -1263,8 +1261,7 @@ pipeline {
                         execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1151;gfx1201"  \
-                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
+                                           -D CMAKE_CXX_FLAGS=" -O3 -ftime-trace" .. && ninja -j64 """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 25ea3b2ae4..97946207a1 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -103,7 +103,7 @@ function(add_instance_library INSTANCE_NAME)
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
-    message("remaining instances: ${ARGN}")
+    #message("remaining instances: ${ARGN}")
     #only continue if there are some source files left on the list
     if(ARGN)
         set(INST_OBJ)

From 5b3430b868766068dabcc92394f0da65d9206099 Mon Sep 17 00:00:00 2001
From: arai713 <67439843+arai713@users.noreply.github.com>
Date: Fri, 16 May 2025 11:11:54 -0700
Subject: [PATCH 126/443] Narrowing error fix for codegen compilation (#2194)

* removed comment with special characters

* fix for arg/template change after merge from develop

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 ...e_gemm_pipeline_xdlops_b_preshuffle_v3.hpp |  1 -
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   | 54 ++++++++++---------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
index 6f3a7e6357..6f0404a1ca 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -381,7 +381,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
         static_for<0, DS_READ_A_PREFETCH_STAGES, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, KGroup, 1>{}([&](auto kg0) {
-                    // K = k0 × KGroup × k1 = k0 × kg0 × A_K1
                     a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
                                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
                                        a_block_buf.At(I0),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index 6c4195e75d..f193b093d1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -860,35 +860,37 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
 #endif
         if(desc.has_main_k_block_loop)
         {
-            GridwiseGemm::template Run<true>(p_a_grid,
-                                             p_b_grid,
-                                             p_ds_grid,
-                                             p_e_grid,
-                                             p_shared_block,
-                                             desc.a_element_op,
-                                             desc.b_element_op,
-                                             desc.cde_element_op,
-                                             desc.a_grid_desc_ak0_m_ak1,
-                                             desc.b_grid_desc_bk0_n_bk1,
-                                             desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                             desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                             desc.block_2_etile_map);
+            GridwiseGemm::template Run<true, InMemoryDataOperationEnum::Set>(
+                p_a_grid,
+                p_b_grid,
+                p_ds_grid,
+                p_e_grid,
+                p_shared_block,
+                desc.a_element_op,
+                desc.b_element_op,
+                desc.cde_element_op,
+                desc.a_grid_desc_ak0_m_ak1,
+                desc.b_grid_desc_bk0_n_bk1,
+                desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
+                desc.block_2_etile_map);
         }
         else
         {
-            GridwiseGemm::template Run<false>(p_a_grid,
-                                              p_b_grid,
-                                              p_ds_grid,
-                                              p_e_grid,
-                                              p_shared_block,
-                                              desc.a_element_op,
-                                              desc.b_element_op,
-                                              desc.cde_element_op,
-                                              desc.a_grid_desc_ak0_m_ak1,
-                                              desc.b_grid_desc_bk0_n_bk1,
-                                              desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                              desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                              desc.block_2_etile_map);
+            GridwiseGemm::template Run<false, InMemoryDataOperationEnum::Set>(
+                p_a_grid,
+                p_b_grid,
+                p_ds_grid,
+                p_e_grid,
+                p_shared_block,
+                desc.a_element_op,
+                desc.b_element_op,
+                desc.cde_element_op,
+                desc.a_grid_desc_ak0_m_ak1,
+                desc.b_grid_desc_bk0_n_bk1,
+                desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
+                desc.block_2_etile_map);
         }
     }
 };

From 6342f6b5e8bbb9f2b4cefa33d2a863a8bb35329b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Sat, 17 May 2025 03:42:02 +0200
Subject: [PATCH 127/443] Restore oddc instances (#2201)

---
 .../gpu/grouped_convolution_forward.hpp       |   8 ++
 .../gpu/grouped_convolution_forward_wmma.inc  | 111 ++++++++++++++++++
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   4 +
 ...ma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp |  40 +++++++
 ...mma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp |  40 +++++++
 ...ma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp |  40 +++++++
 ...mma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp |  40 +++++++
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |   9 ++
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |   8 ++
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |   9 ++
 ...nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp |   9 ++
 ...dl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp |   8 ++
 ...gc_gkyxc_nhwgk_f16_comp_part2_instance.cpp |   9 ++
 ...dl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp |  10 +-
 ...l_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp |  28 ++++-
 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp |  10 +-
 ...gc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp |  11 +-
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |   4 +
 ...gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp |  41 +++++++
 ..._gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp |  41 +++++++
 ...ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp |  41 +++++++
 ..._ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp |  41 +++++++
 35 files changed, 682 insertions(+), 17 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index cf5dbaa323..545826650c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -613,6 +613,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -623,6 +624,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -638,6 +640,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -648,6 +651,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -665,6 +669,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -676,6 +681,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -693,6 +699,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -704,6 +711,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
index df4e95007d..0ea24d0929 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
@@ -51,6 +51,20 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -93,6 +107,20 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -135,6 +163,20 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -177,6 +219,19 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -236,6 +291,20 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -278,6 +347,20 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -320,6 +403,20 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -361,6 +458,20 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index eba6fd789e..22e9d726b0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -93,6 +93,8 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
   ## NHWGC, GKYXC, NHWGK
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp
@@ -100,4 +102,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..a8f723dfec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<2,
+                                                                              GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..784a118897
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<2,
+                                                                             GNHWC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             GNHWK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..8c621543a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..5cb313b3ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             NHWGK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
index f5df7278d0..c078f8ed04 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index db048679bd..a67b11f1cf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -49,6 +49,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
index ee9507a80a..5c0391a25f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instanc
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
index 132d3c8411..726276c461 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
                                                               Empty_Tuple,
                                                               NHWGK,
                                                               ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
index a7deb969ba..8b7bdec2a8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
@@ -49,6 +49,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
index d2732547fa..c66114b9a3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance
                                                                  Empty_Tuple,
                                                                  NHWGK,
                                                                  ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
index 8a0caebc9f..93e07e08fb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -48,6 +48,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
index e45df1e107..6acbb7475c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -50,6 +50,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
 
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwdOddC>{});
+
     if(ck::get_device_name() != "gfx950")
     {
         add_device_operation_instances(
@@ -78,6 +86,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdOddC>{});
     }
 
     if(ck::get_device_name() == "gfx950")
@@ -108,6 +125,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 078221f89f..2afbfdc386 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               GNHWK,
                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 3a481dd204..822ef51e00 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             GNHWC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             GNHWK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 5add0f8add..79a1fb99a8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             GNHWC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             GNHWK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 0257c7d315..e567c0df75 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 2715506fe2..3e42184996 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             NHWGK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 8d3e4d91b1..c035d4c3da 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             NHWGK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
index 465fa927a5..5c425effd8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
index 87423801cb..e8a763c527 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
index ebb213461a..3ae3fb5186 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
index c2c8a099b2..cb7e912936 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
index 11cb853f0d..d787f4b048 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
index 1992d7f7c1..5644289790 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
index 2b8fd3d9db..5b12dad5a3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
index 5579ec62cc..f667481fa4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
index 77f3df2c11..2ff2c7f51f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index f55bdd45c9..f8efa5a7c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -66,6 +66,10 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..fa378af1ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<3,
+                                                                              GNDHWC,
+                                                                              GKZYXC,
+                                                                              Empty_Tuple,
+                                                                              GNDHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..d41416fd4a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<3,
+                                                                             GNDHWC,
+                                                                             GKZYXC,
+                                                                             Empty_Tuple,
+                                                                             GNDHWK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..8a7bc26178
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Empty_Tuple,
+                                                                              NDHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..7649f86971
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Empty_Tuple,
+                                                                             NDHWGK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From b8b12bb81e1b370d39ab7b17b0c13654a6e54721 Mon Sep 17 00:00:00 2001
From: jefyang1 <146495389+jefyang1@users.noreply.github.com>
Date: Mon, 19 May 2025 14:25:50 -0700
Subject: [PATCH 128/443] Fix example_grouped_gemm_multiple_d_xdl_fp16 on
 gfx950 (#2203)

* Fix example_grouped_gemm_multiple_d_xdl_fp16 on gfx950

* Run clang format
---
 example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
index db162fe444..63a2aea0b3 100644
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -141,8 +141,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 
     a_tensors_device.reserve(group_count);
     b_tensors_device.reserve(group_count);
-    d_tensors_device.reserve(group_count);
     c_tensors_device.reserve(group_count);
+    d_tensors_device.resize(group_count); // reserve and update vector size
 
     std::size_t flop = 0, num_btype = 0;
 

From 57e0f5df29abefd919c334c994628a994ba2868c Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Mon, 19 May 2025 15:52:51 -0600
Subject: [PATCH 129/443] MX GEMM - Expand MX MFMA Testing to BF8, FP6, and BF6
 Data Types (#2199)

* Unify test interface for different layouts.

* WIP: Introducing FP4/FP6/FP8 abstractions

* WIP: Introducing packed storage abstraction

* WIP: Introducing packed storage abstraction

* WIP: Improved support for FP6 data type

* Refactor packed storage for f6_t

* WIP: FP6 MFMA test

* Test if we correctly represent all FP6/FP4 numbers

* Additional output for failed FP4 test.

* More failing conversion tests

* Even more failing conversion tests

* Working FP6 MFMA tests

* Expand MX MFMA testing to BF8/6

* Update and verify MX MFMA test for packed types

* Fix fp4 and fp6 conversions on host

* Working MX MFMA tests for FP8/6/4

* Cleanup

* Add missing type

* Cleanup

* Final cleanup

* Restrict FP6/4 values output to CK_LOGGING=1

* Use CHAR_BIT instead of number 8

* Fix typo

* Remove FP6 and FP4 from the list of native types

---------

Co-authored-by: Rostyslav Geyyer <rosty.geyyer@amd.com>
---
 include/ck/library/utility/host_tensor.hpp    |  57 +--
 .../library/utility/host_tensor_generator.hpp | 232 ++++++++++
 include/ck/utility/amd_xdlops.hpp             | 390 ++++++++++++++--
 include/ck/utility/data_type.hpp              | 428 +++++++-----------
 include/ck/utility/dtype_vector.hpp           | 104 ++++-
 include/ck/utility/mxf4_utils.hpp             |  12 +-
 include/ck/utility/mxf6_utils.hpp             |   8 +-
 .../cpu/reference_gemm.hpp                    |  16 +
 .../cpu/reference_mx_gemm.hpp                 |  20 +
 test/data_type/test_bf6.cpp                   | 111 ++++-
 test/data_type/test_fp4.cpp                   |  57 +++
 test/data_type/test_fp6.cpp                   | 106 ++++-
 test/mx_mfma_op/mx_mfma_op.cpp                | 365 ++++++++++++---
 test/mx_mfma_op/mx_mfma_op.hpp                | 282 ++++++------
 14 files changed, 1601 insertions(+), 587 deletions(-)

diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 71417ce7bf..257636d956 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -360,10 +360,9 @@ struct Tensor
 
     std::size_t GetElementSpaceSize() const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
-                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
+        if constexpr(ck::is_packed_type_v<ck::remove_cvref_t<T>>)
         {
-            return (mDesc.GetElementSpaceSize() + 1) / 2;
+            return (mDesc.GetElementSpaceSize() + 1) / ck::packed_size_v<ck::remove_cvref_t<T>>;
         }
         else
         {
@@ -516,69 +515,31 @@ struct Tensor
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
-                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
-        {
-            return mDesc.GetOffsetFromMultiIndex(is...) / 2;
-        }
-        else
-        {
-            return mDesc.GetOffsetFromMultiIndex(is...);
-        }
+        return mDesc.GetOffsetFromMultiIndex(is...) / ck::packed_size_v<ck::remove_cvref_t<T>>;
     }
 
     template <typename... Is>
     T& operator()(Is... is)
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
-                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
-        }
-        else
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(is...)];
-        }
+        return mData[mDesc.GetOffsetFromMultiIndex(is...) /
+                     ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
 
     template <typename... Is>
     const T& operator()(Is... is) const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
-                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(is...) / 2];
-        }
-        else
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(is...)];
-        }
+        return mData[mDesc.GetOffsetFromMultiIndex(is...) /
+                     ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
 
     T& operator()(std::vector<std::size_t> idx)
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
-                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
-        }
-        else
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(idx)];
-        }
+        return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
 
     const T& operator()(std::vector<std::size_t> idx) const
     {
-        if constexpr(ck::is_same_v<ck::remove_cvref_t<T>, ck::pk_i4_t> ||
-                     ck::is_same_v<ck::remove_cvref_t<T>, ck::f4x2_pk_t>)
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(idx) / 2];
-        }
-        else
-        {
-            return mData[mDesc.GetOffsetFromMultiIndex(idx)];
-        }
+        return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
 
     typename Data::iterator begin() { return mData.begin(); }
diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp
index 785f74a3c0..f48ba49bbf 100644
--- a/include/ck/library/utility/host_tensor_generator.hpp
+++ b/include/ck/library/utility/host_tensor_generator.hpp
@@ -67,6 +67,18 @@ struct GeneratorTensor_1<ck::f8_t>
         return ck::type_convert<ck::f8_t>(value);
     }
 };
+
+template <>
+struct GeneratorTensor_1<ck::bf8_t>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ck::bf8_t operator()(Is...)
+    {
+        return ck::type_convert<ck::bf8_t>(value);
+    }
+};
 #endif
 
 template <>
@@ -93,6 +105,38 @@ struct GeneratorTensor_1<ck::f4x2_pk_t>
     }
 };
 
+template <>
+struct GeneratorTensor_1<ck::f6x32_pk_t>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ck::f6x32_pk_t operator()(Is...)
+    {
+        ck::f6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            r.pack(ck::type_convert<ck::f6_t>(value), static_cast<ck::index_t>(i));
+        });
+        return r;
+    }
+};
+
+template <>
+struct GeneratorTensor_1<ck::bf6x32_pk_t>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ck::bf6x32_pk_t operator()(Is...)
+    {
+        ck::bf6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            r.pack(ck::type_convert<ck::bf6_t>(value), static_cast<ck::index_t>(i));
+        });
+        return r;
+    }
+};
+
 template <>
 struct GeneratorTensor_1<int8_t>
 {
@@ -132,6 +176,44 @@ struct GeneratorTensor_2
     }
 };
 
+template <>
+struct GeneratorTensor_2<ck::f6x32_pk_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::f6x32_pk_t operator()(Is...)
+    {
+        ck::f6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            float tmp = (std::rand() % (max_value - min_value)) + min_value;
+            r.pack(ck::type_convert<ck::f6_t>(tmp), static_cast<ck::index_t>(i));
+        });
+
+        return r;
+    }
+};
+
+template <>
+struct GeneratorTensor_2<ck::bf6x32_pk_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::bf6x32_pk_t operator()(Is...)
+    {
+        ck::bf6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            float tmp = (std::rand() % (max_value - min_value)) + min_value;
+            r.pack(ck::type_convert<ck::bf6_t>(tmp), static_cast<ck::index_t>(i));
+        });
+
+        return r;
+    }
+};
+
 template <>
 struct GeneratorTensor_2<ck::bhalf_t>
 {
@@ -342,6 +424,46 @@ struct GeneratorTensor_3<ck::f4x2_pk_t>
     }
 };
 
+template <>
+struct GeneratorTensor_3<ck::f6x32_pk_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::f6x32_pk_t operator()(Is...)
+    {
+        ck::f6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            float rnd  = float(std::rand()) / float(RAND_MAX);
+            float fp32 = min_value + rnd * (max_value - min_value);
+            r.pack(ck::type_convert<ck::f6_t>(fp32), static_cast<ck::index_t>(i));
+        });
+
+        return r;
+    }
+};
+
+template <>
+struct GeneratorTensor_3<ck::bf6x32_pk_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::bf6x32_pk_t operator()(Is...)
+    {
+        ck::bf6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            float rnd  = float(std::rand()) / float(RAND_MAX);
+            float fp32 = min_value + rnd * (max_value - min_value);
+            r.pack(ck::type_convert<ck::bf6_t>(fp32), static_cast<ck::index_t>(i));
+        });
+
+        return r;
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_4
 {
@@ -360,6 +482,69 @@ struct GeneratorTensor_4
     }
 };
 
+template <>
+struct GeneratorTensor_4<ck::f4x2_pk_t>
+{
+    std::mt19937 generator;
+    std::normal_distribution<float> distribution;
+
+    GeneratorTensor_4(float mean, float stddev, unsigned int seed = 1)
+        : generator(seed), distribution(mean, stddev){};
+
+    template <typename... Is>
+    ck::f4x2_pk_t operator()(Is...)
+    {
+        float fp32_tmp0 = distribution(generator);
+        float fp32_tmp1 = distribution(generator);
+
+        return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(ck::float2_t{fp32_tmp0, fp32_tmp1})};
+    }
+};
+
+template <>
+struct GeneratorTensor_4<ck::f6x32_pk_t>
+{
+    std::mt19937 generator;
+    std::normal_distribution<float> distribution;
+
+    GeneratorTensor_4(float mean, float stddev, unsigned int seed = 1)
+        : generator(seed), distribution(mean, stddev){};
+
+    template <typename... Is>
+    ck::f6x32_pk_t operator()(Is...)
+    {
+        ck::f6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            r.pack(ck::type_convert<ck::f6_t>(distribution(generator)),
+                   static_cast<ck::index_t>(i));
+        });
+
+        return r;
+    }
+};
+
+template <>
+struct GeneratorTensor_4<ck::bf6x32_pk_t>
+{
+    std::mt19937 generator;
+    std::normal_distribution<float> distribution;
+
+    GeneratorTensor_4(float mean, float stddev, unsigned int seed = 1)
+        : generator(seed), distribution(mean, stddev){};
+
+    template <typename... Is>
+    ck::bf6x32_pk_t operator()(Is...)
+    {
+        ck::bf6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}([&](auto i) {
+            r.pack(ck::type_convert<ck::bf6_t>(distribution(generator)),
+                   static_cast<ck::index_t>(i));
+        });
+
+        return r;
+    }
+};
+
 struct GeneratorTensor_Checkboard
 {
     template <typename... Ts>
@@ -405,6 +590,53 @@ struct GeneratorTensor_Sequential
     }
 };
 
+template <ck::index_t Dim>
+struct GeneratorTensor_Sequential<ck::f4x2_pk_t, Dim>
+{
+    template <typename... Ts>
+    ck::f4x2_pk_t operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+
+        float tmp = dims[Dim];
+        return ck::type_convert<ck::f4x2_t>(ck::float2_t(tmp));
+    }
+};
+
+template <ck::index_t Dim>
+struct GeneratorTensor_Sequential<ck::f6x32_pk_t, Dim>
+{
+    template <typename... Ts>
+    ck::f6x32_pk_t operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+
+        float tmp = dims[Dim];
+
+        ck::f6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}(
+            [&](auto i) { r.pack(ck::type_convert<ck::f6_t>(tmp), static_cast<ck::index_t>(i)); });
+        return r;
+    }
+};
+
+template <ck::index_t Dim>
+struct GeneratorTensor_Sequential<ck::bf6x32_pk_t, Dim>
+{
+    template <typename... Ts>
+    ck::bf6x32_pk_t operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+
+        float tmp = dims[Dim];
+
+        ck::bf6x32_pk_t r;
+        ck::static_for<0, 32, 1>{}(
+            [&](auto i) { r.pack(ck::type_convert<ck::bf6_t>(tmp), static_cast<ck::index_t>(i)); });
+        return r;
+    }
+};
+
 template <typename T, size_t NumEffectiveDim = 2>
 struct GeneratorTensor_Diagonal
 {
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 66c4958e1d..ad48389625 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -498,7 +498,7 @@ struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                0, // cbsz
+                0, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
                 0, // blgp
                 0,
                 0,
@@ -511,6 +511,28 @@ struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a, const bf8x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                1, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1, // blgp
+                0,
+                0,
+                0,
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a, const f4x32_t& reg_b, FloatC& reg_c)
     {
@@ -536,6 +558,62 @@ struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
         ignore = reg_a;
         ignore = reg_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f6x32_t& reg_a, const f6x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                2, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2, // blgp
+                0, // OPSEL
+                0,
+                0, // OPSEL
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf6x32_t& reg_a, const bf6x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                3, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3, // blgp
+                0, // OPSEL
+                0,
+                0, // OPSEL
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
 #endif
     }
 };
@@ -583,6 +661,43 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const bf8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+        // XXX: Note on the scale_a and scale_b parameters:
+        // If compiler detects that one or both scales are constant values, it will treat that
+        // constant as F32 constant. I.e., if scale_a at some point was declared as
+        // `e8m0_bexp_t a_scale{1.0f}`, the instruction would only work if scale_a parameter is
+        // assigned value `bit_cast<int32_t>(static_cast<float>(a_scale))`.
+
+        // XXX: Note on the OPSEL parameters: Instruction always takes byte0 as a scale value even
+        // when OPSEL is set otherwise.
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const bf8x32_t& reg_a,
                                const int32_t& scale_a,
@@ -620,6 +735,74 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const f6x32_t& reg_a,
+                               const int32_t scale_a,
+                               const f6x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                2, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf6x32_t& reg_a,
+                               const int32_t scale_a,
+                               const bf6x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                3, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a,
                                const int32_t scale_a,
@@ -639,7 +822,7 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                4, // cbsz
+                4, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
                 4, // blgp
                 0, // OPSEL
                 scale_a,
@@ -748,6 +931,101 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const f8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f6x32_t& reg_a,
+                               const int32_t scale_a,
+                               const f6x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                2, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf6x32_t& reg_a,
+                               const int32_t scale_a,
+                               const bf6x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                3, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a,
                                const int32_t scale_a,
@@ -778,35 +1056,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
         ignore = reg_b;
         ignore = scale_b;
         ignore = reg_c;
-#endif
-    }
-
-    template <class FloatC>
-    __device__ static void Run(const bf8x32_t& reg_a,
-                               const int32_t& scale_a,
-                               const f8x32_t& reg_b,
-                               const int32_t& scale_b,
-                               FloatC& reg_c)
-    {
-#if defined(__gfx950__)
-        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
-        reg_c.template AsType<float4_t>()(Number<0>{}) =
-            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
-                reg_a,
-                reg_b,
-                reg_c.template AsType<float4_t>()[Number<0>{}],
-                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                0, // blgp
-                0, // OPSEL
-                scale_a,
-                0, // OPSEL
-                scale_b);
-#else
-        ignore = reg_a;
-        ignore = scale_a;
-        ignore = reg_b;
-        ignore = scale_b;
-        ignore = reg_c;
 #endif
     }
 };
@@ -833,7 +1082,7 @@ struct intrin_mfma_f32_16x16x128f8f6f4<16, 16>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                0, // cbsz
+                0, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
                 0, // blgp
                 0,
                 0,
@@ -846,6 +1095,29 @@ struct intrin_mfma_f32_16x16x128f8f6f4<16, 16>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a, const bf8x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1, // blgp
+                0,
+                0,
+                0,
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a, const f4x32_t& reg_b, FloatC& reg_c)
     {
@@ -870,6 +1142,60 @@ struct intrin_mfma_f32_16x16x128f8f6f4<16, 16>
         ignore = reg_a;
         ignore = reg_b;
         ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f6x32_t& reg_a, const f6x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                2, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2, // blgp
+                0, // OPSEL
+                0,
+                0, // OPSEL
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf6x32_t& reg_a, const bf6x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        int32x6_t arg_a = bit_cast<int32x6_t>(reg_a);
+        int32x6_t arg_b = bit_cast<int32x6_t>(reg_b);
+
+        using arg_type = int32x8_t;
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
+                arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                3, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3, // blgp
+                0, // OPSEL
+                0,
+                0, // OPSEL
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
 #endif
     }
 };
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index a6106bb146..c11b9c0272 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -32,8 +32,14 @@ using f4_t    = unsigned _BitInt(4);
 using f6_t    = _BitInt(6);          // e2m3 format
 using bf6_t   = unsigned _BitInt(6); // e3m2 format
 
+// scalar_type
+template <typename TV>
+struct scalar_type;
+
 struct f4x2_pk_t
 {
+    static constexpr int packed_size = 2;
+
     using type = uint8_t;
     type data;
     __host__ __device__ f4x2_pk_t() : data{type{}} {}
@@ -55,269 +61,82 @@ struct f4x2_pk_t
     }
 };
 
-struct f6x16_pk_t
+template <typename BitType, index_t pk_size>
+struct f6_pk_t
 {
-    // store 16 elements of f6_t in an array of 3 uint32_t
-    using element_type = uint32_t;
-    using type         = StaticallyIndexedArray_v2<element_type, 3>;
-    type data;
-    typedef int8_t test_vec_t __attribute__((ext_vector_type(16)));
-    f6x16_pk_t() : data{type{}} {}
-    f6x16_pk_t(type init) : data{init} {}
+    using element_type = uint32_t; // element storage fundamental type
 
-    template <index_t I>
-    __host__ __device__ inline f6_t unpack(Number<I>)
+    static constexpr index_t packed_size       = pk_size;
+    static constexpr index_t num_bits_elem     = 6;
+    static constexpr index_t num_bits_vec_elem = sizeof(element_type) * CHAR_BIT;
+    static_assert((packed_size * num_bits_elem) % num_bits_vec_elem == 0,
+                  "Packed elements must fit exactly into the element storage.");
+    static constexpr index_t vector_size = (packed_size * num_bits_elem) / num_bits_vec_elem;
+
+    using storage_type = StaticallyIndexedArray_v2<element_type, vector_size>;
+    storage_type data; // packed data
+
+    using type = f6_pk_t<BitType, packed_size>;
+
+    __host__ __device__ constexpr f6_pk_t() : data{} {}
+    __host__ __device__ constexpr f6_pk_t(storage_type init) : data{init} {}
+    template <typename T, typename = enable_if_t<scalar_type<T>::vector_size == packed_size>>
+    __host__ __device__ f6_pk_t(const T& v) : data{}
     {
-        static_assert(I < 16, "Index out of range for 16 f6_t elements.");
+        static_for<0, packed_size, 1>{}(
+            [&](auto i) { pack(v[static_cast<index_t>(i)], static_cast<index_t>(i)); });
+    }
 
-        constexpr int num_bits_elem     = 6;
-        constexpr int num_bits_vec_elem = 32;
-        constexpr int vector_size       = 3;
-        constexpr int bit_pos           = I * num_bits_elem;
-        constexpr int arr_idx           = bit_pos / num_bits_vec_elem;
-        constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-        uint32_t bits                   = data.At(Number<arr_idx>{}) >> bit_offset;
-        constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
+    template <typename T>
+    __host__ __device__ void pack(const T x, const index_t i)
+    {
+        static_assert(is_integral<T>::value || is_same_v<T, BitType>,
+                      "T must be an integral type.");
 
-        if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
+        uint32_t bits        = static_cast<uint32_t>(x) & 0x3F;
+        const int bit_pos    = i * num_bits_elem;
+        const int arr_index  = bit_pos / num_bits_vec_elem;
+        const int bit_offset = bit_pos % num_bits_vec_elem;
+        const int overhang   = bit_offset + num_bits_elem - num_bits_vec_elem;
+        uint32_t old_value   = data.data_[arr_index];
+
+        // insert bits into the current 32-bit block
+        old_value |= (bits << bit_offset);
+        data.data_[arr_index] = old_value;
+
+        // if it crosses into the next block, shift the remainder
+        if(overhang > 0 && (arr_index + 1) < vector_size)
         {
-            bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
+            uint32_t next_value = data.data_[arr_index + 1];
+            next_value |= (bits >> (num_bits_elem - overhang));
+            data.data_[arr_index + 1] = next_value;
+        }
+    }
+
+    __host__ __device__ static inline BitType unpack(const type& pk, const index_t i)
+    {
+        const int bit_pos    = i * num_bits_elem;
+        const int arr_idx    = bit_pos / num_bits_vec_elem;
+        const int bit_offset = bit_pos % num_bits_vec_elem;
+        const int overhang   = bit_offset + num_bits_elem - num_bits_vec_elem;
+
+        uint32_t bits = pk.data.data_[arr_idx] >> bit_offset;
+        if(overhang > 0 && (arr_idx + 1) < vector_size)
+        {
+            bits |= (pk.data.data_[arr_idx + 1] & ((1u << overhang) - 1))
                     << (num_bits_elem - overhang);
         }
 
-        return static_cast<f6_t>(bits & 0x3F);
+        return static_cast<BitType>(bits & 0x3F);
     }
 
-    __host__ __device__ inline type pack(const test_vec_t& x)
-    {
-        type packed{};
-
-        // for each of the 16 f6_t values, place its 6 bits in the correct position
-        ck::static_for<0, 16, 1>{}([&](auto i) {
-            uint32_t bits                   = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
-            constexpr int num_bits_elem     = 6;
-            constexpr int num_bits_vec_elem = 32;
-            constexpr int vector_size       = 3;
-            constexpr int bit_pos           = i * num_bits_elem;
-            constexpr int arr_index         = bit_pos / num_bits_vec_elem;
-            constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-            constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-            uint32_t old_value              = packed.At(Number<arr_index>{});
-
-            // insert bits into the current 32-bit block
-            old_value |= (bits << bit_offset);
-            packed.At(Number<arr_index>{}) = old_value;
-
-            // if it crosses into the next block, shift the remainder
-            if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
-            {
-                uint32_t next_value = packed.At(Number<arr_index + 1>{});
-                next_value |= (bits >> (num_bits_elem - overhang));
-                packed.At(Number<arr_index + 1>{}) = next_value;
-            }
-        });
-
-        return packed;
-    }
+    __host__ __device__ inline BitType unpack(const index_t i) const { return unpack(*this, i); }
 };
 
-struct f6x32_pk_t
-{
-    // store 32 elements of f6_t in an array of 6 uint32_t
-    using element_type = uint32_t;
-    using type         = StaticallyIndexedArray_v2<element_type, 6>;
-    type data;
-    typedef int8_t test_vec_t __attribute__((ext_vector_type(32)));
-    f6x32_pk_t() : data{type{}} {}
-    f6x32_pk_t(type init) : data{init} {}
-
-    template <index_t I>
-    __host__ __device__ inline f6_t unpack(Number<I>)
-    {
-        static_assert(I < 32, "Index out of range for 32 f6_t elements.");
-
-        constexpr int num_bits_elem     = 6;
-        constexpr int num_bits_vec_elem = 32;
-        constexpr int vector_size       = 6;
-        constexpr int bit_pos           = I * num_bits_elem;
-        constexpr int arr_idx           = bit_pos / num_bits_vec_elem;
-        constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-        uint32_t bits                   = data.At(Number<arr_idx>{}) >> bit_offset;
-        constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-
-        if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
-        {
-            bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
-                    << (num_bits_elem - overhang);
-        }
-
-        return static_cast<f6_t>(bits & 0x3F);
-    }
-
-    __host__ __device__ inline type pack(const test_vec_t& x)
-    {
-        type packed{};
-
-        // for each of the 32 f6_t values, place its 6 bits in the correct position
-        ck::static_for<0, 32, 1>{}([&](auto i) {
-            uint32_t bits                   = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
-            constexpr int num_bits_elem     = 6;
-            constexpr int num_bits_vec_elem = 32;
-            constexpr int vector_size       = 6;
-            constexpr int bit_pos           = i * num_bits_elem;
-            constexpr int arr_index         = bit_pos / num_bits_vec_elem;
-            constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-            constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-            uint32_t old_value              = packed.At(Number<arr_index>{});
-
-            // insert bits into the current 32-bit block
-            old_value |= (bits << bit_offset);
-            packed.At(Number<arr_index>{}) = old_value;
-
-            // if it crosses into the next block, shift the remainder
-            if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
-            {
-                uint32_t next_value = packed.At(Number<arr_index + 1>{});
-                next_value |= (bits >> (num_bits_elem - overhang));
-                packed.At(Number<arr_index + 1>{}) = next_value;
-            }
-        });
-
-        return packed;
-    }
-};
-
-struct bf6x16_pk_t
-{
-    // store 16 elements of bf6_t in an array of 3 uint32_t
-    using element_type = uint32_t;
-    using type         = StaticallyIndexedArray_v2<element_type, 3>;
-    type data;
-    typedef int8_t test_vec_t __attribute__((ext_vector_type(16)));
-    bf6x16_pk_t() : data{type{}} {}
-    bf6x16_pk_t(type init) : data{init} {}
-
-    template <index_t I>
-    __host__ __device__ inline bf6_t unpack(Number<I>)
-    {
-        static_assert(I < 16, "Index out of range for 16 f6_t elements.");
-
-        constexpr int num_bits_elem     = 6;
-        constexpr int num_bits_vec_elem = 32;
-        constexpr int vector_size       = 3;
-        constexpr int bit_pos           = I * num_bits_elem;
-        constexpr int arr_idx           = bit_pos / num_bits_vec_elem;
-        constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-        uint32_t bits                   = data.At(Number<arr_idx>{}) >> bit_offset;
-        constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-
-        if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
-        {
-            bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
-                    << (num_bits_elem - overhang);
-        }
-
-        return static_cast<bf6_t>(bits & 0x3F);
-    }
-
-    __host__ __device__ inline type pack(const test_vec_t& x)
-    {
-        type packed{};
-
-        // for each of the 16 bf6_t values, place its 6 bits in the correct position
-        ck::static_for<0, 16, 1>{}([&](auto i) {
-            uint32_t bits                   = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
-            constexpr int num_bits_elem     = 6;
-            constexpr int num_bits_vec_elem = 32;
-            constexpr int vector_size       = 3;
-            constexpr int bit_pos           = i * num_bits_elem;
-            constexpr int arr_index         = bit_pos / num_bits_vec_elem;
-            constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-            constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-            uint32_t old_value              = packed.At(Number<arr_index>{});
-
-            // insert bits into the current 32-bit block
-            old_value |= (bits << bit_offset);
-            packed.At(Number<arr_index>{}) = old_value;
-
-            // if it crosses into the next block, shift the remainder
-            if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
-            {
-                uint32_t next_value = packed.At(Number<arr_index + 1>{});
-                next_value |= (bits >> (num_bits_elem - overhang));
-                packed.At(Number<arr_index + 1>{}) = next_value;
-            }
-        });
-
-        return packed;
-    }
-};
-
-struct bf6x32_pk_t
-{
-    // store 32 elements of bf6_t in an array of 6 uint32_t
-    using element_type = uint32_t;
-    using type         = StaticallyIndexedArray_v2<element_type, 6>;
-    type data;
-    typedef int8_t test_vec_t __attribute__((ext_vector_type(32)));
-    bf6x32_pk_t() : data{type{}} {}
-    bf6x32_pk_t(type init) : data{init} {}
-
-    template <index_t I>
-    __host__ __device__ inline bf6_t unpack(Number<I>)
-    {
-        static_assert(I < 32, "Index out of range for 32 f6_t elements.");
-
-        constexpr int num_bits_elem     = 6;
-        constexpr int num_bits_vec_elem = 32;
-        constexpr int vector_size       = 6;
-        constexpr int bit_pos           = I * num_bits_elem;
-        constexpr int arr_idx           = bit_pos / num_bits_vec_elem;
-        constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-        uint32_t bits                   = data.At(Number<arr_idx>{}) >> bit_offset;
-        constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-
-        if constexpr(overhang > 0 && (arr_idx + 1) < vector_size)
-        {
-            bits |= (data.At(Number<arr_idx + 1>{}) & ((1u << overhang) - 1))
-                    << (num_bits_elem - overhang);
-        }
-
-        return static_cast<bf6_t>(bits & 0x3F);
-    }
-
-    __host__ __device__ inline type pack(const test_vec_t& x)
-    {
-        type packed{};
-
-        // for each of the 32 bf6_t values, place its 6 bits in the correct position
-        ck::static_for<0, 32, 1>{}([&](auto i) {
-            uint32_t bits                   = static_cast<uint32_t>(x[static_cast<int>(i)]) & 0x3F;
-            constexpr int num_bits_elem     = 6;
-            constexpr int num_bits_vec_elem = 32;
-            constexpr int vector_size       = 6;
-            constexpr int bit_pos           = i * num_bits_elem;
-            constexpr int arr_index         = bit_pos / num_bits_vec_elem;
-            constexpr int bit_offset        = bit_pos % num_bits_vec_elem;
-            constexpr int overhang          = bit_offset + num_bits_elem - num_bits_vec_elem;
-            uint32_t old_value              = packed.At(Number<arr_index>{});
-
-            // insert bits into the current 32-bit block
-            old_value |= (bits << bit_offset);
-            packed.At(Number<arr_index>{}) = old_value;
-
-            // if it crosses into the next block, shift the remainder
-            if constexpr(overhang > 0 && (arr_index + 1) < vector_size)
-            {
-                uint32_t next_value = packed.At(Number<arr_index + 1>{});
-                next_value |= (bits >> (num_bits_elem - overhang));
-                packed.At(Number<arr_index + 1>{}) = next_value;
-            }
-        });
-
-        return packed;
-    }
-};
+using f6x16_pk_t  = f6_pk_t<f6_t, 16>;
+using f6x32_pk_t  = f6_pk_t<f6_t, 32>;
+using bf6x16_pk_t = f6_pk_t<bf6_t, 16>;
+using bf6x32_pk_t = f6_pk_t<bf6_t, 32>;
 
 // custom data type - pack int4 data
 struct pk_i4_t
@@ -335,15 +154,14 @@ inline constexpr auto next_pow2(uint32_t x)
 }
 
 // native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_fnuz_t, bf8_fnuz_t,
-// native types: bool, f4_t, f6_t, bf6_t
+// native types: bool
 template <typename T>
 inline constexpr bool is_native_type()
 {
     return is_same<T, double>::value || is_same<T, float>::value || is_same<T, half_t>::value ||
-           is_same<T, bhalf_t>::value || is_same<T, int32_t>::value || is_same<T, int8_t>::value ||
-           is_same<T, uint8_t>::value || is_same<T, f8_fnuz_t>::value ||
-           is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value || is_same<T, f4_t>::value ||
-           is_same<T, f6_t>::value || is_same<T, bf6_t>::value;
+           is_same<T, bhalf_t>::value || is_same<T, int32_t>::value ||
+           is_same<T, uint32_t>::value || is_same<T, int8_t>::value || is_same<T, uint8_t>::value ||
+           is_same<T, f8_fnuz_t>::value || is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value;
 }
 
 // scalar_type
@@ -484,6 +302,106 @@ struct scalar_type<bool>
     static constexpr index_t vector_size = 1;
 };
 
+// Default behavior for types that do not need special handling
+template <typename T>
+struct packed_type
+{
+    using type                           = T;
+    static constexpr index_t packed_size = 1; // number of packed elements
+};
+
+template <>
+struct packed_type<int4_t>
+{
+    using type                           = pk_i4_t;
+    static constexpr index_t packed_size = 2; // number of packed elements
+};
+
+template <>
+struct packed_type<f4_t>
+{
+    using type                           = f4x2_pk_t;
+    static constexpr index_t packed_size = 2; // number of packed elements
+};
+
+template <>
+struct packed_type<f6_t>
+{
+    using type                           = f6x32_pk_t;
+    static constexpr index_t packed_size = f6x32_pk_t::packed_size; // number of packed elements
+};
+
+template <>
+struct packed_type<bf6_t>
+{
+    using type                           = bf6x32_pk_t;
+    static constexpr index_t packed_size = bf6x32_pk_t::packed_size; // number of packed elements
+};
+
+template <typename T>
+using packed_type_t = typename packed_type<T>::type;
+
+// Check if the type has packed type specialization
+template <typename T>
+inline constexpr bool has_packed_type_v = !is_same_v<packed_type_t<T>, T>;
+
+template <typename T>
+struct element_type
+{
+    private:
+    static constexpr auto get_element_type()
+    {
+        using U = remove_cvref_t<T>;
+        if constexpr(is_same_v<U, pk_i4_t>)
+            return int4_t{};
+        else if constexpr(is_same_v<U, f4x2_pk_t>)
+            return f4_t{};
+        else if constexpr(is_same_v<U, f6x16_pk_t>)
+            return f6_t{};
+        else if constexpr(is_same_v<U, bf6x16_pk_t>)
+            return bf6_t{};
+        else if constexpr(is_same_v<U, f6x32_pk_t>)
+            return f6_t{};
+        else if constexpr(is_same_v<U, bf6x32_pk_t>)
+            return bf6_t{};
+        else
+            return T{};
+    }
+
+    public:
+    using type = decltype(get_element_type());
+};
+template <typename T>
+using element_type_t = typename element_type<T>::type;
+
+template <typename T>
+inline constexpr bool is_packed_type_v =
+    has_packed_type_v<element_type_t<T>>&& is_same_v<T, packed_type_t<element_type_t<T>>>;
+
+template <typename T>
+struct packed_size
+{
+    private:
+    static constexpr auto get_packed_size()
+    {
+        using U = remove_cvref_t<T>;
+        if constexpr(is_packed_type_v<U>)
+            return Number<packed_type<element_type_t<U>>::packed_size>{};
+        else
+            return Number<packed_type<U>::packed_size>{};
+    }
+
+    public:
+    using type                  = decltype(get_packed_size());
+    static constexpr auto value = get_packed_size();
+};
+
+template <typename T>
+using packed_size_t = typename packed_size<T>::type;
+
+template <typename T>
+inline constexpr index_t packed_size_v = packed_size<T>::value;
+
 #if defined(_WIN32)
 using int64_t = long long;
 #else
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index 9c40d923d3..65eed0624c 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -365,6 +365,88 @@ struct vector_type<T, 5, typename ck::enable_if_t<is_native_type<T>()>>
     }
 };
 
+template <typename T>
+struct vector_type<T, 6, typename ck::enable_if_t<is_native_type<T>()>>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d3_t __attribute__((ext_vector_type(3)));
+    typedef T d6_t __attribute__((ext_vector_type(6)));
+
+    using type = d6_t;
+
+    union
+    {
+        d6_t d6_;
+        StaticallyIndexedArray<d1_t, 6> d1x6_;
+        StaticallyIndexedArray<d2_t, 3> d2x3_;
+        StaticallyIndexedArray<d3_t, 2> d3x2_;
+        StaticallyIndexedArray<d6_t, 1> d6x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d3_t>::value || is_same<X, d6_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x6_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x3_;
+        }
+        else if constexpr(is_same<X, d3_t>::value)
+        {
+            return data_.d3x2_;
+        }
+        else if constexpr(is_same<X, d6_t>::value)
+        {
+            return data_.d6x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d3_t>::value || is_same<X, d6_t>::value,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x6_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x3_;
+        }
+        else if constexpr(is_same<X, d3_t>::value)
+        {
+            return data_.d3x2_;
+        }
+        else if constexpr(is_same<X, d6_t>::value)
+        {
+            return data_.d6x1_;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
 template <typename T>
 struct vector_type<T, 7, typename ck::enable_if_t<is_native_type<T>()>>
 {
@@ -1221,25 +1303,25 @@ struct nnvb_data_t_selector<e8m0_bexp_t>
 template <>
 struct nnvb_data_t_selector<f6x16_pk_t>
 {
-    using type = f6x16_pk_t::type;
+    using type = f6x16_pk_t::storage_type;
 };
 
 template <>
 struct nnvb_data_t_selector<f6x32_pk_t>
 {
-    using type = f6x32_pk_t::type;
+    using type = f6x32_pk_t::storage_type;
 };
 
 template <>
 struct nnvb_data_t_selector<bf6x16_pk_t>
 {
-    using type = bf6x16_pk_t::type;
+    using type = bf6x16_pk_t::storage_type;
 };
 
 template <>
 struct nnvb_data_t_selector<bf6x32_pk_t>
 {
-    using type = bf6x32_pk_t::type;
+    using type = bf6x32_pk_t::storage_type;
 };
 
 template <>
@@ -1406,12 +1488,23 @@ struct non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T)
 };
 
 template <typename T, index_t N>
-struct scalar_type<non_native_vector_base<T, N>>
+struct scalar_type<non_native_vector_base<
+    T,
+    N,
+    ck::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>>
 {
     using type                           = typename non_native_vector_base<T, N>::data_t;
     static constexpr index_t vector_size = N;
 };
 
+template <typename T, index_t N>
+struct scalar_type<
+    non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T) == 24>>>
+{
+    using type                           = typename non_native_vector_base<T, N>::element_t;
+    static constexpr index_t vector_size = N * non_native_vector_base<T, N>::size_factor;
+};
+
 // non-native vector_type implementation
 template <typename T>
 struct vector_type<T, 1, typename ck::enable_if_t<!is_native_type<T>()>>
@@ -2025,6 +2118,7 @@ using bhalf32_t = typename vector_type<bhalf_t, 32>::type;
 // i32
 using int32x2_t  = typename vector_type<int32_t, 2>::type;
 using int32x4_t  = typename vector_type<int32_t, 4>::type;
+using int32x6_t  = typename vector_type<int32_t, 6>::type;
 using int32x8_t  = typename vector_type<int32_t, 8>::type;
 using int32x16_t = typename vector_type<int32_t, 16>::type;
 using int32x32_t = typename vector_type<int32_t, 32>::type;
diff --git a/include/ck/utility/mxf4_utils.hpp b/include/ck/utility/mxf4_utils.hpp
index b0b5297f77..53edb6e182 100644
--- a/include/ck/utility/mxf4_utils.hpp
+++ b/include/ck/utility/mxf4_utils.hpp
@@ -66,7 +66,7 @@ __host__ __device__ inline f4_t sat_convert_to_type<f4_t>(float value)
                     : NumericUtils<f4_t>::data_max_positive_normal_mask;
     }
 
-    if(std::abs(value) > NumericLimits<f4_t>::Max()) // covers inf case as well
+    if(std::abs(value) > NumericLimits<f4_t>::DataMaxNorm()) // covers inf case as well
         return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
                     : NumericUtils<f4_t>::data_max_positive_normal_mask;
 
@@ -74,8 +74,8 @@ __host__ __device__ inline f4_t sat_convert_to_type<f4_t>(float value)
 
     if(std::abs(to_float<f4_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
        NumericLimits<f4_t>::DataMinSubnorm())
-        return value < 0 ? NumericUtils<f4_t>::negative_zero_mask
-                         : NumericUtils<f4_t>::positive_zero_mask;
+        return sign ? NumericUtils<f4_t>::negative_zero_mask
+                    : NumericUtils<f4_t>::positive_zero_mask;
 
     return res;
 }
@@ -91,7 +91,7 @@ __host__ __device__ inline f4_t sat_convert_to_type_sr<f4_t>(float value, uint32
         return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
                     : NumericUtils<f4_t>::data_max_positive_normal_mask;
 
-    if(std::abs(value) > NumericLimits<f4_t>::Max()) // covers inf case as well
+    if(std::abs(value) > NumericLimits<f4_t>::DataMaxNorm()) // covers inf case as well
         return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
                     : NumericUtils<f4_t>::data_max_positive_normal_mask;
 
@@ -99,8 +99,8 @@ __host__ __device__ inline f4_t sat_convert_to_type_sr<f4_t>(float value, uint32
 
     if(std::abs(to_float<f4_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
        NumericLimits<f4_t>::DataMinSubnorm())
-        return value < 0 ? NumericUtils<f4_t>::negative_zero_mask
-                         : NumericUtils<f4_t>::positive_zero_mask;
+        return sign ? NumericUtils<f4_t>::negative_zero_mask
+                    : NumericUtils<f4_t>::positive_zero_mask;
 
     return res;
 }
diff --git a/include/ck/utility/mxf6_utils.hpp b/include/ck/utility/mxf6_utils.hpp
index cf68188b3e..a840c520a9 100644
--- a/include/ck/utility/mxf6_utils.hpp
+++ b/include/ck/utility/mxf6_utils.hpp
@@ -201,7 +201,7 @@ __host__ __device__ inline f6_t sat_convert_to_type<f6_t>(float value)
                     : NumericUtils<f6_t>::data_max_positive_normal_mask;
     }
 
-    if(std::abs(value) > NumericLimits<f6_t>::Max()) // covers inf case as well
+    if(std::abs(value) > NumericLimits<f6_t>::DataMaxNorm()) // covers inf case as well
         return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
                     : NumericUtils<f6_t>::data_max_positive_normal_mask;
 
@@ -239,7 +239,7 @@ __host__ __device__ inline bf6_t sat_convert_to_type<bf6_t>(float value)
                     : NumericUtils<bf6_t>::data_max_positive_normal_mask;
     }
 
-    if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
+    if(std::abs(value) > NumericLimits<bf6_t>::DataMaxNorm()) // covers inf case as well
         return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
                     : NumericUtils<bf6_t>::data_max_positive_normal_mask;
 
@@ -274,7 +274,7 @@ __host__ __device__ inline f6_t sat_convert_to_type_sr<f6_t>(float value, uint32
         return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
                     : NumericUtils<f6_t>::data_max_positive_normal_mask;
 
-    if(std::abs(value) > NumericLimits<f6_t>::Max()) // covers inf case as well
+    if(std::abs(value) > NumericLimits<f6_t>::DataMaxNorm()) // covers inf case as well
         return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
                     : NumericUtils<f6_t>::data_max_positive_normal_mask;
 
@@ -308,7 +308,7 @@ __host__ __device__ inline bf6_t sat_convert_to_type_sr<bf6_t>(float value, uint
     if(std::isnan(value))
         return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
                     : NumericUtils<bf6_t>::data_max_positive_normal_mask;
-    if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
+    if(std::abs(value) > NumericLimits<bf6_t>::DataMaxNorm()) // covers inf case as well
         return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
                     : NumericUtils<bf6_t>::data_max_positive_normal_mask;
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index c8d284a1d7..ed07e53e6d 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -89,6 +89,14 @@ struct ReferenceGemm : public device::BaseOperator
                             v_a = type_convert<ComputeTypeA>(
                                 f4_t(arg.a_m_k_(m, k).template unpack<>(Number<0>{})));
                     }
+                    else if constexpr(is_same_v<ADataType, f6x16_pk_t> ||
+                                      is_same_v<ADataType, bf6x16_pk_t> ||
+                                      is_same_v<ADataType, f6x32_pk_t> ||
+                                      is_same_v<ADataType, bf6x32_pk_t>)
+                    {
+                        v_a = type_convert<ComputeTypeA>(
+                            arg.a_m_k_(m, k).unpack(k % ADataType::packed_size));
+                    }
                     else
                     {
                         arg.a_element_op_(v_a, arg.a_m_k_(m, k));
@@ -115,6 +123,14 @@ struct ReferenceGemm : public device::BaseOperator
                             v_b = type_convert<ComputeTypeB>(
                                 f4_t(arg.b_k_n_(k, n).template unpack<>(Number<0>{})));
                     }
+                    else if constexpr(is_same_v<BDataType, f6x16_pk_t> ||
+                                      is_same_v<BDataType, bf6x16_pk_t> ||
+                                      is_same_v<BDataType, f6x32_pk_t> ||
+                                      is_same_v<BDataType, bf6x32_pk_t>)
+                    {
+                        v_b = type_convert<ComputeTypeB>(
+                            arg.b_k_n_(k, n).unpack(k % BDataType::packed_size));
+                    }
                     else
                     {
                         arg.b_element_op_(v_b, arg.b_k_n_(k, n));
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
index e8fdcf1acd..3fc39911dd 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
@@ -105,6 +105,16 @@ struct ReferenceMXGemm : public device::BaseOperator
                                 type_convert<ComputeTypeA>(
                                     arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
                     }
+                    else if constexpr(is_same_v<ADataType, f6x16_pk_t> ||
+                                      is_same_v<ADataType, bf6x16_pk_t> ||
+                                      is_same_v<ADataType, f6x32_pk_t> ||
+                                      is_same_v<ADataType, bf6x32_pk_t>)
+                    {
+                        a_m_k_scaled(m, k) =
+                            type_convert<ComputeTypeA>(
+                                arg.a_m_k_(m, k).unpack(k % ADataType::packed_size)) *
+                            type_convert<ComputeTypeA>(arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                    }
                     else
                     {
                         a_m_k_scaled(m, k) =
@@ -134,6 +144,16 @@ struct ReferenceMXGemm : public device::BaseOperator
                                 type_convert<ComputeTypeB>(
                                     arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
                     }
+                    else if constexpr(is_same_v<BDataType, f6x16_pk_t> ||
+                                      is_same_v<BDataType, bf6x16_pk_t> ||
+                                      is_same_v<BDataType, f6x32_pk_t> ||
+                                      is_same_v<BDataType, bf6x32_pk_t>)
+                    {
+                        b_k_n_scaled(k, n) =
+                            type_convert<ComputeTypeB>(
+                                arg.b_k_n_(k, n).unpack(k % BDataType::packed_size)) *
+                            type_convert<ComputeTypeB>(arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                    }
                     else
                     {
                         b_k_n_scaled(k, n) =
diff --git a/test/data_type/test_bf6.cpp b/test/data_type/test_bf6.cpp
index a260f81d16..9dbb77454c 100644
--- a/test/data_type/test_bf6.cpp
+++ b/test/data_type/test_bf6.cpp
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/utility/scaled_type_convert.hpp"
 
 using ck::bf6_convert_rne;
@@ -41,6 +42,11 @@ TEST(BF6, ConvertFP32Nearest)
     ASSERT_NEAR(max_bf6,
                 type_convert<float>(bf6_convert_rne(std::numeric_limits<float>::infinity())),
                 0.0f);
+
+    // convert float +/-30 to bf6 and back, check if clipped to +/-max_bf6
+    ASSERT_NEAR(-max_bf6, type_convert<float>(bf6_convert_rne(-30.0f)), 0.0f);
+    ASSERT_NEAR(max_bf6, type_convert<float>(bf6_convert_rne(30.0f)), 0.0f);
+
     // convert float value less than bf6 subnorm to bf6 and back, check if equal to 0.0
     float less_than_subnorm = 0.03125f;
     ASSERT_NEAR(0.0f, type_convert<float>(bf6_convert_rne(less_than_subnorm)), 0.0f);
@@ -266,21 +272,18 @@ TEST(BF6, TestAsType16x1)
     vector_type<bf6x16_pk_t, vector_size> right_vec;
     // check default CTOR
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            right_vec.template AsType<bf6x16_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}),
-            0);
+        ASSERT_EQ(right_vec.template AsType<bf6x16_pk_t>()(Number<0>{}).unpack(i), 0);
     });
     // assign test values to the vector
     ck::static_for<0, vector_size, 1>{}([&](auto i) {
-        right_vec.template AsType<bf6x16_pk_t>()(Number<i>{}) = bf6x16_pk_t{}.pack(test_vec);
+        right_vec.template AsType<bf6x16_pk_t>()(Number<i>{}) = bf6x16_pk_t{test_vec};
     });
     // copy the vector
     vector_type<bf6x16_pk_t, vector_size> left_vec{right_vec};
     // check if values were copied correctly
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            left_vec.template AsType<bf6x16_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}),
-            static_cast<bf6_t>(test_vec[static_cast<int>(i)]));
+        ASSERT_EQ(left_vec.template AsType<bf6x16_pk_t>()(Number<0>{}).unpack(i),
+                  static_cast<bf6_t>(test_vec[static_cast<int>(i)]));
     });
 }
 
@@ -329,23 +332,23 @@ TEST(BF6, TestAsType16x2)
     // check default CTOR
     ck::static_for<0, vector_size, 1>{}([&](auto idx_vector) {
         ck::static_for<0, packed_size, 1>{}([&](auto idx_element) {
-            ASSERT_EQ(right_vec.template AsType<bf6x16_pk_t>()(Number<idx_vector>{})
-                          .template unpack<>(Number<idx_element>{}),
-                      0);
+            ASSERT_EQ(
+                right_vec.template AsType<bf6x16_pk_t>()(Number<idx_vector>{}).unpack(idx_element),
+                0);
         });
     });
     // assign test values to the vector
     ck::static_for<0, vector_size, 1>{}([&](auto i) {
-        right_vec.template AsType<bf6x16_pk_t>()(Number<i>{}) = bf6x16_pk_t{}.pack(test_vec[i]);
+        right_vec.template AsType<bf6x16_pk_t>()(Number<i>{}) = bf6x16_pk_t{test_vec[i]};
     });
     // copy the vector
     vector_type<bf6x16_pk_t, vector_size> left_vec{right_vec};
     // check if values were copied correctly
     ck::static_for<0, vector_size, 1>{}([&](auto idx_vector) {
         ck::static_for<0, packed_size, 1>{}([&](auto idx_element) {
-            ASSERT_EQ(left_vec.template AsType<bf6x16_pk_t>()(Number<idx_vector>{})
-                          .template unpack<>(Number<idx_element>{}),
-                      static_cast<bf6_t>(test_vec[idx_vector][static_cast<int>(idx_element)]));
+            ASSERT_EQ(
+                left_vec.template AsType<bf6x16_pk_t>()(Number<idx_vector>{}).unpack(idx_element),
+                static_cast<bf6_t>(test_vec[idx_vector][static_cast<int>(idx_element)]));
         });
     });
 }
@@ -369,20 +372,86 @@ TEST(BF6, TestAsType32x1)
     vector_type<bf6x32_pk_t, vector_size> right_vec;
     // check default CTOR
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            right_vec.template AsType<bf6x32_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}),
-            0);
+        ASSERT_EQ(right_vec.template AsType<bf6x32_pk_t>()(Number<0>{}).unpack(i), 0);
     });
     // assign test values to the vector
     ck::static_for<0, vector_size, 1>{}([&](auto i) {
-        right_vec.template AsType<bf6x32_pk_t>()(Number<i>{}) = bf6x32_pk_t{}.pack(test_vec);
+        right_vec.template AsType<bf6x32_pk_t>()(Number<i>{}) = bf6x32_pk_t{test_vec};
     });
     // copy the vector
     vector_type<bf6x32_pk_t, vector_size> left_vec{right_vec};
     // check if values were copied correctly
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            left_vec.template AsType<bf6x32_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}),
-            static_cast<bf6_t>(test_vec[static_cast<int>(i)]));
+        ASSERT_EQ(left_vec.template AsType<bf6x32_pk_t>()(Number<0>{}).unpack(i),
+                  static_cast<bf6_t>(test_vec[static_cast<int>(i)]));
+    });
+}
+
+TEST(BF6, TestAllValues)
+{
+
+    constexpr std::array<float, 64> e3m2ValuesOCP = {
+        // clang-format off
+        0.0000000000, 0.0625000000, 0.1250000000, 0.1875000000,
+        0.2500000000, 0.3125000000, 0.3750000000, 0.4375000000,
+        0.5000000000, 0.6250000000, 0.7500000000, 0.8750000000,
+        1.0000000000, 1.2500000000, 1.5000000000, 1.7500000000,
+        2.0000000000, 2.5000000000, 3.0000000000, 3.5000000000,
+        4.0000000000, 5.0000000000, 6.0000000000, 7.0000000000,
+        8.0000000000, 10.0000000000, 12.0000000000, 14.0000000000,
+        16.0000000000, 20.0000000000, 24.0000000000, 28.0000000000,
+        -0.0000000000, -0.0625000000, -0.1250000000, -0.1875000000,
+        -0.2500000000, -0.3125000000, -0.3750000000, -0.4375000000,
+        -0.5000000000, -0.6250000000, -0.7500000000, -0.8750000000,
+        -1.0000000000, -1.2500000000, -1.5000000000, -1.7500000000,
+        -2.0000000000, -2.5000000000, -3.0000000000, -3.5000000000,
+        -4.0000000000, -5.0000000000, -6.0000000000, -7.0000000000,
+        -8.0000000000, -10.0000000000, -12.0000000000, -14.0000000000,
+        -16.0000000000, -20.0000000000, -24.0000000000, -28.0000000000
+        // clang-format on
+    };
+
+    constexpr uint8_t e3m2BitsOCP[] = {
+        // clang-format off
+        0b000000, 0b000001, 0b000010, 0b000011,
+        0b000100, 0b000101, 0b000110, 0b000111,
+        0b001000, 0b001001, 0b001010, 0b001011,
+        0b001100, 0b001101, 0b001110, 0b001111,
+        0b010000, 0b010001, 0b010010, 0b010011,
+        0b010100, 0b010101, 0b010110, 0b010111,
+        0b011000, 0b011001, 0b011010, 0b011011,
+        0b011100, 0b011101, 0b011110, 0b011111,
+        0b100000, 0b100001, 0b100010, 0b100011,
+        0b100100, 0b100101, 0b100110, 0b100111,
+        0b101000, 0b101001, 0b101010, 0b101011,
+        0b101100, 0b101101, 0b101110, 0b101111,
+        0b110000, 0b110001, 0b110010, 0b110011,
+        0b110100, 0b110101, 0b110110, 0b110111,
+        0b111000, 0b111001, 0b111010, 0b111011,
+        0b111100, 0b111101, 0b111110, 0b111111
+        // clang-format on
+    };
+
+    const bool ck_logging = ck::EnvIsEnabled(CK_ENV(CK_LOGGING));
+
+    if(ck_logging)
+        printf("BF6 Table\n");
+    ck::static_for<0, 64, 1>{}([&](auto i) {
+        float fp = type_convert<float>(bf6_t(e3m2BitsOCP[i]));
+        ASSERT_EQ(fp, e3m2ValuesOCP[i]);
+
+        bf6_t bf6 = type_convert<bf6_t>(e3m2ValuesOCP[i]);
+        ASSERT_EQ(bf6 & 0x3F, e3m2BitsOCP[i] & 0x3F);
+
+        if(ck_logging)
+        {
+            // Print the binary representation
+            printf("Bits: 0b");
+            for(int j = 5; j >= 0; --j)
+            {
+                printf("%c", (e3m2BitsOCP[i] & (1 << j)) ? '1' : '0');
+            }
+            printf(", 0x%02X, Value: %f\n", e3m2BitsOCP[i], e3m2ValuesOCP[i]);
+        }
     });
 }
diff --git a/test/data_type/test_fp4.cpp b/test/data_type/test_fp4.cpp
index f4b2bf3358..3fc74a2ef3 100644
--- a/test/data_type/test_fp4.cpp
+++ b/test/data_type/test_fp4.cpp
@@ -5,6 +5,7 @@
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 #include "ck/utility/scaled_type_convert.hpp"
+#include "ck/utility/env.hpp"
 
 using ck::e8m0_bexp_t;
 using ck::f4_convert_rne;
@@ -38,6 +39,11 @@ TEST(FP4, ConvertFP32Nearest)
     // convert maximal float to fp4 and back, check if clipped to 6.0
     ASSERT_NEAR(
         max_fp4, type_convert<float>(f4_convert_rne(std::numeric_limits<float>::max())), abs_tol);
+
+    // convert +/-7.0 to fp4 and back, check if clipped to +/-6.0
+    ASSERT_NEAR(-max_fp4, type_convert<float>(f4_convert_rne(-7.0f)), 0.0);
+    ASSERT_NEAR(max_fp4, type_convert<float>(f4_convert_rne(7.0f)), 0.0);
+
     // positive norm float value to fp4 and back, check if holds
     float pos_float = 1.0f;
     ASSERT_NEAR(pos_float, type_convert<float>(f4_convert_rne(pos_float)), abs_tol);
@@ -468,3 +474,54 @@ TEST(FP4, TestAsType32)
                   test_vec.at(i + 1));
     });
 }
+
+TEST(FP4, TestAllValues)
+{
+    constexpr std::array<float, 16> e2m1ValuesOCP = {
+        // clang-format off
+        0.0000000000, 0.5000000000,
+        1.0000000000, 1.5000000000,
+        2.0000000000, 3.0000000000,
+        4.0000000000, 6.0000000000,
+        -0.0000000000, -0.5000000000,
+        -1.0000000000, -1.5000000000,
+        -2.0000000000, -3.0000000000,
+        -4.0000000000, -6.0000000000
+        // clang-format on
+    };
+
+    constexpr uint8_t e2m1BitsOCP[] = {
+        // clang-format off
+        0b0000, 0b0001,
+        0b0010, 0b0011,
+        0b0100, 0b0101,
+        0b0110, 0b0111,
+        0b1000, 0b1001,
+        0b1010, 0b1011,
+        0b1100, 0b1101,
+        0b1110, 0b1111
+        // clang-format on
+    };
+
+    const bool ck_logging = ck::EnvIsEnabled(CK_ENV(CK_LOGGING));
+
+    if(ck_logging)
+        printf("FP4 Table\n");
+    ck::static_for<0, 16, 1>{}([&](auto i) {
+        float fp = type_convert<float>(f4_t(e2m1BitsOCP[i]));
+        ASSERT_EQ(fp, e2m1ValuesOCP[i]);
+
+        f4_t fp4 = type_convert<f4_t>(e2m1ValuesOCP[i]);
+        ASSERT_EQ(fp4 & 0xF, e2m1BitsOCP[i] & 0xF);
+        if(ck_logging)
+        {
+            // Print the binary representation
+            printf("Bits: 0b");
+            for(int j = 3; j >= 0; --j)
+            {
+                printf("%c", (e2m1BitsOCP[i] & (1 << j)) ? '1' : '0');
+            }
+            printf(", 0x%02X, Value: %f\n", e2m1BitsOCP[i], e2m1ValuesOCP[i]);
+        }
+    });
+}
diff --git a/test/data_type/test_fp6.cpp b/test/data_type/test_fp6.cpp
index cf91e69db3..6d4aec1d9a 100644
--- a/test/data_type/test_fp6.cpp
+++ b/test/data_type/test_fp6.cpp
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/utility/scaled_type_convert.hpp"
 
 using ck::e8m0_bexp_t;
@@ -34,6 +35,11 @@ TEST(FP6, ConvertFP32Nearest)
     ASSERT_NEAR(0.0f, type_convert<float>(f6_convert_rne(0.0f)), 0.0f);
     // convert maximal f6_t to float and check if equal to max_fp6
     ASSERT_NEAR(max_fp6, type_convert<float>(f6_convert_rne(max_fp6)), 0.0f);
+
+    // convert maximal +/-8.0 to fp6 and check if equal to +/-max_fp6
+    ASSERT_NEAR(-max_fp6, type_convert<float>(f6_convert_rne(-8.0f)), 0.0f);
+    ASSERT_NEAR(max_fp6, type_convert<float>(f6_convert_rne(8.0f)), 0.0f);
+
     // convert maximal float to fp6 and back, check if clipped to max_fp6
     ASSERT_NEAR(
         max_fp6, type_convert<float>(f6_convert_rne(std::numeric_limits<float>::max())), 0.0f);
@@ -265,20 +271,24 @@ TEST(FP6, TestAsType16x1)
     vector_type<f6x16_pk_t, vector_size> right_vec;
     // check default CTOR
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            right_vec.template AsType<f6x16_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}), 0);
+        ASSERT_EQ(right_vec.template AsType<f6x16_pk_t>()(Number<0>{}).unpack(i), 0);
     });
     // assign test values to the vector
     ck::static_for<0, vector_size, 1>{}([&](auto i) {
-        right_vec.template AsType<f6x16_pk_t>()(Number<i>{}) = f6x16_pk_t{}.pack(test_vec);
+        right_vec.template AsType<f6x16_pk_t>()(Number<i>{}) = f6x16_pk_t{test_vec};
     });
+
     // copy the vector
     vector_type<f6x16_pk_t, vector_size> left_vec{right_vec};
     // check if values were copied correctly
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            left_vec.template AsType<f6x16_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}),
-            static_cast<f6_t>(test_vec[static_cast<int>(i)]));
+        ASSERT_EQ(left_vec.template AsType<f6x16_pk_t>()(Number<0>{}).unpack(i),
+                  static_cast<f6_t>(test_vec[static_cast<int>(i)]))
+            << " i = " << i << "; left = "
+            << type_convert<float>(left_vec.template AsType<f6x16_pk_t>()(Number<0>{}).unpack(i))
+            << " -- right = "
+            << type_convert<float>(static_cast<f6_t>(test_vec[static_cast<int>(i)])) << " ("
+            << static_cast<int>(test_vec[static_cast<int>(i)]) << ")" << std::endl;
     });
 }
 
@@ -327,23 +337,23 @@ TEST(FP6, TestAsType16x2)
     // check default CTOR
     ck::static_for<0, vector_size, 1>{}([&](auto idx_vector) {
         ck::static_for<0, packed_size, 1>{}([&](auto idx_element) {
-            ASSERT_EQ(right_vec.template AsType<f6x16_pk_t>()(Number<idx_vector>{})
-                          .template unpack<>(Number<idx_element>{}),
-                      0);
+            ASSERT_EQ(
+                right_vec.template AsType<f6x16_pk_t>()(Number<idx_vector>{}).unpack(idx_element),
+                0);
         });
     });
     // assign test values to the vector
     ck::static_for<0, vector_size, 1>{}([&](auto i) {
-        right_vec.template AsType<f6x16_pk_t>()(Number<i>{}) = f6x16_pk_t{}.pack(test_vec[i]);
+        right_vec.template AsType<f6x16_pk_t>()(Number<i>{}) = f6x16_pk_t{test_vec[i]};
     });
     // copy the vector
     vector_type<f6x16_pk_t, vector_size> left_vec{right_vec};
     // check if values were copied correctly
     ck::static_for<0, vector_size, 1>{}([&](auto idx_vector) {
         ck::static_for<0, packed_size, 1>{}([&](auto idx_element) {
-            ASSERT_EQ(left_vec.template AsType<f6x16_pk_t>()(Number<idx_vector>{})
-                          .template unpack<>(Number<idx_element>{}),
-                      static_cast<f6_t>(test_vec[idx_vector][static_cast<int>(idx_element)]));
+            ASSERT_EQ(
+                left_vec.template AsType<f6x16_pk_t>()(Number<idx_vector>{}).unpack(idx_element),
+                static_cast<f6_t>(test_vec[idx_vector][static_cast<int>(idx_element)]));
         });
     });
 }
@@ -367,19 +377,77 @@ TEST(FP6, TestAsType32x1)
     vector_type<f6x32_pk_t, vector_size> right_vec;
     // check default CTOR
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            right_vec.template AsType<f6x32_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}), 0);
+        ASSERT_EQ(right_vec.template AsType<f6x32_pk_t>()(Number<0>{}).unpack(i), 0);
     });
     // assign test values to the vector
     ck::static_for<0, vector_size, 1>{}([&](auto i) {
-        right_vec.template AsType<f6x32_pk_t>()(Number<i>{}) = f6x32_pk_t{}.pack(test_vec);
+        right_vec.template AsType<f6x32_pk_t>()(Number<i>{}) = f6x32_pk_t{test_vec};
     });
     // copy the vector
     vector_type<f6x32_pk_t, vector_size> left_vec{right_vec};
     // check if values were copied correctly
     ck::static_for<0, packed_size, 1>{}([&](auto i) {
-        ASSERT_EQ(
-            left_vec.template AsType<f6x32_pk_t>()(Number<0>{}).template unpack<>(Number<i>{}),
-            static_cast<f6_t>(test_vec[static_cast<int>(i)]));
+        ASSERT_EQ(left_vec.template AsType<f6x32_pk_t>()(Number<0>{}).unpack(i),
+                  static_cast<f6_t>(test_vec[static_cast<int>(i)]));
+    });
+}
+
+TEST(FP6, TestAllValues)
+{
+    constexpr std::array<float, 64> e2m3ValuesOCP = {
+        // clang-format off
+        0.0000000000, 0.1250000000, 0.2500000000, 0.3750000000, 0.5000000000, 0.6250000000, 0.7500000000, 0.8750000000,
+        1.0000000000, 1.1250000000, 1.2500000000, 1.3750000000, 1.5000000000, 1.6250000000, 1.7500000000, 1.8750000000,
+        2.0000000000, 2.2500000000, 2.5000000000, 2.7500000000, 3.0000000000, 3.2500000000, 3.5000000000, 3.7500000000,
+        4.0000000000, 4.5000000000, 5.0000000000, 5.5000000000, 6.0000000000, 6.5000000000, 7.0000000000, 7.5000000000,
+        -0.0000000000, -0.1250000000, -0.2500000000, -0.3750000000, -0.5000000000, -0.6250000000, -0.7500000000, -0.8750000000,
+        -1.0000000000, -1.1250000000, -1.2500000000, -1.3750000000, -1.5000000000, -1.6250000000, -1.7500000000, -1.8750000000,
+        -2.0000000000, -2.2500000000, -2.5000000000, -2.7500000000, -3.0000000000, -3.2500000000, -3.5000000000, -3.7500000000,
+        -4.0000000000, -4.5000000000, -5.0000000000, -5.5000000000, -6.0000000000, -6.5000000000, -7.0000000000, -7.5000000000
+        // clang-format on
+    };
+
+    constexpr uint8_t e2m3BitsOCP[] = {
+        // clang-format off
+        0b000000, 0b000001, 0b000010, 0b000011,
+        0b000100, 0b000101, 0b000110, 0b000111,
+        0b001000, 0b001001, 0b001010, 0b001011,
+        0b001100, 0b001101, 0b001110, 0b001111,
+        0b010000, 0b010001, 0b010010, 0b010011,
+        0b010100, 0b010101, 0b010110, 0b010111,
+        0b011000, 0b011001, 0b011010, 0b011011,
+        0b011100, 0b011101, 0b011110, 0b011111,
+        0b100000, 0b100001, 0b100010, 0b100011,
+        0b100100, 0b100101, 0b100110, 0b100111,
+        0b101000, 0b101001, 0b101010, 0b101011,
+        0b101100, 0b101101, 0b101110, 0b101111,
+        0b110000, 0b110001, 0b110010, 0b110011,
+        0b110100, 0b110101, 0b110110, 0b110111,
+        0b111000, 0b111001, 0b111010, 0b111011,
+        0b111100, 0b111101, 0b111110, 0b111111
+        // clang-format on
+    };
+
+    const bool ck_logging = ck::EnvIsEnabled(CK_ENV(CK_LOGGING));
+
+    if(ck_logging)
+        printf("FP6 Table\n");
+    ck::static_for<0, 64, 1>{}([&](auto i) {
+        float fp = type_convert<float>(f6_t(e2m3BitsOCP[i]));
+        ASSERT_EQ(fp, e2m3ValuesOCP[i]);
+
+        f6_t fp6 = type_convert<f6_t>(e2m3ValuesOCP[i]);
+        ASSERT_EQ(fp6 & 0x3F, e2m3BitsOCP[i] & 0x3F);
+
+        if(ck_logging)
+        {
+            // Print the binary representation
+            printf("Bits: 0b");
+            for(int j = 5; j >= 0; --j)
+            {
+                printf("%c", (e2m3BitsOCP[i] & (1 << j)) ? '1' : '0');
+            }
+            printf(", 0x%02X, Value: %f\n", e2m3BitsOCP[i], e2m3ValuesOCP[i]);
+        }
     });
 }
diff --git a/test/mx_mfma_op/mx_mfma_op.cpp b/test/mx_mfma_op/mx_mfma_op.cpp
index fddb8288a6..5e2aedd35e 100644
--- a/test/mx_mfma_op/mx_mfma_op.cpp
+++ b/test/mx_mfma_op/mx_mfma_op.cpp
@@ -5,9 +5,12 @@
 
 #include "mx_mfma_op.hpp"
 
+using ck::bf6_t;
+using ck::bf8_t;
 using ck::e8m0_bexp_t;
 using ck::f4_t;
 using ck::f4x2_pk_t;
+using ck::f6_t;
 using ck::f8_t;
 using ck::half_t;
 using ck::type_convert;
@@ -17,13 +20,15 @@ using ck::type_convert;
  *
  * @param init - selects initialization algorithm for A and B tensors
  */
-template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
-bool run_mfma_km_kn_nm_test(ck::index_t init)
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AType,
+          typename BType,
+          typename CType,
+          ck::MFMA_F8F6F4 mfma>
+bool run_mfma_test(ck::index_t init)
 {
-    using ALayout = ck::tensor_layout::gemm::ColumnMajor;
-    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-    using CLayout = ck::tensor_layout::gemm::ColumnMajor;
-
     using AccType    = float; // only MFMA_F32 instructions supported
     using CPUAccType = AccType;
 
@@ -53,74 +58,153 @@ bool run_mfma_km_kn_nm_test(ck::index_t init)
     return pass;
 }
 
+const ck::index_t common_init = -4; // set to "< 0" for test-specific initializations
+
 TEST(MFMA, FP8MFMA16x16x128)
 {
-    auto AB_init = 5;
-    auto pass = run_mfma_km_kn_nm_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
+    using ALayout = ck::tensor_layout::gemm::ColumnMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::ColumnMajor;
+    auto AB_init  = (common_init < 0) ? 5 : common_init;
+    auto pass     = run_mfma_test<ALayout,
+                              BLayout,
+                              CLayout,
+                              f8_t,
+                              f8_t,
+                              half_t,
+                              ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
-TEST(MFMA, FP8MFMA32x32x64)
+TEST(MFMA, BF8MFMA16x16x128)
 {
-    auto AB_init = 5;
-    auto pass = run_mfma_km_kn_nm_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
+    using ALayout = ck::tensor_layout::gemm::ColumnMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::ColumnMajor;
+    auto AB_init  = (common_init < 0) ? 5 : common_init;
+    auto pass     = run_mfma_test<ALayout,
+                              BLayout,
+                              CLayout,
+                              bf8_t,
+                              bf8_t,
+                              half_t,
+                              ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
-/**
- * @brief Run the test for the given MFMA instruction
- *
- * @param init - selects initialization algorithm for A and B tensors
- */
-template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
-bool run_mfma_mk_kn_mn_test(ck::index_t init)
+TEST(MFMA, FP4MFMA16x16x128)
 {
     using ALayout = ck::tensor_layout::gemm::RowMajor;
     using BLayout = ck::tensor_layout::gemm::ColumnMajor;
     using CLayout = ck::tensor_layout::gemm::RowMajor;
 
-    using AccType    = float; // only MFMA_F32 instructions supported
-    using CPUAccType = AccType;
-
-    ck::mfma_type<static_cast<ck::MfmaInstr>(mfma)> mfma_instr;
-    constexpr auto BLOCK_M = mfma_instr.m_per_blk;
-    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
-    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
-
-    const auto mfma_kernel = ck::
-        matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, ALayout, BLayout, CLayout>;
-
-    bool pass = true;
-
-    pass = ck::mfma_test::TestMFMA<decltype(mfma_kernel),
-                                   AType,
-                                   BType,
-                                   CType,
-                                   AccType,
-                                   CPUAccType,
-                                   ALayout,
-                                   BLayout,
-                                   CLayout,
-                                   BLOCK_M,
-                                   BLOCK_N,
-                                   BLOCK_K>{}(mfma_kernel, init);
-
-    return pass;
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass =
+        run_mfma_test<ALayout, BLayout, CLayout, f4_t, f4_t, float, ck::MFMA_F8F6F4::F32_16x16x128>(
+            AB_init);
+    EXPECT_TRUE(pass);
 }
 
-TEST(MFMA, FP4MFMA16x16x128)
+TEST(MFMA, FP6MFMA16x16x128)
 {
-    auto AB_init = 4;
-    auto pass = run_mfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, float, ck::MFMA_F8F6F4::F32_16x16x128>(
-        AB_init);
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass =
+        run_mfma_test<ALayout, BLayout, CLayout, f6_t, f6_t, float, ck::MFMA_F8F6F4::F32_16x16x128>(
+            AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MFMA, BF6MFMA16x16x128)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mfma_test<ALayout,
+                              BLayout,
+                              CLayout,
+                              bf6_t,
+                              bf6_t,
+                              float,
+                              ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MFMA, FP8MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::ColumnMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass =
+        run_mfma_test<ALayout, BLayout, CLayout, f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(
+            AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MFMA, BF8MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::ColumnMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mfma_test<ALayout,
+                              BLayout,
+                              CLayout,
+                              bf8_t,
+                              bf8_t,
+                              float,
+                              ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MFMA, FP4MFMA32x32x64)
 {
-    auto AB_init = 4;
-    auto pass = run_mfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, half_t, ck::MFMA_F8F6F4::F32_32x32x64>(
-        AB_init);
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass =
+        run_mfma_test<ALayout, BLayout, CLayout, f4_t, f4_t, half_t, ck::MFMA_F8F6F4::F32_32x32x64>(
+            AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MFMA, FP6MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass =
+        run_mfma_test<ALayout, BLayout, CLayout, f6_t, f6_t, half_t, ck::MFMA_F8F6F4::F32_32x32x64>(
+            AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MFMA, BF6MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mfma_test<ALayout,
+                              BLayout,
+                              CLayout,
+                              bf6_t,
+                              bf6_t,
+                              half_t,
+                              ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -129,15 +213,18 @@ TEST(MFMA, FP4MFMA32x32x64)
  *
  * @param init - selects initialization algorithm for A and B tensors
  */
-template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
-bool run_mxmfma_mk_kn_mn_test(ck::index_t init)
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AType,
+          typename BType,
+          typename CType,
+          ck::MFMA_F8F6F4 mfma>
+bool run_mxmfma_test(ck::index_t init)
 {
     static_assert(mfma == ck::MFMA_F8F6F4::SCALE_F32_16x16x128 ||
                       mfma == ck::MFMA_F8F6F4::SCALE_F32_32x32x64,
                   "Only SCALE_F32_16x16x128 and SCALE_F32_32x32x64 are supported");
-    using ALayout = ck::tensor_layout::gemm::RowMajor;
-    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-    using CLayout = ck::tensor_layout::gemm::RowMajor;
 
     using AccType   = float;           // only MFMA_F32 instructions supported
     using ScaleType = ck::e8m0_bexp_t; // biased exponent type
@@ -181,34 +268,170 @@ bool run_mxmfma_mk_kn_mn_test(ck::index_t init)
 
 TEST(MXMFMA, MXFP8MFMA16x16x128)
 {
-    auto AB_init = 5;
-    auto pass =
-        run_mxmfma_mk_kn_mn_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                f8_t,
+                                f8_t,
+                                float,
+                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MXMFMA, MXFP8MFMA32x32x64)
 {
-    auto AB_init = 5;
-    auto pass =
-        run_mxmfma_mk_kn_mn_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                f8_t,
+                                f8_t,
+                                half_t,
+                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXBF8MFMA16x16x128)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                bf8_t,
+                                bf8_t,
+                                float,
+                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXBF8MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                bf8_t,
+                                bf8_t,
+                                half_t,
+                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP6MFMA16x16x128)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                f6_t,
+                                f6_t,
+                                float,
+                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP6MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                f6_t,
+                                f6_t,
+                                half_t,
+                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXBF6MFMA16x16x128)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                bf6_t,
+                                bf6_t,
+                                float,
+                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXBF6MFMA32x32x64)
+{
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                bf6_t,
+                                bf6_t,
+                                half_t,
+                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MXMFMA, MXFP4MFMA16x16x128)
 {
-    auto AB_init = 4;
-    auto pass =
-        run_mxmfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(
-            AB_init);
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                f4_t,
+                                f4_t,
+                                float,
+                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MXMFMA, MXFP4MFMA32x32x64)
 {
-    auto AB_init = 4;
-    auto pass =
-        run_mxmfma_mk_kn_mn_test<f4x2_pk_t, f4x2_pk_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(
-            AB_init);
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    auto AB_init = (common_init < 0) ? 5 : common_init;
+    auto pass    = run_mxmfma_test<ALayout,
+                                BLayout,
+                                CLayout,
+                                f4_t,
+                                f4_t,
+                                half_t,
+                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 9ce871cfb1..4cab411cb4 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -151,6 +151,8 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
     // Reg 7 [24:31]     |     K79    |     K95     |     K111   |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |
     // clang-format on
 
+    static_assert(!is_packed_type_v<AType>, "Packed type is not supported");
+
     static constexpr int32_t WAVE_SIZE = 64;
 
     // Here we want to load from rows of A in chunks of 16 elements each.
@@ -270,12 +272,28 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
     // Reg 3 [8:15]      |     K26K27 |     K58K59  |     K90K91 |    K122K123 |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |     K58K59  |  v[13] |
     // Reg 3 [16:23]     |     K28K29 |     K60K61  |     K92K93 |    K124K125 |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |     K60K61  |  v[14] |
     // Reg 3 [24:31]     |     K30K31 |     K62K63  |     K94K95 |    K126K127 |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |     K62K63  |  v[15] |
+
+
+    // Register Mapping for 16x128 for FP6:                                                ||    Register Mapping for 32x64 for FP6:
+    // Size              |   BLOCK_M  |   BLOCK_M   |   BLOCK_M  |   BLOCK_M   |           ||    Size              |   BLOCK_M  |   BLOCK_M   |        |
+    // M                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    M                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0-2 [0:95]    | K =  0-15  |  K = 32-47  |  K = 64-79 | K = 96-111  |  v[0]     ||    Reg 0-2 [0:95]    | K =  0-15  |  K = 32-47  |  v[0]  |
+    // Reg 3-5 [0:95]    | K = 16-31  |  K = 48-63  |  K = 80-95 | K = 112-127 |  v[0]     ||    Reg 3-5 [0:95]    | K = 16-31  |  K = 48-63  |  v[0]  |
+
     // clang-format on
 
     static constexpr int32_t WAVE_SIZE = 64;
 
+    // FP8 chunk_size = 16, num_chunks = 2, packed_size = 1
+    // FP4 chunk_size = 32, num_chunks = 1, packed_size = 2
+    // FP6 chunk_size = 32, num_chunks = 1, packed_size = 32
+
+    constexpr index_t num_chunks = is_packed_type_v<AType> ? 1 : 2;
+
     // Here we want to load from rows of A in chunks of 16 elements each.
-    static constexpr uint32_t chunk_size = 16;
+    constexpr uint32_t chunk_size = is_packed_type_v<AType> ? 32 : 16;
 
     // each chunk is separated by offset
     static constexpr uint32_t chunk_offset = chunk_size * WAVE_SIZE / BLOCK_M;
@@ -283,43 +301,35 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
     // To start the loading process, let's visualize in 2D coords.
     // Each thread will load 32 elements.
     // We need to know where they start, and where the next elements are.
-    auto startCoord2D =
-        std::make_pair(threadIdx.x % BLOCK_M,                 // Row {0-31}  |  {0-15}
-                       (threadIdx.x / BLOCK_M) * chunk_size); // Col {0, 16} |  {0, 16, 32, 48}
+    // FP8/6/4 Row {0-31}  |  {0-15}
+    // FP8     Col {0, 16} |  {0, 16, 32, 48}
+    // FP6/4   Col {0, 32} |  {0, 32, 64, 96}
+    auto startCoord2D = std::make_pair(threadIdx.x % BLOCK_M, (threadIdx.x / BLOCK_M) * chunk_size);
 
-    // auto minorStepCoord2D = std::make_pair(0u, 1u);          // read rows
     auto majorStepCoord2D = std::make_pair(0, chunk_offset); // read a chunk from a row
 
     // Flatten to 1D row_major offsets.
     auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
 
-    // BLOCK_K is a stride in A matrix
-    auto startOffset = row_major(
-        startCoord2D, BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1));
-    // auto kMinorOffset = row_major(minorStepCoord2D, BLOCK_K /
-    // (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1));
-    auto kMajorOffset =
-        row_major(majorStepCoord2D,
-                  BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1));
-
-    using ARawT        = typename scalar_type<AFragT>::type;
-    using AScalarFragT = vector_type<ARawT, chunk_size>::type;
-
-    constexpr index_t num_chunks =
-        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 1 : 2);
+    using ARawT         = typename scalar_type<AFragT>::type;
+    using AScalarChunkT = vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
 
     union
     {
         AFragT frag;
-        AScalarFragT chunks[num_chunks];
+        AScalarChunkT chunks[num_chunks];
     } fragA{};
 
-    const AScalarFragT* fragPtr;
+    const AScalarChunkT* fragPtr;
+
+    // BLOCK_K is a stride in A matrix
+    auto startOffset  = row_major(startCoord2D, BLOCK_K) / packed_size_v<AType>;
+    auto kMajorOffset = row_major(majorStepCoord2D, BLOCK_K) / packed_size_v<AType>;
 
     for(index_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++)
     {
-        fragPtr                 = reinterpret_cast<AScalarFragT const*>(input_ptr + startOffset +
-                                                        chunk_idx * kMajorOffset);
+        fragPtr                 = reinterpret_cast<AScalarChunkT const*>(input_ptr + startOffset +
+                                                         chunk_idx * kMajorOffset);
         fragA.chunks[chunk_idx] = *fragPtr;
     }
 
@@ -488,12 +498,27 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
     // Reg 3 [8:15]      |     K26K27 |     K58K59  |     K90K91 |    K122K123 |  v[13]    ||    Reg 3 [8:15]      |     K26K27 |     K58K59  |  v[13] |
     // Reg 3 [16:23]     |     K28K29 |     K60K61  |     K92K93 |    K124K125 |  v[14]    ||    Reg 3 [16:23]     |     K28K29 |     K60K61  |  v[14] |
     // Reg 3 [24:31]     |     K30K31 |     K62K63  |     K94K95 |    K126K127 |  v[15]    ||    Reg 3 [24:31]     |     K30K31 |     K62K63  |  v[15] |
+
+    // Register Mapping for 16x128 for FP6:                                                ||    Register Mapping for 32x64 for FP6:
+    // Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N  |   BLOCK_N   |           ||    Size              |   BLOCK_N  |   BLOCK_N   |        |
+    // N                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    N                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0-2 [0:95]    | K =  0-15  |  K = 32-47  |  K = 64-79 | K = 96-111  |  v[0]     ||    Reg 0-2 [0:95]    | K =  0-15  |  K = 32-47  |  v[0]  |
+    // Reg 3-5 [0:95]    | K = 16-31  |  K = 48-63  |  K = 80-95 | K = 112-127 |  v[0]     ||    Reg 3-5 [0:95]    | K = 16-31  |  K = 48-63  |  v[0]  |
+
     // clang-format on
 
     static constexpr int32_t WAVE_SIZE = 64;
 
+    // FP8 chunk_size = 16, num_chunks = 2, packed_size = 1
+    // FP4 chunk_size = 32, num_chunks = 1, packed_size = 2
+    // FP6 chunk_size = 32, num_chunks = 1, packed_size = 32
+
+    constexpr index_t num_chunks = is_packed_type_v<BType> ? 1 : 2;
+
     // Here we want to load from cols of B in chunks of 16 elements each.
-    static constexpr uint32_t chunk_size = 16;
+    constexpr uint32_t chunk_size = is_packed_type_v<BType> ? 32 : 16;
 
     // each chunk is separated by an offset
     static constexpr uint32_t chunk_offset = chunk_size * WAVE_SIZE / BLOCK_N; // 32 or 64
@@ -501,44 +526,36 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
     // To start the loading process, let's visualize in 2D coords.
     // Each thread will load 32 elements.
     // We need to know where they start, and where the next elements are.
-    auto startCoord2D =
-        std::make_pair((threadIdx.x / BLOCK_N) * chunk_size, // Row {0, 16} |  {0, 16, 32, 48}
-                       threadIdx.x % BLOCK_N);               // Col {0-31}  |  {0-15}
+    // FP8/6/4 Col {0-31}  |  {0-15}
+    // FP8     Row {0, 16} |  {0, 16, 32, 48}
+    // FP6/4   Row {0, 32} |  {0, 32, 64, 96}
+    auto startCoord2D = std::make_pair((threadIdx.x / BLOCK_N) * chunk_size, threadIdx.x % BLOCK_N);
 
     // Flatten to 1D col_major offsets.
     auto col_major = [](auto const& coord, auto ld) { return coord.first + coord.second * ld; };
 
-    // auto minorStepCoord2D = std::make_pair(1u, 0u);       // read cols
     auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
 
-    // BLOCK_K is a stride in B matrix
-    auto startOffset = col_major(
-        startCoord2D, BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1));
-    // auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_K /
-    // (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1));
-    auto kMajorOffset =
-        col_major(majorStepCoord2D,
-                  BLOCK_K / (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1));
-
-    using BRawT        = typename scalar_type<BFragT>::type;
-    using BScalarFragT = vector_type<BRawT, chunk_size>::type;
-
-    constexpr index_t num_chunks =
-        (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 1 : 2);
+    using BRawT         = typename scalar_type<BFragT>::type;
+    using BScalarChunkT = vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
 
     union
     {
         BFragT frag;
-        BScalarFragT chunks[num_chunks];
+        BScalarChunkT chunks[num_chunks];
     } fragB{};
 
-    const BScalarFragT* fragPtr;
+    const BScalarChunkT* fragPtr;
 
-    for(index_t chunk = 0; chunk < num_chunks; chunk++)
+    // BLOCK_K is a stride in B matrix
+    auto startOffset  = col_major(startCoord2D, BLOCK_K) / packed_size_v<BType>;
+    auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_K) / packed_size_v<BType>;
+
+    for(index_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++)
     {
-        fragPtr =
-            reinterpret_cast<BScalarFragT const*>(input_ptr + startOffset + chunk * kMajorOffset);
-        fragB.chunks[chunk] = *fragPtr;
+        fragPtr                 = reinterpret_cast<BScalarChunkT const*>(input_ptr + startOffset +
+                                                         chunk_idx * kMajorOffset);
+        fragB.chunks[chunk_idx] = *fragPtr;
     }
 
     return fragB.frag;
@@ -904,20 +921,22 @@ template <typename AType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-__global__ void matmul(const AType* a, const BType* b, CType* c)
+__global__ void matmul(const typename packed_type<AType>::type* a,
+                       const typename packed_type<BType>::type* b,
+                       CType* c)
 {
+    using PackedAType            = typename packed_type<AType>::type;
+    constexpr auto packed_size_a = packed_type<AType>::packed_size;
+    using PackedBType            = typename packed_type<BType>::type;
+    constexpr auto packed_size_b = packed_type<BType>::packed_size;
+
     constexpr int WAVE_SIZE = 64;
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT =
-        vector_type<AType,
-                    BLOCK_M * BLOCK_K / WAVE_SIZE /
-                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
-    using BFragT =
-        vector_type<BType,
-                    BLOCK_K * BLOCK_N / WAVE_SIZE /
-                        (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
+    using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
+    using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
+
     using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
     using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
@@ -931,11 +950,11 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
     // Load the inputs.
     if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
     {
-        fragA = load_A_row_major<AType, AFragT, BLOCK_M, BLOCK_K>(a);
+        fragA = load_A_row_major<PackedAType, AFragT, BLOCK_M, BLOCK_K>(a);
     }
     else
     {
-        fragA = load_A_col_major<AType, AFragT, BLOCK_M, BLOCK_K>(a);
+        fragA = load_A_col_major<PackedAType, AFragT, BLOCK_M, BLOCK_K>(a);
     }
 
     if constexpr(is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
@@ -944,7 +963,7 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
     }
     else
     {
-        fragB = load_B_col_major<BType, BFragT, BLOCK_K, BLOCK_N>(b);
+        fragB = load_B_col_major<PackedBType, BFragT, BLOCK_K, BLOCK_N>(b);
     }
 
     // Matrix multiply-accumulate using MFMA units
@@ -979,21 +998,24 @@ template <typename AType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-__global__ void
-matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb, CType* c)
+__global__ void matmul(const packed_type_t<AType>* a,
+                       const ScaleType* xa,
+                       const packed_type_t<BType>* b,
+                       const ScaleType* xb,
+                       CType* c)
 {
+    using PackedAType            = packed_type_t<AType>;
+    constexpr auto packed_size_a = packed_size_v<AType>;
+    using PackedBType            = packed_type_t<BType>;
+    constexpr auto packed_size_b = packed_size_v<BType>;
+
     constexpr int WAVE_SIZE = 64;
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT =
-        vector_type<AType,
-                    BLOCK_M * BLOCK_K / WAVE_SIZE /
-                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
-    using BFragT =
-        vector_type<BType,
-                    BLOCK_K * BLOCK_N / WAVE_SIZE /
-                        (ck::is_same_v<ck::remove_cvref_t<BType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
+    using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
+    using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
+
     using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
     using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
@@ -1011,9 +1033,13 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
     // Load the inputs.
     if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
     {
-        fragA =
-            load_mx_A_row_major<AType, AFragT, ScaleType, AScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
-                a, xa, fragXa);
+        fragA = load_mx_A_row_major<PackedAType,
+                                    AFragT,
+                                    ScaleType,
+                                    AScaleFragT,
+                                    BLOCK_M,
+                                    BLOCK_K,
+                                    BLOCK_X>(a, xa, fragXa);
     }
     else
     {
@@ -1026,9 +1052,13 @@ matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb,
     }
     else
     {
-        fragB =
-            load_mx_B_col_major<BType, BFragT, ScaleType, BScaleFragT, BLOCK_K, BLOCK_N, BLOCK_X>(
-                b, xb, fragXb);
+        fragB = load_mx_B_col_major<PackedBType,
+                                    BFragT,
+                                    ScaleType,
+                                    BScaleFragT,
+                                    BLOCK_K,
+                                    BLOCK_N,
+                                    BLOCK_X>(b, xb, fragXb);
     }
 
     // Scaled Matrix multiply-accumulate using MFMA units
@@ -1151,6 +1181,11 @@ template <typename DeviceMFMA,
           index_t BLOCK_X>
 struct TestMXMFMA
 {
+    using PackedAType                   = typename packed_type<ADataType>::type;
+    static constexpr auto packed_size_a = packed_type<ADataType>::packed_size;
+    using PackedBType                   = typename packed_type<BDataType>::type;
+    static constexpr auto packed_size_b = packed_type<BDataType>::packed_size;
+
     auto PrepareGemmTensors(const GemmParams& params, index_t init)
     {
         auto f_host_tensor_descriptor =
@@ -1167,11 +1202,11 @@ struct TestMXMFMA
                 }
             };
 
-        Tensor<ADataType> a_m_k(
+        Tensor<PackedAType> a_m_k(
             f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
         Tensor<ScaleType> a_scales(
             f_host_tensor_descriptor(params.M, params.K / BLOCK_X, params.K / BLOCK_X, ALayout{}));
-        Tensor<BDataType> b_n_k(
+        Tensor<PackedBType> b_n_k(
             f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
         Tensor<ScaleType> b_scales(
             f_host_tensor_descriptor(params.K / BLOCK_X, params.N, params.K / BLOCK_X, BLayout{}));
@@ -1183,51 +1218,44 @@ struct TestMXMFMA
         switch(init)
         {
         case 0:
-            a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0f});
-            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{0.015625f}}); // 1/6
+            a_m_k.GenerateTensorValue(GeneratorTensor_1<PackedAType>{1.0f});
+            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{0.5f}});
             // NOTE: not all numbers are representable in FP8, BF8, etc.
             // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 16 18 20 20 20 22 24 24 24 26 28 28 28 30 32
-            b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
+            b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<PackedBType, 1>{});
             b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{1.0f}});
             break;
         case 1:
             // results in C = {K}
-            a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0f});
+            a_m_k.GenerateTensorValue(GeneratorTensor_1<PackedAType>{1.0f});
             a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{512.0f}});
-            b_n_k.GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0f});
+            b_n_k.GenerateTensorValue(GeneratorTensor_1<PackedBType>{1.0f});
             b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{1.0f / 512}});
             break;
         case 2:
             // expect small round off errors
-            a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
+            a_m_k.GenerateTensorValue(GeneratorTensor_3<PackedAType>{-2.0, 2.0});
             a_scales.GenerateTensorValue(
                 GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
-            b_n_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
+            b_n_k.GenerateTensorValue(GeneratorTensor_3<PackedBType>{-2.0, 2.0});
             b_scales.GenerateTensorValue(GeneratorTensor_2<ScaleType>{126, 129});
             break;
         case 3:
             // expect small round off errors
-            a_m_k.GenerateTensorValue(GeneratorTensor_4<ADataType>(0, 1));
+            a_m_k.GenerateTensorValue(GeneratorTensor_4<PackedAType>(0, 1, time(nullptr)));
             a_scales.GenerateTensorValue(
                 GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
-            b_n_k.GenerateTensorValue(GeneratorTensor_4<BDataType>(0, 1));
-            b_scales.GenerateTensorValue(
-                GeneratorTensor_2<ScaleType>{126, 129}); //  scales: {0.5, 1, 2}
-            break;
-        case 4:
-            a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1., 1.});
-            a_scales.GenerateTensorValue(
-                GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
-            b_n_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1., 1.});
+            b_n_k.GenerateTensorValue(GeneratorTensor_4<PackedBType>(0, 1, time(nullptr) / 2));
             b_scales.GenerateTensorValue(
                 GeneratorTensor_2<ScaleType>{126, 129}); //  scales: {0.5, 1, 2}
             break;
+
         default:
             // all initial values are representable in FP8, BF8
-            a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6}); // Z[-5,5]
+            a_m_k.GenerateTensorValue(GeneratorTensor_2<PackedAType>{-6, 7}); // Z[-6,6]
             a_scales.GenerateTensorValue(
-                GeneratorTensor_2<ScaleType>{122, 129});                    // scales: [1/32,..., 2]
-            b_n_k.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6}); // Z[-5,5]
+                GeneratorTensor_2<ScaleType>{122, 129}); // scales: [1/32,..., 2]
+            b_n_k.GenerateTensorValue(GeneratorTensor_2<PackedBType>{-6, 7}); // Z[-6,6]
             b_scales.GenerateTensorValue(
                 GeneratorTensor_2<ScaleType>{122, 129}); //  scales: [1/32,..., 2]
 
@@ -1272,9 +1300,9 @@ struct TestMXMFMA
 
         auto host_tensors = PrepareGemmTensors(params, init);
 
-        const Tensor<ADataType>& a        = std::get<0>(host_tensors);
+        const Tensor<PackedAType>& a      = std::get<0>(host_tensors);
         const Tensor<ScaleType>& a_scales = std::get<1>(host_tensors);
-        const Tensor<BDataType>& b        = std::get<2>(host_tensors);
+        const Tensor<PackedBType>& b      = std::get<2>(host_tensors);
         const Tensor<ScaleType>& b_scales = std::get<3>(host_tensors);
         Tensor<CDataType>& c_host         = std::get<4>(host_tensors);
         Tensor<CDataType>& c_device       = std::get<5>(host_tensors);
@@ -1356,6 +1384,12 @@ template <typename DeviceMFMA,
           index_t BLOCK_K>
 struct TestMFMA
 {
+
+    using PackedAType                   = typename packed_type<ADataType>::type;
+    static constexpr auto packed_size_a = packed_type<ADataType>::packed_size;
+    using PackedBType                   = typename packed_type<BDataType>::type;
+    static constexpr auto packed_size_b = packed_type<BDataType>::packed_size;
+
     auto PrepareGemmTensors(const GemmParams& params, index_t init)
     {
         auto f_host_tensor_descriptor =
@@ -1372,9 +1406,9 @@ struct TestMFMA
                 }
             };
 
-        Tensor<ADataType> a_m_k(
+        Tensor<PackedAType> a_m_k(
             f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<BDataType> b_n_k(
+        Tensor<PackedBType> b_n_k(
             f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
         Tensor<CDataType> c_m_n_host_result(
             f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
@@ -1384,34 +1418,30 @@ struct TestMFMA
         switch(init)
         {
         case 0:
-            a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{0.015625f});
+            a_m_k.GenerateTensorValue(GeneratorTensor_1<PackedAType>{0.625f});
             // NOTE: not all numbers are representable in FP8, BF8, etc.
-            b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
+            b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<PackedBType, 1>{});
             break;
         case 1:
             // results in C = {K}
-            a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0f});
-            b_n_k.GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0f});
+            a_m_k.GenerateTensorValue(GeneratorTensor_1<PackedAType>{1.0f});
+            b_n_k.GenerateTensorValue(GeneratorTensor_1<PackedBType>{1.0f});
             break;
         case 2:
-            // expect small round off errors
-            a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-5, 5});
-            b_n_k.GenerateTensorValue(GeneratorTensor_3<BDataType>{-5, 5});
+            // expect small round off errors that lead to FP8MFMA32x32x64 failures
+            a_m_k.GenerateTensorValue(GeneratorTensor_3<PackedAType>{-5, 5});
+            b_n_k.GenerateTensorValue(GeneratorTensor_3<PackedBType>{-5, 5});
             break;
         case 3:
-            // expect small round off errors
-            a_m_k.GenerateTensorValue(GeneratorTensor_4<ADataType>(-1, 3));
-            b_n_k.GenerateTensorValue(GeneratorTensor_4<BDataType>(1, 3));
-            break;
-        case 4:
-            // FP4 values case
-            a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5});
-            b_n_k.GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5});
+            // expect small round off errors that lead to FP8MFMA32x32x64 failures
+            a_m_k.GenerateTensorValue(GeneratorTensor_4<PackedAType>(-1, 3));
+            b_n_k.GenerateTensorValue(GeneratorTensor_4<PackedBType>(1, 3));
             break;
+
         default:
-            // all initial values are representable in FP8, BF8
-            a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6});
-            b_n_k.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6});
+            // all initial values are representable in FP8/6, BF8/6 FP4 is missing 5
+            a_m_k.GenerateTensorValue(GeneratorTensor_2<PackedAType>{-6, 7}); // Z[-6,6]
+            b_n_k.GenerateTensorValue(GeneratorTensor_2<PackedBType>{-6, 7});
 
             break;
         }
@@ -1453,10 +1483,10 @@ struct TestMFMA
 
         auto host_tensors = PrepareGemmTensors(params, init);
 
-        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
-        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
-        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
-        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+        const Tensor<PackedAType>& a = std::get<0>(host_tensors);
+        const Tensor<PackedBType>& b = std::get<1>(host_tensors);
+        Tensor<CDataType>& c_host    = std::get<2>(host_tensors);
+        Tensor<CDataType>& c_device  = std::get<3>(host_tensors);
 
         using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
@@ -1464,8 +1494,8 @@ struct TestMFMA
         auto b_element_op = PassThrough{};
         auto c_element_op = PassThrough{};
 
-        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                BDataType,
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<PackedAType,
+                                                                                PackedBType,
                                                                                 CDataType,
                                                                                 CPUAccDataType,
                                                                                 PassThrough,

From f18170064dc10833c1eec47b873156b80a840f4f Mon Sep 17 00:00:00 2001
From: jefyang1 <146495389+jefyang1@users.noreply.github.com>
Date: Mon, 19 May 2025 17:29:51 -0700
Subject: [PATCH 130/443] Use new mfma instructions for FP8 on gfx950 (#2202)

* Add logic to use new mfma instructions for fp8 bf8

* Fix example_gemm_xdl_fp8_pk_i4_bpreshuffle_v3 on gfx950 and run clang format

* Update include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp

Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>

* Fix intrin_mfma f8 calls due to merge mistake

---------

Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
---
 example/01_gemm/gemm_xdl_fp8.cpp              |   2 +
 ...ipeline_xdlops_b_preshuffle_dequant_v3.hpp |   4 +-
 ...iple_d_welford_first_half_xdl_cshuffle.hpp |  16 ++-
 ...wise_batched_gemm_gemm_xdl_cshuffle_v1.hpp |  11 +-
 ...iple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp |  33 +++--
 ...ultiple_d_softmax_gemm_xdl_cshuffle_v1.hpp |  19 ++-
 ...ched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp |  11 +-
 ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp |  11 +-
 ...ridwise_gemm_multiple_abd_xdl_cshuffle.hpp |  20 +--
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |  11 +-
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |  20 +--
 ...ultiple_d_xdl_cshuffle_lds_direct_load.hpp |  19 +--
 ...se_gemm_multiple_d_xdl_splitk_cshuffle.hpp |  16 ++-
 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp  |  11 +-
 ...e_gemm_split_k_multiple_d_xdl_cshuffle.hpp |  32 +++--
 ...emm_split_k_multiple_d_xdl_cshuffle_v2.hpp |  16 ++-
 .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp    |  13 +-
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp |  13 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp    |  16 ++-
 .../grid/gridwise_gemm_xdl_cshuffle_v2.hpp    |  16 ++-
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    |  13 +-
 ...wise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp |  20 ++-
 .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp |  13 +-
 ...ridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp |  13 +-
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp |  14 +-
 ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp |  14 +-
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |  25 +++-
 ...ridwise_gemm_xdl_layernorm_cshuffle_v1.hpp |  11 +-
 ...ridwise_gemm_xdl_waveletmodel_cshuffle.hpp |  17 ++-
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  |  17 ++-
 .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp    |  11 +-
 .../gpu/grid/gridwise_moe_gemm.hpp            |  29 ++--
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 131 +++++++++++++++---
 include/ck/utility/amd_xdlops.hpp             |  90 ++++++++++++
 34 files changed, 548 insertions(+), 180 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp8.cpp b/example/01_gemm/gemm_xdl_fp8.cpp
index 3c75a44d21..0c51a58037 100644
--- a/example/01_gemm/gemm_xdl_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8.cpp
@@ -32,6 +32,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|           |            |             |             |
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |           |            |             |             |
          < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
+    // this instance has been tested working on gfx950     
+    //     < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    128,  32,  32,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8,  LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
index 4be4e321d3..e5fe92a50d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
@@ -124,7 +124,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
     using Base::I1;
     using Base::I2;
     using Base::KRepeat;
-    using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
 
     using Base::a_block_desc_m0_m1_m2_k;
@@ -145,6 +144,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
 
     using Base::MWaves;
 
+    static constexpr auto xdlops_gemm =
+        XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, BDataType>{};
+
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t PrefillStages         = 1;
     static constexpr index_t GlobalBufferNum       = 1;
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
index d728360c55..02dba97430 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
@@ -519,13 +519,19 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<ABDataType, f8_t>::value || is_same<ABDataType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack =
-            math::max(lcm_AK1_BK1,
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
-                          selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<ABDataType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         ABDataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
index 50b4a734fa..258d0ad0ca 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -452,13 +452,16 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_v2<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
index 79a9410898..53a45c7f16 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -365,16 +365,20 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
               lcm_A0K1_B0K1 <= 4) ||
-             (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8))
+             (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8) ||
+             ((is_same<A0B0B1DataType, f8_t>::value || is_same<A0B0B1DataType, bf8_t>::value) &&
+              lcm_A0K1_B0K1 < 32))
                 ? true
                 : false;
-        constexpr auto mfma = MfmaSelector<A0B0B1DataType,
+        constexpr auto is_scale_mfma = false;
+        constexpr auto mfma          = MfmaSelector<A0B0B1DataType,
                                            Gemm0MPerXdl,
                                            Gemm0NPerXdl,
                                            A0B0B1DataType,
-                                           is_single_rate_mfma>::selected_mfma;
-        constexpr auto N3   = mfma.num_groups_per_blk;
-        constexpr auto N5   = mfma.group_size;
+                                           is_single_rate_mfma,
+                                           is_scale_mfma>::selected_mfma;
+        constexpr auto N3            = mfma.num_groups_per_blk;
+        constexpr auto N5            = mfma.group_size;
         return transform_tensor_descriptor(
             d0_grid_desc_m_n,
             make_tuple(make_unmerge_transform(make_tuple(
@@ -657,16 +661,19 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
               lcm_A0K1_B0K1 <= 4) ||
-             (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8))
+             (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8) ||
+             ((is_same<A0B0B1DataType, f8_t>::value || is_same<A0B0B1DataType, bf8_t>::value) &&
+              lcm_A0K1_B0K1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack =
-            math::max(lcm_A0K1_B0K1,
-                      MfmaSelector<A0B0B1DataType,
-                                   Gemm0MPerXdl,
-                                   Gemm0NPerXdl,
-                                   A0B0B1DataType,
-                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_A0K1_B0K1,
+                                            MfmaSelector<A0B0B1DataType,
+                                                         Gemm0MPerXdl,
+                                                         Gemm0NPerXdl,
+                                                         A0B0B1DataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm0 = BlockwiseGemmXdlops_v2<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
index d15767f658..0f2085525f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -347,11 +347,15 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
+        constexpr auto is_scale_mfma = false;
         constexpr auto mfma =
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma;
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma;
         constexpr auto N3 = mfma.num_groups_per_blk;
         constexpr auto N4 = mfma.num_input_blks;
         constexpr auto N5 = mfma.group_size;
@@ -564,13 +568,16 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_v2<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
index a11d696019..33b9199ea5 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -473,13 +473,16 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_v2<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index ab97a940a8..f406bfb95a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -502,13 +502,16 @@ struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
index 79ab3acd92..054aca2936 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -679,17 +679,19 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
             (((is_same<AComputeDataType, half_t>::value ||
                is_same<AComputeDataType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<AComputeDataType, f8_t>::value || is_same<AComputeDataType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-
-        constexpr index_t KPack =
-            math::max(lcm_AK1_BK1,
-                      MfmaSelector<AComputeDataType,
-                                   MPerXdl,
-                                   NPerXdl,
-                                   BComputeDataType,
-                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
+        static constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack             = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<AComputeDataType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         BComputeDataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 0e51c6904c..127d889572 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -468,13 +468,16 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index a3301dd932..be0fff087e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -647,17 +647,19 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             (((is_same<AComputeDataType, half_t>::value ||
                is_same<AComputeDataType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<AComputeDataType, f8_t>::value || is_same<AComputeDataType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-
-        constexpr index_t KPack =
-            math::max(lcm_AK1_BK1,
-                      MfmaSelector<AComputeDataType,
-                                   MPerXdl,
-                                   NPerXdl,
-                                   BComputeDataType,
-                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<AComputeDataType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         BComputeDataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index 57b9b02548..7781d1def3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -605,17 +605,20 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             (((is_same<AComputeDataType, half_t>::value ||
                is_same<AComputeDataType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<AComputeDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<AComputeDataType, f8_t>::value || is_same<AComputeDataType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
+        constexpr auto is_scale_mfma = false;
 
-        constexpr index_t KPack =
-            math::max(lcm_AK1_BK1,
-                      MfmaSelector<AComputeDataType,
-                                   MPerXdl,
-                                   NPerXdl,
-                                   BComputeDataType,
-                                   is_single_rate_mfma>::selected_mfma.k_per_blk);
+        constexpr index_t KPack = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<AComputeDataType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         BComputeDataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
index 88d6be234c..5815eb5b0b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
@@ -603,13 +603,19 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<AComputeType, half_t>::value || is_same<AComputeType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<AComputeType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<AComputeType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<AComputeType, f8_t>::value || is_same<AComputeType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
-            lcm_AK1_BK1,
-            MfmaSelector<AComputeType, MPerXdl, NPerXdl, AComputeType, is_single_rate_mfma>::
-                selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<AComputeType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         AComputeType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index 56581256dc..db227bb7ef 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -455,13 +455,16 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
index 23b4aec3b0..70301c326a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -585,13 +585,19 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<ABDataType, f8_t>::value || is_same<ABDataType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack =
-            math::max(lcm_AK1_BK1,
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
-                          selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<ABDataType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         ABDataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -1018,13 +1024,19 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<ABDataType, f8_t>::value || is_same<ABDataType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack =
-            math::max(lcm_AK1_BK1,
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
-                          selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<ABDataType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         ABDataType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
index 44c1e936bd..f64838ea4e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
@@ -599,13 +599,19 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
         constexpr bool is_single_rate_mfma =
             (((is_same<ComputeType, half_t>::value || is_same<ComputeType, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<ComputeType, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<ComputeType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<ComputeType, f8_t>::value || is_same<ComputeType, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
-            lcm_AK1_BK1,
-            MfmaSelector<ComputeType, MPerXdl, NPerXdl, ComputeType, is_single_rate_mfma>::
-                selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<ComputeType,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         ComputeType,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index d37b3cd38e..4d3ae93659 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -83,13 +83,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeA,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index e5e32a8535..4e72255d31 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -144,13 +144,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeA,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
     __host__ static auto CalculateMPadded(index_t M)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index 240bc464e1..7edcd7270f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -814,13 +814,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr bool is_single_rate_mfma =
             (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
-            lcm_AK1_BK1,
-            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
-                selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<ComputeTypeA,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         ComputeTypeB,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
index c7d44e842d..f92268265f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -873,13 +873,19 @@ struct GridwiseGemm_xdl_cshuffle_v2
         constexpr bool is_single_rate_mfma =
             (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
-            lcm_AK1_BK1,
-            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
-                selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(lcm_AK1_BK1,
+                                            MfmaSelector<ComputeTypeA,
+                                                         MPerXdl,
+                                                         NPerXdl,
+                                                         ComputeTypeA,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         // auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
         //     BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 29150c0688..0dbbc2a5e9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -255,13 +255,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeA,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index a22fc06a50..cfa8bfeb2a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -148,13 +148,21 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
     static constexpr auto AK1Number = Number<AK1Value>{};
     static constexpr auto BK1Number = Number<BK1Value>{};
 
-    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
+    // Use singal rate mfma instruction for this special case A (f8_t) * B (pk_i4_t)
+    // See example gemm_xdl_fp8_pk_i4_bpreshuffle_v3
+    // TODO: explore optimization opportunity by using new mfma instructions on gfx950
+    static constexpr auto lcm_AK1_BK1         = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma = true;
+    static constexpr auto is_scale_mfma       = false;
+    static constexpr auto mfma                = MfmaSelector<ComputeTypeA,
+                                              MPerXdl,
+                                              NPerXdl,
+                                              ComputeTypeA,
+                                              is_single_rate_mfma,
+                                              is_scale_mfma>{};
+    static constexpr index_t KPack = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
+    static constexpr index_t KLane = mfma.GetKPerXdlops() / mfma.GetK1PerXdlops();
 
-    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
-
-    static constexpr index_t KLane =
-        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
     static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
     static constexpr index_t NLane   = NPerXdl;
     static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index 7124687d5d..93c1779a80 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -160,13 +160,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeA,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index ac3e821340..97d0e2a4eb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -198,13 +198,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeB,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index 4163d1d01a..38ce9536ab 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -183,14 +183,20 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
-
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeB,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 21812380c2..ef84dd182a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -153,14 +153,20 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
             ? true
             : false;
-
+    static constexpr auto is_scale_mfma = false;
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB, is_single_rate_mfma>::
-                      selected_mfma.k_per_blk);
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeB,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index c0d9464136..8fb955c561 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -164,12 +164,25 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
-    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
-    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
-    static constexpr index_t KGroup = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
-    static constexpr index_t KLane =
-        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
+            ? true
+            : false;
+    static constexpr auto is_scale_mfma    = false;
+    static constexpr auto mfma             = MfmaSelector<ComputeTypeA,
+                                              MPerXdl,
+                                              NPerXdl,
+                                              ComputeTypeA,
+                                              is_single_rate_mfma,
+                                              is_scale_mfma>{};
+    static constexpr index_t KPack         = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
+    static constexpr index_t KGroup        = mfma.selected_mfma.k_per_blk == 32 ? 2 : 1;
+    static constexpr index_t KLane         = mfma.GetKPerXdlops() / mfma.GetK1PerXdlops();
     static constexpr index_t KPackPerGroup = KPack / KGroup;
     static constexpr index_t KRepeat       = KPerBlock / KLane / KPackPerGroup;
     static constexpr index_t NLane         = NPerXdl;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index b435fd5d5a..67fb4d651e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -493,13 +493,16 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t KPack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
index ad65e75ef9..50363d832e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -491,13 +491,20 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
             constexpr bool is_single_rate_mfma =
                 (((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
                   lcm_AK1_BK1 <= 4) ||
-                 (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8))
+                 (is_same<ABDataType, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+                 ((is_same<ABDataType, f8_t>::value || is_same<ABDataType, bf8_t>::value) &&
+                  lcm_AK1_BK1 < 32))
                     ? true
                     : false;
-            constexpr index_t KPack = math::max(
-                lcm_AK1_BK1,
-                MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
-                    selected_mfma.k_per_blk);
+            constexpr auto is_scale_mfma = false;
+            constexpr index_t KPack =
+                math::max(lcm_AK1_BK1,
+                          MfmaSelector<ABDataType,
+                                       MPerXdl,
+                                       NPerXdl,
+                                       ABDataType,
+                                       is_single_rate_mfma,
+                                       is_scale_mfma>::selected_mfma.k_per_blk);
 
             auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
                 TileMathThreadGroupSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 168c553180..b7947309e4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -744,14 +744,19 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAAdjusted, half_t>::value || is_same<FloatAAdjusted, bhalf_t>::value) &&
               K1 <= 4) ||
-             (is_same<FloatAAdjusted, int8_t>::value && K1 <= 8))
+             (is_same<FloatAAdjusted, int8_t>::value && K1 <= 8) ||
+             ((is_same<FloatAAdjusted, f8_t>::value || is_same<FloatAAdjusted, bf8_t>::value) &&
+              K1 < 32))
                 ? true
                 : false;
-
-        constexpr index_t KPack = math::max(
-            K1,
-            MfmaSelector<FloatAAdjusted, MPerXDL, NPerXDL, FloatBAdjusted, is_single_rate_mfma>::
-                selected_mfma.k_per_blk);
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t KPack      = math::max(K1,
+                                            MfmaSelector<FloatAAdjusted,
+                                                         MPerXDL,
+                                                         NPerXDL,
+                                                         FloatBAdjusted,
+                                                         is_single_rate_mfma,
+                                                         is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index f433b3c974..15c2da9d32 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -450,13 +450,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
         constexpr bool is_single_rate_mfma =
             (((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
               lcm_AK1_BK1 <= 4) ||
-             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8))
+             (is_same<FloatAB, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+             ((is_same<FloatAB, f8_t>::value || is_same<FloatAB, bf8_t>::value) &&
+              lcm_AK1_BK1 < 32))
                 ? true
                 : false;
-        constexpr index_t k_pack = math::max(
+        constexpr auto is_scale_mfma = false;
+        constexpr index_t k_pack     = math::max(
             lcm_AK1_BK1,
-            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
-                .k_per_blk);
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma, is_scale_mfma>::
+                selected_mfma.k_per_blk);
 
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 255fb8cff4..a083293485 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -183,14 +183,27 @@ struct GridwiseMoeGemm
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
-    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
-    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
-    static constexpr index_t KLane =
-        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
-    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
-    static constexpr index_t NLane   = NPerXdl;
-    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+          lcm_AK1_BK1 <= 4) ||
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
+          lcm_AK1_BK1 < 32))
+            ? true
+            : false;
+    static constexpr auto is_scale_mfma = false;
+    static constexpr auto mfma          = MfmaSelector<ComputeTypeA,
+                                              MPerXdl,
+                                              NPerXdl,
+                                              ComputeTypeA,
+                                              is_single_rate_mfma,
+                                              is_scale_mfma>{};
+    static constexpr index_t KPack      = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
+    static constexpr index_t KLane      = mfma.GetKPerXdlops() / mfma.GetK1PerXdlops();
+    static constexpr index_t KRepeat    = KPerBlock / KLane / KPack;
+    static constexpr index_t NLane      = NPerXdl;
+    static constexpr index_t NWave      = NPerBlock / NPerXdl / NXdlPerWave;
     // static constexpr index_t NumTokens = 1;
     static constexpr index_t SortedTileSize = MPerBlock;
 
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 06268f3cfb..b825d7ab69 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1117,12 +1117,31 @@ struct MfmaSelector
 #endif
     }
 
+    // Use singal rate mfma instruction for this special case A (f8_t) * B (pk_i4_t)
+    // See example gemm_xdl_fp8_pk_i4_bpreshuffle_v3
+    // TODO: explore optimization opportunity by using new mfma instructions on gfx950
     template <>
-    constexpr auto GetMfma<f8_t, 32, 32>()
+    constexpr auto GetMfma<f8_t, 32, 32, pk_i4_t, true, false>()
     {
         return MfmaInstr::mfma_f32_32x32x16f8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 32, 32, f8_t, true, false>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16f8f8;
+    }
+
+    template <>
+    constexpr auto GetMfma<f8_t, 32, 32, f8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_32x32x64f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_32x32x16f8f8;
+#endif
+    }
+
     template <>
     constexpr auto GetMfma<f8_t, 32, 32, f8_t, false, true>()
     {
@@ -1136,11 +1155,21 @@ struct MfmaSelector
     }
 
     template <>
-    constexpr auto GetMfma<f8_t, 16, 16>()
+    constexpr auto GetMfma<f8_t, 16, 16, f8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_16x16x32f8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<f8_t, 16, 16, f8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_16x16x128f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_16x16x32f8f8;
+#endif
+    }
+
     template <>
     constexpr auto GetMfma<f8_t, 16, 16, f8_t, false, true>()
     {
@@ -1166,41 +1195,101 @@ struct MfmaSelector
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 32, 32>()
+    constexpr auto GetMfma<bf8_t, 32, 32, bf8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_32x32x16bf8bf8;
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 16, 16>()
+    constexpr auto GetMfma<bf8_t, 32, 32, bf8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_32x32x64f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_32x32x16bf8bf8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_16x16x32bf8bf8;
     }
 
     template <>
-    constexpr auto GetMfma<f8_t, 32, 32, bf8_t>()
+    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_16x16x128f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_16x16x32bf8bf8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<f8_t, 32, 32, bf8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_32x32x16f8bf8;
     }
 
     template <>
-    constexpr auto GetMfma<f8_t, 16, 16, bf8_t>()
+    constexpr auto GetMfma<f8_t, 32, 32, bf8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_32x32x64f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_32x32x16f8bf8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<f8_t, 16, 16, bf8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_16x16x32f8bf8;
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 32, 32, f8_t>()
+    constexpr auto GetMfma<f8_t, 16, 16, bf8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_16x16x128f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_16x16x32f8bf8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<bf8_t, 32, 32, f8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_32x32x16bf8f8;
     }
 
     template <>
-    constexpr auto GetMfma<bf8_t, 16, 16, f8_t>()
+    constexpr auto GetMfma<bf8_t, 32, 32, f8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_32x32x64f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_32x32x16bf8f8;
+#endif
+    }
+
+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, f8_t, true, false>()
     {
         return MfmaInstr::mfma_f32_16x16x32bf8f8;
     }
 
+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, f8_t, false, false>()
+    {
+#if defined(__gfx950__)
+        return MfmaInstr::mfma_f32_16x16x128f8f6f4;
+#else
+        return MfmaInstr::mfma_f32_16x16x32bf8f8;
+#endif
+    }
+
     static constexpr auto selected_mfma = mfma_type<GetMfma<base_type,
                                                             MPerXdlops,
                                                             NPerXdlops,
@@ -1530,15 +1619,23 @@ struct XdlopsGemm
         return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
     }
 
-    // Falls back to single rate instruction on gfx950 if KPack <= 4; no change on gfx942-
-    static constexpr auto mfma = MfmaSelector < base_type, MPerXdlops, NPerXdlops, additional_type,
-                          (((is_same<base_type, half_t>::value ||
-                             is_same<base_type, bhalf_t>::value) &&
-                            KPack <= 4) ||
-                           (is_same<base_type, int8_t>::value && KPack <= 8))
-                              ? true
-                              : false,
-                          is_scale_mfma > {};
+    // Falls back to single rate instruction on gfx950 if KPack is single rate; no change on gfx942-
+    // when base_type is either f8_t or bf8_t, additional_type will always be either f8_t or bf8_t,
+    // except Use single rate mfma instruction for this special case A (f8_t) * B (pk_i4_t)
+    static constexpr bool is_single_rate_mfma =
+        (((is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value) &&
+          KPack <= 4) ||
+         (is_same<base_type, int8_t>::value && KPack <= 8) ||
+         ((is_same<base_type, f8_t>::value || is_same<base_type, bf8_t>::value) && KPack < 32) ||
+         is_same<additional_type, pk_i4_t>::value)
+            ? true
+            : false;
+    static constexpr auto mfma = MfmaSelector<base_type,
+                                              MPerXdlops,
+                                              NPerXdlops,
+                                              additional_type,
+                                              is_single_rate_mfma,
+                                              is_scale_mfma>{};
 
     static constexpr auto mfma_instr = mfma.selected_mfma;
 
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index ad48389625..ed3354dfb5 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -533,6 +533,50 @@ struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a, const f8x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                1, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0, // blgp
+                0,
+                0,
+                0,
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f8x32_t& reg_a, const bf8x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1, // blgp
+                0,
+                0,
+                0,
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a, const f4x32_t& reg_b, FloatC& reg_c)
     {
@@ -1118,6 +1162,52 @@ struct intrin_mfma_f32_16x16x128f8f6f4<16, 16>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a, const f8x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0, // blgp
+                0,
+                0,
+                0,
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const f8x32_t& reg_a, const bf8x32_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                0, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1, // blgp
+                0,
+                0,
+                0,
+                0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a, const f4x32_t& reg_b, FloatC& reg_c)
     {

From 0970f22221917d559d0bd72c1065303f08a70450 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Tue, 20 May 2025 02:30:15 +0200
Subject: [PATCH 131/443] [CMake] Disable newly added compiler warning -Wnrvo
 (#2210)

Recently a new warning was added to Clang to warn when no copy-elision
on return happens. That prevents our CK build. This disables the
warning.
---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e12462a41..13606975c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,9 @@ add_compile_options(-Wno-pass-failed)
 add_compile_options(-Wno-switch-default)
 add_compile_options(-Wno-unique-object-duplication)
 
+# Recent change in compiler makes this warning ON by default, which led to compile errors.
+add_compile_options(-Wno-nrvo)
+
 if(NOT DISABLE_DL_KERNELS)
     add_definitions(-DDL_KERNELS)
     set(DL_KERNELS "ON")

From c4929225f60f56e3a9320547dcfdff30a77f0aa3 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Mon, 19 May 2025 19:31:04 -0500
Subject: [PATCH 132/443] remove debug statements from CMakeLists (#2204)

---
 example/CMakeLists.txt | 2 --
 test/CMakeLists.txt    | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 996a543ecc..9c30a2e255 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -135,11 +135,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     endif()
     #message("add_example returns ${result}")
     if(result EQUAL 0 AND NOT "${EXAMPLE_NAME}" IN_LIST REGRESSION_EXAMPLES)
-        #message("adding to SMOKE EXAMPLE FILTER ${EXAMPLE_NAME}")
         set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "SMOKE_TEST")
         add_dependencies(smoke ${EXAMPLE_NAME})
     elseif(result EQUAL 0 AND "${EXAMPLE_NAME}" IN_LIST REGRESSION_EXAMPLES)
-        #message("Adding to REGRESSION EXAMPLE FILTER ${EXAMPLE_NAME}")
         set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "REGRESSION_TEST")
         add_dependencies(regression ${EXAMPLE_NAME})
     endif()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 69ffb94488..5ea61d2dfc 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -121,11 +121,9 @@ function(add_test_executable TEST_NAME)
     #message("add_test returns ${result}")
     set(result ${result} PARENT_SCOPE)
     if(result EQUAL 0 AND NOT "${TEST_NAME}" IN_LIST REGRESSION_TESTS)
-        message("adding to SMOKE TEST FILTER ${TEST_NAME}")
         set_tests_properties(${TEST_NAME} PROPERTIES LABELS "SMOKE_TEST")
         add_dependencies(smoke ${TEST_NAME})
     elseif(result EQUAL 0 AND "${TEST_NAME}" IN_LIST REGRESSION_TESTS)
-        message("Adding to REGRESSION TEST FILTER ${TEST_NAME}")
         set_tests_properties(${TEST_NAME} PROPERTIES LABELS "REGRESSION_TEST")
         add_dependencies(regression ${TEST_NAME})
     endif()
@@ -222,11 +220,9 @@ function(add_gtest_executable TEST_NAME)
     #message("add_gtest returns ${result}")
     set(result ${result} PARENT_SCOPE)
     if(result EQUAL 0 AND NOT "${TEST_NAME}" IN_LIST REGRESSION_TESTS)
-        #message("adding to smoke test FILTER ${TEST_NAME}")
         set_tests_properties(${TEST_NAME} PROPERTIES LABELS "SMOKE_TEST")
         add_dependencies(smoke ${TEST_NAME})
     elseif(result EQUAL 0 AND "${TEST_NAME}" IN_LIST REGRESSION_TESTS)
-        #message("Adding to REGRESSION TEST FILTER ${TEST_NAME}")
         set_tests_properties(${TEST_NAME} PROPERTIES LABELS "REGRESSION_TEST")
         add_dependencies(regression ${TEST_NAME})
     endif()

From d1e6f0982d04d6b356f001da731dd5e315f78812 Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Tue, 20 May 2025 17:18:57 +0300
Subject: [PATCH 133/443] [CK_TILE] Grouped GEMM tile loop (#2146)

* Add trait to use a persistent kernel and split the entrypoints in grouped gemm

* Some helper functions for persistent kernel case

* Get max occupancy grid using device properties

* Implement tile loop in main entry point to grouped gemm

* Enable GridSize() on device

* Handle offset tile index using real current block index

* Add persistent kernel choice to grouped gemm example

* Use a for-loop for iterating over the group

* Reduce VGPR spills by early-exit

* Enable persistent kernel choice in grouped_gemm example

* Add persistent kernel option to grouped_gemm test

* Fix formatting with remod.py

* Remove GridUpdateBlocks as blocks are now iteratively computed

* Add comment about VGPR spilling

* Fix formatting

* Use CK_TILE_HOST instead of __host__

* Enable all Row/Col combinations in grouped gemm unit test

* Add some KBatch=2 cases to grouped gemm tests

* Fix SplitK for grouped gemm

* Enable pipeline hotloop/tailnumber selection in-kernel for grouped gemm

* Add type traits

* Split examples to regular and tileloop

* Formatting

* Use hipExtStreamGetCUMask to get current active CUs for the given stream

* Align test and example kernel config, and disable validation for splitk repeats

* Remove debug options from CMakeLists.txt

* Separate the code paths for persistent/non-persistent in test

* Fix formatting

* Address review comments

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 .../ck_tile/17_grouped_gemm/CMakeLists.txt    |   2 +-
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 132 ++++-----
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  |  17 +-
 .../17_grouped_gemm/grouped_gemm_tileloop.cpp | 174 ++++++++++++
 .../run_grouped_gemm_example.inc              |  98 +++++--
 include/ck_tile/core/utility/type_traits.hpp  |  11 +
 include/ck_tile/host/stream_utils.hpp         |  45 ++++
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |  18 +-
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   | 252 ++++++++++++++++--
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |  12 +-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |   0
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |  24 +-
 .../grouped_gemm/test_grouped_gemm.cpp        |  30 ++-
 .../test_grouped_gemm_ut_cases.inc            |  30 ++-
 .../grouped_gemm/test_grouped_gemm_util.hpp   | 209 +++++++++++++--
 15 files changed, 908 insertions(+), 146 deletions(-)
 create mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
 create mode 100644 include/ck_tile/host/stream_utils.hpp
 mode change 100755 => 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp

diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
index d34013dd6c..79df4e624d 100644
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-
+add_executable(tile_example_grouped_gemm_tileloop EXCLUDE_FROM_ALL grouped_gemm_tileloop.cpp)
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 9b134ff779..61193e2e29 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -16,15 +16,10 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
-{
-    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
-}
-
 template <typename ALayout, typename BLayout, typename CLayout>
 float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
-                   void* p_workspace_)
+                   void* kargs_ptr)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     // Memory friendly for Interwave scheduler
@@ -114,70 +109,76 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
 
     float ave_time{0};
 
-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
 
-            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 CLayout,
-                                                 GemmPipelineProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 M_Warp,
-                                                 N_Warp,
-                                                 M_Warp_Tile,
-                                                 N_Warp_Tile,
-                                                 K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation>>;
-            using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKargs(gemm_descs);
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKargs(gemm_descs);
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Kernel arguments not supported!");
+        }
 
-            const dim3 grids      = Kernel::GridSize(gemm_descs);
-            constexpr dim3 blocks = Kernel::BlockSize();
+        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids      = Kernel::GridSize(gemm_descs);
 
-            ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
-                                                         kargs.data(),
-                                                         get_workspace_size(gemm_descs),
-                                                         hipMemcpyHostToDevice,
-                                                         s.stream_id_));
+        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                            kargs.data(),
+                                            get_workspace_size(gemm_descs),
+                                            hipMemcpyHostToDevice,
+                                            s.stream_id_));
 
-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
 
-            ave_time = ck_tile::launch_kernel(
-                s,
-                ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                    Kernel{},
-                    grids,
-                    blocks,
-                    0,
-                    ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
-                    gemm_descs.size()));
-            return ave_time;
-        };
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       gemm_descs.size()));
+
+        return ave_time;
+    };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(gemm_descs[0].k_batch == 1)
@@ -317,4 +318,5 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
 
 #include "run_grouped_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
+constexpr bool Persistent = false;
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 4fec329c2f..77db182c72 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -70,14 +70,25 @@ auto create_args(int argc, char* argv[])
         .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
         .insert("warmup", "10", "number of iterations before benchmark the kernel.")
         .insert("repeat", "100", "number of iterations to benchmark the kernel.")
-        .insert("group_count", "8", "group count.");
+        .insert("group_count", "8", "group count.")
+        .insert("kbatch", "1", "kbatch for SplitK");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
-std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs);
+inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
+{
+    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
+}
 
+template <typename ALayout, typename BLayout, typename CLayout>
 float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
-                   void* p_workspace_);
+                   void* kargs_ptr);
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk = false);
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
new file mode 100644
index 0000000000..5c0cb92683
--- /dev/null
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "grouped_gemm.hpp"
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk)
+{
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+    // Memory friendly for Interwave scheduler
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 32;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 4;
+    constexpr ck_tile::index_t N_Warp = 1;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    constexpr bool DoubleSmemBuffer = false;
+#endif
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    // Compute friendly for Intrawave scheduler
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = true;
+#endif
+
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu                         = 1;
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
+                                                                           kPadN,
+                                                                           kPadK,
+                                                                           DoubleSmemBuffer,
+                                                                           ALayout,
+                                                                           BLayout,
+                                                                           CLayout>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    float ave_time{0};
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        // We create the GEMM pipeline without specifying hotloop or tailnumber.
+        // These are automatically run inside the kernel based on the given input data.
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler>;
+
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       num_groups));
+
+        return ave_time;
+    };
+
+    if(!splitk)
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::atomic_add>{});
+    }
+
+    return ave_time;
+}
+
+#include "run_grouped_gemm_example.inc"
+
+constexpr bool Persistent = true;
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index f068510d26..a01d8178cc 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -30,20 +30,60 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ALayout, typename BLayout, typename CLayout, bool Persistent>
 float invoke_gemm(int n_warmup,
                   int n_repeat,
                   int group_count,
                   const std::vector<grouped_gemm_kargs>& args)
 {
-
+    // Workspace memory allocated to hold the gemm descriptions.
     ck_tile::DeviceMem gemm_workspace;
     gemm_workspace.Realloc(get_workspace_size(args));
 
-    float ave_time = grouped_gemm<ALayout, BLayout, CLayout>(
-        args,
-        ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
-        gemm_workspace.GetDeviceBuffer());
+    float ave_time = 0;
+    if constexpr(!Persistent)
+    {
+        // Regular version of grouped gemm
+        ave_time = grouped_gemm<ALayout, BLayout, CLayout>(
+            args,
+            ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
+            gemm_workspace.GetDeviceBuffer());
+    }
+    else
+    {
+        // NOTE: With the persistent TileLoop kernel, we do not necessarily need to have
+        // the gemm problems known on the host. Instead, we can just pass the pointer
+        // to the kernel and let the workgroups figure out which tiles to work on.
+        // This is useful when the gemm problems are generated dynamically.
+        // In this example however, we generate the `kargs` using the known gemm_descs,
+        // and copy the gemm descriptions to the device memory.
+        // The contents of the memory pointed to by `kargs_ptr` pointer could be
+        // written by e.g. another kernel from earlier stage.
+        std::vector<ck_tile::GemmTransKernelArg> kargs;
+        void* kargs_ptr   = gemm_workspace.GetDeviceBuffer();
+        const bool splitk = args[0].k_batch > 1;
+        for(const auto& arg : args)
+        {
+            kargs.emplace_back(ck_tile::GemmKernelArgs{arg.a_ptr,
+                                                       arg.b_ptr,
+                                                       arg.c_ptr,
+                                                       arg.M,
+                                                       arg.N,
+                                                       arg.K,
+                                                       arg.stride_A,
+                                                       arg.stride_B,
+                                                       arg.stride_C,
+                                                       arg.k_batch});
+        }
+        const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
+        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                            kargs.data(),
+                                            kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
+                                            hipMemcpyHostToDevice,
+                                            stream.stream_id_));
+        ave_time = grouped_gemm_tileloop<ALayout, BLayout, CLayout>(
+            stream, group_count, kargs_ptr, splitk);
+    }
 
     std::string op_name{"Grouped Gemm"};
 
@@ -66,7 +106,7 @@ float invoke_gemm(int n_warmup,
     return ave_time;
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <bool Persistent, typename ALayout, typename BLayout, typename CLayout>
 int run_grouped_gemm_example_with_layouts(int argc,
                                           char* argv[],
                                           const ALayout a_layout                  = ALayout{},
@@ -87,6 +127,15 @@ int run_grouped_gemm_example_with_layouts(int argc,
     const int group_count = arg_parser.get_int("group_count");
     const int repeat      = arg_parser.get_int("repeat");
     const int warmup      = arg_parser.get_int("warmup");
+    const int kbatch      = arg_parser.get_int("kbatch");
+    bool validate         = arg_parser.get_bool("validate");
+
+    if(kbatch > 1 && validate && warmup + repeat > 1)
+    {
+        std::cout << "WARNING: Data validation enabled with SplitK and more than"
+                  << "1 warmup/repeat. Disabling validation." << std::endl;
+        validate = false;
+    }
 
     std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
     std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
@@ -102,7 +151,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
         {
             Ms.push_back(256 + 256 * i);
             Ns.push_back(256 + 512 * i);
-            Ks.push_back(256 + 64 * i);
+            Ks.push_back(512 + 128 * i);
 
             stride_As.push_back(Ks[i]);
             stride_Bs.push_back(Ks[i]);
@@ -150,8 +199,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
                   << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc
                   << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
 
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tensors[i]);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors[i]);
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
 
         a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
             a_m_k_tensors[i].get_element_space_size_in_bytes()));
@@ -169,13 +218,11 @@ int run_grouped_gemm_example_with_layouts(int argc,
         const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
         void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
-        // TODO Add support for kbatch > 1 in grouped gemm
-        static constexpr ck_tile::index_t k_batch = 1;
         gemm_descs.push_back(
-            {p_a, p_b, p_c, k_batch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+            {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
     }
 
-    invoke_gemm<ALayout, BLayout, CLayout>(warmup, repeat, group_count, gemm_descs);
+    invoke_gemm<ALayout, BLayout, CLayout, Persistent>(warmup, repeat, group_count, gemm_descs);
 
     for(int i = 0; i < group_count; i++)
     {
@@ -183,7 +230,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
     }
 
     bool pass{true};
-    if(arg_parser.get_int("validate"))
+    if(validate)
     {
         for(int i = 0; i < group_count; ++i)
         {
@@ -194,7 +241,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
                 a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
             const float max_accumulated_value =
                 *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-            const auto rtol_atol = calculate_rtol_atol(Ks[i], 1 /*kbatch*/, max_accumulated_value);
+            const auto rtol_atol = calculate_rtol_atol(Ks[i], kbatch, max_accumulated_value);
             pass &= ck_tile::check_err(c_m_n_tensors[i],
                                        c_m_n_host_ref,
                                        "Error: Incorrect results!",
@@ -211,6 +258,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
     return pass;
 }
 
+template <bool Persistent>
 int run_grouped_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -227,12 +275,20 @@ int run_grouped_gemm_example(int argc, char* argv[])
 
     if(a_layout == "R" && b_layout == "C")
     {
-        return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Col{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Col{}, Col{}, Row{});
     }
-    // else if(a_layout == "R" && b_layout == "R")
-    // {
-    //     return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
-    // }
     else
     {
         throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index b432cfcef7..2e82e21ba1 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -127,4 +127,15 @@ struct is_any_of<CompareTo, FirstType, Rest...>
 {
 };
 
+// Helper to check if a type is a specialization of a given template
+template <typename Test, template <typename...> class RefTemplate>
+struct is_specialization_of : std::false_type
+{
+};
+
+template <template <typename...> class RefTemplate, typename... Args>
+struct is_specialization_of<RefTemplate<Args...>, RefTemplate> : std::true_type
+{
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/host/stream_utils.hpp b/include/ck_tile/host/stream_utils.hpp
new file mode 100644
index 0000000000..25faba9bfc
--- /dev/null
+++ b/include/ck_tile/host/stream_utils.hpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime_api.h>
+
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+
+namespace ck_tile {
+
+static inline index_t get_available_compute_units(const stream_config& s)
+{
+    constexpr static uint32_t MAX_MASK_DWORDS = 64;
+
+    // assume at most 64*32 = 2048 CUs
+    uint32_t cu_mask[MAX_MASK_DWORDS]{};
+
+    auto count_set_bits = [](uint32_t dword) {
+        index_t count = 0;
+        while(dword != 0)
+        {
+            if(dword & 0x1)
+            {
+                count++;
+            }
+            dword = dword >> 1;
+        }
+        return count;
+    };
+
+    HIP_CHECK_ERROR(hipExtStreamGetCUMask(s.stream_id_, MAX_MASK_DWORDS, &cu_mask[0]));
+
+    index_t num_cu = 0;
+    for(uint32_t i = 0; i < MAX_MASK_DWORDS; i++)
+    {
+        num_cu += count_set_bits(cu_mask[i]);
+    }
+
+    return num_cu;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index d8c0239153..28e8bee908 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -195,6 +195,22 @@ struct OffsettedTile1DPartitioner
         const auto [iM, iN] = TilePartitioner{M, N}.GetOutputTileIndex(blockIdx.x - block_start);
         return make_tuple(iM, iN);
     }
+
+    /**
+     * @brief The function subtracts the block's start (offset) from a given block index.
+     * @param [in] block_start Workgroup offset.
+     * @param [in] M           Gemm's M dimension.
+     * @param [in] N           Gemm's N dimension.
+     * @param [in] block_idx   Current block index of the workgroup.
+     * @return Returns a `tuple` [Im, In] with shifted index.
+     */
+    [[nodiscard]] CK_TILE_DEVICE static auto
+    GetOffsetedTileIndex(index_t block_start, index_t M, index_t N, index_t block_idx) noexcept
+        -> const tuple<index_t, index_t>
+    {
+        const auto [iM, iN] = TilePartitioner{M, N}.GetOutputTileIndex(block_idx - block_start);
+        return make_tuple(iM, iN);
+    }
 };
 
 /**
@@ -230,7 +246,7 @@ struct GemmSpatiallyLocalTilePartitioner
      * @param N     GEMM's N dimension.
      * @return index_t A total number of workgroups.
      */
-    CK_TILE_HOST static auto
+    CK_TILE_HOST_DEVICE static auto
     GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> index_t
     {
         const index_t GridDimX = integer_divide_ceil(M, MPerBlock);
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 5577cb083a..d0ad97c800 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -5,10 +5,15 @@
 
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/utility/literals.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/host/stream_utils.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/host.hpp"
 
+#include <hip/hip_runtime.h>
+
 namespace ck_tile {
 
 struct GemmTransKernelArg
@@ -22,6 +27,8 @@ struct GemmTransKernelArg
         : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
     {
     }
+
+    GemmTransKernelArg(GemmKernelArgs&& karg) : group_karg{karg}, block_start{0}, block_end{0} {}
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
@@ -40,8 +47,10 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
     using OffsetTile1DPartitioner = OffsettedTile1DPartitioner<TilePartitioner>;
     using Base                    = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+    using Kernel = GroupedGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
 
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    static constexpr index_t KernelBlockSize  = GemmPipeline::BlockSize;
+    static constexpr bool UsePersistentKernel = GemmPipeline::UsePersistentKernel;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -51,19 +60,42 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return concat('_', "gemm_grouped", gemm_prec_str<ADataType, BDataType>,
                       concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock),
                       concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
-                      concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
+                      concat('x', P_::kPadM, P_::kPadN, P_::kPadK),
+                      (UsePersistentKernel ? "Persistent" : "NonPersistent"));
         // clang-format on
     }
 
-    __host__ static auto GetWorkSpaceSize(const std::vector<GemmHostArgs>& gemm_descs)
+    CK_TILE_HOST static auto GetWorkSpaceSize(const std::vector<GemmHostArgs>& gemm_descs)
         -> std::size_t
     {
         return gemm_descs.size() * sizeof(GemmTransKernelArg);
     }
 
-    __host__ static constexpr auto BlockSize() -> dim3 { return dim3(KernelBlockSize); }
+    CK_TILE_HOST static auto GetWorkSpaceSize(index_t group_count) -> std::size_t
+    {
+        return group_count * sizeof(GemmTransKernelArg);
+    }
 
-    __host__ static constexpr auto GridSize(const std::vector<GemmHostArgs>& gemm_descs)
+    CK_TILE_HOST static constexpr auto BlockSize() -> dim3 { return dim3(KernelBlockSize); }
+
+    /**
+     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
+     * @return The maximum occupancy grid size.
+     * @note This function queries the maximum occupancy of the kernel using
+     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+     */
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        using ConstantPointer = const void CK_CONSTANT_ADDRESS_SPACE*;
+        const auto kernel     = kentry<KernelBlockSize, 1, Kernel, ConstantPointer, index_t>;
+        int occupancy;
+        HIP_CHECK_ERROR(
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
+        const int grid_size = get_available_compute_units(s) * occupancy;
+        return dim3(grid_size, 1, 1);
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const std::vector<GemmHostArgs>& gemm_descs)
     {
         index_t grid_size = 0;
         for(const auto& it_desc : gemm_descs)
@@ -121,39 +153,165 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return gemm_kernel_args_;
     }
 
+    CK_TILE_HOST static bool IsSupportedArgument(const std::vector<GemmTransKernelArg>& kargs)
+    {
+        for(const auto& karg : kargs)
+        {
+            if(!Base::IsSupportedArgument(karg.group_karg))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize() -> index_t
     {
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs) const
+    CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs,
+                            const tuple<index_t, index_t>& block_idx_2d,
+                            const index_t block_idx_z) const
     {
-        const auto [iM, iN] = OffsetTile1DPartitioner::GetOffsetedTileIndex(
-            kargs.block_start, kargs.group_karg.M, kargs.group_karg.N);
+        Run(kargs.group_karg, block_idx_2d, block_idx_z);
+    }
+
+    CK_TILE_DEVICE void Run(const GemmKernelArgs& kargs,
+                            const tuple<index_t, index_t>& block_idx_2d,
+                            const index_t block_idx_z) const
+    {
+        const auto [iM, iN] = block_idx_2d;
 
         const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
         const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
 
-        const typename Base::SplitKBatchOffset splitk_batch_offset(kargs.group_karg, blockIdx.z);
+        const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
 
-        const ADataType* a_ptr = static_cast<const ADataType*>(kargs.group_karg.a_ptr);
-        const BDataType* b_ptr = static_cast<const BDataType*>(kargs.group_karg.b_ptr);
-        CDataType* c_ptr       = static_cast<CDataType*>(kargs.group_karg.c_ptr);
+        const ADataType* a_ptr =
+            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+        const BDataType* b_ptr =
+            static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
 
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
-        this->RunGemm(
-            a_ptr, b_ptr, c_ptr, smem_ptr, kargs.group_karg, splitk_batch_offset, i_m, i_n);
+        if constexpr(UsePersistentKernel)
+        {
+            RunGemmWithPipelineSelection(
+                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
+        else
+        {
+            this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
     }
 
-    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                   index_t group_count) const
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note The GEMM pipeline is selected in-kernel based on the number of K-loops
+     *       and the tail-number. This is needed for the persistent tile-loop when
+     *       we didn't have access to the K dimension on the host.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void
+    RunGemmWithPipelineSelection(const ADataType* a_ptr,
+                                 const BDataType* b_ptr,
+                                 CDataType* c_ptr,
+                                 void* smem_ptr_0,
+                                 const GemmKernelArgs& kargs,
+                                 const typename Base::SplitKBatchOffset& splitk_batch_offset,
+                                 const index_t block_idx_m,
+                                 const index_t block_idx_n)
     {
-        const index_t block_id   = ck_tile::get_block_1d_id();
-        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
-            cast_pointer_to_generic_address_space(gemm_descs_const));
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
 
+        const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows =
+            Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        const auto& a_block_window = gemm_tile_windows.at(Base::I0);
+        const auto& b_block_window = gemm_tile_windows.at(Base::I1);
+
+        // Get hot-loop and tail configuration
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto RunEpilogue = [&](auto& c_block_tile) {
+            // Run Epilogue Pipeline
+            auto& c_block_window = gemm_tile_windows.at(Base::I2);
+            EpiloguePipeline{}
+                .template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+                    c_block_window, c_block_tile, smem_ptr_0);
+        };
+
+        if constexpr(is_specialization_of<GemmPipeline, GemmPipelineAgBgCrCompV3>::value)
+        {
+            // Run the specific implementation with hotloop+tailnum config
+            using PipelineImpl =
+                typename GemmPipeline::template PipelineImpl<GemmPipeline::Scheduler>;
+            const auto PassThrough = [](const auto& a) { return a; };
+            if(has_hot_loop && tail_num == TailNumber::Full)
+            {
+                const auto& c_block_tile =
+                    PipelineImpl{}.template operator()<true, TailNumber::Full>(a_block_window,
+                                                                               PassThrough,
+                                                                               b_block_window,
+                                                                               PassThrough,
+                                                                               num_loop,
+                                                                               smem_ptr_0);
+                RunEpilogue(c_block_tile);
+            }
+            else if(has_hot_loop && tail_num == TailNumber::Odd)
+            {
+                const auto& c_block_tile =
+                    PipelineImpl{}.template operator()<true, TailNumber::Odd>(a_block_window,
+                                                                              PassThrough,
+                                                                              b_block_window,
+                                                                              PassThrough,
+                                                                              num_loop,
+                                                                              smem_ptr_0);
+                RunEpilogue(c_block_tile);
+            }
+            else if(has_hot_loop && tail_num == TailNumber::Even)
+            {
+                const auto& c_block_tile =
+                    PipelineImpl{}.template operator()<true, TailNumber::Even>(a_block_window,
+                                                                               PassThrough,
+                                                                               b_block_window,
+                                                                               PassThrough,
+                                                                               num_loop,
+                                                                               smem_ptr_0);
+                RunEpilogue(c_block_tile);
+            }
+        }
+        else
+        {
+            ignore = a_block_window;
+            ignore = b_block_window;
+            static_assert(false, "GemmPipeline specialization not supported!");
+        }
+    }
+
+    CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr,
+                                       index_t block_id,
+                                       index_t group_count) const
+    {
         index_t left     = 0;
         index_t right    = group_count;
         index_t group_id = index_t((left + right) >> 1);
@@ -173,7 +331,61 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
             group_id = index_t((left + right) >> 1);
         }
 
-        Run(gemm_desc_ptr[group_id]);
+        return group_id;
+    }
+
+    // For non-persistent kernels
+    template <bool U = UsePersistentKernel, typename = std::enable_if_t<!U>>
+    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                   index_t group_count) const
+    {
+        const index_t block_id   = ck_tile::get_block_1d_id();
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+
+        const index_t group_id  = FindGroupId(gemm_desc_ptr, block_id, group_count);
+        const auto& kargs       = gemm_desc_ptr[group_id];
+        const auto grid_size_2d = TilePartitioner::GridSize(kargs.group_karg.M, kargs.group_karg.N);
+        const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex(
+            0,
+            kargs.group_karg.M,
+            kargs.group_karg.N,
+            (block_id - kargs.block_start) % grid_size_2d);
+        Run(kargs, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d);
+    }
+
+    // For persistent kernels
+    template <bool U   = UsePersistentKernel,
+              typename = std::enable_if_t<U>,
+              typename = void> // extra template parameter to avoid redefinition
+    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                   const index_t group_count) const
+    {
+        const index_t grid_size  = ck_tile::get_grid_size();
+        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
+            cast_pointer_to_generic_address_space(gemm_descs_const));
+        index_t block_id      = ck_tile::get_block_1d_id(); // initial block_id
+        index_t cum_grid_size = 0;
+        for(index_t group_id = 0; group_id < group_count; ++group_id)
+        {
+            const auto& kargs      = gemm_desc_ptr[group_id].group_karg;
+            const auto& k_batch    = kargs.k_batch;
+            const auto block_start = cum_grid_size;
+            cum_grid_size += TilePartitioner::GridSize(kargs.M, kargs.N) * k_batch;
+            while(block_id < cum_grid_size)
+            {
+                const auto grid_size_2d = TilePartitioner::GridSize(kargs.M, kargs.N);
+                const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex(
+                    0, kargs.M, kargs.N, (block_id - block_start) % grid_size_2d);
+                Run(kargs, block_idx_2d, (block_id - block_start) / grid_size_2d);
+                block_id = block_id + grid_size; // advance to next block
+                // NOTE: this check is redundant but helps the compiler avoid spilling some VGPR
+                if(block_id >= cum_grid_size)
+                {
+                    break; // exit the loop if all blocks are processed
+                }
+            }
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index c198c9443a..90cd22429e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -20,18 +20,19 @@ namespace ck_tile {
 template <typename Problem>
 struct BaseGemmPipelineAgBgCrCompV3
 {
-    static constexpr index_t PrefetchStages  = 2;
-    static constexpr index_t PrefillStages   = 1;
-    static constexpr index_t GlobalBufferNum = 1;
+    static constexpr index_t PrefetchStages   = 2;
+    static constexpr index_t PrefillStages    = 1;
+    static constexpr index_t GlobalBufferNum  = 1;
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
 
     CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
 
-    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop)
     {
         return num_loop > PrefetchStages;
     }
 
-    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
     {
         if(BlockHasHotloop(num_loop))
         {
@@ -104,6 +105,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     static constexpr auto Scheduler  = Problem::Scheduler;
 
     using Base::PrefetchStages;
+    using Base::UsePersistentKernel;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
old mode 100755
new mode 100644
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index a31004b425..a61b0eee3c 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -38,7 +38,8 @@ template <bool kPadM_,
           typename BLayout_,
           typename CLayout_,
           bool TransposeC_            = false,
-          bool UseStructuredSparsity_ = false>
+          bool UseStructuredSparsity_ = false,
+          bool UsePersistentKernel_   = false>
 struct TileGemmUniversalTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -53,6 +54,27 @@ struct TileGemmUniversalTraits
 
     static constexpr bool TransposeC            = TransposeC_;
     static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
+    static constexpr bool UsePersistentKernel   = UsePersistentKernel_;
 };
 
+template <bool kPadM_,
+          bool kPadN_,
+          bool kPadK_,
+          bool DoubleSmemBuffer_,
+          typename ALayout_,
+          typename BLayout_,
+          typename CLayout_,
+          bool TransposeC_            = false,
+          bool UseStructuredSparsity_ = false>
+using PersistentTileGemmUniversalTraits = TileGemmUniversalTraits<kPadM_,
+                                                                  kPadN_,
+                                                                  kPadK_,
+                                                                  DoubleSmemBuffer_,
+                                                                  ALayout_,
+                                                                  BLayout_,
+                                                                  CLayout_,
+                                                                  TransposeC_,
+                                                                  UseStructuredSparsity_,
+                                                                  true>;
+
 } // namespace ck_tile
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
index 7ea4c2b6dc..7d71f9f927 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
@@ -8,19 +8,27 @@
 #include "ck_tile/host.hpp"
 #include "test_grouped_gemm_util.hpp"
 
-using F16 = ck_tile::half_t;
-using F32 = float;
-
-using Row = ck_tile::tensor_layout::gemm::RowMajor;
-using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+using F16   = ck_tile::half_t;
+using F32   = float;
+using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+using True  = ck_tile::bool_constant<true>;
+using False = ck_tile::bool_constant<false>;
 
 // clang-format off
 using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
-    // std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
-    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>//,
-    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, Persistent
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,      False>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,      False>,
+
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,      False>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,      False>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,      False>
     >;
 // clang-format on
 
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
index 9f6b66c92b..b40dcfcaa1 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc
@@ -3,6 +3,7 @@
 TYPED_TEST(TestCkTileGroupedGemm, Basic)
 {
     const int group_count = 8;
+    const int kbatch      = 1;
     std::vector<int> Ms;
     std::vector<int> Ns;
     std::vector<int> Ks;
@@ -14,12 +15,37 @@ TYPED_TEST(TestCkTileGroupedGemm, Basic)
     {
         Ms.push_back(256 + 256 * i);
         Ns.push_back(256 + 512 * i);
-        Ks.push_back(256 + 64 * i);
+        Ks.push_back(512 + 128 * i);
 
         stride_As.push_back(Ks[i]);
         stride_Bs.push_back(Ks[i]);
         stride_Cs.push_back(Ns[i]);
     }
 
-    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, group_count);
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count);
+}
+
+TYPED_TEST(TestCkTileGroupedGemm, SplitK)
+{
+    const int group_count = 8;
+    const int kbatch      = 2;
+    std::vector<int> Ms;
+    std::vector<int> Ns;
+    std::vector<int> Ks;
+    std::vector<int> stride_As;
+    std::vector<int> stride_Bs;
+    std::vector<int> stride_Cs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * i);
+        Ns.push_back(256 + 512 * i);
+        Ks.push_back(512 + 128 * i);
+
+        stride_As.push_back(Ks[i]);
+        stride_Bs.push_back(Ks[i]);
+        stride_Cs.push_back(Ns[i]);
+    }
+
+    this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count);
 }
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 3dec229643..cdc2e4f090 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -24,6 +24,10 @@ class TestCkTileGroupedGemm : public ::testing::Test
     using AccDataType = std::tuple_element_t<5, Tuple>;
     using CDataType   = std::tuple_element_t<6, Tuple>;
 
+    // Get the persistent value from ck_tile::bool_constant
+    using PersistentType             = std::tuple_element_t<7, Tuple>;
+    static constexpr bool Persistent = PersistentType::value;
+
     struct GroupedGemKernelParam
     {
         static const bool kPadM = false;
@@ -31,9 +35,9 @@ class TestCkTileGroupedGemm : public ::testing::Test
         static const bool kPadK = false;
 
         static const int kBlockPerCu         = 1;
-        static const ck_tile::index_t M_Tile = 128;
-        static const ck_tile::index_t N_Tile = 128;
-        static const ck_tile::index_t K_Tile = 32;
+        static const ck_tile::index_t M_Tile = 256;
+        static const ck_tile::index_t N_Tile = 256;
+        static const ck_tile::index_t K_Tile = 64;
 
         static const ck_tile::index_t M_Warp = 2;
         static const ck_tile::index_t N_Warp = 2;
@@ -41,7 +45,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
 
         static const ck_tile::index_t M_Warp_Tile = 32;
         static const ck_tile::index_t N_Warp_Tile = 32;
-        static const ck_tile::index_t K_Warp_Tile = 8;
+        static const ck_tile::index_t K_Warp_Tile = 16;
     };
 
     using grouped_gemm_kargs = ck_tile::GemmHostArgs;
@@ -53,7 +57,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
     template <typename ALayout, typename BLayout, typename CLayout>
     void invoke_grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                              const ck_tile::stream_config& s,
-                             void* p_workspace_)
+                             void* kargs_ptr)
     {
         constexpr bool DoubleSmemBuffer = false;
         constexpr bool TransposeC       = false;
@@ -138,11 +142,12 @@ class TestCkTileGroupedGemm : public ::testing::Test
                                                  memory_operation>>;
             using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKargs(gemm_descs);
+            EXPECT_TRUE(Kernel::IsSupportedArgument(kargs));
 
             const dim3 grids      = Kernel::GridSize(gemm_descs);
             constexpr dim3 blocks = Kernel::BlockSize();
 
-            ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
+            ck_tile::hip_check_error(hipMemcpyWithStream(kargs_ptr,
                                                          kargs.data(),
                                                          get_workspace_size(gemm_descs),
                                                          hipMemcpyHostToDevice,
@@ -163,7 +168,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
                     grids,
                     blocks,
                     0,
-                    ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
+                    ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
                     gemm_descs.size()));
             return ave_time;
         };
@@ -171,6 +176,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
             if(gemm_descs[0].k_batch == 1)
             {
+                std::cout << "Run without SplitK" << std::endl;
                 Run(has_hot_loop_,
                     tail_number_,
                     ck_tile::integral_constant<ck_tile::memory_operation_enum,
@@ -178,6 +184,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
             }
             else
             {
+                std::cout << "Run using SplitK" << std::endl;
                 Run(has_hot_loop_,
                     tail_number_,
                     ck_tile::integral_constant<ck_tile::memory_operation_enum,
@@ -213,6 +220,135 @@ class TestCkTileGroupedGemm : public ::testing::Test
         }
     }
 
+    template <typename ALayout, typename BLayout, typename CLayout>
+    void invoke_grouped_gemm_persistent(const ck_tile::stream_config& s,
+                                        const ck_tile::index_t num_groups,
+                                        void* kargs_ptr,
+                                        bool splitk)
+    {
+        constexpr bool TransposeC       = false;
+        constexpr bool DoubleSmemBuffer = false;
+
+        constexpr int kBlockPerCu                         = 1;
+        constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<GroupedGemKernelParam::M_Tile,
+                                                     GroupedGemKernelParam::N_Tile,
+                                                     GroupedGemKernelParam::K_Tile>,
+                                   ck_tile::sequence<GroupedGemKernelParam::M_Warp,
+                                                     GroupedGemKernelParam::N_Warp,
+                                                     GroupedGemKernelParam::K_Warp>,
+                                   ck_tile::sequence<GroupedGemKernelParam::M_Warp_Tile,
+                                                     GroupedGemKernelParam::N_Warp_Tile,
+                                                     GroupedGemKernelParam::K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
+                                               GroupedGemKernelParam::kPadN,
+                                               GroupedGemKernelParam::kPadK,
+                                               ALayout,
+                                               BLayout,
+                                               CLayout>;
+        using GemmUniversalTraits =
+            ck_tile::PersistentTileGemmUniversalTraits<GroupedGemKernelParam::kPadM,
+                                                       GroupedGemKernelParam::kPadN,
+                                                       GroupedGemKernelParam::kPadK,
+                                                       DoubleSmemBuffer,
+                                                       ALayout,
+                                                       BLayout,
+                                                       CLayout,
+                                                       TransposeC>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        const auto Run = [&](const auto memory_operation_) {
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            // We create the GEMM pipeline without specifying hotloop or tailnumber.
+            // These are automatically run inside the kernel based on the given input data.
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GroupedGemKernelParam::M_Warp,
+                                                 GroupedGemKernelParam::N_Warp,
+                                                 GroupedGemKernelParam::M_Warp_Tile,
+                                                 GroupedGemKernelParam::N_Warp_Tile,
+                                                 GroupedGemKernelParam::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       num_groups));
+        };
+
+        if(splitk)
+        {
+            Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+        else
+        {
+
+            Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+    }
+
+    auto calculate_rtol_atol(const ck_tile::index_t K,
+                             const ck_tile::index_t kbatch,
+                             const float max_accumulated_value)
+    {
+        using ComputeType =
+            std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+        // Calculate thresholds
+        const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+            ck_tile::integer_divide_ceil(K, kbatch));
+        const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+            max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+        // Calculate error due to split_k accumulation
+        const auto rtol_split_k =
+            ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+        const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+            max_accumulated_value, kbatch);
+        // Use higher threshold
+        return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+    }
+
     public:
     void Run(const std::vector<int>& Ms,
              const std::vector<int>& Ns,
@@ -220,6 +356,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
              std::vector<int>& stride_As,
              std::vector<int>& stride_Bs,
              std::vector<int>& stride_Cs,
+             const int kbatch      = 1,
              const int group_count = 16)
     {
         using namespace ck_tile::literals;
@@ -294,10 +431,10 @@ class TestCkTileGroupedGemm : public ::testing::Test
             std::cout << "gemm[" << i << "]"
                       << " a_m_k: " << a_m_k_tensors[i].mDesc
                       << " b_k_n: " << b_k_n_tensors[i].mDesc
-                      << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
+                      << " c_m_n: " << c_m_n_tensors[i].mDesc << " KBatch: " << kbatch << std::endl;
 
-            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tensors[i]);
-            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors[i]);
+            ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+            ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
 
             a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
                 a_m_k_tensors[i].get_element_space_size_in_bytes()));
@@ -315,18 +452,51 @@ class TestCkTileGroupedGemm : public ::testing::Test
             const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
             void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
-            // TODO add support for kbatch > 1
-            static constexpr ck_tile::index_t k_batch = 1;
             gemm_descs.push_back(
-                {p_a, p_b, p_c, k_batch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+                {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
         }
 
         ck_tile::DeviceMem gemm_workspace;
         gemm_workspace.Realloc(get_workspace_size(gemm_descs));
 
-        invoke_grouped_gemm<ALayout, BLayout, CLayout>(
-            gemm_descs, ck_tile::stream_config{nullptr, false}, gemm_workspace.GetDeviceBuffer());
+        if constexpr(Persistent)
+        {
+            // Generate kernel arguments
+            std::vector<ck_tile::GemmTransKernelArg> kargs;
+            void* kargs_ptr   = gemm_workspace.GetDeviceBuffer();
+            const bool splitk = gemm_descs[0].k_batch > 1;
+            for(const auto& arg : gemm_descs)
+            {
+                kargs.emplace_back(ck_tile::GemmKernelArgs{arg.a_ptr,
+                                                           arg.b_ptr,
+                                                           arg.c_ptr,
+                                                           arg.M,
+                                                           arg.N,
+                                                           arg.K,
+                                                           arg.stride_A,
+                                                           arg.stride_B,
+                                                           arg.stride_C,
+                                                           arg.k_batch});
+            }
+            const auto stream = ck_tile::stream_config{nullptr, false, 1};
+            ck_tile::hip_check_error(
+                hipMemcpyWithStream(kargs_ptr,
+                                    kargs.data(),
+                                    kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
+                                    hipMemcpyHostToDevice,
+                                    stream.stream_id_));
+            invoke_grouped_gemm_persistent<ALayout, BLayout, CLayout>(
+                stream, group_count, kargs_ptr, splitk);
+        }
+        else
+        {
+            invoke_grouped_gemm<ALayout, BLayout, CLayout>(
+                gemm_descs,
+                ck_tile::stream_config{nullptr, false, 1},
+                gemm_workspace.GetDeviceBuffer());
+        }
 
+        // Copy results back to host for validation
         for(int i = 0; i < group_count; i++)
         {
             c_m_n_dev_buf[i]->FromDevice(c_m_n_tensors[i].data());
@@ -340,7 +510,14 @@ class TestCkTileGroupedGemm : public ::testing::Test
             c_m_n_host_ref.SetZero();
             ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
                 a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
-            pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref);
+            const float max_accumulated_value =
+                *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+            const auto rtol_atol = calculate_rtol_atol(Ks[i], kbatch, max_accumulated_value);
+            pass &= ck_tile::check_err(c_m_n_tensors[i],
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
         }
         EXPECT_TRUE(pass);
     }

From 1386924749e490ff0223cf757e0cf2bb6befce59 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 20 May 2025 15:05:08 -0700
Subject: [PATCH 134/443] Add the instances for small sized GEMM in preshuffle
 and improve CMake Flag (#2212)

* Add small instance, add the bug fix, & improve the example CMake

* clang format
---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |  5 ++
 ..._multiply_multiply_xdl_fp8_bpreshuffle.cpp |  6 +--
 ...e_gemm_pipeline_xdlops_b_preshuffle_v2.hpp | 16 +++----
 .../gemm_multiply_multiply_wp/CMakeLists.txt  | 48 +++++++++++++++++++
 4 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 8d51d43c65..a58612cb5b 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -22,6 +22,11 @@ foreach(gpu IN LISTS GPU_TARGETS)
             target_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
             target_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()
+        set(GEMM_OPTIONS)
+        list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+        target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
+        target_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
+        target_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
         set(target 1)
     endif()
 endforeach()
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index 9f758d5fc5..280697851b 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -141,11 +141,11 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
         128,   128,    128,
         16,   16,
-        16,   16,
-        8,    2,
+        32,   32,
+        4,    1,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-        2,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
+        1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
         ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
index 7bbaaca5b6..601756be44 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
@@ -285,10 +285,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                 static_for<0, KGroup, 1>{}([&](auto kg0) {
                     a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
                                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
-                                       a_block_buf,
+                                       a_block_buf.At(I0),
                                        a_thread_desc_,
                                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                       a_thread_buf);
+                                       a_thread_bufs(I0));
                 });
             });
         });
@@ -328,10 +328,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                                 a_thread_copy_.Run(
                                     a_block_desc_m0_m1_m2_k0_k1_k2,
                                     make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
-                                    a_block_buf,
+                                    a_block_buf.At(local_read_buf),
                                     a_thread_desc_,
                                     make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                    a_thread_buf);
+                                    a_thread_bufs(local_read_buf));
                             });
                         });
                     });
@@ -403,10 +403,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                         a_thread_copy_.Run(
                             a_block_desc_m0_m1_m2_k0_k1_k2,
                             make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
-                            a_block_buf,
+                            a_block_buf.At(local_read_reg),
                             a_thread_desc_,
                             make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                            a_thread_buf);
+                            a_thread_bufs(local_read_reg));
                     });
                 });
             });
@@ -460,10 +460,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::I
                         a_thread_copy_.Run(
                             a_block_desc_m0_m1_m2_k0_k1_k2,
                             make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
-                            a_block_buf,
+                            a_block_buf.At(local_read_reg),
                             a_thread_desc_,
                             make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                            a_thread_buf);
+                            a_thread_bufs(local_read_reg));
                     });
                 });
             });
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
index 743a0272f7..37233ac5b4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/CMakeLists.txt
@@ -2,6 +2,18 @@
 set(GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES)
 
 list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES 
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp
+        f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -9,6 +21,18 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
         f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
 
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
+        f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -17,6 +41,18 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES
         f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
         )
 
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -24,6 +60,18 @@ set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")

From 380bca2b85b556ff867b79dbf3010e3a13b33325 Mon Sep 17 00:00:00 2001
From: SamiAario-AMD <samaario@amd.com>
Date: Wed, 21 May 2025 01:15:28 +0300
Subject: [PATCH 135/443] Fix 11_add_rmsnorm2d_rdquant (#2207)

---
 .../add_rmsnorm2d_rdquant_fwd.cpp             | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
index 574edf64d3..06c04b763e 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -67,13 +67,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;
 
-    using ADataType       = typename TypeConfig::ADataType;
-    using BDataType       = typename TypeConfig::BDataType;
-    using GammaDataType   = typename TypeConfig::GammaDataType;
-    using XDataType       = typename TypeConfig::XDataType;
-    using YScaleDataType  = typename TypeConfig::YScaleDataType;
-    using QYDataType      = typename TypeConfig::QYDataType;
-    using ComputeDataType = float;
+    using ADataType        = typename TypeConfig::ADataType;
+    using BDataType        = typename TypeConfig::BDataType;
+    using GammaDataType    = typename TypeConfig::GammaDataType;
+    using XDataType        = typename TypeConfig::XDataType;
+    using YScaleDataType   = typename TypeConfig::YScaleDataType;
+    using QYDataType       = typename TypeConfig::QYDataType;
+    using ComputeDataType  = float;
+    using UnquantYDataType = ck_tile::null_type;
 
     // host verify
     ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -184,6 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         // Rmsnorm2d
         {
             ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+            ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n});
 
             // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
             // simplicity
@@ -191,8 +193,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                                             InvRmsDataType,
+                                             UnquantYDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
         }
 
         // yscale

From 990d645578b4a195f5c5b8479eeef47d828faa98 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 20 May 2025 17:35:07 -0500
Subject: [PATCH 136/443] added gemm universal example in readme (#2216)

---
 profiler/README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/profiler/README.md b/profiler/README.md
index 3f4837aada..4398a878bc 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -1,5 +1,29 @@
 [Back to the main page](../README.md)
 # Composable Kernel profiler
+## Profiler GEMM UNIVERSAL kernels
+```bash
+# arg1: tensor operation (gemm_universal: Universal GEMM)
+# arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16->f8; 7: f8->bf16, comp f8; 8: f16@i4; 9: bf16@i4
+# arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
+#                     1: A[m, k] * B[n, k] = C[m, n];
+#                     2: A[k, m] * B[k, n] = C[m, n];
+#                     3: A[k, m] * B[n, k] = C[m, n])
+# arg4: verification (0: no; 1: yes)
+# arg5: initialization (0: no init; 1: integer value; 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0=no, 1=yes)
+# arg8 to 13: M, N, K, StrideA, StrideB, StrideC
+# arg14: split k into  mulitiple batch
+# optional:
+# arg15: number of warm-up cycles (default 1)
+# arg16: number of iterations (default 10)
+# arg17: memory for rotating buffer (default 0, size in MB)
+
+
+################        op  datatype  layout  verify  init  print  time  M N K  StrideA StrideB StrideC  SplitK  WarmupCycles  Iterations  MemoryBuffer
+./bin/ckProfiler gemm_universal 1 0 1 1 0 1 4096 4096 4096 4096 4096 4096 1 1 10 0
+```
+
 ## Profile GEMM kernels
 ```bash
 #arg1: tensor operation (gemm=GEMM)

From fa39c4e7987acb39d3bb1f3c74add5acda44e164 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Wed, 21 May 2025 12:34:30 -0500
Subject: [PATCH 137/443] Add Doxygen Documentation for HostTesnor,
 HostTensorDescriptor, DeviceMem, FillUniformDistribution (#2160)

* added documentation for HostTensorDescriptor

* added documentation for DeviceMem and FillUniformDistribution

* fixed merging error

* fixed host_tensor_descriptor error

* clang format
---
 include/ck/library/utility/fill.hpp    | 14 +++++
 include/ck_tile/host/device_memory.hpp | 31 +++++++++-
 include/ck_tile/host/fill.hpp          | 20 ++++++-
 include/ck_tile/host/host_tensor.hpp   | 81 +++++++++++++++++++++++++-
 4 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp
index 35625d142e..4f421b4282 100644
--- a/include/ck/library/utility/fill.hpp
+++ b/include/ck/library/utility/fill.hpp
@@ -85,6 +85,20 @@ struct FillUniformDistributionIntegerValue
     }
 };
 
+/**
+ * @brief A functor for filling a container with a monotonically increasing or decreasing sequence.
+ *
+ * FillMonotonicSeq generates a sequence of values starting from an initial value
+ * and incrementing by a fixed step for each subsequent element.
+ *
+ * @tparam T The numeric type of the sequence elements.
+ *
+ * Example usage:
+ * ```
+ * std::vector<int> v(5);
+ * FillMonotonicSeq<int>{10, 2}(v); // Fills v with {10, 12, 14, 16, 18}
+ * ```
+ */
 template <typename T>
 struct FillMonotonicSeq
 {
diff --git a/include/ck_tile/host/device_memory.hpp b/include/ck_tile/host/device_memory.hpp
index 13684c0e24..587f38987e 100644
--- a/include/ck_tile/host/device_memory.hpp
+++ b/include/ck_tile/host/device_memory.hpp
@@ -20,10 +20,35 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
 }
 
 /**
- * @brief Container for storing data in GPU device memory
+ * @brief Manages device memory allocation and host-device data transfers
  *
+ * DeviceMem encapsulates GPU memory management operations using HIP runtime API.
+ * It provides functionality for allocating device memory, transferring data between
+ * host and device, and performing basic memory operations.
+ *
+ * Key features:
+ * - Automatic memory allocation and deallocation
+ * - Host-to-device and device-to-host data transfers
+ * - Memory initialization operations
+ * - Integration with HostTensor for simplified data handling
+ *
+ * Usage example:
+ * ```
+ * // Allocate device memory
+ * BHostTensor<float> AHostData({256});
+ * DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
+ *
+ * // Transfer data to device
+ * HostTensor<float> AHostTensor({256});
+ * d_mem.ToDevice(AHostData.data());
+ *
+ * // Retrieve data from device
+ * HostTensor<float> ResultHostTensor({256});
+ * d_mem.FromDevice(ResultHostTensor.data());
+ * ```
  */
 struct DeviceMem
+
 {
     DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
     DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
@@ -163,8 +188,8 @@ struct DeviceMem
         }
     }
 
-    void* mpDeviceBuf;
-    std::size_t mMemSize;
+    void* mpDeviceBuf;    ///< pointer to device buffer
+    std::size_t mMemSize; ///< size of device buffer in bytes
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 3f64eb28cd..4a359e031f 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -17,13 +17,31 @@
 
 namespace ck_tile {
 
+/**
+ * @brief Functor for filling a range with randomly generated values from a uniform distribution.
+ *
+ * This struct provides functionality to fill iterators or ranges with random values
+ * generated from a uniform distribution. It supports both single-threaded and
+ * multi-threaded operation.
+ *
+ * @tparam T The target type for the generated values.
+ *
+ * @note The multi-threaded implementation is not guaranteed to provide perfectly
+ * distributed values across threads.
+ *
+ * @example
+ *
+ *     // Direct usage without creating a separate variable:
+ *     ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_host_tensor);
+ */
 template <typename T>
 struct FillUniformDistribution
 {
     float a_{-5.f};
     float b_{5.f};
     std::optional<uint32_t> seed_{11939};
-    // ATTENTION: threaded does not guarantee the distribution between thread
+    // ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed
+    // across threads).
     bool threaded = false;
 
     template <typename ForwardIter>
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index a43877c6da..deaa158d50 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -85,6 +85,19 @@ CK_TILE_HOST auto construct_f_unpack_args(F, T args)
     return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
 }
 
+/**
+ * @brief Descriptor for tensors in host memory.
+ *
+ * HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
+ * of a tensor in host memory. It provides functionality to:
+ * - Store tensor dimensions and strides
+ * - Calculate default strides for contiguous memory layout
+ * - Convert multi-dimensional indices to linear memory offsets
+ * - Query tensor metadata (dimensions, element counts, etc.)
+ *
+ * The class supports both automatic stride calculation for contiguous memory layout
+ * and custom strides for more complex memory patterns.
+ */
 struct HostTensorDescriptor
 {
     HostTensorDescriptor() = default;
@@ -138,12 +151,35 @@ struct HostTensorDescriptor
     }
 
     std::size_t get_num_of_dimension() const { return mLens.size(); }
+    /**
+     * @brief Calculates the total number of elements in the tensor.
+     *
+     * Computes the product of all dimension lengths to determine the
+     * total element count in the tensor.
+     *
+     * @pre The lengths array (mLens) and strides array (mStrides) must have
+     *      the same size.
+     *
+     * @return The total number of elements in the tensor.
+     */
     std::size_t get_element_size() const
     {
         assert(mLens.size() == mStrides.size());
         return std::accumulate(
             mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
     }
+    /**
+     * @brief Calculates the total element space required for the tensor in memory.
+     *
+     * This method computes the minimum size of contiguous memory needed to store
+     * all elements of the tensor, taking into account the tensor's dimensions and
+     * strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
+     * across all dimensions.
+     *
+     * Dimensions with length 0 are skipped in this calculation.
+     *
+     * @return The size of the tensor's element space (number of elements).
+     */
     std::size_t get_element_space_size() const
     {
         std::size_t space = 1;
@@ -165,6 +201,18 @@ struct HostTensorDescriptor
 
     const std::vector<std::size_t>& get_strides() const { return mStrides; }
 
+    /**
+     * @brief Calculates the linear offset from multi-dimensional indices.
+     *
+     * Converts a set of N-dimensional indices into a single linear offset by computing
+     * the inner product of the indices with the tensor's strides.
+     *
+     * @tparam Is Parameter pack of index types (should be convertible to std::size_t)
+     * @param is Variable number of indices, one for each dimension of the tensor
+     * @return std::size_t Linear offset corresponding to the given multi-dimensional indices
+     *
+     * @pre The number of indices must match the number of dimensions in the tensor
+     */
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
@@ -173,6 +221,15 @@ struct HostTensorDescriptor
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
     }
 
+    /**
+     * @brief Calculates the linear memory offset from a multi-dimensional index
+     *
+     * Computes the linear offset by performing an inner product between the provided
+     * multi-dimensional indices and the tensor's strides.
+     *
+     * @param iss Vector containing the multi-dimensional indices
+     * @return The calculated linear offset as a size_t
+     */
     std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
     {
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
@@ -194,8 +251,8 @@ struct HostTensorDescriptor
     }
 
     private:
-    std::vector<std::size_t> mLens;
-    std::vector<std::size_t> mStrides;
+    std::vector<std::size_t> mLens;    ///< Lengths of each dimension
+    std::vector<std::size_t> mStrides; ///< Strides for each dimension
 };
 
 template <typename New2Old>
@@ -681,6 +738,24 @@ struct HostTensor
     Data mData;
 };
 
+/**
+ * @brief Creates a host tensor descriptor with specified dimensions and layout
+ *
+ * Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
+ * layout is row-major or column-major. This is determined via the compile-time template
+ * parameter `is_row_major`.
+ *
+ * @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
+ * column-major (false)
+ *
+ * @param row Number of rows in the tensor
+ * @param col Number of columns in the tensor
+ * @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
+ *
+ * @return HostTensorDescriptor with shape {row, col} and strides:
+ *         - For row-major: {stride, 1}
+ *         - For column-major: {1, stride}
+ */
 template <bool is_row_major>
 auto host_tensor_descriptor(std::size_t row,
                             std::size_t col,
@@ -698,6 +773,7 @@ auto host_tensor_descriptor(std::size_t row,
         return HostTensorDescriptor({row, col}, {1_uz, stride});
     }
 }
+
 template <bool is_row_major>
 auto get_default_stride(std::size_t row,
                         std::size_t col,
@@ -718,5 +794,4 @@ auto get_default_stride(std::size_t row,
     else
         return stride;
 }
-
 } // namespace ck_tile

From ebc5a6ef8717aba5ea5ee691de8e0da0dc4de04e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 21 May 2025 22:47:34 +0200
Subject: [PATCH 138/443] Grouped conv bwd wei add for larger filter and Merge
 Groupes optimization (#2197)

* Grouped conv bwd wei add two stage instances for larger filter and Merge Groups

* Fix

* fix

* Restore removed instances

---------

Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
---
 ...conv_bwd_weight_two_stage_xdl_instance.hpp | 108 ++++++++++++++++--
 .../grouped_convolution_backward_weight.hpp   |  16 +++
 ...rouped_convolution_backward_weight_xdl.inc |  96 ++++++++++++++++
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  |   4 +
 ...ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp |   2 +-
 ...gkcyx_ngkhw_bf16_pipev1_part2_instance.cpp |  41 +++++++
 ..._ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp |   2 +-
 ..._gkcyx_ngkhw_f16_pipev1_part2_instance.cpp |  41 +++++++
 ...nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp |   2 +-
 ...gkyxc_nhwgk_bf16_pipev1_part2_instance.cpp |  41 +++++++
 ..._nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp |   2 +-
 ..._gkyxc_nhwgk_f16_pipev1_part2_instance.cpp |  41 +++++++
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |   4 +
 ...wgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp |   2 +-
 ...zyxc_ndhwgk_bf16_pipev1_part2_instance.cpp |  41 +++++++
 ...hwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp |   2 +-
 ...kzyxc_ndhwgk_f16_pipev1_part2_instance.cpp |  41 +++++++
 ...dhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp |   2 +-
 ...czyx_ngkdhw_bf16_pipev1_part2_instance.cpp |  41 +++++++
 ...cdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp |   2 +-
 ...kczyx_ngkdhw_f16_pipev1_part2_instance.cpp |  41 +++++++
 21 files changed, 552 insertions(+), 20 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index 0ed12b984b..fbcda3ca57 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -72,14 +72,31 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+    // clang-format on
+    >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_part2_instances = std::tuple<
+    // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 16, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 16,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              4,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>,
 
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4>
     // clang-format on
     >;
 
@@ -145,15 +162,34 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instance
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+    // clang-format on
+    >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_part2_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 4, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 4,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 16, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 16,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              4,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>
-    // clang-format on
-    >;
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4>
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial,
           typename ALayout,
@@ -231,11 +267,34 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8 ,1>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1>
+    // clang-format on
+    >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_part2_instances = std::tuple<
+    // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1, F16, F16, 4, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1, F16, F16, 2, 2>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8, F16, F16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8, F16, F16, 2, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8, F16, F16, 1, 4>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4, F16, F16, 2, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>
     // clang-format on
     >;
 
@@ -292,14 +351,39 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instance
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8 ,1>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>
 
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1, BF16, BF16, 4, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1, BF16, BF16, 2, 2>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>
     // clang-format on
     >;
 
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_part2_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1,  S<8, 8, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              8,              8,      false,  S<8, 8,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              8,              8,      false,           1,           1,   S<1, 16, 1, 16>,                4, Scheduler, PipelineVersion, 1, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              2,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              2,              8,      false,           1,           1,    S<1, 8, 1, 32>,                2, Scheduler, PipelineVersion, 1, BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,    64,     64,   8,   32,   32,    1,    1, S<8, 32, 1>,   S<2, 0, 1>,  S<2, 0, 1>,                  1,              1,              8,      false,  S<8, 32,  1>, S<2, 0, 1>,  S<2, 0, 1>,                1,              1,              8,      false,           1,           1,    S<1, 4, 1, 64>,                1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8, BF16, BF16, 2, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   256,     32,   8,   16,   16,    1,   16,   S<4, 2, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 4>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4, BF16, BF16, 2, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,   128,     32,   8,   16,   16,    1,    8,   S<4, 4, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              8,      false,  S<4, 16,  1>,  S<2, 0, 1>, S<1, 0, 2>,                1,              4,              8,      false,           1,           1,   S<1, 4, 1, 16>,                 1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>
+        // clang-format on
+        >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index e8e46a7329..a450307dc2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -383,6 +383,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instances(
@@ -422,6 +424,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
@@ -443,6 +447,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev5_instances(
@@ -460,6 +466,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instances(
@@ -586,6 +594,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instances(
@@ -625,6 +635,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
 
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
@@ -655,6 +667,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev5_instances(
@@ -672,6 +686,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instances(
+                        op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index 2af5edf98c..31926ce908 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -232,6 +232,18 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -304,6 +316,18 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
@@ -413,6 +437,18 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -485,6 +521,18 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
@@ -731,6 +779,18 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -803,6 +863,18 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
@@ -912,6 +984,18 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -984,6 +1068,18 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index 9d5be260cb..7264c4688d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -30,6 +30,8 @@ set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instance.cpp
     xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
     xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instance.cpp
     xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
     xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
     xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
@@ -44,6 +46,8 @@ set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev5_instance.cpp
     xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
     xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instance.cpp
+    xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instance.cpp
 
     xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
     xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
index d63cb7375a..7763c5db57 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_p
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             2,
             NGCHW,
             GKCYX,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..15d99d131a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_part2_instances<
+            2,
+            NGCHW,
+            GKCYX,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
index c28de81134..3897eac117 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pi
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             2,
             NGCHW,
             GKCYX,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..a832d9c3e9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKCYX,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_part2_instances<
+            2,
+            NGCHW,
+            GKCYX,
+            NGKHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
index 6e77488299..f09e9c8479 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances<
             2,
             NHWGC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..051d8b17ac
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_part2_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
index e2ecee734f..480b84960d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances<
             2,
             NHWGC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..bf6492a820
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_part2_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index 1b0d2dd0b2..5574cf82f9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -26,6 +26,8 @@ set(GROUPED_CONV3D_BWD_WEIGHT
      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instance.cpp
+     xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instance.cpp
      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
      xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
@@ -44,6 +46,8 @@ set(GROUPED_CONV3D_BWD_WEIGHT
      xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev5_instance.cpp
      xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
      xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instance.cpp
+     xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
index 4c4589d128..8dc563e079 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances<
             3,
             NDHWGC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..07221a7af5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_part2_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
index 125b324985..0b96c12198 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances<
             3,
             NDHWGC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..2de899e66d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ndhwgc_gkzyxc_ndhwgk/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_part2_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
index e7cfcf1e5f..1514cb1c6c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf1
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances<
             3,
             NGCDHW,
             GKCZYX,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..f451708158
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_bf16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_part2_instances<
+            3,
+            NGCDHW,
+            GKCZYX,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
index f22b0c74c0..dd7309eb62 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_instance.cpp
@@ -25,7 +25,7 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances<
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances<
             3,
             NGCDHW,
             GKCZYX,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instance.cpp
new file mode 100644
index 0000000000..9eb492d07f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/ngcdhw_gkczyx_ngkdhw/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkczyx_ngkdhw_f16_pipev1_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_part2_instances<
+            3,
+            NGCDHW,
+            GKCZYX,
+            NGKDHW,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From 534d4594d0906288339c937b6419f5995ef09889 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Thu, 22 May 2025 01:28:00 -0500
Subject: [PATCH 139/443] Refactor tile_window.hpp, tile_window_linear.hpp into
 a CK Tile Hierarchy  (#2214)

* window_origin variable now in base class

* abstracted more functions

* consolidated tile_window_static_distribution and tile_window_static_lengths

* clang format

* skeleton code for tile_window and tile_window_linear consolidation

* more abstraction

* moved variables from child to parent

* clang format

* removed comments

* removed debug code

* removed debug code

* abstracting traits WIP

* consolidated traits

* removed comments and clang formatted
---
 include/ck_tile/core.hpp                      |   1 +
 include/ck_tile/core/tensor/tile_window.hpp   | 540 +++++------------
 .../ck_tile/core/tensor/tile_window_base.hpp  | 256 +++++++++
 .../core/tensor/tile_window_linear.hpp        | 544 ++++++------------
 4 files changed, 571 insertions(+), 770 deletions(-)
 create mode 100644 include/ck_tile/core/tensor/tile_window_base.hpp

diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 2ea8bf15a7..aa9411b2e1 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -54,6 +54,7 @@
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
 #include "ck_tile/core/tensor/tile_scatter_gather.hpp"
+#include "ck_tile/core/tensor/tile_window_base.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/tile_window_utils.hpp"
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 716b1f4ecb..d8a5c14f9b 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -13,6 +13,7 @@
 #include "ck_tile/core/tensor/static_distributed_tensor.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/tensor/tile_window_base.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
@@ -34,166 +35,60 @@ template <typename BottomTensorView_,
           typename StaticTileDistribution_,
           index_t NumCoord>
 struct tile_window_with_static_distribution
+    : public tile_window_with_tile_dstr_base<
+          tile_window_with_static_distribution<BottomTensorView_,
+                                               WindowLengths_,
+                                               StaticTileDistribution_,
+                                               NumCoord>,
+          BottomTensorView_,
+          WindowLengths_,
+          StaticTileDistribution_>
 {
-    using BottomTensorView = remove_reference_t<BottomTensorView_>;
-    using WindowLengths    = remove_cvref_t<WindowLengths_>;
-    using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
-
-    using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
-    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
-
-    using DataType = remove_cvref_t<typename BottomTensorView::DataType>;
-
-    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
-    static constexpr index_t NDimBottomTensor     = BottomTensorDesc::get_num_of_dimension();
-
-    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
-    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+    using Base = tile_window_with_tile_dstr_base<
+        tile_window_with_static_distribution<BottomTensorView_,
+                                             WindowLengths_,
+                                             StaticTileDistribution_,
+                                             NumCoord>,
+        BottomTensorView_,
+        WindowLengths_,
+        StaticTileDistribution_>;
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
     static_assert(NumCoord == 1);
 
-    // TODO: check WindowLengths and StaticTileDistribution are consistent
-
-    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
-                  "wrong! lengths should be static");
-    static_assert(TileDstr::is_static(), "wrong!");
-
-    static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
-                  "wrong! inconsistent # of diemsnions");
-
-    using AdaptorTopIndex   = array<index_t, NDimWindowAdaptorTop>;
-    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
-
-    using WindowAdaptorCoord =
-        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
-
-    using BottomTensorCoord =
-        decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{}));
-
-    struct load_store_traits
-    {
-        private:
-        static constexpr auto get_vector_dim_y_scalar_per_vector()
-        {
-            const auto [ys_vector_lengths, ys_vector_strides] =
-                tile_window_with_static_distribution::
-                    get_window_adaptor_ys_safe_vector_length_strides();
-
-            index_t VectorDimY_      = 0;
-            index_t ScalarPerVector_ = 1;
-
-            for(index_t i = 0; i < NDimY; ++i)
-            {
-                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
-                {
-                    ScalarPerVector_ = ys_vector_lengths[i];
-                    VectorDimY_      = i;
-                }
-            }
-
-            return make_tuple(VectorDimY_, ScalarPerVector_);
-        }
-
-        public:
-        static constexpr index_t PackedSize =
-            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
-        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
-        static constexpr index_t ScalarPerVector =
-            get_vector_dim_y_scalar_per_vector().template at<1>();
-
-        // using vector_type_t = vector_type_maker_t<DataType, ScalarPerVector>;
-        // using vector_t      = typename vector_type_t::type;
-        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;
-
-        private:
-        static constexpr auto scalars_per_access_ = [] {
-            constexpr auto scalars_per_access_arr = generate_array(
-                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
-
-            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
-            constexpr auto NDimY_ = NDimY;
-
-            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
-        }();
-
-        static constexpr auto get_space_filling_curve()
-        {
-            constexpr auto tile_dstr = TileDstr{};
-
-            constexpr auto thread_tensor_lengths_ys =
-                to_sequence(tile_dstr.get_ys_to_d_descriptor().get_lengths());
-
-            // FIXME: need logic to judge dim access order
-            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
-
-            return space_filling_curve<decltype(thread_tensor_lengths_ys),
-                                       DimAccessOrder,
-                                       decltype(scalars_per_access_)>{};
-        }
-
-        public:
-        using SFC_Ys = decltype(get_space_filling_curve());
-
-        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
-
-        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
-        static_assert(NumAccess % NumCoord == 0, "wrong! # of access is not divisible by NumCoord");
-    };
-
-    static constexpr index_t NumAccessPerCoord = load_store_traits::NumAccess / NumCoord;
+    static_assert(Base::Traits::NumAccess % NumCoord == 0,
+                  "wrong! # of access is not divisible by NumCoord");
+    static constexpr index_t NumAccessPerCoord = Base::Traits::NumAccess / NumCoord;
 
     CK_TILE_DEVICE constexpr tile_window_with_static_distribution() = default;
 
     CK_TILE_DEVICE constexpr tile_window_with_static_distribution(
-        const BottomTensorView& bottom_tensor_view,
-        const WindowLengths& window_lengths,
-        const BottomTensorIndex& window_origin,
-        const TileDstr& tile_distribution)
-        : bottom_tensor_view_{bottom_tensor_view},
-          window_lengths_{window_lengths},
-          window_origin_{window_origin},
-          tile_dstr_{tile_distribution},
-          pre_computed_coords_{}
+        const typename Base::BottomTensorView& bottom_tensor_view,
+        const typename Base::WindowLengths& window_lengths,
+        const typename Base::BottomTensorIndex& window_origin,
+        const typename Base::TileDstr& tile_distribution)
+        : pre_computed_coords_{}
     {
-#if 0 // debug
-      // TODO: this use more register for FA, but less register for GEMM
-      // need investigation
-      // only support warp-tile and block-tile
-        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
 
-        WindowAdaptorCoord window_adaptor_thread_coord_tmp;
-
-        if constexpr(NDimP == 1)
-        {
-            window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
-                tile_distribution.get_ps_ys_to_xs_adaptor(), AdaptorTopIndex{get_lane_id(), 0});
-        }
-        else if constexpr(NDimP == 2)
-        {
-            window_adaptor_thread_coord_tmp =
-                make_tensor_adaptor_coordinate(tile_distribution.get_ps_ys_to_xs_adaptor(),
-                                               AdaptorTopIndex{get_warp_id(), get_lane_id(), 0});
-        }
-#else
-        // TODO: this use less register for FA, but more register for GEMM
-        // need investigation
+        this->window_origin_                       = window_origin;
+        this->window_lengths_                      = window_lengths;
+        this->bottom_tensor_view_                  = bottom_tensor_view;
+        this->tile_dstr_                           = tile_distribution;
         const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
             tile_distribution.get_ps_ys_to_xs_adaptor(),
             container_concat(detail::get_partition_index(tile_distribution),
-                             array<index_t, NDimY>{0}));
-#endif
+                             array<index_t, Base::NDimY>{0}));
 
-        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+        typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
             window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
 
         const auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
-            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+            bottom_tensor_view.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
 
         // pre-compute NumCoord (WindowAdaptorCoord, BottomTensorCoord) bundles to speed up
         // future load/store() calls (might allocate more registers)
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
         using SFC_Ys = typename Traits::SFC_Ys;
 
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -204,9 +99,10 @@ struct tile_window_with_static_distribution
                 SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
 
             constexpr auto idx_diff_ps_ys = container_concat(
-                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
+                generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
+                idx_diff_ys);
 
-            move_window_adaptor_and_bottom_tensor_thread_coordinate(
+            Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
 
             pre_computed_coords_(iCoord) =
@@ -214,95 +110,12 @@ struct tile_window_with_static_distribution
         });
     }
 
-    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
-
-    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
-    {
-        return TileDstr::is_static();
-    }
-
-    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
-
-    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
-
-    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
-
-    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
-
-    CK_TILE_DEVICE constexpr void
-    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
-    {
-        bottom_tensor_view_.buf_.p_data_ = data;
-    }
-
-    // move thread's window adaptor coordinate and bottom tensor coordinate
-    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
-    template <typename ATopIndex>
-    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
-        WindowAdaptorCoord& window_adaptor_thread_coord,
-        BottomTensorCoord& bottom_tensor_thread_coord,
-        const ATopIndex& idx_diff_adaptor_top) const
-    {
-        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
-
-        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
-                                       window_adaptor_thread_coord,
-                                       idx_diff_adaptor_top,
-                                       idx_diff_adaptor_bottom);
-
-        move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
-                               bottom_tensor_thread_coord,
-                               idx_diff_adaptor_bottom);
-    }
-
-    // return vector dimension among [y0, y1, ...]
-    CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
-    {
-        // bottom tensor top dimension vector lengths and strides
-        const auto [bottom_tensor_top_dim_vector_lengths, bottom_tensor_top_dim_vector_strides] =
-            BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
-
-        // window vector lengths/strides
-        const auto window_adaptor_bottom_dim_vector_lengths = bottom_tensor_top_dim_vector_lengths;
-        const auto window_adaptor_bottom_dim_vector_strides = bottom_tensor_top_dim_vector_strides;
-
-        // window adaptor [p0, p1, ..., y0, y1, ...]
-        array<index_t, WindowAdaptor::get_num_of_hidden_dimension()> window_adaptor_vector_lengths{
-            -1};
-        array<index_t, WindowAdaptor::get_num_of_hidden_dimension()> window_adaptor_vector_strides{
-            -1};
-
-        constexpr auto window_adaptor_bottom_dims =
-            WindowAdaptor::get_bottom_dimension_hidden_ids();
-
-        set_container_subset(window_adaptor_vector_lengths,
-                             window_adaptor_bottom_dims,
-                             window_adaptor_bottom_dim_vector_lengths);
-        set_container_subset(window_adaptor_vector_strides,
-                             window_adaptor_bottom_dims,
-                             window_adaptor_bottom_dim_vector_strides);
-
-        const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
-            WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
-                window_adaptor_vector_lengths, window_adaptor_vector_strides);
-
-        // [y0, y1, ...]
-        constexpr auto y_dims = typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
-                                                                 NDimWindowAdaptorTop,
-                                                                 1>::type{};
-
-        return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
-                          get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
-    }
-
-    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return load_store_traits::NumAccess; }
-
     template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
-        constexpr auto tile_dstr = TileDstr{};
-        auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
+        constexpr auto tile_dstr = typename Base::TileDstr{};
+        auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
         load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
         return dst_tensor;
     }
@@ -314,11 +127,11 @@ struct tile_window_with_static_distribution
                              number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
-        using Traits   = load_store_traits;
+        using Traits   = typename Base::Traits;
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -334,9 +147,8 @@ struct tile_window_with_static_distribution
 
                 // read from bottom tensor
                 const vector_t vec_value =
-                    get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                    this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
                         bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
-#if 1
                 // write into distributed tensor
                 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
@@ -344,33 +156,26 @@ struct tile_window_with_static_distribution
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
                         },
-                        number<NDimY>{});
+                        number<Base::NDimY>{});
 
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                         Traits::PackedSize;
 
                     dst_tensor.get_thread_buffer().template at<d>() =
-                        vec_value.template get_as<DataType>()[j / Traits::PackedSize];
+                        vec_value
+                            .template get_as<typename Base::DataType>()[j / Traits::PackedSize];
                 });
-#else
-                constexpr index_t d =
-                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
-                static_assert(d % Traits::ScalarPerVector == 0);
-
-                dst_tensor.get_thread_buffer().template get_as<vector_t>()(
-                    number<d / Traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
-#endif
                 // move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                 {
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
@@ -386,22 +191,16 @@ struct tile_window_with_static_distribution
                                  bool_constant<oob_conditional_check> = {},
                                  bool_constant<pre_nop>               = {}) const
     {
-        using Traits = load_store_traits;
-
-        // using vector_type_t = typename Traits::vector_type_t;
+        using Traits   = typename Base::Traits;
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
         static constexpr index_t YElementSize =
-            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
+            typename Base::TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
         static_assert(YElementSize % (Traits::PackedSize * Traits::ScalarPerVector) == 0);
         using vectorized_tbuf =
             array<vector_t, YElementSize / (Traits::PackedSize * Traits::ScalarPerVector)>;
-        // StaticBuffer<address_space_enum::vgpr,
-        //                                      vector_t,
-        //                                      YElementSize / Traits::ScalarPerVector,
-        //                                      true>;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
 
@@ -427,7 +226,7 @@ struct tile_window_with_static_distribution
                     Traits::PackedSize;
                 static_assert(d % Traits::ScalarPerVector == 0);
 
-                get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
+                this->get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
                     dst_vec_tbuf.template at<d / Traits::ScalarPerVector>(),
                     bottom_tensor_thread_coord,
                     0 /**/,
@@ -444,10 +243,10 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
@@ -492,9 +291,8 @@ struct tile_window_with_static_distribution
         const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
         m0_set_with_memory(m0_init_value); // This should be wave independent
 
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
 
-        // using vector_type_t = typename Traits::vector_type_t;
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
@@ -516,7 +314,7 @@ struct tile_window_with_static_distribution
                 }();
 
                 // read from bottom tensor
-                get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                this->get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
                     smem, bottom_tensor_thread_coord, 0, pre_nop_);
 
                 // move thread coordinate
@@ -525,10 +323,10 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
 
                     m0_inc_with_memory(size_per_issue);
@@ -569,7 +367,7 @@ struct tile_window_with_static_distribution
 
         const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
 
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
 
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
@@ -588,7 +386,7 @@ struct tile_window_with_static_distribution
                 constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
 
                 // read from bottom tensor
-                get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
                     smem, bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
 
                 // move thread coordinate
@@ -597,10 +395,10 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
 
                     smem += size_per_issue; // Note we manually increase the per-issue offset
@@ -610,17 +408,18 @@ struct tile_window_with_static_distribution
     }
 
     template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
-    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+    CK_TILE_DEVICE void store(const static_distributed_tensor<typename Base::DataType,
+                                                              typename Base::TileDstr>& dstr_tensor,
                               number<i_access_unsupport_>          = {},
                               bool_constant<oob_conditional_check> = {}) const
     {
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
 
         // using vector_type_t = typename Traits::vector_type_t;
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -643,20 +442,20 @@ struct tile_window_with_static_distribution
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
                         },
-                        number<NDimY>{});
+                        number<Base::NDimY>{});
 
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                         Traits::PackedSize;
 
-                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
+                    vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
                 // const vector_t vec_value = vec.template get_as<vector_t>().template at<0>();
 
                 // write into bottom tensor
-                get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                this->get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
                     bottom_tensor_thread_coord,
                     0,
                     vec_value,
@@ -668,10 +467,10 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
@@ -679,15 +478,17 @@ struct tile_window_with_static_distribution
     }
 
     template <index_t i_access_unsupport_ = -1>
-    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
-                                  number<i_access_unsupport_> = {}) const
+    CK_TILE_DEVICE void
+    store_raw(const static_distributed_tensor<typename Base::DataType, typename Base::TileDstr>&
+                  dstr_tensor,
+              number<i_access_unsupport_> = {}) const
     {
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
 
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr                    = TileDstr{};
+        constexpr auto tile_dstr                    = typename Base::TileDstr{};
         static constexpr bool oob_conditional_check = true;
 
         // loop over thread tensor space [y0, y1, ...]
@@ -710,16 +511,16 @@ struct tile_window_with_static_distribution
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
                         },
-                        number<NDimY>{});
+                        number<Base::NDimY>{});
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                         Traits::PackedSize;
-                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
+                    vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
                 // write into bottom tensor
-                get_bottom_tensor_view()
+                this->get_bottom_tensor_view()
                     .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
                         bottom_tensor_thread_coord, 0, vec_value);
 
@@ -729,10 +530,10 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
@@ -740,16 +541,18 @@ struct tile_window_with_static_distribution
     }
 
     template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
-    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
-                               number<i_access_unsupport_>          = {},
-                               bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE void
+    update(const static_distributed_tensor<typename Base::DataType, typename Base::TileDstr>&
+               dstr_tensor,
+           number<i_access_unsupport_>          = {},
+           bool_constant<oob_conditional_check> = {}) const
     {
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
 
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -772,18 +575,18 @@ struct tile_window_with_static_distribution
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
                         },
-                        number<NDimY>{});
+                        number<Base::NDimY>{});
 
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                         Traits::PackedSize;
 
-                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
+                    vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
                 // write into bottom tensor
-                get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
+                this->get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
                     bottom_tensor_thread_coord,
                     0,
                     vec_value,
@@ -795,10 +598,10 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
@@ -806,17 +609,19 @@ struct tile_window_with_static_distribution
     }
 
     template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true, bool pre_nop>
-    CK_TILE_DEVICE void update_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
-                                   number<i_access_unsupport_>          = {},
-                                   bool_constant<oob_conditional_check> = {},
-                                   bool_constant<pre_nop>               = {}) const
+    CK_TILE_DEVICE void
+    update_raw(const static_distributed_tensor<typename Base::DataType, typename Base::TileDstr>&
+                   dstr_tensor,
+               number<i_access_unsupport_>          = {},
+               bool_constant<oob_conditional_check> = {},
+               bool_constant<pre_nop>               = {}) const
     {
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
 
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -839,18 +644,18 @@ struct tile_window_with_static_distribution
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                             : idx_ys_start[jj];
                         },
-                        number<NDimY>{});
+                        number<Base::NDimY>{});
 
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                         Traits::PackedSize;
 
-                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
+                    vec_value.template get_as<typename Base::DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
                 // write into bottom tensor
-                get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
+                this->get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
                     bottom_tensor_thread_coord,
                     0,
                     vec_value,
@@ -863,70 +668,44 @@ struct tile_window_with_static_distribution
                     constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
 
                     constexpr auto idx_diff_ps_ys = container_concat(
-                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
-                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                 }
             });
         });
     }
 
-    // move thread's botom tensor coordiante
-    // [x0', x1', ... ] ==> [offset]
-    // also move window-origin
-    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    // Custom move behavior
+    CK_TILE_DEVICE void move_extended(const typename Base::BottomTensorIndex& step)
     {
-        window_origin_ += step;
-
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+            move_tensor_coordinate(this->bottom_tensor_view_.get_tensor_descriptor(),
                                    pre_computed_coords_(iCoord)(I1),
                                    step);
         });
     }
 
-    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex&)
     {
-        window_origin_ = new_window_origin;
-
-#if 0 // debug
-      // TODO: this use more register for FA, but less register for GEMM
-      // need investigation
-      // only support warp-tile and block-tile
-        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
-
-        WindowAdaptorCoord window_adaptor_thread_coord_tmp;
-
-        if constexpr(NDimP == 1)
-        {
-            window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
-                tile_dstr_.get_ps_ys_to_xs_adaptor(), AdaptorTopIndex{get_lane_id(), 0});
-        }
-        else if constexpr(NDimP == 2)
-        {
-            window_adaptor_thread_coord_tmp =
-                make_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
-                                               AdaptorTopIndex{get_warp_id(), get_lane_id(), 0});
-        }
-#else
         // TODO: this use less register for FA, but more register for GEMM
         // need investigation
         const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
-            tile_dstr_.get_ps_ys_to_xs_adaptor(),
-            container_concat(detail::get_partition_index(tile_dstr_), array<index_t, NDimY>{0}));
-#endif
+            this->tile_dstr_.get_ps_ys_to_xs_adaptor(),
+            container_concat(detail::get_partition_index(this->tile_dstr_),
+                             array<index_t, Base::NDimY>{0}));
 
-        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
-            window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
+        typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            this->window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
 
         const auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
-            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+            this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
 
         // pre-compute NumCoord (WindowAdaptorCoord, BottomTensorCoord) bundles to speed up
         // future load/store() calls (might allocate more registers)
-        using Traits = load_store_traits;
+        using Traits = typename Base::Traits;
         using SFC_Ys = typename Traits::SFC_Ys;
 
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -937,9 +716,10 @@ struct tile_window_with_static_distribution
                 SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
 
             constexpr auto idx_diff_ps_ys = container_concat(
-                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
+                generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
+                idx_diff_ys);
 
-            move_window_adaptor_and_bottom_tensor_thread_coordinate(
+            Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
 
             pre_computed_coords_(iCoord) =
@@ -947,27 +727,11 @@ struct tile_window_with_static_distribution
         });
     }
 
-    CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); }
-
-    // this is the bottom tensor view
-    // [x0', x1', ...] ==> [offset]
-    BottomTensorView bottom_tensor_view_;
-
-    //
-    WindowLengths window_lengths_;
-
-    // origin ([x0', x1', ...]) of window on bottom tensor
-    BottomTensorIndex window_origin_;
-
-    // Tile tensor distribution, which contains:
-    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
-    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
-    TileDstr tile_dstr_;
-
     // this contains:
     //   per-thread coordinate for window adaptor
     //   per-thread coordinate for bottom tensor
-    array<tuple<WindowAdaptorCoord, BottomTensorCoord>, NumCoord> pre_computed_coords_;
+    array<tuple<typename Base::WindowAdaptorCoord, typename Base::BottomTensorCoord>, NumCoord>
+        pre_computed_coords_;
 };
 
 // TODO: use strategy
@@ -1037,62 +801,26 @@ CK_TILE_DEVICE void move_tile_window(
  */
 template <typename BottomTensorView_, typename WindowLengths_>
 struct tile_window_with_static_lengths
+    : public tile_window_base<tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>,
+                              BottomTensorView_,
+                              WindowLengths_>
 {
-    using BottomTensorView = remove_reference_t<BottomTensorView_>;
-    using WindowLengths    = remove_cvref_t<WindowLengths_>;
-    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
-    using DataType         = typename BottomTensorView::DataType;
-
-    static constexpr index_t NDimBottomTensor = BottomTensorDesc::get_num_of_dimension();
-
-    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
-                  "wrong! lengths should be static");
-
-    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
+    using Base =
+        tile_window_base<tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>,
+                         BottomTensorView_,
+                         WindowLengths_>;
 
     CK_TILE_DEVICE constexpr tile_window_with_static_lengths() = default;
 
     CK_TILE_DEVICE constexpr tile_window_with_static_lengths(
-        const BottomTensorView& bottom_tensor_view,
-        const WindowLengths& window_lengths,
-        const BottomTensorIndex& window_origin)
-        : bottom_tensor_view_{bottom_tensor_view},
-          window_lengths_{window_lengths},
-          window_origin_{window_origin}
+        const typename Base::BottomTensorView& bottom_tensor_view,
+        const typename Base::WindowLengths& window_lengths,
+        const typename Base::BottomTensorIndex& window_origin)
     {
+        this->window_origin_      = window_origin;
+        this->window_lengths_     = window_lengths;
+        this->bottom_tensor_view_ = bottom_tensor_view;
     }
-
-    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
-
-    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
-
-    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
-
-    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
-
-    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
-    {
-        window_origin_ = new_window_origin;
-    }
-
-    CK_TILE_DEVICE constexpr void
-    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
-    {
-        bottom_tensor_view_.buf_.p_data_ = data;
-    }
-
-    // move window-origin
-    CK_TILE_DEVICE void move(const BottomTensorIndex& step) { window_origin_ += step; }
-
-    // this is the bottom tensor view
-    // [x0', x1', ...] ==> [offset]
-    BottomTensorView bottom_tensor_view_;
-
-    //
-    WindowLengths window_lengths_;
-
-    // origin ([x0', x1', ...]) of window on bottom tensor
-    BottomTensorIndex window_origin_;
 };
 
 template <typename TensorView_, typename WindowLengths_>
diff --git a/include/ck_tile/core/tensor/tile_window_base.hpp b/include/ck_tile/core/tensor/tile_window_base.hpp
new file mode 100644
index 0000000000..89a928a53c
--- /dev/null
+++ b/include/ck_tile/core/tensor/tile_window_base.hpp
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+/**
+ * @brief This class provides description of tile windowed view on the device memory.
+ *
+ * @note This class does not provide any functions to read or modify device memory.
+ *
+ * @tparam BottomTensorView_    Class describing & holding device tensor memory.
+ * @tparam WindowLengths_       Spatial sizes of windowed view on tensor.
+ */
+template <typename TileWindowType_, typename BottomTensorView_, typename WindowLengths_>
+struct tile_window_base
+{
+
+    using BottomTensorView = remove_reference_t<BottomTensorView_>;
+    using WindowLengths    = remove_cvref_t<WindowLengths_>;
+    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
+    using DataType         = remove_cvref_t<typename BottomTensorView::DataType>;
+
+    static constexpr index_t NDimBottomTensor = BottomTensorDesc::get_num_of_dimension();
+
+    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
+                  "wrong! lengths should be static");
+
+    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
+
+    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
+    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
+    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
+    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
+
+    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    {
+        window_origin_ = new_window_origin;
+
+        // Delegate to child if it implements extra logic
+        static_cast<TileWindowType_*>(this)->set_window_origin_extended(new_window_origin);
+    }
+    // Default no-op; can be overridden in child
+    CK_TILE_DEVICE void set_window_origin_extended(const BottomTensorIndex&) {}
+
+    CK_TILE_DEVICE constexpr void
+    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
+    {
+        bottom_tensor_view_.buf_.p_data_ = data;
+    }
+
+    // move window-origin
+    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    {
+        window_origin_ += step;
+
+        // Delegate to child if it implements extra movement logic
+        static_cast<TileWindowType_*>(this)->move_extended(step);
+    }
+
+    // Default no-op; can be overridden in child
+    CK_TILE_DEVICE void move_extended(const BottomTensorIndex&) {}
+
+    // origin ([x0', x1', ...]) of window on bottom tensor
+    BottomTensorIndex window_origin_;
+
+    WindowLengths window_lengths_;
+
+    // this is the bottom tensor view
+    // [x0', x1', ...] ==> [offset]
+    BottomTensorView bottom_tensor_view_;
+};
+
+template <typename TileWindowType_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_>
+struct tile_window_with_tile_dstr_base
+    : public tile_window_base<TileWindowType_, BottomTensorView_, WindowLengths_>
+{
+    using TileDstr       = remove_cvref_t<StaticTileDistribution_>;
+    using TileWindowBase = tile_window_base<TileWindowType_, BottomTensorView_, WindowLengths_>;
+
+    using WindowAdaptor = typename TileDstr::PsYs2XsAdaptor;
+
+    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
+
+    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
+    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+
+    using AdaptorTopIndex = array<index_t, NDimWindowAdaptorTop>;
+    // using BottomTensorIndex = array<index_t, TileWindowBase::NDimBottomTensor>;
+
+    using WindowAdaptorCoord =
+        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
+
+    using BottomTensorCoord = decltype(make_tensor_coordinate(
+        typename TileWindowBase::BottomTensorDesc{}, typename TileWindowBase::BottomTensorIndex{}));
+
+    static_assert(TileDstr::is_static(), "wrong!");
+    static_assert(TileWindowBase::NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
+                  "wrong! inconsistent # of diemsnions");
+
+    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
+    CK_TILE_HOST_DEVICE void init_raw() { this->bottom_tensor_view_.init_raw(); }
+
+    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
+    {
+        return TileDstr::is_static();
+    }
+
+    // move thread's window adaptor coordinate and bottom tensor coordinate
+    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
+    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
+        WindowAdaptorCoord& window_adaptor_thread_coord,
+        BottomTensorCoord& bottom_tensor_thread_coord,
+        const ATopIndex& idx_diff_adaptor_top) const
+    {
+        array<index_t, TileWindowBase::NDimBottomTensor> idx_diff_adaptor_bottom;
+
+        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                       window_adaptor_thread_coord,
+                                       idx_diff_adaptor_top,
+                                       idx_diff_adaptor_bottom);
+
+        move_tensor_coordinate(this->bottom_tensor_view_.get_tensor_descriptor(),
+                               bottom_tensor_thread_coord,
+                               idx_diff_adaptor_bottom);
+    }
+
+    struct Traits
+    {
+        public:
+        static constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<typename TileWindowBase::DataType>>::PackedSize;
+
+        static constexpr auto get_vector_dim_y_scalar_per_vector()
+        {
+            const auto [ys_vector_lengths, ys_vector_strides] =
+                tile_window_with_tile_dstr_base::get_window_adaptor_ys_safe_vector_length_strides();
+
+            index_t VectorDimY_      = 0;
+            index_t ScalarPerVector_ = 1;
+
+            for(index_t i = 0; i < NDimY; ++i)
+            {
+                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
+                {
+                    ScalarPerVector_ = ys_vector_lengths[i];
+                    VectorDimY_      = i;
+                }
+            }
+
+            return make_tuple(VectorDimY_, ScalarPerVector_);
+        }
+
+        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
+        static constexpr index_t ScalarPerVector =
+            get_vector_dim_y_scalar_per_vector().template at<1>();
+        using vector_t =
+            thread_buffer<typename TileWindowBase::DataType, ScalarPerVector / PackedSize>;
+
+        static constexpr auto scalars_per_access_ = [] {
+            constexpr auto scalars_per_access_arr = generate_array(
+                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
+
+            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
+            constexpr auto NDimY_ = NDimY;
+
+            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
+        }();
+
+        static constexpr auto get_space_filling_curve()
+        {
+            constexpr auto thread_tensor_lengths_ys =
+                to_sequence(TileDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+            // FIXME: need logic to judge dim access order
+            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
+
+            return space_filling_curve<decltype(thread_tensor_lengths_ys),
+                                       DimAccessOrder,
+                                       decltype(scalars_per_access_),
+                                       false /*!!! no snaked curve! */>{};
+        }
+
+        using SFC_Ys = decltype(get_space_filling_curve());
+
+        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
+
+        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
+    };
+
+    // return vector dimension among [y0, y1, ...]
+    CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
+    {
+        // bottom tensor top dimension vector lengths and strides
+        const auto [bottom_tensor_top_dim_vector_lengths, bottom_tensor_top_dim_vector_strides] =
+            TileWindowBase::BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
+
+        // window vector lengths/strides
+        const auto window_adaptor_bottom_dim_vector_lengths = bottom_tensor_top_dim_vector_lengths;
+        const auto window_adaptor_bottom_dim_vector_strides = bottom_tensor_top_dim_vector_strides;
+
+        // window adaptor [p0, p1, ..., y0, y1, ...]
+        array<index_t, WindowAdaptor::get_num_of_hidden_dimension()> window_adaptor_vector_lengths{
+            -1};
+        array<index_t, WindowAdaptor::get_num_of_hidden_dimension()> window_adaptor_vector_strides{
+            -1};
+
+        constexpr auto window_adaptor_bottom_dims =
+            WindowAdaptor::get_bottom_dimension_hidden_ids();
+
+        set_container_subset(window_adaptor_vector_lengths,
+                             window_adaptor_bottom_dims,
+                             window_adaptor_bottom_dim_vector_lengths);
+        set_container_subset(window_adaptor_vector_strides,
+                             window_adaptor_bottom_dims,
+                             window_adaptor_bottom_dim_vector_strides);
+
+        const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
+            WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
+                window_adaptor_vector_lengths, window_adaptor_vector_strides);
+
+        // [y0, y1, ...]
+        constexpr auto y_dims = typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
+                                                                 NDimWindowAdaptorTop,
+                                                                 1>::type{};
+
+        return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
+                          get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
+    }
+
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return Traits::NumAccess; }
+    // Tile tensor distribution, which contains:
+    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
+    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
+    TileDstr tile_dstr_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index 5ecaf5ca17..f11610d658 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -13,6 +13,7 @@
 #include "ck_tile/core/tensor/static_distributed_tensor.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/tensor/tile_window_base.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
@@ -37,171 +38,48 @@ namespace ck_tile {
 // TODO: if using this struct, better use load_raw()/store_raw(), can control
 //       the the immediate offset on the fly
 // space-filing-curve is non-snaked here!
-//
+// This struct inherits from tile_window_with_tile_dstr_base, which is an intermediary base class
+// with the ultimate parent class being tile_window_base.
 template <typename BottomTensorView_,
           typename WindowLengths_,
           typename StaticTileDistribution_,
           typename LinearBottomDims_>
 struct tile_window_linear
+    : public tile_window_with_tile_dstr_base<tile_window_linear<BottomTensorView_,
+                                                                WindowLengths_,
+                                                                StaticTileDistribution_,
+                                                                LinearBottomDims_>,
+                                             BottomTensorView_,
+                                             WindowLengths_,
+                                             StaticTileDistribution_>
 {
+    using Base = tile_window_with_tile_dstr_base<tile_window_linear<BottomTensorView_,
+                                                                    WindowLengths_,
+                                                                    StaticTileDistribution_,
+                                                                    LinearBottomDims_>,
+                                                 BottomTensorView_,
+                                                 WindowLengths_,
+                                                 StaticTileDistribution_>;
 
-    using BottomTensorView = remove_reference_t<BottomTensorView_>;
-    using WindowLengths    = remove_cvref_t<WindowLengths_>;
-    using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
-
-    using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
-    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
-
-    using DataType         = remove_cvref_t<typename BottomTensorView::DataType>;
     using LinearBottomDims = remove_cvref_t<LinearBottomDims_>;
 
-    static_assert(LinearBottomDims::size() == BottomTensorView::get_num_of_dimension());
-
-    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
-    static constexpr index_t NDimBottomTensor     = BottomTensorDesc::get_num_of_dimension();
-
-    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
-    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+    static_assert(LinearBottomDims::size() == Base::BottomTensorView::get_num_of_dimension());
 
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
 
-    // TODO: check WindowLengths and StaticTileDistribution are consistent
-
-    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
-                  "wrong! lengths should be static");
-    static_assert(TileDstr::is_static(), "wrong!");
-
-    static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
-                  "wrong! inconsistent # of diemsnions");
-
-    using AdaptorTopIndex   = array<index_t, NDimWindowAdaptorTop>;
-    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
-
-    using WindowAdaptorCoord =
-        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
-
-    using BottomTensorCoord =
-        decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{}));
-
     struct traits
     {
-        private:
-        // return vector dimension among [y0, y1, ...]
-        CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
-        {
-            // bottom tensor top dimension vector lengths and strides
-            const auto [bottom_tensor_top_dim_vector_lengths,
-                        bottom_tensor_top_dim_vector_strides] =
-                BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
-
-            // window vector lengths/strides
-            const auto window_adaptor_bottom_dim_vector_lengths =
-                bottom_tensor_top_dim_vector_lengths;
-            const auto window_adaptor_bottom_dim_vector_strides =
-                bottom_tensor_top_dim_vector_strides;
-
-            // window adaptor [p0, p1, ..., y0, y1, ...]
-            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
-                window_adaptor_vector_lengths{-1};
-            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
-                window_adaptor_vector_strides{-1};
-
-            constexpr auto window_adaptor_bottom_dims =
-                WindowAdaptor::get_bottom_dimension_hidden_ids();
-
-            set_container_subset(window_adaptor_vector_lengths,
-                                 window_adaptor_bottom_dims,
-                                 window_adaptor_bottom_dim_vector_lengths);
-            set_container_subset(window_adaptor_vector_strides,
-                                 window_adaptor_bottom_dims,
-                                 window_adaptor_bottom_dim_vector_strides);
-
-            const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
-                WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
-                    window_adaptor_vector_lengths, window_adaptor_vector_strides);
-
-            // [y0, y1, ...]
-            constexpr auto y_dims =
-                typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
-                                                 NDimWindowAdaptorTop,
-                                                 1>::type{};
-
-            return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
-                              get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
-        }
-
-        static constexpr auto get_vector_dim_y_scalar_per_vector()
-        {
-            const auto [ys_vector_lengths, ys_vector_strides] =
-                get_window_adaptor_ys_safe_vector_length_strides();
-
-            index_t VectorDimY_      = 0;
-            index_t ScalarPerVector_ = 1;
-
-            for(index_t i = 0; i < NDimY; ++i)
-            {
-                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
-                {
-                    ScalarPerVector_ = ys_vector_lengths[i];
-                    VectorDimY_      = i;
-                }
-            }
-
-            return make_tuple(VectorDimY_, ScalarPerVector_);
-        }
-
-        public:
-        static constexpr index_t PackedSize =
-            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
-        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
-        static constexpr index_t ScalarPerVector =
-            get_vector_dim_y_scalar_per_vector().template at<1>();
-
-        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;
-
-        private:
-        static constexpr auto scalars_per_access_ = [] {
-            constexpr auto scalars_per_access_arr = generate_array(
-                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
-
-            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
-            constexpr auto NDimY_ = NDimY;
-
-            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
-        }();
-
-        static constexpr auto get_space_filling_curve()
-        {
-            constexpr auto thread_tensor_lengths_ys =
-                to_sequence(TileDstr{}.get_ys_to_d_descriptor().get_lengths());
-
-            // FIXME: need logic to judge dim access order
-            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
-
-            return space_filling_curve<decltype(thread_tensor_lengths_ys),
-                                       DimAccessOrder,
-                                       decltype(scalars_per_access_),
-                                       false /*!!! no snaked curve! */>{};
-        }
-
-        public:
-        using SFC_Ys = decltype(get_space_filling_curve());
-
-        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
-
-        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
-
         private:
         static constexpr auto get_num_non_linear_access()
         {
-            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
-            using ys_to_rhs_major =
-                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
+            using ys_to_rhs_major          = typename decltype(
+                typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
 
             constexpr auto non_linear = [&]() {
                 index_t cnt = 1;
-                static_for<0, NDimY, 1>{}([&](auto i_dim_y) {
+                static_for<0, Base::NDimY, 1>{}([&](auto i_dim_y) {
                     constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
                     constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
                     if constexpr(LinearBottomDims{}[target_h_dim] == 0)
@@ -230,20 +108,20 @@ struct tile_window_linear
         //  -> prefixsum : seqneuce<0, 2, 4, 6, 8>
         static constexpr auto get_non_linear_access_map()
         {
-            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
-            using ys_to_rhs_major =
-                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
+            using ys_to_rhs_major          = typename decltype(
+                typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
             constexpr auto non_linear_map = [&]() {
-                array<index_t, NumAccess> m_{0};
+                array<index_t, Base::Traits::NumAccess> m_{0};
                 index_t cumulative_len_            = 1;
                 index_t cumulative_non_linear_len_ = 1;
-                static_for<0, NDimY, 1>{}([&](auto i_y) {
-                    constexpr auto i_dim_y       = number<NDimY - i_y - 1>{}; // from right to left
+                static_for<0, Base::NDimY, 1>{}([&](auto i_y) {
+                    constexpr auto i_dim_y = number<Base::NDimY - i_y - 1>{}; // from right to left
                     constexpr auto rhs_major     = ys_to_rhs_major{}[i_dim_y];
                     constexpr auto target_h_dim  = number<rhs_major - 1>{}; // no r dim here!
                     constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim];
 
-                    array<index_t, NumAccess> current_m_{0};
+                    array<index_t, Base::Traits::NumAccess> current_m_{0};
                     constexpr auto current_len_ = sfc_access_lens[i_dim_y];
 
                     // copy cumulative length as current pattern
@@ -266,13 +144,12 @@ struct tile_window_linear
                 return m_;
             }();
 
-            return TO_SEQUENCE(non_linear_map, NumAccess);
+            return TO_SEQUENCE(non_linear_map, Base::Traits::NumAccess);
         }
 
         static constexpr auto get_non_linear_access_histogram()
         {
             constexpr auto m_ = get_non_linear_access_map();
-            // m_.foo();
 
             constexpr auto r_ =
                 typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
@@ -296,7 +173,7 @@ struct tile_window_linear
         using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum());
     };
 
-    static constexpr index_t NumAccess           = traits::NumAccess;
+    static constexpr index_t NumAccess           = Base::Traits::NumAccess;
     static constexpr index_t NumAccess_NonLinear = traits::NumAccess_NonLinear;
     using AccessMap_NonLinear                    = typename traits::AccessMap_NonLinear;
     using AccessHistogram_NonLinear              = typename traits::AccessHistogram_NonLinear;
@@ -304,30 +181,31 @@ struct tile_window_linear
 
     CK_TILE_DEVICE constexpr tile_window_linear() = default;
 
-    CK_TILE_DEVICE constexpr tile_window_linear(const BottomTensorView& bottom_tensor_view,
-                                                const WindowLengths& window_lengths,
-                                                const BottomTensorIndex& window_origin,
-                                                const TileDstr& tile_distribution)
-        : bottom_tensor_view_{bottom_tensor_view},
-          window_lengths_{window_lengths},
-          window_origin_{window_origin},
-          tile_dstr_{tile_distribution},
-          cached_coords_{},
-          cached_flags_{}
+    CK_TILE_DEVICE constexpr tile_window_linear(
+        const typename Base::BottomTensorView& bottom_tensor_view,
+        const typename Base::WindowLengths& window_lengths,
+        const typename Base::BottomTensorIndex& window_origin,
+        const typename Base::TileDstr& tile_distribution)
+        : cached_coords_{}, cached_flags_{}
     {
+        this->bottom_tensor_view_            = bottom_tensor_view;
+        this->window_lengths_                = window_lengths;
+        this->window_origin_                 = window_origin;
+        this->tile_dstr_                     = tile_distribution;
         auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
             tile_distribution.get_ps_ys_to_xs_adaptor(),
-            container_concat(make_tuple(get_warp_id(), get_lane_id()),
-                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+            container_concat(
+                make_tuple(get_warp_id(), get_lane_id()),
+                generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimY>{})));
 
-        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+        typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
             window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
 
         auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
-            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+            this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
 
         // future load/store() calls (might allocate more registers)
-        using SFC_Ys = typename traits::SFC_Ys;
+        using SFC_Ys = typename Base::Traits::SFC_Ys;
 
         static_for<0, NumAccess, 1>{}([&](auto i_access) {
             constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
@@ -343,16 +221,16 @@ struct tile_window_linear
             //      cached flag is independent from non-linear-coord
             //      but need be updated in move_tile, with proper dims
             cached_flags_(i_access) = coordinate_has_valid_offset_assuming_top_index_is_valid(
-                bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
+                this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
 
             if constexpr(i_access != (NumAccess - 1))
             {
                 constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
                 constexpr auto idx_diff_ps_ys = container_concat(
-                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                     idx_diff_ys);
 
-                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                     window_adaptor_thread_coord_tmp,
                     bottom_tensor_thread_coord_tmp,
                     idx_diff_ps_ys);
@@ -360,54 +238,13 @@ struct tile_window_linear
         });
     }
 
-    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
-
-    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
-    {
-        return TileDstr::is_static();
-    }
-
-    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
-
-    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
-
-    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
-
-    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
-
-    CK_TILE_DEVICE constexpr void
-    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
-    {
-        bottom_tensor_view_.buf_.p_data_ = data;
-    }
-
-    // move thread's window adaptor coordinate and bottom tensor coordinate
-    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
-    template <typename ATopIndex>
-    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
-        WindowAdaptorCoord& window_adaptor_thread_coord,
-        BottomTensorCoord& bottom_tensor_thread_coord,
-        const ATopIndex& idx_diff_adaptor_top) const
-    {
-        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
-
-        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
-                                       window_adaptor_thread_coord,
-                                       idx_diff_adaptor_top,
-                                       idx_diff_adaptor_bottom);
-
-        move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
-                               bottom_tensor_thread_coord,
-                               idx_diff_adaptor_bottom);
-    }
-
     template <index_t i_access>
     CK_TILE_DEVICE static constexpr auto get_bottom_linear_coordinate(number<i_access>)
     {
-        using SFC_Ys          = typename traits::SFC_Ys;
+        using SFC_Ys          = typename Base::Traits::SFC_Ys;
         constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
-        using ys_to_rhs_major =
-            typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+        using ys_to_rhs_major = typename decltype(
+            typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
 
         constexpr auto modified_idx_ys = generate_tuple(
             [&](auto i_dim_y) {
@@ -422,9 +259,9 @@ struct tile_window_linear
                     return number<idx_ys[i_dim_y]>{};
                 }
             },
-            number<NDimY>{});
+            number<Base::NDimY>{});
 
-        constexpr auto adaptor_ = TileDstr{}.get_ps_ys_to_xs_adaptor();
+        constexpr auto adaptor_ = typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor();
         constexpr auto idx_ =
             container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys);
 
@@ -441,8 +278,8 @@ struct tile_window_linear
         {
             // this case usually is a LDS window, everything is known at compile tile.
             // we directly use BottomTensorView transform to compute the offset, in case padding
-            auto bottom_tensor_coord =
-                make_tensor_coordinate(BottomTensorView{}.get_tensor_descriptor(), linear_coord);
+            auto bottom_tensor_coord = make_tensor_coordinate(
+                typename Base::BottomTensorView{}.get_tensor_descriptor(), linear_coord);
             return bottom_tensor_coord.get_offset();
         }
         else
@@ -453,7 +290,7 @@ struct tile_window_linear
             // since that would introduce runtime length (so can't use linear offset)
             constexpr index_t linear_offset = [&]() {
                 constexpr auto x_idx_ = linear_coord;
-                constexpr auto x_len_ = TileDstr{}.get_lengths();
+                constexpr auto x_len_ = typename Base::TileDstr{}.get_lengths();
                 static_assert(x_idx_.size() == x_len_.size());
                 constexpr index_t x_dims_ = x_idx_.size();
                 index_t cu_stride_        = 1;
@@ -469,17 +306,16 @@ struct tile_window_linear
         }
     }
 
-    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }
-
     template <index_t i_access = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE auto load(number<i_access> = {}, bool_constant<oob_conditional_check> = {}) const
     {
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
-        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
+        auto dst_tensor =
+            make_static_distributed_tensor<typename Base::DataTypeDataType>(tile_dstr);
 
         auto issue = [&](auto i_access_) {
             constexpr auto IAccess = number<i_access_>{};
@@ -492,35 +328,29 @@ struct tile_window_linear
 
             // read from bottom tensor
             const vector_t vec_value =
-                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
                     bottom_tensor_thread_coord,
                     linear_offset,
                     bottom_tensor_flag,
                     bool_constant<oob_conditional_check>{});
-#if 1
+
             // data index [y0, y1, ...]
             constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
             // write into distributed tensor
-            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
+            static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
-                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                        return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
+                                                              : idx_diff_ys[jj];
                     },
-                    number<NDimY>{});
+                    number<Base::NDimY>{});
 
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
-                                      traits::PackedSize;
+                                      Base::Traits::PackedSize;
 
-                dst_tensor.get_thread_buffer().template at<d>() =
-                    vec_value.template get_as<DataType>()[j / traits::PackedSize];
+                dst_tensor.get_thread_buffer().template at<d>() = vec_value.template get_as<
+                    typename Base::DataTypeDataType>()[j / Base::Traits::PackedSize];
             });
-#else
-            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
-            static_assert(d % traits::ScalarPerVector == 0);
-
-            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
-                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
-#endif
         };
 
         WINDOW_DISPATCH_ISSUE();
@@ -533,10 +363,10 @@ struct tile_window_linear
                              number<i_access>                     = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
 
@@ -551,35 +381,28 @@ struct tile_window_linear
 
             // read from bottom tensor
             const vector_t vec_value =
-                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
                     bottom_tensor_thread_coord,
                     linear_offset,
                     bottom_tensor_flag,
                     bool_constant<oob_conditional_check>{});
-#if 1
             // data index [y0, y1, ...]
             constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
             // write into distributed tensor
-            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
+            static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
-                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                        return jj == Base::Traits::VectorDimY ? (idx_diff_ys[jj] + j)
+                                                              : idx_diff_ys[jj];
                     },
-                    number<NDimY>{});
+                    number<Base::NDimY>{});
 
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
-                                      traits::PackedSize;
+                                      Base::Traits::PackedSize;
 
-                dst_tensor.get_thread_buffer().template at<d>() =
-                    vec_value.template get_as<DataType>()[j / traits::PackedSize];
+                dst_tensor.get_thread_buffer().template at<d>() = vec_value.template get_as<
+                    typename Base::DataTypeDataType>()[j / Base::Traits::PackedSize];
             });
-#else
-            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
-            static_assert(d % traits::ScalarPerVector == 0);
-
-            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
-                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
-#endif
         };
 
         WINDOW_DISPATCH_ISSUE();
@@ -596,15 +419,17 @@ struct tile_window_linear
                                  bool_constant<oob_conditional_check> = {},
                                  bool_constant<pre_nop>               = {}) const
     {
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
         static constexpr index_t YElementSize =
-            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
-        static_assert(YElementSize % (traits::PackedSize * traits::ScalarPerVector) == 0);
+            typename Base::TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
+        static_assert(YElementSize % (Base::Traits::PackedSize * Base::Traits::ScalarPerVector) ==
+                      0);
         using vectorized_tbuf =
-            array<vector_t, YElementSize / (traits::PackedSize * traits::ScalarPerVector)>;
+            array<vector_t,
+                  YElementSize / (Base::Traits::PackedSize * Base::Traits::ScalarPerVector)>;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
 
@@ -612,7 +437,7 @@ struct tile_window_linear
             constexpr auto IAccess  = number<i_access_>{};
             constexpr auto pre_nop_ = [&]() {
                 if constexpr(pre_nop && i_access_ == 0 &&
-                             BottomTensorView::buffer_view::get_address_space() ==
+                             Base::BottomTensorView::buffer_view::get_address_space() ==
                                  address_space_enum::global)
                     return bool_constant<true>{};
                 else
@@ -628,11 +453,11 @@ struct tile_window_linear
             constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
             constexpr index_t d =
                 tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
-                traits::PackedSize;
-            static_assert(d % traits::ScalarPerVector == 0);
+                Base::Traits::PackedSize;
+            static_assert(d % Base::Traits::ScalarPerVector == 0);
 
-            get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
-                dst_vec_tbuf.template at<d / traits::ScalarPerVector>(),
+            this->get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
+                dst_vec_tbuf.template at<d / Base::Traits::ScalarPerVector>(),
                 bottom_tensor_thread_coord,
                 linear_offset /**/,
                 bottom_tensor_flag,
@@ -663,7 +488,7 @@ struct tile_window_linear
         // currently we only support everything is non linear dim
         // actually it's not performant if we have linear dim(e.g. fast changing)
         static_assert(NumAccess_NonLinear == NumAccess);
-        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+        static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
                       address_space_enum::global);
 
         // issues * warps * lanes
@@ -689,7 +514,7 @@ struct tile_window_linear
         const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
         m0_set_with_memory(m0_init_value); // This should be wave independent
 
-        using vector_t = typename traits::vector_t;
+        using vector_t = typename Base::Traits::vector_t;
 
         LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
 
@@ -708,7 +533,7 @@ struct tile_window_linear
             auto bottom_tensor_flag         = cached_flags_[IAccess]; // get this flag anyway
 
             // read from bottom tensor
-            get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+            this->get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
                 smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
 
             // move thread coordinate
@@ -732,7 +557,7 @@ struct tile_window_linear
         // currently we only support everything is non linear dim
         // actually it's not performant if we have linear dim(e.g. fast changing)
         static_assert(NumAccess_NonLinear == NumAccess);
-        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+        static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
                       address_space_enum::global);
 
         // issues * warps * lanes
@@ -757,7 +582,7 @@ struct tile_window_linear
 
         const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
 
-        using vector_t = typename traits::vector_t;
+        using vector_t = typename Base::Traits::vector_t;
 
         // TODO: we force CK_TILE_LDS_ADDR
         CK_TILE_LDS_ADDR LdsDataType* smem =
@@ -771,7 +596,7 @@ struct tile_window_linear
             auto bottom_tensor_flag         = cached_flags_[IAccess];
 
             // read from bottom tensor
-            get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+            this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
                 smem,
                 bottom_tensor_thread_coord,
                 0,
@@ -789,15 +614,16 @@ struct tile_window_linear
     }
 
     template <index_t i_access = -1, bool oob_conditional_check = true>
-    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+    CK_TILE_DEVICE void store(const static_distributed_tensor<typename Base::DataType,
+                                                              typename Base::TileDstr>& dstr_tensor,
                               number<i_access>                     = {},
                               bool_constant<oob_conditional_check> = {}) const
     {
 
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         auto issue = [&](auto i_access_) {
@@ -812,22 +638,23 @@ struct tile_window_linear
             // read from distributed tensor
             vector_t vec_value;
 
-            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
+            static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
-                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                        return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                              : idx_ys_start[jj];
                     },
-                    number<NDimY>{});
+                    number<Base::NDimY>{});
 
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
-                                      traits::PackedSize;
+                                      Base::Traits::PackedSize;
 
-                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
             // write into bottom tensor
-            get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+            this->get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
                 bottom_tensor_thread_coord,
                 linear_offset,
                 bottom_tensor_flag,
@@ -839,13 +666,15 @@ struct tile_window_linear
     }
 
     template <index_t i_access = -1>
-    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
-                                  number<i_access> = {}) const
+    CK_TILE_DEVICE void
+    store_raw(const static_distributed_tensor<typename Base::DataType, typename Base::TileDstr>&
+                  dstr_tensor,
+              number<i_access> = {}) const
     {
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
 
-        constexpr auto tile_dstr                    = TileDstr{};
+        constexpr auto tile_dstr                    = typename Base::TileDstr{};
         static constexpr bool oob_conditional_check = true;
 
         // loop over thread tensor space [y0, y1, ...]
@@ -861,20 +690,21 @@ struct tile_window_linear
 
             // read from distributed tensor
             vector_t vec_value;
-            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
+            static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
-                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                        return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                              : idx_ys_start[jj];
                     },
-                    number<NDimY>{});
+                    number<Base::NDimY>{});
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
-                                      traits::PackedSize;
-                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
+                                      Base::Traits::PackedSize;
+                vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
             // write into bottom tensor
-            get_bottom_tensor_view()
+            this->get_bottom_tensor_view()
                 .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
                     bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
         };
@@ -883,15 +713,17 @@ struct tile_window_linear
     }
 
     template <index_t i_access = -1, bool oob_conditional_check = true>
-    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
-                               number<i_access>                     = {},
-                               bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE void
+    update(const static_distributed_tensor<typename Base::DataType, typename Base::TileDstr>&
+               dstr_tensor,
+           number<i_access>                     = {},
+           bool_constant<oob_conditional_check> = {}) const
     {
 
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         auto issue = [&](auto i_access_) {
@@ -907,22 +739,24 @@ struct tile_window_linear
             // read from distributed tensor
             vector_t vec_value;
 
-            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
+            static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
-                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                        return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                              : idx_ys_start[jj];
                     },
-                    number<NDimY>{});
+                    number<Base::NDimY>{});
 
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
-                                      traits::PackedSize;
+                                      Base::Traits::PackedSize;
 
-                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataTypeDataType>()(
+                    j / Base::Traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
             // write into bottom tensor
-            get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
+            this->get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
                 bottom_tensor_thread_coord,
                 linear_offset,
                 bottom_tensor_flag,
@@ -934,16 +768,18 @@ struct tile_window_linear
     }
 
     template <index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
-    CK_TILE_DEVICE void update_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
-                                   number<i_access>                     = {},
-                                   bool_constant<oob_conditional_check> = {},
-                                   bool_constant<pre_nop>               = {}) const
+    CK_TILE_DEVICE void
+    update_raw(const static_distributed_tensor<typename Base::DataType, typename Base::TileDstr>&
+                   dstr_tensor,
+               number<i_access>                     = {},
+               bool_constant<oob_conditional_check> = {},
+               bool_constant<pre_nop>               = {}) const
     {
 
-        using vector_t = typename traits::vector_t;
-        using SFC_Ys   = typename traits::SFC_Ys;
+        using vector_t = typename Base::Traits::vector_t;
+        using SFC_Ys   = typename Base::Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        constexpr auto tile_dstr = typename Base::TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         auto issue = [&](auto i_access_) {
@@ -959,22 +795,24 @@ struct tile_window_linear
             // read from distributed tensor
             vector_t vec_value;
 
-            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
+            static_for<0, Base::Traits::ScalarPerVector, Base::Traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
-                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                        return jj == Base::Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                              : idx_ys_start[jj];
                     },
-                    number<NDimY>{});
+                    number<Base::NDimY>{});
 
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
-                                      traits::PackedSize;
+                                      Base::Traits::PackedSize;
 
-                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataTypeDataType>()(
+                    j / Base::Traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
             // write into bottom tensor
-            get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
+            this->get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
                 bottom_tensor_thread_coord,
                 linear_offset,
                 bottom_tensor_flag,
@@ -985,14 +823,10 @@ struct tile_window_linear
 
         WINDOW_DISPATCH_ISSUE();
     }
-
-    // move thread's botom tensor coordiante
-    // [x0', x1', ... ] ==> [offset]
-    // also move window-origin
-    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    // *_extended() functions acts like a virtual function with a default implementation exisiting
+    // in the base class
+    CK_TILE_DEVICE void move_extended(const typename Base::BottomTensorIndex& step)
     {
-        window_origin_ += step;
-
         static_for<0, NumAccess, 1>{}([&](auto i_access) {
             constexpr auto IAccess       = number<i_access>{};
             constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
@@ -1001,7 +835,7 @@ struct tile_window_linear
 
             if constexpr(need_update_non_linear_coord)
             {
-                move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                move_tensor_coordinate(this->bottom_tensor_view_.get_tensor_descriptor(),
                                        cached_coords_(non_linear_id),
                                        step);
             }
@@ -1010,30 +844,29 @@ struct tile_window_linear
             auto tmp_coords             = cached_coords_[non_linear_id];
             constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess);
             move_tensor_coordinate(
-                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
+                this->bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
 
             cached_flags_(IAccess) = coordinate_has_valid_offset_assuming_top_index_is_valid(
-                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
+                this->bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
         });
     }
 
-    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex&)
     {
-        window_origin_ = new_window_origin;
-
         auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
-            TileDstr{}.get_ps_ys_to_xs_adaptor(),
-            container_concat(make_tuple(get_warp_id(), get_lane_id()),
-                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+            typename Base::TileDstr{}.get_ps_ys_to_xs_adaptor(),
+            container_concat(
+                make_tuple(get_warp_id(), get_lane_id()),
+                generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimY>{})));
 
-        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
-            window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
+        typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            this->window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
 
         auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
-            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+            this->bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
 
         // future load/store() calls (might allocate more registers)
-        using SFC_Ys = typename traits::SFC_Ys;
+        using SFC_Ys = typename Base::Traits::SFC_Ys;
 
         static_for<0, NumAccess, 1>{}([&](auto i_access) {
             constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
@@ -1049,10 +882,10 @@ struct tile_window_linear
             {
                 constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
                 constexpr auto idx_diff_ps_ys = container_concat(
-                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                     idx_diff_ys);
 
-                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                     window_adaptor_thread_coord_tmp,
                     bottom_tensor_thread_coord_tmp,
                     idx_diff_ps_ys);
@@ -1060,26 +893,9 @@ struct tile_window_linear
         });
     }
 
-    CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); }
-
-    // this is the bottom tensor view
-    // [x0', x1', ...] ==> [offset]
-    BottomTensorView bottom_tensor_view_;
-
-    //
-    WindowLengths window_lengths_;
-
-    // origin ([x0', x1', ...]) of window on bottom tensor
-    BottomTensorIndex window_origin_;
-
-    // Tile tensor distribution, which contains:
-    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
-    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
-    TileDstr tile_dstr_;
-
     // this contains:
-    array<BottomTensorCoord, traits::NumAccess_NonLinear> cached_coords_;
-    array<bool, traits::NumAccess> cached_flags_;
+    array<typename Base::BottomTensorCoord, traits::NumAccess_NonLinear> cached_coords_;
+    array<bool, Base::Traits::NumAccess> cached_flags_;
 };
 
 #undef WINDOW_DISPATCH_ISSUE

From 417a6b65b6436b533c64966a6b8825f12c31d8ef Mon Sep 17 00:00:00 2001
From: Adam Dickin <adam.dickin@amd.com>
Date: Thu, 22 May 2025 12:14:33 -0600
Subject: [PATCH 140/443] Add MIOPEN_REQ_LIBS_ONLY option for cmake to build
 only the libs MIOpen requires (#2224)

* cut out anything we dont need for MIOpen to test

* refactor exclusion code to be more streamlined.
---
 CMakeLists.txt                                | 15 +++++---
 .../gpu/CMakeLists.txt                        | 38 +++++++++++++------
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13606975c0..2f1fc8892b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -605,6 +605,9 @@ ENDIF()
 ENDFOREACH()
 
 add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
+
+option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
+
 add_subdirectory(library)
 
 if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
@@ -624,11 +627,13 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
    endif()
 endif()
 
-rocm_package_setup_component(profiler
-    LIBRARY_NAME composablekernel
-    PACKAGE_NAME ckprofiler
-)
-add_subdirectory(profiler)
+if (NOT MIOPEN_REQ_LIBS_ONLY)
+    rocm_package_setup_component(profiler
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME ckprofiler
+    )
+    add_subdirectory(profiler)
+endif()
 
 if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
   add_subdirectory(codegen)
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 97946207a1..73f3ae6cd6 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -103,6 +103,18 @@ function(add_instance_library INSTANCE_NAME)
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
+
+    if(MIOPEN_REQ_LIBS_ONLY)
+        message("Removing all sources that are not required for MIOpen")
+        foreach(source IN LISTS ARGN)
+            if(source MATCHES "gemm" OR 
+               source MATCHES "mha" OR 
+               source MATCHES "contraction" OR 
+               source MATCHES "reduce")
+                    list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+    endif()
     #message("remaining instances: ${ARGN}")
     #only continue if there are some source files left on the list
     if(ARGN)
@@ -343,7 +355,7 @@ if(CK_DEVICE_OTHER_INSTANCES)
             DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
         )
 endif()
-if(CK_DEVICE_GEMM_INSTANCES)
+if(CK_DEVICE_GEMM_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
         add_library(device_gemm_operations ${CK_DEVICE_GEMM_INSTANCES})
         add_library(composablekernels::device_gemm_operations ALIAS device_gemm_operations)
         target_compile_features(device_gemm_operations PUBLIC)
@@ -389,7 +401,7 @@ if(CK_DEVICE_CONV_INSTANCES)
             DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
         )
 endif()
-if(CK_DEVICE_MHA_INSTANCES)
+if(CK_DEVICE_MHA_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
         set(gpu_list ${INST_TARGETS})
         if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a" OR gpu_list MATCHES "gfx95")
             add_library(device_mha_operations ${CK_DEVICE_MHA_INSTANCES})
@@ -411,7 +423,7 @@ if(CK_DEVICE_MHA_INSTANCES)
             )
         endif()
 endif()
-if(CK_DEVICE_CONTRACTION_INSTANCES)
+if(CK_DEVICE_CONTRACTION_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
         add_library(device_contraction_operations ${CK_DEVICE_CONTRACTION_INSTANCES})
         add_library(composablekernels::device_contraction_operations ALIAS device_contraction_operations)
         target_compile_features(device_contraction_operations PUBLIC)
@@ -433,7 +445,7 @@ if(CK_DEVICE_CONTRACTION_INSTANCES)
             DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
         )
 endif()
-if(CK_DEVICE_REDUCTION_INSTANCES)
+if(CK_DEVICE_REDUCTION_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
         add_library(device_reduction_operations ${CK_DEVICE_REDUCTION_INSTANCES})
         add_library(composablekernels::device_reduction_operations ALIAS device_reduction_operations)
         target_compile_features(device_reduction_operations PUBLIC)
@@ -455,14 +467,16 @@ if(CK_DEVICE_REDUCTION_INSTANCES)
         )
 endif()
 
-add_library(device_operations INTERFACE)
-target_link_libraries(device_operations INTERFACE
-    device_contraction_operations
-    device_conv_operations
-    device_gemm_operations
-    device_other_operations
-    device_reduction_operations
-    utility)
+if(NOT MIOPEN_REQ_LIBS_ONLY)
+    add_library(device_operations INTERFACE)
+    target_link_libraries(device_operations INTERFACE
+        device_contraction_operations
+        device_conv_operations
+        device_gemm_operations
+        device_other_operations
+        device_reduction_operations
+        utility)
+endif()
 
 set(DEV_OPS_INC_DIRS
     ${PROJECT_SOURCE_DIR}/include/ck/

From bc2551ac3b27edc31f20863e3a873508fb73aad2 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 22 May 2025 14:03:04 -0700
Subject: [PATCH 141/443] disable building device_mha_operations by default
 (#2225)

---
 CMakeLists.txt                                           | 1 +
 library/src/tensor_operation_instance/gpu/CMakeLists.txt | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f1fc8892b..98698e5940 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -607,6 +607,7 @@ ENDFOREACH()
 add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
 
 option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
+option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
 
 add_subdirectory(library)
 
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 73f3ae6cd6..aef40b8cb3 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -76,7 +76,7 @@ function(add_instance_library INSTANCE_NAME)
     endforeach()
     # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
     foreach(source IN LISTS ARGN)
-	    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "mha")
+	    if((NOT BUILD_MHA_LIB OR (NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND NOT INST_TARGETS MATCHES "gfx95")) AND source MATCHES "mha")
          message("removing mha instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
@@ -401,7 +401,7 @@ if(CK_DEVICE_CONV_INSTANCES)
             DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
         )
 endif()
-if(CK_DEVICE_MHA_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
+if(CK_DEVICE_MHA_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY AND BUILD_MHA_LIB)
         set(gpu_list ${INST_TARGETS})
         if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a" OR gpu_list MATCHES "gfx95")
             add_library(device_mha_operations ${CK_DEVICE_MHA_INSTANCES})

From 1b846143c669b7bcbba4fda5f9165bb270f88ea1 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 22 May 2025 15:41:17 -0700
Subject: [PATCH 142/443] Revert "Update the buffer load/store intrinsic names
 for clang>=20. (#2192)" (#2227)

This reverts commit 58f9e9ffbc190188f85895deb952cb47cc89c403.
---
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |   74 +-
 .../amd_buffer_addressing_builtins.hpp        |   20 +-
 include/ck_tile/core.hpp                      |    1 +
 .../arch/amd_buffer_addressing_builtins.hpp   | 2559 +++++++++++++++++
 include/ck_tile/core/tensor/buffer_view.hpp   |    4 +
 5 files changed, 2607 insertions(+), 51 deletions(-)
 create mode 100644 include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp

diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index c191fff7d0..15a9df2c0c 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -4,22 +4,14 @@
 #include <type_traits>
 
 template <typename T>
-constexpr const char* DataTypeToString()
-{
-    if constexpr(std::is_same_v<T, ck_tile::half_t>)
-    {
+constexpr const char* DataTypeToString() {
+    if constexpr (std::is_same_v<T, ck_tile::half_t>) {
         return "fp16";
-    }
-    else if constexpr(std::is_same_v<T, ck_tile::fp8_t>)
-    {
+    } else if constexpr (std::is_same_v<T, ck_tile::fp8_t>) {
         return "fp8";
-    }
-    else if constexpr(std::is_same_v<T, ck_tile::bf8_t>)
-    {
+    } else if constexpr (std::is_same_v<T, ck_tile::bf8_t>) {
         return "bf8";
-    }
-    else
-    {
+    } else {
         return "unknown";
     }
 }
@@ -120,9 +112,8 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time =
-        flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+    float ave_time = flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -130,15 +121,18 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>()
-              << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
-              << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
-              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>() << " M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
 
     return ave_time;
 }
 
-template <typename PrecType, typename ALayout, typename BLayout, typename CLayout>
+template <typename PrecType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                     char* argv[],
                                     const ALayout a_layout                  = ALayout{},
@@ -153,7 +147,7 @@ int run_flatmm_example_with_layouts(int argc,
     using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
     using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
     using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
-
+    
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
     ck_tile::index_t K = arg_parser.get_int("k");
@@ -188,7 +182,7 @@ int run_flatmm_example_with_layouts(int argc,
     c_rslt_host.SetZero();
 
     // do pre-shuffle
-    std::string mfma = arg_parser.get_str("prec");
+    std::string mfma                              = arg_parser.get_str("prec");
 #if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
     ck_tile::index_t mfma_type = 1;
 #else
@@ -199,18 +193,18 @@ int run_flatmm_example_with_layouts(int argc,
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
     invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        a_dev_buf,
-        b_shuffle_dev_buf,
-        c_dev_buf,
-        M,
-        N,
-        K,
-        stride_A,
-        stride_B,
-        stride_C,
-        kbatch,
-        n_warmup,
-        n_repeat);
+                                             a_dev_buf,
+                                             b_shuffle_dev_buf,
+                                             c_dev_buf,
+                                             M,
+                                             N,
+                                             K,
+                                             stride_A,
+                                             stride_B,
+                                             stride_C,
+                                             kbatch,
+                                             n_warmup,
+                                             n_repeat);
 
     c_dev_buf.FromDevice(c_rslt_host.data());
     bool pass = true;
@@ -225,9 +219,8 @@ int run_flatmm_example_with_layouts(int argc,
             a_host, b_origin_host, c_ref_host);
         const float max_accumulated_value =
             *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_rslt_host,
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_rslt_host,
                                   c_ref_host,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
@@ -284,9 +277,8 @@ int run_flatmm_example_with_layouts(int argc,
         c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
         const float max_accumulated_value =
             *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_rslt_host,
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_rslt_host,
                                   c_gpu_ref_host,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 296c1d44d7..19869906dc 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -80,7 +80,7 @@ __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
 
 // buffer atomic-add i32
 __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
@@ -88,7 +88,7 @@ __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
 
 // buffer atomic-add fp32
 __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
@@ -96,15 +96,15 @@ __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
 
 // buffer atomic-add fp32
-__device__ double
-llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
-                                       int32x4_t rsrc, // dst_wave_buffer_resource
-                                       int voffset,    // dst_thread_addr_offset
-                                       int soffset,    // dst_wave_addr_offset
-                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
+__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(
+    double vdata,
+    int32x4_t rsrc, // dst_wave_buffer_resource
+    int voffset,    // dst_thread_addr_offset
+    int soffset,    // dst_wave_addr_offset
+    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
 
 // memory coherency bit for buffer store/load instruction
 // check ISA manual for each GFX target
@@ -827,7 +827,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t offset,
-                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
 
 #ifndef __HIPCC_RTC__
 template <typename T, index_t NumElemsPerThread>
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index aa9411b2e1..27af59c192 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/algorithm/space_filling_curve.hpp"
 #include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/arch/utility.hpp"
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
new file mode 100644
index 0000000000..0b9956cd01
--- /dev/null
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -0,0 +1,2559 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#if CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
+
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/container/thread_buffer.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+
+namespace ck_tile {
+
+// 128 bit SGPRs to supply buffer resource in buffer instructions
+// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+struct __attribute__((packed)) buffer_resource
+{
+    const void* ptr;
+    uint32_t range;
+    uint32_t config;
+};
+
+CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
+{
+    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
+    int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
+    return r;
+}
+
+namespace impl {
+// below type indicate the data type used for buffer load inline asm
+// clang-format off
+template<index_t N, typename T> struct buffer_load_trait;
+
+template<typename T> struct buffer_load_trait<16, T> { using payload_t = fp32x4_t; };
+template<typename T> struct buffer_load_trait<8 , T> { using payload_t = fp32x2_t; };
+template<typename T> struct buffer_load_trait<4 , T> { using payload_t = float; };
+template<typename T> struct buffer_load_trait<2 , T> { using payload_t = float; };
+template<typename T> struct buffer_load_trait<1 , T> { using payload_t = float; };
+
+#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
+template<> struct buffer_load_trait<16, thread_buffer<bf16_t, 8>> { using payload_t = bf16x8_t; };
+template<> struct buffer_load_trait<8 , thread_buffer<bf16_t, 4>> { using payload_t = bf16x4_t; };
+template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payload_t = bf16x2_t; };
+#endif
+// clang-format on
+} // namespace impl
+
+// TODO: glc/slc/...
+template <index_t bytes, bool pre_nop = false>
+struct buffer_load;
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
+// (exp_vector_type(xxx))
+template <bool pre_nop>
+struct buffer_load<16, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<8, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<4, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_dword %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
+        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load<1, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/       = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+        else
+            asm volatile("buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset)
+                         : "memory");
+    }
+};
+
+template <index_t bytes, bool pre_nop = false>
+struct buffer_load_if;
+
+template <bool pre_nop>
+struct buffer_load_if<16, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 16);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
+        static_assert(sizeof(mbuf_t) == sizeof(T));
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<8, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 8);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<4, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+
+template <bool pre_nop>
+struct buffer_load_if<1, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag           = 0,
+                                   bool_constant<pre_nop> = {})
+    {
+        static_assert(sizeof(T) == 4);
+        auto saved_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
+        if constexpr(pre_nop)
+            asm volatile("s_nop 4\n"
+                         "v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+        else
+            asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                         "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
+                         "s_mov_b64 exec %5"
+                         : "+v"(reinterpret_cast<mbuf_t&>(value))
+                         : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
+                         : "memory");
+    }
+};
+#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
+template <index_t bytes>
+struct buffer_store;
+
+template <>
+struct buffer_store<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = fp32x4_t;
+        asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = fp32x2_t;
+        asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 2);
+        using mbuf_t = short;
+        asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag*/ = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <index_t bytes>
+struct buffer_store_if;
+
+template <>
+struct buffer_store_if<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 16);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = fp32x4_t;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 8);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        // TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
+        using mbuf_t = ext_vector_t<typename T::value_type, T::size()>;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 2);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = short;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <>
+struct buffer_store_if<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(bit_cast<mbuf_t>(value)),
+                       "v"(v_offset),
+                       "s"(res),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+CK_TILE_DEVICE void lds_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+template <typename scalar_type, index_t N, bool pre_nop = false>
+struct buffer_atomic_add_if;
+
+template <bool pre_nop>
+struct buffer_atomic_add_if<bf16_t, 2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t flag = 1)
+    {
+        static_assert(sizeof(T) == 4);
+        auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
+        asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
+                     "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
+                     "s_mov_b64 exec %5"
+                     :
+                     : "v"(v_offset),
+                       "v"(bit_cast<mbuf_t>(value)),
+                       "s"(res.xy),
+                       "n"(i_offset),
+                       "v"(flag),
+                       "s"(save_exec)
+                     : "memory");
+    }
+};
+
+template <typename scalar_type, index_t N, bool pre_nop = false>
+struct buffer_atomic_add;
+
+template <bool pre_nop>
+struct buffer_atomic_add<bf16_t, 2, pre_nop>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(const T& value,
+                                   int32x4_t res /*buffer resource*/,
+                                   index_t v_offset,
+                                   index_t /*s_offset*/,
+                                   index_t i_offset /*max 0xFFF*/,
+                                   index_t /*flag = 1*/)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
+                     :
+                     : "v"(v_offset), "v"(bit_cast<mbuf_t>(value)), "s"(res.xy), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+namespace impl {
+// below type indicate the data type used for buffer load inline asm
+// clang-format off
+template<index_t N, typename T> struct smem_load_trait;
+
+template<typename T> struct smem_load_trait<16, T> { using payload_t = fp32x4_t; };
+template<typename T> struct smem_load_trait<8 , T> { using payload_t = fp32x2_t; };
+template<typename T> struct smem_load_trait<4 , T> { using payload_t = float; };
+template<typename T> struct smem_load_trait<2 , T> { using payload_t = float; };
+template<typename T> struct smem_load_trait<1 , T> { using payload_t = float; };
+
+// clang-format on
+} // namespace impl
+
+// NOTE: smem load/store no need pre_nop to make sure dependency by sw, happy :)
+template <index_t>
+struct smem_load;
+
+template <>
+struct smem_load<16>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 16);
+        using mbuf_t = typename impl::smem_load_trait<16, T>::payload_t;
+        asm volatile("ds_read_b128 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<8>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 8);
+        using mbuf_t = typename impl::smem_load_trait<8, T>::payload_t;
+        asm volatile("ds_read_b64 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<4>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::smem_load_trait<4, T>::payload_t;
+        asm volatile("ds_read_b32 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<2>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
+        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
+        asm volatile("ds_read_u16 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+template <>
+struct smem_load<1>
+{
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
+    {
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
+        asm volatile("ds_read_u8 %0, %1 offset:%2"
+                     : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
+                     : "v"(v_offset), "n"(i_offset)
+                     : "memory");
+    }
+};
+
+// clang-format off
+namespace impl{
+
+// can't use "+v" since there could be potential extra move(read/write)
+// use "v" can help remove such duplicated moves
+// besides, fake this as "memory" operation to force later valu after this fence
+// TODO: may have scratch (because this is memory?)
+//       need to reduce extra move inside compiler
+template<index_t N>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword(array<float, N>& b)
+{
+    constexpr auto kSize = remove_cvref_t<decltype(b)>::size(); 
+    static_for<0, kSize, 1>{}([&](auto i){
+        asm volatile(" " : : "v"(b.get(number<i>{})) : "memory");
+    });
+}
+#if 1
+// below specialization just merge size() of dwords into single section
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<2>(array<float, 2>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<3>(array<float, 3>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<4>(array<float, 4>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<8>(array<float, 8>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
+                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<16>(array<float, 16>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
+                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
+                         "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
+                         "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})) : "memory");
+}
+
+template<>
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<32>(array<float, 32>& b)
+{
+    asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
+                         "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
+                         "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
+                         "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})),
+                         "v"(b.get(number<16>{})), "v"(b.get(number<17>{})), "v"(b.get(number<18>{})), "v"(b.get(number<19>{})),
+                         "v"(b.get(number<20>{})), "v"(b.get(number<21>{})), "v"(b.get(number<22>{})), "v"(b.get(number<23>{})),
+                         "v"(b.get(number<24>{})), "v"(b.get(number<25>{})), "v"(b.get(number<26>{})), "v"(b.get(number<27>{})),
+                         "v"(b.get(number<28>{})), "v"(b.get(number<29>{})), "v"(b.get(number<30>{})), "v"(b.get(number<31>{})) : "memory");
+}
+#endif
+CK_TILE_DEVICE void insert_dummy_dep() {}
+
+template<typename T>
+CK_TILE_DEVICE void insert_dummy_dep(T & buffer)
+{
+    // TODO: indeed we expect T to be multiple of dword. subdword is always buggy
+    using da_type = array<float, (sizeof(T) + 3) / 4>;
+    auto & dummy = reinterpret_cast<da_type&>(buffer);
+    insert_dummy_dep_per_dword(dummy);
+}
+
+template<typename Tx, typename... Ty>
+CK_TILE_DEVICE void insert_dummy_dep(Tx& bx, Ty&... by)
+{
+    insert_dummy_dep(bx);
+    insert_dummy_dep(by...);
+}
+}
+// clang-format on
+template <typename... T>
+CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0, T&... o)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+    impl::insert_dummy_dep(o...);
+}
+
+CK_TILE_DEVICE void buffer_store_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+// buffer load i8
+CK_TILE_DEVICE_EXTERN int8_t
+llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
+                               index_t voffset,
+                               index_t soffset,
+                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN int8x2_t
+llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN int8x4_t
+llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8.v4i32");
+
+// buffer load i16
+CK_TILE_DEVICE_EXTERN int16_t
+llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN int16x2_t
+llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN int16x4_t
+llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16.v4i32");
+
+// buffer load i32
+CK_TILE_DEVICE_EXTERN int32_t
+llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32.v4i32");
+
+CK_TILE_DEVICE_EXTERN int32x2_t
+llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32.v4i32");
+
+CK_TILE_DEVICE_EXTERN int32x4_t
+llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32.v4i32");
+
+// buffer load fp16
+CK_TILE_DEVICE_EXTERN _Float16
+llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16.v4i32");
+
+// buffer load fp32
+CK_TILE_DEVICE_EXTERN float
+llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(
+    int32x4_t srsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32.v4i32");
+
+// buffer store i8
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
+                                int32x4_t rsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8.v4i32");
+
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8.v4i32");
+
+// buffer store i16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(
+    int16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(
+    int16x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
+
+// buffer store i32
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32.v4i32");
+
+// buffer store ui16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(
+    uint16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(
+    uint16x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(
+    int32x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(
+    int32x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32.v4i32");
+
+// buffer store fp16
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(
+    fp16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(
+    fp16x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16.v4i32");
+
+// buffer store fp32
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp32(float vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(
+    fp32x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32.v4i32");
+
+CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(
+    fp32x4_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32.v4i32");
+
+// buffer atomic-add fp16
+CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+    fp16x2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
+
+// buffer atomic-add i32
+CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
+    int32_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
+
+// buffer atomic-add fp32
+CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
+    float vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
+
+// buffer atomic-max fp64
+CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(
+    double vdata,
+    int32x4_t rsrc, // dst_wave_buffer_resource
+    int voffset,    // dst_thread_addr_offset
+    int soffset,    // dst_wave_addr_offset
+    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
+
+// Direct loads from global to LDS.
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
+                                __attribute__((address_space(3))) uint32_t* lds_ptr,
+                                index_t size,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t offset,
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
+
+template <bool pre_nop = false>
+CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
+                                              int32x4_t rsrc,
+                                              index_t voffset,
+                                              index_t /*soffset*/,
+                                              index_t ioffset /*max 0xFFF*/,
+                                              index_t /*flag*/       = 0,
+                                              bool_constant<pre_nop> = {})
+{
+    if constexpr(pre_nop)
+        asm volatile("s_nop 4\n"
+                     "buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
+    else
+        asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
+                     : "=r"(smem) /*dummy dependency for smem*/
+                     : "v"(voffset), "s"(rsrc), "n"(ioffset)
+                     : "memory");
+}
+
+CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
+{
+    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
+}
+
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct amd_buffer_coherence_enum
+{
+    coherence_default = 0, // default value
+    glc               = 1,
+    slc               = 2,
+    glc_slc           = 3,
+};
+
+template <index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE thread_buffer<int8_t, N>
+amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
+                                index_t src_thread_addr_offset,
+                                index_t src_wave_addr_offset)
+{
+    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
+                  "wrong! not implemented");
+
+    using rtn_type = thread_buffer<int8_t, N>;
+
+    if constexpr(N == 1)
+    {
+        return bit_cast<rtn_type>(llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
+                                                                 src_thread_addr_offset,
+                                                                 src_wave_addr_offset,
+                                                                 static_cast<index_t>(coherence)));
+    }
+    else if constexpr(N == 2)
+    {
+
+        int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 4)
+    {
+        int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
+                                                      src_thread_addr_offset,
+                                                      src_wave_addr_offset,
+                                                      static_cast<index_t>(coherence));
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 8)
+    {
+        int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 16)
+    {
+        int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                          src_thread_addr_offset,
+                                                          src_wave_addr_offset,
+                                                          static_cast<index_t>(coherence));
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 32)
+    {
+        int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                           src_thread_addr_offset,
+                                                           src_wave_addr_offset,
+                                                           static_cast<index_t>(coherence));
+        int32x4_t tmp1 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 4 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+        thread_buffer<int32_t, 8> tmp;
+
+        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
+        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
+
+        return bit_cast<rtn_type>(tmp);
+    }
+    else if constexpr(N == 64)
+    {
+        int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                           src_thread_addr_offset,
+                                                           src_wave_addr_offset,
+                                                           static_cast<index_t>(coherence));
+        int32x4_t tmp1 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 4 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+        int32x4_t tmp2 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 8 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+        int32x4_t tmp3 =
+            llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                              src_thread_addr_offset,
+                                              src_wave_addr_offset + 12 * sizeof(int32_t),
+                                              static_cast<index_t>(coherence));
+
+        thread_buffer<int32_t, 16> tmp;
+
+        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
+        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
+        tmp.template get_as<int32x4_t>()(number<2>{}) = tmp2;
+        tmp.template get_as<int32x4_t>()(number<3>{}) = tmp3;
+
+        return bit_cast<rtn_type>(tmp);
+    }
+}
+
+#ifndef BUFFER_LOAD_USE_INLINEASM
+#define BUFFER_LOAD_USE_INLINEASM 0
+#endif
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
+                                                        index_t src_thread_addr_offset,
+                                                        index_t src_wave_addr_offset)
+{
+    static_assert(
+        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, int32_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, pk_int4_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)),
+        "wrong! not implemented");
+
+    using rtn_type = thread_buffer<T, N>;
+
+    if constexpr(std::is_same<T, float>::value) // fp32
+    {
+        if constexpr(N == 1)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 2)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 4)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 8)
+        {
+            thread_buffer<float, 8> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return tmp;
+        }
+        else if constexpr(N == 16)
+        {
+            thread_buffer<float, 16> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 8 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 12 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return tmp;
+        }
+    }
+    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
+    {
+        if constexpr(N == 1)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 2)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 4)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 8)
+        {
+            // use fp32 load to mimic fp16 load
+            fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
+    }
+    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
+    {
+        if constexpr(N == 1)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                src_wave_addr_offset,
+                                                static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 2)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 4)
+        {
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence)));
+        }
+        else if constexpr(N == 8)
+        {
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                              src_thread_addr_offset,
+                                                              src_wave_addr_offset,
+                                                              static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
+    }
+    else // other datatype
+    {
+        auto raw_data = amd_buffer_load_impl_with_bytes<sizeof(T) * N, coherence>(
+            src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
+
+        return bit_cast<rtn_type>(raw_data);
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer<T, N>& dst,
+                                             int32x4_t src_wave_buffer_resource,
+                                             index_t src_thread_addr_offset,
+                                             index_t src_wave_addr_offset,
+                                             index_t src_linear_addr_offset,
+                                             index_t flag           = 0,
+                                             bool_constant<pre_nop> = {})
+{
+    constexpr index_t bytes = sizeof(T) * N;
+    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
+                  "wrong! not supported by buffer_load instruction");
+
+    using type = thread_buffer<T, N>;
+    if constexpr(oob_conditional_check)
+    {
+        buffer_load_if<sizeof(type), pre_nop>{}(dst,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                src_wave_addr_offset,
+                                                src_linear_addr_offset,
+                                                flag,
+                                                bool_constant<pre_nop>{});
+    }
+    else
+    {
+        buffer_load<sizeof(type), pre_nop>{}(dst,
+                                             src_wave_buffer_resource,
+                                             src_thread_addr_offset,
+                                             src_wave_addr_offset,
+                                             src_linear_addr_offset,
+                                             flag,
+                                             bool_constant<pre_nop>{});
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
+                                               int32x4_t src_wave_buffer_resource,
+                                               index_t src_thread_addr_offset,
+                                               index_t src_wave_addr_offset,
+                                               index_t src_immediate_addr_offset = 0,
+                                               bool_constant<pre_nop>            = {})
+{
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+
+    async_buffer_load_dword_v(smem,
+                              src_wave_buffer_resource,
+                              src_thread_addr_offset,
+                              src_wave_addr_offset,
+                              src_immediate_addr_offset,
+                              0,
+                              bool_constant<pre_nop>{});
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
+                                          int32x4_t src_wave_buffer_resource,
+                                          index_t src_thread_addr_offset,
+                                          index_t src_wave_addr_offset,
+                                          index_t src_immediate_addr_offset    = 0,
+                                          index_t flag                         = 0,
+                                          bool_constant<oob_conditional_check> = {})
+{
+    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
+
+    if constexpr(oob_conditional_check)
+    {
+        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        v_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
+                                        smem,
+                                        sizeof(uint32_t),
+                                        src_thread_addr_offset,
+                                        src_wave_addr_offset,
+                                        src_immediate_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+}
+
+template <index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
+                                                     int32x4_t dst_wave_buffer_resource,
+                                                     index_t dst_thread_addr_offset,
+                                                     index_t dst_wave_addr_offset)
+{
+    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
+                  "wrong! not implemented");
+
+    if constexpr(N == 1)
+    {
+        llvm_amdgcn_raw_buffer_store_i8(bit_cast<int8_t>(src_thread_data),
+                                        dst_wave_buffer_resource,
+                                        dst_thread_addr_offset,
+                                        dst_wave_addr_offset,
+                                        static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 2)
+    {
+
+        llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
+                                         dst_wave_buffer_resource,
+                                         dst_thread_addr_offset,
+                                         dst_wave_addr_offset,
+                                         static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 4)
+    {
+        llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
+                                         dst_wave_buffer_resource,
+                                         dst_thread_addr_offset,
+                                         dst_wave_addr_offset,
+                                         static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 8)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           dst_wave_addr_offset,
+                                           static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 16)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           dst_wave_addr_offset,
+                                           static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 32)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 4,
+            static_cast<index_t>(coherence));
+    }
+    else if constexpr(N == 64)
+    {
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 4,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<2>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 8,
+            static_cast<index_t>(coherence));
+
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<3>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 12,
+            static_cast<index_t>(coherence));
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
+CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_data,
+                                          int32x4_t dst_wave_buffer_resource,
+                                          index_t dst_thread_addr_offset,
+                                          index_t dst_wave_addr_offset)
+{
+    static_assert(
+        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int32_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, uint16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
+
+    if constexpr(std::is_same<T, float>::value) // fp32
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32(bit_cast<float>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<fp32x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(
+                src_thread_data.template get_as<fp32x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+            llvm_amdgcn_raw_buffer_store_fp32x4(
+                src_thread_data.template get_as<fp32x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(float),
+                static_cast<index_t>(coherence));
+        }
+    }
+    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16(bit_cast<_Float16>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x4(bit_cast<fp16x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+#if 0
+            thread_buffer<fp16_t, 8> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
+                                                static_cast<index_t>(coherence));
+#else
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+#endif
+        }
+    }
+    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x2(bit_cast<int16x2_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x4(bit_cast<int16x4_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x4(
+                src_thread_data.template get_as<int16x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_i16x4(
+                src_thread_data.template get_as<int16x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(bf16_t),
+                static_cast<index_t>(coherence));
+        }
+    }
+    else if constexpr(std::is_same<T, uint16_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16(bit_cast<uint16_t>(src_thread_data),
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x2(bit_cast<uint16x2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x4(bit_cast<uint16x4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                static_cast<index_t>(coherence));
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_ui16x4(
+                src_thread_data.template get_as<uint16x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+
+            llvm_amdgcn_raw_buffer_store_ui16x4(
+                src_thread_data.template get_as<uint16x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(uint16_t),
+                static_cast<index_t>(coherence));
+        }
+    }
+    else
+    {
+        using r_t = thread_buffer<int8_t, sizeof(T) * N>;
+
+        amd_buffer_store_impl_with_bytes<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
+                                                                   dst_wave_buffer_resource,
+                                                                   dst_thread_addr_offset,
+                                                                   dst_wave_addr_offset);
+    }
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thread_data,
+                                              int32x4_t dst_wave_buffer_resource,
+                                              index_t dst_thread_addr_offset,
+                                              index_t dst_wave_addr_offset,
+                                              index_t dst_linear_addr_offset,
+                                              index_t is_valid_element = 1)
+{
+    constexpr index_t bytes = sizeof(T) * N;
+    static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
+                  "wrong! not supported by buffer_store instruction");
+
+    using type = thread_buffer<T, N>;
+    if constexpr(oob_conditional_check)
+    {
+        buffer_store_if<sizeof(type)>{}(dst_thread_data,
+                                        dst_wave_buffer_resource,
+                                        dst_thread_addr_offset,
+                                        dst_wave_addr_offset,
+                                        dst_linear_addr_offset,
+                                        is_valid_element);
+    }
+    else
+    {
+        buffer_store<sizeof(type)>{}(dst_thread_data,
+                                     dst_wave_buffer_resource,
+                                     dst_thread_addr_offset,
+                                     dst_wave_addr_offset,
+                                     dst_linear_addr_offset);
+    }
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_thread_data,
+                                               int32x4_t dst_wave_buffer_resource,
+                                               index_t dst_thread_addr_offset,
+                                               index_t dst_wave_addr_offset)
+{
+    static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+                      (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+
+    if constexpr(std::is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(bit_cast<float>(src_thread_data),
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(float),
+                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(float),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(float),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(float),
+                0);
+        }
+    }
+    else if constexpr(std::is_same<T, fp16_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
+                                                     dst_wave_buffer_resource,
+                                                     dst_thread_addr_offset,
+                                                     dst_wave_addr_offset,
+                                                     0);
+        }
+        else if constexpr(N == 4)
+        {
+            static_for<0, 2, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+                    src_thread_data.template get_as<fp16x2_t>()[i],
+                    dst_wave_buffer_resource,
+                    dst_thread_addr_offset,
+                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
+                    0);
+            });
+        }
+        else if constexpr(N == 8)
+        {
+            static_for<0, 4, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+                    src_thread_data.template get_as<fp16x2_t>()[i],
+                    dst_wave_buffer_resource,
+                    dst_thread_addr_offset,
+                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
+                    0);
+            });
+        }
+    }
+    else if constexpr(std::is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(bit_cast<int32_t>(src_thread_data),
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(int32_t),
+                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(int32_t),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(int32_t),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(int32_t),
+                0);
+        }
+    }
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const thread_buffer<T, N> src_thread_data,
+                                               int32x4_t dst_wave_buffer_resource,
+                                               index_t dst_thread_addr_offset,
+                                               index_t dst_wave_addr_offset)
+{
+    static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+    if constexpr(std::is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(bit_cast<double>(src_thread_data),
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(double),
+                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(double),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(double),
+                0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(double),
+                0);
+        }
+    }
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+//   oob_conditional_check : dynamic check if out-of-bound
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE thread_buffer<T, N>
+amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
+                                            index_t src_thread_element_offset,
+                                            bool src_thread_element_valid,
+                                            index_t src_element_space_size)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+    uint32_t src_addr_shift = [&]() {
+        if constexpr(oob_conditional_check)
+            return src_thread_element_valid ? 0 : 0x80000000;
+        else
+            return 0;
+    }();
+    return amd_buffer_load_impl<T, N, coherence>(
+        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+#else
+    thread_buffer<T, N> tmp =
+        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
+    if constexpr(oob_conditional_check)
+        return src_thread_element_valid ? tmp : thread_buffer<T, N>{numeric<T>::zero()};
+    else
+        return tmp;
+#endif
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE thread_buffer<T, N>
+amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
+                                                        index_t src_thread_element_offset,
+                                                        bool src_thread_element_valid,
+                                                        index_t src_element_space_size,
+                                                        T customized_value)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    thread_buffer<T, N> tmp =
+        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    if constexpr(oob_conditional_check)
+        return src_thread_element_valid ? tmp : thread_buffer<T, N>{customized_value};
+    else
+        return tmp;
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
+                                        const T* p_src_wave,
+                                        index_t src_thread_element_offset,
+                                        index_t src_linear_element_offset,
+                                        index_t src_element_space_size,
+                                        index_t is_valid_element = 0,
+                                        bool_constant<pre_nop>   = {})
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
+        dst,
+        src_wave_buffer_resource,
+        src_thread_addr_offset,
+        0,
+        src_linear_addr_offset,
+        is_valid_element,
+        bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
+                                        const int32x4_t src_wave_buffer_resource,
+                                        index_t src_thread_element_offset,
+                                        index_t src_linear_element_offset,
+                                        index_t is_valid_element = 0,
+                                        bool_constant<pre_nop>   = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check, pre_nop>(
+        dst,
+        src_wave_buffer_resource,
+        src_thread_addr_offset,
+        0,
+        src_linear_addr_offset,
+        is_valid_element,
+        bool_constant<pre_nop>{});
+}
+
+// unfortunately async copy can not make sure invalid data is zero inside LDS
+// ... unless people manually write zero to LDS at the proper address.
+// so not support invalid_element check for now.
+// buffer_load OOB still working.
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
+                                                       const T* p_src_wave,
+                                                       index_t src_thread_element_offset,
+                                                       index_t src_linear_element_offset,
+                                                       index_t src_element_space_size,
+                                                       bool_constant<pre_nop> = {})
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load_impl<T, N, coherence>(smem,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                0,
+                                                src_linear_addr_offset,
+                                                bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
+                                                       const int32x4_t src_wave_buffer_resource,
+                                                       index_t src_thread_element_offset,
+                                                       index_t src_linear_element_offset,
+                                                       bool_constant<pre_nop> = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load_impl<T, N, coherence>(smem,
+                                                src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                0,
+                                                src_linear_addr_offset,
+                                                bool_constant<pre_nop>{});
+}
+
+// This version support buffer resource as input arg
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = false>
+CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
+                                                   const int32x4_t src_wave_buffer_resource,
+                                                   index_t src_thread_element_offset,
+                                                   index_t src_linear_element_offset,
+                                                   bool is_valid_element,
+                                                   bool_constant<oob_conditional_check> = {})
+{
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+    index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
+
+    amd_async_buffer_load<T, N, coherence>(smem,
+                                           src_wave_buffer_resource,
+                                           src_thread_addr_offset,
+                                           0,
+                                           src_linear_addr_offset,
+                                           is_valid_element,
+                                           bool_constant<oob_conditional_check>{});
+}
+
+// buffer_store requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
+                                     T* p_dst_wave,
+                                     const index_t dst_thread_element_offset,
+                                     const bool dst_thread_element_valid,
+                                     const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = [&]() {
+        if constexpr(oob_conditional_check)
+            return dst_thread_element_valid ? 0 : 0x80000000;
+        else
+            return 0;
+    }();
+    amd_buffer_store_impl<T, N, coherence>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if constexpr(oob_conditional_check)
+    {
+        if(dst_thread_element_valid)
+        {
+            amd_buffer_store_impl<T, N, coherence>(
+                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+        }
+    }
+    else
+    {
+        amd_buffer_store_impl<T, N, coherence>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true>
+CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_data,
+                                         T* p_dst_wave,
+                                         const index_t dst_thread_element_offset,
+                                         const index_t dst_linear_element_offset,
+                                         const bool dst_thread_element_valid,
+                                         const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
+
+    amd_buffer_store_raw_impl<T, N, coherence, oob_conditional_check>(src_thread_data,
+                                                                      dst_wave_buffer_resource,
+                                                                      dst_thread_addr_offset,
+                                                                      0,
+                                                                      dst_linear_addr_offset,
+                                                                      dst_thread_element_valid);
+}
+
+// buffer_atomic_add requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_data,
+                                          T* p_dst_wave,
+                                          const index_t dst_thread_element_offset,
+                                          const bool dst_thread_element_valid,
+                                          const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+
+    amd_buffer_atomic_add_impl<T, N>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_add_impl<T, N>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+template <typename T,
+          index_t N,
+          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
+          bool oob_conditional_check          = true,
+          bool pre_nop                        = false>
+CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer<T, N>& src_thread_data,
+                                              T* p_dst_wave,
+                                              const index_t dst_thread_element_offset,
+                                              const index_t dst_linear_element_offset,
+                                              const bool dst_thread_element_valid,
+                                              const index_t dst_element_space_size,
+                                              bool_constant<pre_nop> = {})
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
+
+    if constexpr(oob_conditional_check)
+    {
+        buffer_atomic_add_if<T, N, pre_nop>{}(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              0,
+                                              dst_linear_addr_offset,
+                                              dst_thread_element_valid);
+    }
+    else
+    {
+        buffer_atomic_add<T, N, pre_nop>{}(src_thread_data,
+                                           dst_wave_buffer_resource,
+                                           dst_thread_addr_offset,
+                                           0,
+                                           dst_linear_addr_offset,
+                                           1);
+    }
+}
+
+// buffer_atomic_max requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_data,
+                                          T* p_dst_wave,
+                                          const index_t dst_thread_element_offset,
+                                          const bool dst_thread_element_valid,
+                                          const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
+
+    amd_buffer_atomic_max_impl<T, N>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_max_impl<T, N>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+template <typename T, index_t NumElemsPerThread>
+CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
+                                                  const index_t global_offset,
+                                                  T* lds_base_ptr,
+                                                  const index_t lds_offset,
+                                                  const bool is_valid,
+                                                  const index_t src_element_space_size)
+{
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+    constexpr auto dword_bytes      = 4;
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+    static_assert(bytes_per_thread == dword_bytes);
+
+    const uint32_t* global_ptr =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
+    const int32x4_t src_resource =
+        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
+    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
+
+#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
+    T* lds_ptr = lds_base_ptr + lds_offset;
+    auto const lds_ptr_sgpr =
+        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
+    asm volatile("s_mov_b32 m0, %0; \n\t"
+                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
+                 "v"(global_offset_bytes),
+                 "s"(src_resource)
+                 : "memory");
+#else
+    // LDS pointer must be attributed with the LDS address space.
+    __attribute__((address_space(3))) uint32_t* lds_ptr =
+        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
+            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+
+    llvm_amdgcn_raw_buffer_load_lds(
+        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+#endif
+}
+
+} // namespace ck_tile
+
+#endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index c2a093f1ab..bdcfbdd920 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -5,7 +5,11 @@
 
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/arch/arch.hpp"
+#if __clang_major__ == 20
+#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#else
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#endif
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/numeric/integer.hpp"

From 8146e471f14ad61b623c8a6410b9eb13787df91f Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 23 May 2025 14:58:25 -0700
Subject: [PATCH 143/443] fix the buffer intrinsic names for clang >=20 (#2228)

---
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |  74 +++---
 .../amd_buffer_addressing_builtins.hpp        |  20 +-
 .../arch/amd_buffer_addressing_builtins.hpp   | 218 +++++++++---------
 3 files changed, 160 insertions(+), 152 deletions(-)

diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 15a9df2c0c..c191fff7d0 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -4,14 +4,22 @@
 #include <type_traits>
 
 template <typename T>
-constexpr const char* DataTypeToString() {
-    if constexpr (std::is_same_v<T, ck_tile::half_t>) {
+constexpr const char* DataTypeToString()
+{
+    if constexpr(std::is_same_v<T, ck_tile::half_t>)
+    {
         return "fp16";
-    } else if constexpr (std::is_same_v<T, ck_tile::fp8_t>) {
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::fp8_t>)
+    {
         return "fp8";
-    } else if constexpr (std::is_same_v<T, ck_tile::bf8_t>) {
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::bf8_t>)
+    {
         return "bf8";
-    } else {
+    }
+    else
+    {
         return "unknown";
     }
 }
@@ -112,8 +120,9 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time = flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+    float ave_time =
+        flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -121,18 +130,15 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>() << " M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>()
+              << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
+              << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
+              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
 
     return ave_time;
 }
 
-template <typename PrecType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
+template <typename PrecType, typename ALayout, typename BLayout, typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                     char* argv[],
                                     const ALayout a_layout                  = ALayout{},
@@ -147,7 +153,7 @@ int run_flatmm_example_with_layouts(int argc,
     using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
     using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
     using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
-    
+
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
     ck_tile::index_t K = arg_parser.get_int("k");
@@ -182,7 +188,7 @@ int run_flatmm_example_with_layouts(int argc,
     c_rslt_host.SetZero();
 
     // do pre-shuffle
-    std::string mfma                              = arg_parser.get_str("prec");
+    std::string mfma = arg_parser.get_str("prec");
 #if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
     ck_tile::index_t mfma_type = 1;
 #else
@@ -193,18 +199,18 @@ int run_flatmm_example_with_layouts(int argc,
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
     invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-                                             a_dev_buf,
-                                             b_shuffle_dev_buf,
-                                             c_dev_buf,
-                                             M,
-                                             N,
-                                             K,
-                                             stride_A,
-                                             stride_B,
-                                             stride_C,
-                                             kbatch,
-                                             n_warmup,
-                                             n_repeat);
+        a_dev_buf,
+        b_shuffle_dev_buf,
+        c_dev_buf,
+        M,
+        N,
+        K,
+        stride_A,
+        stride_B,
+        stride_C,
+        kbatch,
+        n_warmup,
+        n_repeat);
 
     c_dev_buf.FromDevice(c_rslt_host.data());
     bool pass = true;
@@ -219,8 +225,9 @@ int run_flatmm_example_with_layouts(int argc,
             a_host, b_origin_host, c_ref_host);
         const float max_accumulated_value =
             *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
-        pass                 = ck_tile::check_err(c_rslt_host,
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_rslt_host,
                                   c_ref_host,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
@@ -277,8 +284,9 @@ int run_flatmm_example_with_layouts(int argc,
         c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
         const float max_accumulated_value =
             *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(K, kbatch, max_accumulated_value);
-        pass                 = ck_tile::check_err(c_rslt_host,
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_rslt_host,
                                   c_gpu_ref_host,
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 19869906dc..296c1d44d7 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -80,7 +80,7 @@ __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
 
 // buffer atomic-add i32
 __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
@@ -88,7 +88,7 @@ __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
 
 // buffer atomic-add fp32
 __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
@@ -96,15 +96,15 @@ __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
 
 // buffer atomic-add fp32
-__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(
-    double vdata,
-    int32x4_t rsrc, // dst_wave_buffer_resource
-    int voffset,    // dst_thread_addr_offset
-    int soffset,    // dst_wave_addr_offset
-    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
+__device__ double
+llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
+                                       int32x4_t rsrc, // dst_wave_buffer_resource
+                                       int voffset,    // dst_thread_addr_offset
+                                       int soffset,    // dst_wave_addr_offset
+                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
 // memory coherency bit for buffer store/load instruction
 // check ISA manual for each GFX target
@@ -827,7 +827,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t offset,
-                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
 
 #ifndef __HIPCC_RTC__
 template <typename T, index_t NumElemsPerThread>
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 0b9956cd01..53a344c7b0 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -881,95 +881,95 @@ CK_TILE_DEVICE_EXTERN int8_t
 llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
                                index_t voffset,
                                index_t soffset,
-                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8.v4i32");
+                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
 
 CK_TILE_DEVICE_EXTERN int8x2_t
 llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8.v4i32");
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
 
 CK_TILE_DEVICE_EXTERN int8x4_t
 llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8.v4i32");
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
 
 // buffer load i16
 CK_TILE_DEVICE_EXTERN int16_t
 llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
-                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16.v4i32");
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
 
 CK_TILE_DEVICE_EXTERN int16x2_t
 llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
 
 CK_TILE_DEVICE_EXTERN int16x4_t
 llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
 
 // buffer load i32
 CK_TILE_DEVICE_EXTERN int32_t
 llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
-                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32.v4i32");
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
 
 CK_TILE_DEVICE_EXTERN int32x2_t
 llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
 
 CK_TILE_DEVICE_EXTERN int32x4_t
 llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
 
 // buffer load fp16
 CK_TILE_DEVICE_EXTERN _Float16
 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16.v4i32");
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
 
-CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16.v4i32");
+CK_TILE_DEVICE_EXTERN fp16x2_t
+llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
 
-CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16.v4i32");
+CK_TILE_DEVICE_EXTERN fp16x4_t
+llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
 
 // buffer load fp32
 CK_TILE_DEVICE_EXTERN float
 llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32.v4i32");
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
 
-CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32.v4i32");
+CK_TILE_DEVICE_EXTERN fp32x2_t
+llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
 
-CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(
-    int32x4_t srsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32.v4i32");
+CK_TILE_DEVICE_EXTERN fp32x4_t
+llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
 
 // buffer store i8
 CK_TILE_DEVICE_EXTERN void
@@ -977,21 +977,21 @@ llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
                                 int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
-                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8.v4i32");
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
 
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
 
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
 
 // buffer store i16
 CK_TILE_DEVICE_EXTERN void
@@ -999,21 +999,21 @@ llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(
-    int16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i16x2(int16x2_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(
-    int16x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i16x4(int16x4_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
 
 // buffer store i32
 CK_TILE_DEVICE_EXTERN void
@@ -1021,7 +1021,7 @@ llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
-                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32.v4i32");
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
 
 // buffer store ui16
 CK_TILE_DEVICE_EXTERN void
@@ -1029,35 +1029,35 @@ llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(
-    uint16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(
-    uint16x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(
-    int32x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(
-    int32x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
 
 // buffer store fp16
 CK_TILE_DEVICE_EXTERN void
@@ -1065,21 +1065,21 @@ llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(
-    fp16x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(
-    fp16x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
 
 // buffer store fp32
 CK_TILE_DEVICE_EXTERN void
@@ -1087,21 +1087,21 @@ llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
-                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32.v4i32");
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(
-    fp32x2_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
 
-CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(
-    fp32x4_t vdata,
-    int32x4_t rsrc,
-    index_t voffset,
-    index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32.v4i32");
+CK_TILE_DEVICE_EXTERN void
+llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
 
 // buffer atomic-add fp16
 CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
@@ -1109,7 +1109,7 @@ CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
 
 // buffer atomic-add i32
 CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
@@ -1117,7 +1117,7 @@ CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
 
 // buffer atomic-add fp32
 CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
@@ -1125,15 +1125,15 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
-    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
 
 // buffer atomic-max fp64
-CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(
-    double vdata,
-    int32x4_t rsrc, // dst_wave_buffer_resource
-    int voffset,    // dst_thread_addr_offset
-    int soffset,    // dst_wave_addr_offset
-    int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
+CK_TILE_DEVICE_EXTERN double
+llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
+                                       int32x4_t rsrc, // dst_wave_buffer_resource
+                                       int voffset,    // dst_thread_addr_offset
+                                       int soffset,    // dst_wave_addr_offset
+                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
 // Direct loads from global to LDS.
 CK_TILE_DEVICE_EXTERN void
@@ -1143,7 +1143,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t offset,
-                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
+                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
 
 template <bool pre_nop = false>
 CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,

From ece38b9d7a43f9fd54da164084121932e86dd873 Mon Sep 17 00:00:00 2001
From: Zzz9990 <zanzhang@amd.com>
Date: Mon, 26 May 2025 19:17:18 +0800
Subject: [PATCH 144/443] [VLLM V1] Add chunked prefill for FA to pass seq with
 small seqlen_q (#2221)

* fix splitkv compiler issue since lse is used to select kernel instances

* bypass seqlen == 1

* add chunked prefill into mha varlen

This reverts commit aa9847e42d258e48ec40e5d6469b5df103701e16.

* skip compile when receipt 2-4 and add comments

* fix

---------

Co-authored-by: fsx950223 <fsx950223@outlook.com>
---
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 62 +++++++++++--------
 example/ck_tile/01_fmha/fmha_fwd.hpp          |  7 ++-
 include/ck_tile/core.hpp                      |  2 +-
 include/ck_tile/host.hpp                      |  1 +
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 24 ++++++-
 .../pipeline/block_fmha_pipeline_problem.hpp  |  1 +
 .../ops/fmha/pipeline/tile_fmha_traits.hpp    |  4 +-
 7 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 2f1287c87a..e70b896703 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -58,7 +58,8 @@ using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
                                                     {F_lse},
                                                     {F_dropout},
                                                     {F_squant},
-                                                    {F_occupancy}>;
+                                                    {F_occupancy},
+                                                    {F_skip}>;
 
 using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
 
@@ -94,7 +95,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
 
 #include <iostream>
 
@@ -129,9 +130,9 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
         }}
 """
 
-FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
                         ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -160,11 +161,12 @@ class FmhaFwdApiTrait:
     skpad     : str
     dpad      : str
     dvpad     : str
+    skip      : str
 
     @property
     def name(self) -> str:
         return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
-                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}'
 
     @property
     def scheck(self) -> str:
@@ -227,6 +229,7 @@ class FmhaFwdPipeline:
     F_dropout   : str  #
     F_squant    : str  #
     F_mask      : str  # value from MASK_MAP
+    F_skip      : str  # true/false
 
     @property
     def name(self) -> str:
@@ -262,8 +265,12 @@ class FmhaFwdPipeline:
         if self.F_dropout == 't' : n += '_dropout'
         else: n += '_ndropout'
 
+        if self.F_skip == 't' : n += '_skip'
+        else: n += '_nskip'
+
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
+
         return n
 
 class FmhaFwdApiPool:
@@ -293,7 +300,7 @@ class FmhaFwdApiPool:
                     inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
                                    F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
+                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip],
                                    F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
@@ -381,6 +388,7 @@ class FmhaFwdKernel:
                 F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
                 F_dropout       = BOOL_MAP[self.F_pipeline.F_dropout],
                 F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
+                F_skip          = BOOL_MAP[self.F_pipeline.F_skip],
                 F_occupancy     = self.F_tile.F_occupancy,
                 F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                 F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
@@ -419,7 +427,8 @@ class FmhaFwdKernel:
                 spad=self.F_pipeline.F_spad,
                 skpad=self.F_pipeline.F_skpad,
                 dpad=self.F_pipeline.F_dpad,
-                dvpad=self.F_pipeline.F_dvpad)
+                dvpad=self.F_pipeline.F_dvpad,
+                skip=self.F_pipeline.F_skip)
 
 # TODO: design a more practical way to do it
 # this is current supported tile size per hdim
@@ -453,36 +462,36 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         squant = 't' if dtype == 'fp8' else 'f'
         pipelines = []
         if dtype in ['fp16', 'bf16']:
-            for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
+            for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
                 if hdim == 256:
                 # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
 
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask))
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -532,6 +541,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     cond &= pipeline.F_vlayout == 'row'
                     cond &= pipeline.F_bias in ['no', 'alibi']
                     cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
                     if not cond:
                         continue
                 # PyTorch integration
@@ -540,6 +550,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     cond &= pipeline.F_vlayout == 'row'
                     cond &= pipeline.F_bias in ['no', 'bias']
                     cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
                     if not cond:
                         continue
                 # Aiter(mha_fwd) integration
@@ -565,6 +576,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 1838ee5bd9..5ce56d48b5 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -169,6 +169,7 @@ struct fmha_fwd_args
     ck_tile::index_t window_size_left;
     ck_tile::index_t window_size_right;
     ck_tile::index_t mask_type;
+    ck_tile::index_t min_seqlen_q;
 
     float p_drop;
     bool s_randval;
@@ -433,6 +434,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                              args.window_size_left,
                                              args.window_size_right,
                                              args.mask_type,
+                                             args.min_seqlen_q,
                                              args.p_drop,
                                              args.s_randval,
                                              args.drop_seed_offset);
@@ -837,7 +839,8 @@ template <ck_tile::index_t HDim_,
           bool kPadS_,
           bool kPadSK_,
           bool kPadD_,
-          bool kPadDv_>
+          bool kPadDv_,
+          bool kSkipMinSeqlenQ_ = false>
 struct fmha_fwd_traits_
 {
     static constexpr ck_tile::index_t HDim           = HDim_;
@@ -861,6 +864,7 @@ struct fmha_fwd_traits_
     static constexpr bool kPadSK                     = kPadSK_;
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
 };
 
 template <typename Traits_>
@@ -995,6 +999,7 @@ struct fmha_fwd_traits
     bool has_lse;
     bool has_dropout;
     bool do_fp8_static_quant;
+    bool skip_min_seqlen_q = false;
     // TODO: padding check is inside this api
 };
 float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const ck_tile::stream_config&);
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 27af59c192..be84842347 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -55,8 +55,8 @@
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
 #include "ck_tile/core/tensor/tile_scatter_gather.hpp"
-#include "ck_tile/core/tensor/tile_window_base.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_base.hpp"
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/tile_window_utils.hpp"
 #include "ck_tile/core/tensor/transpose_tile.hpp"
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 5a5e01460f..24feaf7c62 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -35,4 +35,5 @@
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/reference/reference_topk.hpp"
 #include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/host/timer.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index a4b3765455..bedf20626f 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -53,6 +53,8 @@ struct FmhaFwdKernel
     static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
     static constexpr bool kHasDropout       = FmhaPipeline::kHasDropout;
     static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
+    static constexpr bool kSkipMinSeqlenQ   = FmhaPipeline::Problem::kSkipMinSeqlenQ;
+
     using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
     using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
     static constexpr bool kHasMask = FmhaMask::IsMasking;
@@ -257,6 +259,11 @@ struct FmhaFwdKernel
         ck_tile::index_t batch_stride_randval = 0;
     };
 
+    struct FmhaFwdSkipMinSeqlenQKargs
+    {
+        ck_tile::index_t min_seqlen_q = 0;
+    };
+
     struct FmhaFwdBatchModeKargs
         : FmhaFwdCommonKargs,
           std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS,
@@ -287,7 +294,8 @@ struct FmhaFwdKernel
           std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
           std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
           std::conditional_t<kHasDropout, FmhaFwdCommonDropoutKargs, FmhaFwdEmptyKargs<4>>,
-          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<5>>
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<5>>,
+          std::conditional_t<kSkipMinSeqlenQ, FmhaFwdSkipMinSeqlenQKargs, FmhaFwdEmptyKargs<6>>
     {
         const int32_t* seqstart_q_ptr;
         const int32_t* seqstart_k_ptr;
@@ -664,6 +672,7 @@ struct FmhaFwdKernel
                   ck_tile::index_t window_size_left,
                   ck_tile::index_t window_size_right,
                   ck_tile::index_t mask_type,
+                  ck_tile::index_t min_seqlen_q,
                   float p_drop,
                   bool s_randval,
                   std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
@@ -698,6 +707,7 @@ struct FmhaFwdKernel
                     {},               // placeholder for fp8_static_quant args
                     {},               // placeholder for dropout
                     {},               // placeholder for logits_soft_cap
+                    {},               // placeholder for min_seqlen_q
                     reinterpret_cast<const int32_t*>(seqstart_q_ptr),
                     reinterpret_cast<const int32_t*>(seqstart_k_ptr),
                     reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
@@ -753,6 +763,10 @@ struct FmhaFwdKernel
         {
             kargs.init_logits_soft_cap(logits_soft_cap);
         }
+        if constexpr(kSkipMinSeqlenQ)
+        {
+            kargs.min_seqlen_q = min_seqlen_q;
+        }
 
         return kargs;
     }
@@ -1053,6 +1067,14 @@ struct FmhaFwdKernel
             const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
             kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
 
+            if constexpr(kSkipMinSeqlenQ)
+            {
+                if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                {
+                    return;
+                }
+            }
+
             // # of required blocks is different in each groups, terminate unnecessary blocks
             // earlier
             if(kargs.seqlen_q <= i_m0)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index f35c00c268..21cc4950eb 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -53,6 +53,7 @@ struct BlockFmhaPipelineProblem
     static constexpr bool kPadHeadDimQ      = Traits::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV      = Traits::kPadHeadDimV;
     static constexpr bool kHasLogitsSoftCap = Traits::kHasLogitsSoftCap;
+    static constexpr bool kSkipMinSeqlenQ   = Traits::kSkipMinSeqlenQ;
     static constexpr auto BiasEnum          = Traits::BiasEnum;
     static constexpr bool kStoreLSE         = Traits::kStoreLSE;
     static constexpr bool kHasDropout       = Traits::kHasDropout;
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
index 4530b58d85..442619a3dc 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -19,7 +19,8 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kStoreLSE_,
           bool kHasDropout_,
           bool kDoFp8StaticQuant_,
-          index_t kBlockPerCu_ = -1 /* overwrite occupancy if not -1 */>
+          index_t kBlockPerCu_  = -1, /* overwrite occupancy if not -1 */
+          bool kSkipMinSeqlenQ_ = false /* skip min seqlen q while chunked prefill */>
 struct TileFmhaTraits
 {
     static constexpr bool kPadSeqLenQ       = kPadSeqLenQ_;
@@ -33,6 +34,7 @@ struct TileFmhaTraits
     static constexpr bool kHasDropout       = kHasDropout_;
     static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
     static constexpr index_t kBlockPerCu    = kBlockPerCu_;
+    static constexpr bool kSkipMinSeqlenQ   = kSkipMinSeqlenQ_;
 };
 
 template <bool kPadSeqLenQ_ /* padding for seqlen_q */,

From 037764bbc62a11e9fddcfb959950c98b346b2901 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 26 May 2025 16:51:09 +0200
Subject: [PATCH 145/443] Fix grid size calc for bwd wei (#2226)

---
 ...onv_bwd_weight_multiple_d_xdl_cshuffle.hpp | 20 ++++++++++++-------
 ..._grouped_conv_bwd_weight_wmma_cshuffle.hpp |  6 +++---
 .../transform_conv_bwd_weight_to_gemm.hpp     | 14 ++++++-------
 .../transform_conv_bwd_weight_to_gemm_v2.hpp  | 12 +++++------
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
index 57c4b1a5cf..33b6d7c585 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -393,8 +393,10 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                 {
                     const index_t GemmM = K;
                     const index_t GemmN = C * X;
-                    const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-                    const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+                    const auto PadGemmM =
+                        GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+                    const auto PadGemmN =
+                        GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
                     return transform_tensor_descriptor(
                         wei_grid_desc,
@@ -432,8 +434,10 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                 {
                     const index_t GemmM = K;
                     const index_t GemmN = C * X * Y;
-                    const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-                    const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+                    const auto PadGemmM =
+                        GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+                    const auto PadGemmN =
+                        GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
                     return transform_tensor_descriptor(
                         wei_grid_desc,
@@ -472,8 +476,10 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                 {
                     const index_t GemmM = K;
                     const index_t GemmN = C * X * Y * Z;
-                    const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-                    const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+                    const auto PadGemmM =
+                        GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+                    const auto PadGemmN =
+                        GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
                     return transform_tensor_descriptor(
                         wei_grid_desc,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
index 0831b754c8..e9e02eae81 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -208,8 +208,8 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
         const index_t GemmM      = K;
         const index_t GemmN      = C * Z * X * Y;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmK0 =
             math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock) * K0PerBlock;
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
index c11bf845d0..bd3ab10802 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
@@ -1,6 +1,6 @@
 
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -166,8 +166,8 @@ struct TransformConvBwdWeightToGemm
         const index_t GemmM      = K;
         const index_t GemmN      = C * X;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmKBatch = batch_k;
         const index_t GemmK0 =
@@ -365,8 +365,8 @@ struct TransformConvBwdWeightToGemm
         const index_t GemmM      = K;
         const index_t GemmN      = C * X * Y;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmKBatch = batch_k;
         const index_t GemmK0 =
@@ -558,8 +558,8 @@ struct TransformConvBwdWeightToGemm
         const index_t GemmM      = K;
         const index_t GemmN      = C * Z * X * Y;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmKBatch = batch_k;
         const index_t GemmK0 =
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
index f34e0e59b3..b72ddb8243 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -346,8 +346,8 @@ struct TransformConvBwdWeightToGemmV2
         const index_t GemmM      = K * NumGroupsToMerge;
         const index_t GemmN      = C * X * NumGroupsToMerge;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmKBatch = batch_k;
         const index_t GemmK0 =
@@ -534,8 +534,8 @@ struct TransformConvBwdWeightToGemmV2
         const index_t GemmM      = K * NumGroupsToMerge;
         const index_t GemmN      = C * X * Y * NumGroupsToMerge;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmKBatch = batch_k;
         const index_t GemmK0 =
@@ -737,8 +737,8 @@ struct TransformConvBwdWeightToGemmV2
         const index_t GemmM      = K * NumGroupsToMerge;
         const index_t GemmN      = C * Z * X * Y * NumGroupsToMerge;
 
-        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
-        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const auto PadGemmM = GemmM % MPerBlock == 0 ? 0 : MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = GemmN % NPerBlock == 0 ? 0 : NPerBlock - GemmN % NPerBlock;
 
         const index_t GemmKBatch = batch_k;
         const index_t GemmK0 =

From 4583aeffad950e7709ba9d0a26fcc9189789f5e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 26 May 2025 22:46:18 +0200
Subject: [PATCH 146/443] Remove not needed bwd wei merged groups instances
 (#2218)

* Grouped conv bwd wei add two stage instances for larger filter and Merge Groups

* Fix

* fix

* Revert "Restore oddc instances (#2201)"

This reverts commit 6342f6b5e8bbb9f2b4cefa33d2a863a8bb35329b.

* fix

---------

Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
---
 ...conv_bwd_weight_two_stage_xdl_instance.hpp |  39 +-----
 .../gpu/grouped_convolution_forward.hpp       |   8 --
 .../gpu/grouped_convolution_forward_wmma.inc  | 111 ------------------
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   4 -
 ...ma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp |  40 -------
 ...mma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp |  40 -------
 ...ma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp |  40 -------
 ...mma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp |  40 -------
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |   9 --
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |   8 --
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |   9 --
 ...nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp |   9 --
 ...dl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp |   8 --
 ...gc_gkyxc_nhwgk_f16_comp_part2_instance.cpp |   9 --
 ...dl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp |  10 +-
 ...l_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp |  28 +----
 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp |  10 +-
 ...gc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp |  11 +-
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |   4 -
 ...gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp |  41 -------
 ..._gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp |  41 -------
 ...ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp |  41 -------
 ..._ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp |  41 -------
 36 files changed, 20 insertions(+), 718 deletions(-)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index fbcda3ca57..87b3d4d8cd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -158,11 +158,7 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instance
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
     // clang-format on
     >;
 
@@ -251,23 +247,9 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 8>,
 
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 2, 2>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 8>,
-
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 1, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 2, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8 ,1>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>
     // clang-format on
     >;
 
@@ -335,24 +317,9 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instance
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>,
 
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>,
-
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 1, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8 ,1>,
-
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>
-
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 545826650c..cf5dbaa323 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -613,7 +613,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -624,7 +623,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -640,7 +638,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -651,7 +648,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(op_ptrs);
-                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -669,7 +665,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -681,7 +676,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -699,7 +693,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -711,7 +704,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
index 0ea24d0929..df4e95007d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
@@ -51,20 +51,6 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -107,20 +93,6 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -163,20 +135,6 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -219,19 +177,6 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -291,20 +236,6 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -347,20 +278,6 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -403,20 +320,6 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -458,20 +361,6 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
-
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 22e9d726b0..eba6fd789e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -93,8 +93,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
   ## NHWGC, GKYXC, NHWGK
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp
@@ -102,6 +100,4 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
-  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
deleted file mode 100644
index a8f723dfec..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<2,
-                                                                              GNHWC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              GNHWK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
deleted file mode 100644
index 784a118897..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
-void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                GNHWC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                GNHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
deleted file mode 100644
index 8c621543a9..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
deleted file mode 100644
index 5cb313b3ca..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
index c078f8ed04..f5df7278d0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index a67b11f1cf..db048679bd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -49,14 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                                                   NHWGC,
-                                                                                   GKYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NHWGK,
-                                                                                   ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
index 5c0391a25f..ee9507a80a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instanc
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
index 726276c461..132d3c8411 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
                                                               Empty_Tuple,
                                                               NHWGK,
                                                               ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
-                                                              NHWGC,
-                                                              GKYXC,
-                                                              Empty_Tuple,
-                                                              NHWGK,
-                                                              ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
index 8b7bdec2a8..a7deb969ba 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
@@ -49,14 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
index c66114b9a3..d2732547fa 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
@@ -52,15 +52,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance
                                                                  Empty_Tuple,
                                                                  NHWGK,
                                                                  ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
-                                                                 NHWGC,
-                                                                 GKYXC,
-                                                                 Empty_Tuple,
-                                                                 NHWGK,
-                                                                 ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
index 93e07e08fb..8a0caebc9f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -48,14 +48,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
index 6acbb7475c..e45df1e107 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -50,14 +50,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
 
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
-                                                                                   NHWGC,
-                                                                                   GKYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NHWGK,
-                                                                                   ConvFwdOddC>{});
-
     if(ck::get_device_name() != "gfx950")
     {
         add_device_operation_instances(
@@ -86,15 +78,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
-                                                                  NHWGC,
-                                                                  GKYXC,
-                                                                  Empty_Tuple,
-                                                                  NHWGK,
-                                                                  ConvFwdOddC>{});
     }
 
     if(ck::get_device_name() == "gfx950")
@@ -125,15 +108,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
-
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
-                                                               NHWGC,
-                                                               GKYXC,
-                                                               Empty_Tuple,
-                                                               NHWGK,
-                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 2afbfdc386..078221f89f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               GNHWK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              GNHWC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              GNHWK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 822ef51e00..3a481dd204 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 79a1fb99a8..5add0f8add 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             GNHWC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             GNHWK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index e567c0df75..0257c7d315 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 3e42184996..2715506fe2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index c035d4c3da..8d3e4d91b1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_instances<2,
-                                                                             NHWGC,
-                                                                             GKYXC,
-                                                                             Empty_Tuple,
-                                                                             NHWGK,
-                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
index 5c425effd8..465fa927a5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,14 +46,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_instances<2,
-                                                                              NHWGC,
-                                                                              GKYXC,
-                                                                              Empty_Tuple,
-                                                                              NHWGK,
-                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
index e8a763c527..87423801cb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
index 3ae3fb5186..ebb213461a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
index cb7e912936..c2c8a099b2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
index d787f4b048..11cb853f0d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
index 5644289790..1992d7f7c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
index 5b12dad5a3..2b8fd3d9db 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
-                                                                                 NHWGC,
-                                                                                 GKYXC,
-                                                                                 Empty_Tuple,
-                                                                                 NHWGK,
-                                                                                 ConvFwdOddC,
-                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
index f667481fa4..5579ec62cc 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
index 2ff2c7f51f..77f3df2c11 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,15 +49,6 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index f8efa5a7c1..f55bdd45c9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -66,10 +66,6 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
-   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
deleted file mode 100644
index fa378af1ee..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
-// wo, k]
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<3,
-                                                                              GNDHWC,
-                                                                              GKZYXC,
-                                                                              Empty_Tuple,
-                                                                              GNDHWK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
deleted file mode 100644
index d41416fd4a..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
-// wo, k]
-void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                GNDHWC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                GNDHWK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<3,
-                                                                             GNDHWC,
-                                                                             GKZYXC,
-                                                                             Empty_Tuple,
-                                                                             GNDHWK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
deleted file mode 100644
index 8a7bc26178..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
-// g, k]
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_f16_instances<3,
-                                                                              NDHWGC,
-                                                                              GKZYXC,
-                                                                              Empty_Tuple,
-                                                                              NDHWGK,
-                                                                              Empty_Tuple,
-                                                                              PassThrough,
-                                                                              ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
deleted file mode 100644
index 7649f86971..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
-// g, k]
-void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_wmma_i8_instances<3,
-                                                                             NDHWGC,
-                                                                             GKZYXC,
-                                                                             Empty_Tuple,
-                                                                             NDHWGK,
-                                                                             Empty_Tuple,
-                                                                             PassThrough,
-                                                                             ConvFwdOddC>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck

From b1ed92b13117d839b7fe9a68b47c442d5649265a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 26 May 2025 23:26:04 +0200
Subject: [PATCH 147/443] Revert "Remove not needed bwd wei merged groups
 instances (#2218)" (#2235)

This reverts commit 4583aeffad950e7709ba9d0a26fcc9189789f5e7.
---
 ...conv_bwd_weight_two_stage_xdl_instance.hpp |  39 +++++-
 .../gpu/grouped_convolution_forward.hpp       |   8 ++
 .../gpu/grouped_convolution_forward_wmma.inc  | 111 ++++++++++++++++++
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   4 +
 ...ma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp |  40 +++++++
 ...mma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp |  40 +++++++
 ...ma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp |  40 +++++++
 ...mma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp |  40 +++++++
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |   9 ++
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |   8 ++
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |   9 ++
 ...nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp |   9 ++
 ...dl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp |   8 ++
 ...gc_gkyxc_nhwgk_f16_comp_part2_instance.cpp |   9 ++
 ...dl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp |  10 +-
 ...l_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp |  28 ++++-
 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  10 +-
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  10 +-
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  10 +-
 ...wd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp |  10 +-
 ...gc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp |  11 +-
 ...wgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp |  11 +-
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp |  11 +-
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |   4 +
 ...gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp |  41 +++++++
 ..._gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp |  41 +++++++
 ...ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp |  41 +++++++
 ..._ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp |  41 +++++++
 36 files changed, 718 insertions(+), 20 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index 87b3d4d8cd..fbcda3ca57 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -158,7 +158,11 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instance
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 8>
     // clang-format on
     >;
 
@@ -247,9 +251,23 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 8>,
 
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 8>,
+
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 1, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 1, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 1, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 2, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8 ,1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, F16, F16, 4, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, F16, F16, 8, 1>
     // clang-format on
     >;
 
@@ -317,9 +335,24 @@ using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instance
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>,
 
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 8>,
+
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 1, 2>,
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>,
-        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 1, 4>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 1, 8>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8 ,1>,
+
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 4, BF16, BF16, 4, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,                  1, Scheduler, PipelineVersion, 8, BF16, BF16, 8, 1>
+
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index cf5dbaa323..545826650c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -613,6 +613,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -623,6 +624,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -638,6 +640,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -648,6 +651,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -665,6 +669,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -676,6 +681,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
@@ -693,6 +699,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(op_ptrs);
             }
 #endif
 #ifdef CK_ENABLE_INT8
@@ -704,6 +711,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(op_ptrs);
                 add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
                     op_ptrs);
+                add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(op_ptrs);
             }
 #endif
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
index df4e95007d..0ea24d0929 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_wmma.inc
@@ -51,6 +51,20 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -93,6 +107,20 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -135,6 +163,20 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -177,6 +219,19 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
@@ -236,6 +291,20 @@ void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 GNDHWC,
@@ -278,6 +347,20 @@ void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
@@ -320,6 +403,20 @@ void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instanc
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 GNHWC,
@@ -361,6 +458,20 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index eba6fd789e..22e9d726b0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -93,6 +93,8 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
   ## NHWGC, GKYXC, NHWGK
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp
@@ -100,4 +102,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
+  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..a8f723dfec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<2,
+                                                                              GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..784a118897
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                GNHWC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                GNHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<2,
+                                                                             GNHWC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             GNHWK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..8c621543a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..5cb313b3ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             NHWGK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
index f5df7278d0..c078f8ed04 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index db048679bd..a67b11f1cf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -49,6 +49,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                                                         Empty_Tuple,
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
index ee9507a80a..5c0391a25f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instanc
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
index 132d3c8411..726276c461 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
                                                               Empty_Tuple,
                                                               NHWGK,
                                                               ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
index a7deb969ba..8b7bdec2a8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.cpp
@@ -49,6 +49,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
index d2732547fa..c66114b9a3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
@@ -52,6 +52,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance
                                                                  Empty_Tuple,
                                                                  NHWGK,
                                                                  ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Empty_Tuple,
+                                                                 NHWGK,
+                                                                 ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
index 8a0caebc9f..93e07e08fb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -48,6 +48,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
                                                        Empty_Tuple,
                                                        NHWGK,
                                                        ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
index e45df1e107..6acbb7475c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
@@ -50,6 +50,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                         NHWGK,
                                                         ConvFwd1x1S1P0>{});
 
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NHWGK,
+                                                                                   ConvFwdOddC>{});
+
     if(ck::get_device_name() != "gfx950")
     {
         add_device_operation_instances(
@@ -78,6 +86,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                   Empty_Tuple,
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  NHWGK,
+                                                                  ConvFwdOddC>{});
     }
 
     if(ck::get_device_name() == "gfx950")
@@ -108,6 +125,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instances(
                                                                Empty_Tuple,
                                                                NHWGK,
                                                                ConvFwd1x1S1P0>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_int8_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Empty_Tuple,
+                                                               NHWGK,
+                                                               ConvFwdOddC>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
index 078221f89f..2afbfdc386 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               GNHWK,
                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              GNHWC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              GNHWK,
+                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index 3a481dd204..822ef51e00 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             GNHWC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             GNHWK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 5add0f8add..79a1fb99a8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                                              Empty_Tuple,
                                                                              GNHWK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             GNHWC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             GNHWK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 0257c7d315..e567c0df75 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 2715506fe2..3e42184996 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             NHWGK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 8d3e4d91b1..c035d4c3da 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                                              Empty_Tuple,
                                                                              NHWGK,
                                                                              ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Empty_Tuple,
+                                                                             NHWGK,
+                                                                             ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
index 465fa927a5..5c425effd8 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
@@ -46,6 +46,14 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instances(
                                                                               Empty_Tuple,
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Empty_Tuple,
+                                                                              NHWGK,
+                                                                              ConvFwdOddC>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
index 87423801cb..e8a763c527 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
index ebb213461a..3ae3fb5186 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
index c2c8a099b2..cb7e912936 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
index 11cb853f0d..d787f4b048 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
index 1992d7f7c1..5644289790 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
index 2b8fd3d9db..5b12dad5a3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
                                                                                  NHWGK,
                                                                                  ConvFwd1x1S1P0,
                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Empty_Tuple,
+                                                                                 NHWGK,
+                                                                                 ConvFwdOddC,
+                                                                                 Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
index 5579ec62cc..f667481fa4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
index 77f3df2c11..2ff2c7f51f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
@@ -49,6 +49,15 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance
                                                                                   NHWGK,
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index f55bdd45c9..f8efa5a7c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -66,6 +66,10 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
+   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..fa378af1ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<3,
+                                                                              GNDHWC,
+                                                                              GKZYXC,
+                                                                              Empty_Tuple,
+                                                                              GNDHWK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..d41416fd4a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                GNDHWC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                GNDHWK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<3,
+                                                                             GNDHWC,
+                                                                             GKZYXC,
+                                                                             Empty_Tuple,
+                                                                             GNDHWK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
new file mode 100644
index 0000000000..8a7bc26178
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_f16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Empty_Tuple,
+                                                                              NDHWGK,
+                                                                              Empty_Tuple,
+                                                                              PassThrough,
+                                                                              ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
new file mode 100644
index 0000000000..7649f86971
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_wmma_i8_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Empty_Tuple,
+                                                                             NDHWGK,
+                                                                             Empty_Tuple,
+                                                                             PassThrough,
+                                                                             ConvFwdOddC>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From 5727af98d1a70f9c4cc8aa7ac0cf6941690eeafb Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Tue, 27 May 2025 09:51:20 +0800
Subject: [PATCH 148/443] Add operator/instance filters to ckProfiler (#2233)

---
 profiler/src/CMakeLists.txt | 275 ++++++++++++++++++++----------------
 1 file changed, 156 insertions(+), 119 deletions(-)

diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 17c8c277eb..65dd704610 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -1,6 +1,16 @@
 # ckProfiler
-set(PROFILER_SOURCES
-    profiler.cpp
+set(CK_PROFILER_OP_FILTER "" CACHE STRING "Filter for the operators to be profiled. Default is to include all")
+set(CK_PROFILER_INSTANCE_FILTER "" CACHE STRING "Filter for the kernels instances to be profiled. Default is to be the same as the operator filter")
+if (CK_PROFILER_OP_FILTER STREQUAL "")
+  set(CK_PROFILER_OP_FILTER ".+")
+endif()
+if (CK_PROFILER_INSTANCE_FILTER STREQUAL "")
+  set(CK_PROFILER_INSTANCE_FILTER ${CK_PROFILER_OP_FILTER})
+endif()
+message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}")
+message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}")
+
+set(PROFILER_OPS
     profile_gemm.cpp
     profile_reduce.cpp
     profile_groupnorm_bwd_data.cpp
@@ -26,161 +36,188 @@ set(PROFILER_SOURCES
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-    list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
-    list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
+    list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp)
+    list(APPEND PROFILER_OPS profile_contraction_scale.cpp)
   endif()
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp)
-    list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
-    list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
+    list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
+    list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
+    list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
+    list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
+    list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
+    list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
+    list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
   endif()
-  list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply_wp.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]")
+    list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
   endif()
-  list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
-  list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp)
-  list(APPEND PROFILER_SOURCES profile_batched_gemm_b_scale.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
-  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
-  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
-  list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp)
-  list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp)
-  list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp)
+  list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
+  list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
+  list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
+  list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
+  list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
+  list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp)
+  list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp)
 
 endif()
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
   endif()
-  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
-  list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
-  list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
-  list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
 endif()
 
 if(DL_KERNELS)
-  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
-  list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
+  list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
 endif()
 
+set(PROFILER_SOURCES profiler.cpp)
+foreach(SOURCE ${PROFILER_OPS})
+  string(REGEX REPLACE "profile_(.+)\.cpp" "\\1" OP_NAME ${SOURCE})
+  if (OP_NAME STREQUAL "") 
+    message(FATAL_ERROR "Unexpected source file name: ${SOURCE}")
+  endif()
+  if("${OP_NAME}" MATCHES "${CK_PROFILER_OP_FILTER}")
+    list(APPEND PROFILER_SOURCES ${SOURCE})
+  endif()
+endforeach()
+message(STATUS "ckProfiler sources: ${PROFILER_SOURCES}")
+
 set(PROFILER_EXECUTABLE ckProfiler)
 
 add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
 target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
 # flags to compress the library
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
-  message("Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
+  message(STATUS "Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
   target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
 endif()
 
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool2d_bwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
+
+set(DEVICE_INSTANCES "")
+list(APPEND DEVICE_INSTANCES device_gemm_instance)
+list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance)
+list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance)
+list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance)
+list(APPEND DEVICE_INSTANCES device_softmax_instance)
+list(APPEND DEVICE_INSTANCES device_reduce_instance)
+list(APPEND DEVICE_INSTANCES device_batchnorm_instance)
+list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance)
+list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance)
+list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance)
+list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance)
+list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance)
+list(APPEND DEVICE_INSTANCES device_image_to_column_instance)
+list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
+list(APPEND DEVICE_INSTANCES device_transpose_instance)
+list(APPEND DEVICE_INSTANCES device_permute_scale_instance)
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
+    list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
+    list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
   endif()
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
+    list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
+    list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
+    list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
+    list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
+    list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
+    list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
   endif()
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_wp_instance)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]")
+    list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
   endif()
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_b_scale_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
+  list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance)
+  list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance)
+  list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance)
+  list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance)
+  list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance)
+  list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
 endif()
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
   endif()
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
 endif()
 
 if(DL_KERNELS)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
 endif()
+
+set(PROFILER_LIBS utility getopt::getopt)
+foreach(LIB ${DEVICE_INSTANCES})
+  string(REGEX REPLACE "device_(.+)_instance" "\\1" INSTANCE_NAME ${LIB})
+  if (INSTANCE_NAME STREQUAL "") 
+    message(FATAL_ERROR "Unexpected kernel instance name: ${LIB}")
+  endif()
+  if("${INSTANCE_NAME}" MATCHES "${CK_PROFILER_INSTANCE_FILTER}")
+    list(APPEND PROFILER_LIBS ${LIB})
+  endif()
+endforeach()
+message(STATUS "ckProfiler libs: ${PROFILER_LIBS}")
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE ${PROFILER_LIBS})
+
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)

From c42b957d654826bd9c218ccb66225865019a5140 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Tue, 27 May 2025 10:58:58 +0800
Subject: [PATCH 149/443] [CK_TILE] For FMHA forward kernels, assign block
 indices reversely if using mask (#2209)

* Assign block indices reversely if kHasMask=true

* Assign block indices reversely for splitkv kernel
---
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 20 +++++++++++++++++--
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   | 11 +++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index bedf20626f..ac37f5dd06 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -983,7 +983,15 @@ struct FmhaFwdKernel
 
             const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
 
-            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            if constexpr(kHasMask)
+            {
+                // assume that num_tile_n1 is always 1
+                return ck_tile::make_tuple(gridDim.z - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+            else
+            {
+                return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
         }
         else
         {
@@ -1003,7 +1011,15 @@ struct FmhaFwdKernel
 
             const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
 
-            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            if constexpr(kHasMask)
+            {
+                // assume that num_tile_n1 is always 1
+                return ck_tile::make_tuple(gridDim.x - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+            else
+            {
+                return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
         }
     }
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 63011d2ba9..501aa26667 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -561,7 +561,16 @@ struct FmhaFwdSplitKVKernel
         const index_t i_nhead           = blockIdx.y;
         const index_t i_batch           = blockIdx.z;
 
-        return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch);
+        if constexpr(kHasMask)
+        {
+            // assume that num_tile_n1 is always 1
+            return ck_tile::make_tuple(
+                (gridDim.x / kargs.num_splits) - 1 - i_tile_m, i_tile_n, i_split, i_nhead, i_batch);
+        }
+        else
+        {
+            return ck_tile::make_tuple(i_tile_m, i_tile_n, i_split, i_nhead, i_batch);
+        }
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }

From 128f5a1eab330744d10c82a799bfdd9978d4c14b Mon Sep 17 00:00:00 2001
From: Casey-Shi <48111042+Yanxing-Shi@users.noreply.github.com>
Date: Tue, 27 May 2025 13:32:36 +0800
Subject: [PATCH 150/443] [Tile Engine] Add benchmark for tile engine gemm.
 (#2193)

* initial commit -m benchmark

* only support profile

* fix

* fix doc

* add default config

* add ci

* fix cmake

* tmp save for gen blobs

* fix bug

* merge

* range config

* test success

* fix

* fix

* move struct

* remove config property

* fix config

* remove comment

* add cmake option & modify

* add changelog

* fix

* format

* add pydantic module to the docker image

* fix

* add benchmark for cold and warmp up

* python format

* add asm cache control

* fix README

* remove pydantic module

* modify changelog

* fix config

* recover benchmark_gemm and fix

* format python

* refactor profiler

* fix csv bug

* fix codegen bug

* add kernel instance object

* add benchmark gemm executable

* fix jenkins & delete extra header

* disable warning output & enable default config

* Disable sparsity for invalid warp tile combinations

* fix gemm host template func

* refactor gemm profiler

* filter out some inmstances

* default config test & fix codegen bug

* add sparse flag to gen more instances

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: khuagarw <khuagarw@amd.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 CHANGELOG.md                                  |   1 +
 Jenkinsfile                                   |  48 +-
 include/ck_tile/host/device_prop.hpp          |  56 ++
 tile_engine/ops/gemm/CMakeLists.txt           |  53 +-
 tile_engine/ops/gemm/README.md                |  65 +-
 tile_engine/ops/gemm/benchmark_gemm.cpp       |  68 ++
 tile_engine/ops/gemm/benchmark_gemm.hpp       | 233 +++++
 tile_engine/ops/gemm/codegen_utils.py         | 239 +++++
 .../ops/gemm/configs/default_config.json      | 130 +++
 .../gemm/configs/instance_combination.json    |  62 --
 .../gemm/configs/user_provided_config.json    | 116 +++
 tile_engine/ops/gemm/gemm_host_api.cpp        | 192 ----
 tile_engine/ops/gemm/gemm_host_api.hpp        | 213 ++---
 tile_engine/ops/gemm/gemm_instance_builder.py | 887 ++++++++----------
 tile_engine/ops/gemm/gemm_profiler.hpp        | 260 +++++
 tile_engine/ops/gemm/json_config.py           | 202 ++++
 16 files changed, 1911 insertions(+), 914 deletions(-)
 create mode 100644 include/ck_tile/host/device_prop.hpp
 create mode 100644 tile_engine/ops/gemm/benchmark_gemm.cpp
 create mode 100644 tile_engine/ops/gemm/benchmark_gemm.hpp
 create mode 100644 tile_engine/ops/gemm/codegen_utils.py
 create mode 100644 tile_engine/ops/gemm/configs/default_config.json
 delete mode 100644 tile_engine/ops/gemm/configs/instance_combination.json
 create mode 100644 tile_engine/ops/gemm/configs/user_provided_config.json
 delete mode 100755 tile_engine/ops/gemm/gemm_host_api.cpp
 create mode 100644 tile_engine/ops/gemm/gemm_profiler.hpp
 create mode 100644 tile_engine/ops/gemm/json_config.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1163f059c..d62a64f3e0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
+* Added benchmarking support for tile engine GEMM.
 
 ### Optimized
 
diff --git a/Jenkinsfile b/Jenkinsfile
index c26350f120..776cf8f9fa 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -774,7 +774,7 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -862,6 +862,10 @@ pipeline {
             name: "RUN_CK_TILE_GEMM_TESTS",
             defaultValue: false,
             description: "Run the ck_tile GEMM tests (default: OFF)")
+        booleanParam(
+            name: "RUN_TILE_ENGINE_GEMM_TESTS",
+            defaultValue: false,
+            description: "Run the tile_engine_gemm tests (default: OFF)")
         booleanParam(
             name: "BUILD_INSTANCES_ONLY",
             defaultValue: false,
@@ -1145,6 +1149,48 @@ pipeline {
                 }
             }
         }
+        stage("Run TILE_ENGINE_GEMM Tests")
+        {
+            parallel
+            {
+                stage("Run TILE_ENGINE_GEMM Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a -G Ninja && \
+                                           ninja benchmark_gemm && \
+                                           ./bin/benchmark_gemm """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run TILE_ENGINE_GEMM Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 -G Ninja && \
+                                           ninja benchmark_gemm && \
+                                           ./bin/benchmark_gemm """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
 
 		stage("Build CK and run Tests")
         {
diff --git a/include/ck_tile/host/device_prop.hpp b/include/ck_tile/host/device_prop.hpp
new file mode 100644
index 0000000000..d33b298369
--- /dev/null
+++ b/include/ck_tile/host/device_prop.hpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#ifndef __HIPCC_RTC__
+#include <string>
+#include <string_view>
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u)
+{
+    return str.empty() ? h
+                       : fnv1a_hash(str.substr(1),
+                                    (h ^ static_cast<unsigned char>(str.front())) * 16777619u);
+}
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string raw_name(props.gcnArchName);
+    const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
+    switch(fnv1a_hash(name))
+    {
+    // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    case fnv1a_hash("Ellesmere"):
+    case fnv1a_hash("Baffin"):
+    case fnv1a_hash("RacerX"):
+    case fnv1a_hash("Polaris10"):
+    case fnv1a_hash("Polaris11"):
+    case fnv1a_hash("Tonga"):
+    case fnv1a_hash("Fiji"):
+    case fnv1a_hash("gfx800"):
+    case fnv1a_hash("gfx802"):
+    case fnv1a_hash("gfx804"): return "gfx803";
+    case fnv1a_hash("Vega10"):
+    case fnv1a_hash("gfx901"): return "gfx900";
+    case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030";
+    default: return name;
+    }
+}
+} // namespace ck_tile
+
+#endif
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index bc613a931e..72bf1aa8a4 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,43 +1,58 @@
 
-
 # generate a list of kernels, but not actually emit files at config stage
 execute_process(
     COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
             --working_path ${CMAKE_CURRENT_BINARY_DIR}
-            --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+            # --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json
             --list_blobs
-            RESULT_VARIABLE ret
-)
-set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
-  ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
-  ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+    RESULT_VARIABLE ret
 )
 
 if(ret AND NOT ret EQUAL 0)
-  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+  message( FATAL_ERROR "Fail to list kernels via Python. ${ret}")
 endif()
 
 file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt GEMM_CODEGEN_BLOBS)
 
+set(GEMM_CODEGEN_CPP_FILES "")
+set(GEMM_CODEGEN_HPP_FILES "")
+
+foreach(blob ${GEMM_CODEGEN_BLOBS})
+    string(STRIP "${blob}" stripped_blob)
+    
+    if(stripped_blob MATCHES "\\.cpp$")
+        list(APPEND GEMM_CODEGEN_CPP_FILES "${stripped_blob}")
+    elseif(stripped_blob MATCHES "\\.hpp$")
+        list(APPEND GEMM_CODEGEN_HPP_FILES "${stripped_blob}")
+    endif()
+endforeach()
+
 add_custom_command(
     OUTPUT  ${GEMM_CODEGEN_BLOBS}
     COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
             --working_path ${CMAKE_CURRENT_BINARY_DIR}
-            --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
+            # --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json
             --gen_blobs
-    DEPENDS ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
-            ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt
-            ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json
 )
 
-set(EXECUTABLE_GEMM_INSTANCE "tile_engine_gemm")
-message("adding example ${EXECUTABLE_GEMM_INSTANCE}")
+add_library(gemm_template_instances OBJECT EXCLUDE_FROM_ALL ${GEMM_CODEGEN_CPP_FILES})
+target_include_directories(gemm_template_instances PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(gemm_template_instances PRIVATE ${GEMM_CODEGEN_HPP_FILES})
+
+set(BENCHMARK_GEMM_EXECUTABLE "benchmark_gemm")
+message("adding example ${BENCHMARK_GEMM_EXECUTABLE}")
 
-# use build as include directory
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
-add_executable(${EXECUTABLE_GEMM_INSTANCE} EXCLUDE_FROM_ALL gemm_host_api.cpp)
-target_include_directories(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${GEMM_CODEGEN_BLOBS})
+
+add_library(gemm_host_api INTERFACE EXCLUDE_FROM_ALL)
+target_include_directories(gemm_host_api INTERFACE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(gemm_host_api INTERFACE ${GEMM_CODEGEN_HPP_FILES} gemm_host_api.hpp)
+target_link_libraries(gemm_host_api INTERFACE gemm_template_instances)
+
+add_executable(${BENCHMARK_GEMM_EXECUTABLE} EXCLUDE_FROM_ALL benchmark_gemm.cpp)
+target_include_directories(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE benchmark_gemm.hpp gemm_profiler.hpp)
+target_link_libraries(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE gemm_host_api)
 
 set(EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS)
 
@@ -46,6 +61,6 @@ list(APPEND EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS
      -Wno-float-equal
      --offload-compress)
 
-target_compile_options(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS})
+target_compile_options(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS})
 
 set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index f7d86e90fe..87267f8bce 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -4,53 +4,56 @@ CK Tile Engine GEMM is used to generate and run GEMM kernels with different comb
 
 # Kernel Configurations
 
-Kernel parameters are specified in the `instance_combination.json` file, including matrix layouts, data types, padding settings, pipelines, schedulers, epilogues, and numerical values for tile and warp sizes.
+User can provide kernel configuration such as tile size, warp size, padding, pipeline, scheduler and epilogue in the config file with limited values. For reference please see `./configs/user_provided_config.json`. 
 
-Given a valid set of values, tile_engine_gemm will automatically iterate over all possible combinations of BlockTile and WarpTile sizes, as well as the specified pipelines, schedulers, and epilogues from `./configs/instance_combination.json`, and build the corresponding kernels.
+The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark  For reference please see in `./configs/default_config.json`
 
+If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. 
 
 ## Build Instructions
 ``` bash
 # in the root of composable kernel create build directory
 mkdir build && cd build
 # build composable kernel
-sh ../script/cmake-ck-dev.sh  ../ <arch> # replace <arch> with the appropriate architecture (example gfx942) or leave blank
+sh ../script/cmake-ck-dev.sh  ../ <arch> -G Ninja # replace <arch> with the appropriate architecture (example gfx942) or leave blank
 # generate the executable
-make tile_engine_gemm -j
+ninja benchmark_gemm
 ```
-`tile_engine_gemm` will be located in the `./bin/` directory.
+`benchmark_gemm` will be located in the `./bin/` directory.
+
+`benchmark_gemm` must be rebuilt everytime if configuration file is modified.
 
-_`tile_engine_gemm` must be rebuilt everytime `instance_combination.json` is modified._
 ``` bash
-rm -rf tile_engine/ && make tile_engine_gemm -j  # rebuild
+rm -rf tile_engine/ && ninja benchmark_gemm  # rebuild
 ```
 
-## tile_engine_gemm inputs
+## benchmark_gemm inputs
 ```
+                      -m    The value for m dimension. Default is 3840.
+                      -n    The value for n dimension. Default is 4096.
+                      -k    The value for k dimension. Default is 2048.
+               -stride_a    The stride value for tensor A. Default is 0.
+               -stride_b    The stride value for tensor B. Default is 0.
+               -stride_c    The stride value for tensor C  Default is 0.
+                -split_k    The split value for k dimension. Default is 1.
+                      -v    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
+                    -log    Wether output kernel instance information or not. Possible values are true or false. Default is false.
+                 -warmup    The number of iterations before benchmark the kernel. Default is 50.
+                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
+                  -timer    Whether if the timer is gpu timer or not. Possible values are true or false. Default is true.  
+                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
+           -csv_filename    The filename of benchmark result. Default is gemm_kernel.
+    -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
+               -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.     
+               -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
+                  -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false. 
+                  -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false. 
+                  -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false. 
 
-                  -m    m dimension (default:3840)
-                  -n    n dimension (default:4096)
-                  -k    k dimension (default:2048)
-           -stride_a    Tensor A stride (default:0)
-           -stride_b    Tensor B stride (default:0)
-           -stride_c    Tensor C stride (default:0)
-            -split_k    SplitK value (default:1)
-                  -v    No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
-             -warmup    Number of iterations before benchmark the kernel (default:50)
-             -repeat    Number of iterations to benchmark the kernel (default:100)
-              -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
-               -init    Value for initializing tensor- random: 0, linear: 1, constant(1): 2 (default:0)
--structured_sparsity    Sparsity for tensor - 0:false, 1:true (default: 0)
-           -pipeline    possible values are: compv3, compv4, mem (default:compv3)
-          -scheduler    possible values are: intrawave, interwave (default:intrawave)
-           -epilogue    possible values are: cshuffle, default (default:cshuffle)
-              -pad_m    Pad in m direction - true/false (default:false)
-              -pad_n    Pad in n direction - true/false (default:false)
-              -pad_k    Pad in k direction - true/false (default:false)
-
-Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json 
+Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json 
 ```
-Note: In `./configs/instance_combination.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above. 
+Note: In `./configs/user_provided_config.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above. 
 
 ## Example
 
@@ -86,7 +89,7 @@ The following JSON file specifies parameters used to generate and build GEMM ker
 
 At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
 ``` bash
-./bin/tile_engine_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default 
+./bin/benchmark_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default 
 ```
 The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and default epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
new file mode 100644
index 0000000000..fb56e524d2
--- /dev/null
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <tuple>
+#include <exception>
+
+#include "gemm_profiler.hpp"
+#include "benchmark_gemm.hpp"
+
+void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
+{
+    GemmProblem gemm_problem{arg_parser.get_int("split_k"),
+                             arg_parser.get_int("m"),
+                             arg_parser.get_int("n"),
+                             arg_parser.get_int("k"),
+                             arg_parser.get_int("stride_a"),
+                             arg_parser.get_int("stride_b"),
+                             arg_parser.get_int("stride_c"),
+                             DataTypeTraits<ADataType>::name,
+                             DataTypeTraits<BDataType>::name,
+                             DataTypeTraits<AccDataType>::name,
+                             DataTypeTraits<CDataType>::name,
+                             ALayout::name,
+                             BLayout::name,
+                             CLayout::name,
+                             arg_parser.get_bool("structured_sparsity")};
+
+    Setting setting{
+        arg_parser.get_int("warmup"),
+        arg_parser.get_int("repeat"),
+        arg_parser.get_bool("timer"),
+        arg_parser.get_int("verify"),
+        arg_parser.get_int("init"),
+        arg_parser.get_bool("log"),
+        arg_parser.get_str("csv_filename"),
+    };
+
+    auto& profiler = GemmProfiler::instance(setting);
+
+    try
+    {
+        auto kernel_func = get_kernel_func_by_trait(arg_parser);
+        profiler.benchmark(gemm_problem, kernel_func);
+        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Benchmark failed: " << e.what() << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+        benchmark_gemm(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
new file mode 100644
index 0000000000..292d67dad6
--- /dev/null
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+
+#include "gemm_host_api.hpp"
+
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct GemmProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_c_;
+
+    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
+    std::string layout_a_, layout_b_, layout_c_;
+
+    bool structured_sparsity_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_c\":\"" << problem.layout_c_ << "\"\n"
+           << "   \"structured_sparsity\":\"" << problem.structured_sparsity_ << "\"\n"
+           << "}";
+        return os;
+    }
+};
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct KernelInstance
+{
+    std::string name_;
+    GemmProblem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \""
+           << "{\n"
+           << obj.name_ << "\n}"
+           << "\",\n"
+           << " \"problem\": \"" << obj.problem_ << "\",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+/// @brief Function to compare the results of the device and host computations
+bool compare(ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
+    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        K, kbatch, max_accumulated_value);
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
+
+/// @brief Function to get the kernel output with reference implementation on CPU/GPU
+void gemm_host_reference(int verify,
+                         ck_tile::HostTensor<ADataType>& a_m_k,
+                         ck_tile::HostTensor<BDataType>& b_k_n,
+                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                         ck_tile::DeviceMem& a_m_k_dev_buf,
+                         ck_tile::DeviceMem& b_k_n_dev_buf,
+                         ck_tile::index_t M,
+                         ck_tile::index_t N,
+                         ck_tile::index_t K,
+                         ck_tile::index_t stride_A,
+                         ck_tile::index_t stride_B,
+                         ck_tile::index_t stride_C)
+{
+    if(verify == 1)
+    {
+        c_m_n_host_result.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_result);
+    }
+    else if(verify == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes());
+        c_m_n_host_result.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data());
+    }
+}
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
new file mode 100644
index 0000000000..a8955cec91
--- /dev/null
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -0,0 +1,239 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+Mappings and utility functions for kernel code generation.
+"""
+
+import subprocess
+import re
+from functools import lru_cache
+
+DATA_TYPE_MAP = {'fp32': 'float',
+                 'fp16': 'ck_tile::half_t',
+                 'bf16': 'ck_tile::bf16_t',
+                 'int8': 'ck_tile::int8_t',
+                 'fp8': 'ck_tile::fp8_t',
+                 'bf8': 'ck_tile::bf8_t',
+                 'int4': 'ck_tile::pk_int4_t'
+                 }
+
+LAYOUT_MAP = {'r': 'ck_tile::tensor_layout::gemm::RowMajor',
+              'c': 'ck_tile::tensor_layout::gemm::ColumnMajor'}
+
+DEFAULT_EPILOGUE = """
+            using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
+                                ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
+                                                                      BDataType,
+                                                                      AccDataType,
+                                                                      CDataType,
+                                                                      CLayout,
+                                                                      kPadM,
+                                                                      kPadN,
+                                                                      WarpTileM,
+                                                                      WarpTileN,
+                                                                      WarpTileK,
+                                                                      UniversalGemmProblem::TransposeC,
+                                                                      true,
+                                                                      memory_operation>>;
+"""
+
+CSHUFFLE_EPILOGUE = """
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                             BDataType,
+                                                             AccDataType,
+                                                             CDataType,
+                                                             CLayout,
+                                                             GemmPipelineProblem::kBlockSize,
+                                                             TilePartitioner::MPerBlock,
+                                                             TilePartitioner::NPerBlock,
+                                                             WarpM,
+                                                             WarpN,
+                                                             WarpTileM,
+                                                             WarpTileN,
+                                                             WarpTileK,
+                                                             UniversalGemmProblem::TransposeC,
+                                                             memory_operation>>;
+"""
+HOT_LOOP_FALSE = """
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                RunSplitk(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Odd)
+            {
+                RunSplitk(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Even)
+            {
+                RunSplitk(ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("Num K loop must be larger than number of prefetech stages.");
+            }
+"""
+RUN_MEM = """
+            // Handle One and Full cases directly
+            if (tail_num == ck_tile::TailNumber::One) {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+            } else if (tail_num == ck_tile::TailNumber::Full) {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            
+            auto check_tail = [&](auto... TNs) {
+                ([&]{
+                    if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
+                        if(tail_num == decltype(TNs)::value) {
+                            RunSplitk(ck_tile::bool_constant<true>{},
+                                    ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
+                        }
+                    }
+                }(), ...);
+            };
+
+            check_tail(
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{}
+            );
+"""
+
+RUN_COMPV3 = """
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Odd)
+            {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_num == ck_tile::TailNumber::Even)
+            {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even.");
+            }
+"""
+
+RUN_COMPV4 = """
+            if(tail_num == ck_tile::TailNumber::Three)
+            {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+            }
+            else
+            {
+                RunSplitk(ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+            }
+"""
+
+
+PIPELINE_MAP = {'mem': ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'],
+                'compv3': ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'],
+                'compv4': ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']}
+
+SCHEDULER_MAP = {'interwave': 'ck_tile::GemmPipelineScheduler::Interwave',
+                 'intrawave': 'ck_tile::GemmPipelineScheduler::Intrawave'}
+
+EPILOGUE_MAP = {'default': DEFAULT_EPILOGUE,
+                'cshuffle': CSHUFFLE_EPILOGUE}
+
+HOT_LOOP_TRUE = {'mem': RUN_MEM,
+                 'compv3': RUN_COMPV3,
+                 'compv4': RUN_COMPV4}
+
+
+def BOOL_MAP(b_): return {True: 'true', False: 'false'}[bool(b_)]
+
+
+# To Do: add some more supported combinations
+warp_tile_supported_combinations = {
+    "gfx90a": {
+        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]],
+        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]]
+    },
+    "gfx942": {
+        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]]
+    },
+    "gfx950": {
+        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
+        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
+    }
+}
+
+# To Do: remove some unsupported combinations
+trait_unsupported_combinations = {
+    ("compv3", "cshuffle", "interwave"),
+    ("compv3", "default", "interwave"),
+    ("compv4", "cshuffle", "interwave"),
+    ("compv4", "default", "interwave")
+}
+
+
+def element_size(data_type: str) -> float:
+    """Calculate the size (in bytes) of a single element for given data type."""
+    data_type = data_type.lower()
+    if data_type in {'fp16', 'bf16'}:
+        return 2
+    elif data_type in {'int8', 'fp8', 'bf8'}:
+        return 1
+    elif data_type == 'int4':
+        return 0.5
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}")
+
+
+GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)')
+
+
+@lru_cache(maxsize=1)
+def get_gpu_name_by_id(gpu_id: int = 0) -> str:
+    """Retrieve GPU name (e.g. gfx90a) by device ID"""
+    try:
+        output = subprocess.check_output(
+            ["rocminfo"],
+            text=True,
+            stderr=subprocess.PIPE,
+            timeout=5
+        )
+        if matches := GPU_NAME_PATTERN.finditer(output):
+            gpu_list = [m.group(1) for m in matches]
+            return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
+
+        return ""
+
+    except subprocess.CalledProcessError as e:
+        print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
+    except FileNotFoundError:
+        print("ROCm tools not installed (requires rocminfo)")
+    except subprocess.TimeoutExpired:
+        print("GPU query timeout (5s)")
+    except Exception as e:
+        print(f"GPU detection error: {str(e)}")
+
+    return ""
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
new file mode 100644
index 0000000000..09fe3b83ac
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -0,0 +1,130 @@
+{
+    "problem": {
+        "layout_a": {
+            "values": [
+                "r"
+            ]
+        },
+        "layout_b": {
+            "values": [
+                "c"
+            ]
+        },
+        "layout_c": {
+            "values": [
+                "r"
+            ]
+        },
+        "datatype_a": {
+            "values": [
+                "fp16"
+            ]
+        },
+        "datatype_b": {
+            "values": [
+                "fp16"
+            ]
+        },
+        "datatype_c": {
+            "values": [
+                "fp16"
+            ]
+        }
+    },
+    "tile_config": {
+        "tile_m": {
+            "max": 512,
+            "min": 64,
+            "step": 64,
+            "exclude": []
+        },
+        "tile_n": {
+            "max": 512,
+            "min": 64,
+            "step": 32,
+            "exclude": []
+        },
+        "tile_k": {
+            "max": 512,
+            "min": 64,
+            "step": 64,
+            "exclude": []
+        },
+        "warp_m": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_n": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_k": {
+            "values": [
+                1
+            ]
+        },
+        "warp_tile_m": {
+            "values": [
+                16,
+                32
+            ]
+        },
+        "warp_tile_n": {
+            "values": [
+                16,
+                32
+            ]
+        },
+        "warp_tile_k": {
+            "values": [
+                8,
+                16,
+                32,
+                64,
+                128
+            ]
+        }
+    },
+    "trait_config": {
+        "pipeline": {
+            "values": [
+                "compv4",
+                "compv3",
+                "mem"
+            ]
+        },
+        "scheduler": {
+            "values": [
+                "intrawave",
+                "interwave"
+            ]
+        },
+        "epilogue": {
+            "values": [
+                "default",
+                "cshuffle"
+            ]
+        },
+        "pad_m": {
+            "values": [
+                false
+            ]
+        },
+        "pad_n": {
+            "values": [
+                false
+            ]
+        },
+        "pad_k": {
+            "values": [
+                false
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json
deleted file mode 100644
index b497513efa..0000000000
--- a/tile_engine/ops/gemm/configs/instance_combination.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-    "architecture": {
-        "values": ["gfx90a"]
-    },
-    "layout_a": {
-      "values": ["r"]
-    },
-    "layout_b": {
-      "values": ["c"]
-    },
-    "layout_c": {
-      "values": ["r"]
-    },
-    "datatype": {
-      "values": ["fp16"]
-    },
-    "tile_m": {
-      "values": [256]
-    },
-    "tile_n": {
-      "values": [256]
-    },
-    "tile_k": {
-      "values": [32]
-    },
-    "warp_m": {
-      "values": [2]
-    },
-    "warp_n": {
-      "values": [2]
-    },
-    "warp_k": {
-      "values": [1]
-    },
-    "warp_tile_m": {
-      "values": [32]
-    },
-    "warp_tile_n": {
-      "values": [32]
-    },
-    "warp_tile_k": {
-      "values": [16]
-    },
-    "kPadM": {
-      "values": [false]
-    },
-    "kPadN": {
-      "values": [false]
-    },
-    "kPadK": {
-      "values": [false]
-    },
-    "pipeline": {
-      "values": ["compv3", "compv4", "mem"]
-    },
-    "scheduler": {
-      "values": ["intrawave", "interwave"]
-    },
-    "epilogue": {
-      "values": ["default", "cshuffle"]
-    }
-}
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
new file mode 100644
index 0000000000..6a6e726e40
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -0,0 +1,116 @@
+{
+  "problem": {
+    "layout_a": {
+      "values": [
+        "r"
+      ]
+    },
+    "layout_b": {
+      "values": [
+        "c"
+      ]
+    },
+    "layout_c": {
+      "values": [
+        "r"
+      ]
+    },
+    "datatype_a": {
+      "values": [
+        "fp16"
+      ]
+    },
+    "datatype_b": {
+      "values": [
+        "fp16"
+      ]
+    },
+    "datatype_c": {
+      "values": [
+        "fp16"
+      ]
+    }
+  },
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        128
+      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        32
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3",
+        "mem"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave",
+        "interwave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "default",
+        "cshuffle"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp
deleted file mode 100755
index a5447cd658..0000000000
--- a/tile_engine/ops/gemm/gemm_host_api.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck_tile/host.hpp"
-#include "gemm_common.hpp"
-#include "gemm_dispatcher.hpp"
-#include "gemm_host_api.hpp"
-
-void gemm_kernel_launch(ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        int verify,
-                        bool structured_sparsity,
-                        KernelTraits& trait,
-                        ck_tile::GemmHostArgs& args,
-                        const ck_tile::stream_config& stream)
-{
-    return GemmDispatcher::dispatch(c_m_n_dev_buf,
-                                    c_m_n_host_result,
-                                    c_m_n_dev_result,
-                                    verify,
-                                    structured_sparsity,
-                                    trait,
-                                    args,
-                                    stream);
-}
-
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-void run(const ck_tile::ArgParser& arg_parser)
-{
-    const ALayout a_layout = ALayout{};
-    const BLayout b_layout = BLayout{};
-
-    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
-    ck_tile::index_t M      = arg_parser.get_int("m");
-    ck_tile::index_t N      = arg_parser.get_int("n");
-    ck_tile::index_t K      = arg_parser.get_int("k");
-
-    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
-    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
-    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
-
-    int n_warmup                 = arg_parser.get_int("warmup");
-    int n_repeat                 = arg_parser.get_int("repeat");
-    int verify                   = arg_parser.get_int("v");
-    ck_tile::index_t init_method = arg_parser.get_int("init");
-    bool structured_sparsity     = arg_parser.get_bool("structured_sparsity");
-
-    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
-    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
-    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
-
-    ck_tile::HostTensor<ADataType> a_m_k(
-        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
-    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    if(init_method == 0)
-    {
-        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-    }
-    else if(init_method == 1)
-    {
-        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
-        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
-    }
-    else if(init_method == 2)
-    {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
-    }
-    else
-    {
-        a_m_k.SetZero();
-        b_k_n.SetZero();
-    }
-
-    if(structured_sparsity)
-    {
-        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
-    }
-
-    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
-
-    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-    {
-        // Permute vector pk_i4x4 data for device implementation
-        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-        // permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
-        permute_vectors_i4x4_b(b_k_n_dev);
-        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
-    }
-    else
-    {
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
-    }
-
-    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    c_m_n_dev_buf.SetZero();
-    c_m_n_dev_result.SetZero();
-
-    ck_tile::GemmHostArgs gemm_args;
-    gemm_args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-    gemm_args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-    gemm_args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-    gemm_args.k_batch  = kbatch;
-    gemm_args.M        = M;
-    gemm_args.N        = N;
-    gemm_args.K        = K;
-    gemm_args.stride_A = stride_A;
-    gemm_args.stride_B = stride_B;
-    gemm_args.stride_C = stride_C;
-
-    KernelTraits trait;
-    trait.pipeline  = arg_parser.get_str("pipeline");
-    trait.scheduler = arg_parser.get_str("scheduler");
-    trait.epilogue  = arg_parser.get_str("epilogue");
-    trait.kPadM     = arg_parser.get_bool("pad_m");
-    trait.kPadN     = arg_parser.get_bool("pad_n");
-    trait.kPadK     = arg_parser.get_bool("pad_k");
-
-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
-              << " B Type = " << DataTypeTraits<BDataType>::name
-              << " C Type = " << DataTypeTraits<CDataType>::name << std::endl;
-
-    ck_tile::HostTensor<CDataType> c_m_n_host_result(
-        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-
-    if(verify)
-    {
-        gemm_host_reference<ADataType,
-                            BDataType,
-                            AccDataType,
-                            CDataType,
-                            ALayout,
-                            BLayout,
-                            CLayout>(verify,
-                                     a_m_k,
-                                     b_k_n,
-                                     c_m_n_host_result,
-                                     a_m_k_dev_buf,
-                                     b_k_n_dev_buf,
-                                     M,
-                                     N,
-                                     K,
-                                     stride_A,
-                                     stride_B,
-                                     stride_C);
-    }
-
-    gemm_kernel_launch(c_m_n_dev_buf,
-                       c_m_n_host_result,
-                       c_m_n_dev_result,
-                       verify,
-                       structured_sparsity,
-                       trait,
-                       gemm_args,
-                       ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
-
-    return;
-}
-
-int main(int argc, char* argv[])
-{
-    try
-    {
-        auto [result, parser] = create_args(argc, argv);
-        if(!result)
-            return EXIT_FAILURE;
-        run<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(parser);
-        return 0;
-    }
-    catch(const std::exception& e)
-    {
-        std::cerr << "Error: " << e.what() << "\n";
-        return EXIT_FAILURE;
-    }
-}
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 579d2770db..8cbc3f26f6 100755
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -1,16 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <hip/hip_runtime.h>
+#pragma once
 
 #include <cstring>
-#include <iostream>
-#include <sstream>
 #include <string>
 #include <tuple>
-#include "ck_tile/ops/gemm.hpp"
 
-#pragma once
+#include "ck_tile/host.hpp"
+#include "gemm_dispatcher.hpp"
+#include "gemm_common.hpp"
 
 template <typename T>
 struct DataTypeTraits;
@@ -57,24 +56,6 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
     static constexpr const char* name = "pk_int4_t";
 };
 
-/// @brief Defines the configuration parameters for a GEMM operation, enabling the selection of a
-/// specific kernel instance based on the provided settings.
-struct KernelTraits
-{
-    /// @brief The name of the pipeline.
-    std::string pipeline;
-    /// @brief The name of the scheduler (e.g., "intrawave", "interwave").
-    std::string scheduler;
-    /// @brief The name of the epilogue (e.g., "cshuffle", "default").
-    std::string epilogue;
-    /// @brief Indicates whether padding is applied to the M dimension.
-    bool kPadM;
-    /// @brief Indicates whether padding is applied to the N dimension.
-    bool kPadN;
-    /// @brief Indicates whether padding is applied to the K dimension.
-    bool kPadK;
-};
-
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
 {
@@ -82,49 +63,71 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                  ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
 inline auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "m dimension")
-        .insert("n", "4096", "n dimension")
-        .insert("k", "2048", "k dimension")
-        .insert("stride_a", "0", "Tensor A stride")
-        .insert("stride_b", "0", "Tensor B stride")
-        .insert("stride_c", "0", "Tensor C stride")
-        .insert("split_k", "1", "splitK value")
-        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
-        .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("structured_sparsity", "0", "0:false, 1:true")
-        .insert("pipeline", "compv3", "compv3, compv4, mem")
-        .insert("scheduler", "intrawave", "intrawave, interwave")
-        .insert("epilogue", "cshuffle", "cshuffle, default")
-        .insert("pad_m", "false", "true, false")
-        .insert("pad_n", "false", "true, false")
-        .insert("pad_k", "false", "true, false");
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C  Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 2, validation on GPU.")
+        .insert("log",
+                "false",
+                "Wether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "gemm_kernel",
+                "The filename of benchmark result. Default is gemm_kernel.")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert(
+            "pipeline",
+            "compv3",
+            "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.")
+        .insert("scheduler",
+                "intrawave",
+                "The type of pipeline. Possible values are compv3, compv4 or mem. Default is "
+                "compv3.")
+        .insert(
+            "epilogue",
+            "cshuffle",
+            "The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.")
+        .insert("pad_m",
+                "false",
+                "Whether pad or not in m direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_n",
+                "false",
+                "Whether pad or not in n direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_k",
+                "false",
+                "Whether pad or not in k direction. Possible values are true or false. Default is "
+                "false.");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -185,79 +188,17 @@ void permute_vectors_i4x4_b(Tensor& tensor)
     }
 }
 
-/// @brief Function to compare the results of the device and host computations
-void compare(ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser)
 {
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
+    KernelTraits trait;
+    trait.pipeline  = arg_parser.get_str("pipeline");
+    trait.scheduler = arg_parser.get_str("scheduler");
+    trait.epilogue  = arg_parser.get_str("epilogue");
+    trait.pad_m     = arg_parser.get_bool("pad_m");
+    trait.pad_n     = arg_parser.get_bool("pad_n");
+    trait.pad_k     = arg_parser.get_bool("pad_k");
 
-    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-}
-
-/// @brief Function to get the kernel output with reference implementation on CPU/GPU
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-void gemm_host_reference(int verify,
-                         ck_tile::HostTensor<ADataType>& a_m_k,
-                         ck_tile::HostTensor<BDataType>& b_k_n,
-                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                         ck_tile::DeviceMem& a_m_k_dev_buf,
-                         ck_tile::DeviceMem& b_k_n_dev_buf,
-                         ck_tile::index_t M,
-                         ck_tile::index_t N,
-                         ck_tile::index_t K,
-                         ck_tile::index_t stride_A,
-                         ck_tile::index_t stride_B,
-                         ck_tile::index_t stride_C)
-{
-    if(verify == 1)
-    {
-        c_m_n_host_result.SetZero();
-
-        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_result);
-    }
-    else if(verify == 2)
-    {
-        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
-        {
-            // Restore input for B for gpu reference
-            b_k_n_dev_buf.ToDevice(b_k_n.data());
-        }
-
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes());
-        c_m_n_host_result.SetZero();
-        c_m_n_gpu_buf_ref.SetZero();
-
-        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
-        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
-        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
-
-        ck_tile::reference_gemm_gpu<ADataType,
-                                    BDataType,
-                                    AccDataType,
-                                    CDataType,
-                                    ALayout,
-                                    BLayout,
-                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data());
-    }
+    bool structured_sparsity = arg_parser.get_bool("structured_sparsity");
+
+    return GemmDispatcher::dispatch(structured_sparsity, trait);
 }
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index dd8b4d1157..c43797f3e0 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -1,385 +1,199 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-# generate kernel instances to speed up compilation
+
+# -*- coding: utf-8 -*-
+
+"""
+generate kernel instances to speed up compilation
+"""
 
 import argparse
-from enum import IntEnum
-from pathlib import Path
-import sys
-from typing import List, Optional, Dict, Any
-import functools
 import itertools
-import copy
-import json
-from dataclasses import dataclass
- 
-DATA_TYPE_MAP = {'fp32'  : 'float',
-                 'fp16'  : 'ck_tile::half_t',
-                 'bf16'  : 'ck_tile::bf16_t',
-                 'int8'  : 'ck_tile::int8_t',
-                 'fp8'   : 'ck_tile::fp8_t',
-                 'bf8'   : 'ck_tile::bf8_t',
-                 'int4'  : 'ck_tile::pk_int4_t'
-                }
+from pathlib import Path
+from typing import List, Optional
+from json_config import GemmConfig, RangeConfigParam
+from codegen_utils import (
+    DATA_TYPE_MAP,
+    LAYOUT_MAP,
+    DEFAULT_EPILOGUE,
+    CSHUFFLE_EPILOGUE,
+    HOT_LOOP_FALSE,
+    RUN_MEM,
+    RUN_COMPV3,
+    RUN_COMPV4,
+    PIPELINE_MAP,
+    SCHEDULER_MAP,
+    EPILOGUE_MAP,
+    HOT_LOOP_TRUE,
+    BOOL_MAP,
+    warp_tile_supported_combinations,
+    trait_unsupported_combinations,
+    element_size,
+    get_gpu_name_by_id
+)
+import logging
+import time
 
-LAYOUT_MAP = {'r' : 'ck_tile::tensor_layout::gemm::RowMajor',
-              'c' : 'ck_tile::tensor_layout::gemm::ColumnMajor'}   
-
-
-warp_tile_combinations_map = {
-        "gfx90a": {
-            'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-            'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-            'fp8': [[32, 32, 16], [32, 32, 32]],
-            'bf8': [[32, 32, 16], [32, 32, 32]]
-        },
-        "gfx942": {
-            'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-            'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-            'fp8': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
-            'bf8': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]]
-        },
-        "gfx950": {
-            'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-            'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-            'fp8': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
-            'bf8': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
-        }
-    }      
-
-def sizeOf(data_type):
-    if data_type == 'fp16' or data_type == 'bf16':
-        return 2
-    elif data_type == 'int8' or data_type == 'fp8' or data_type == 'bf8':
-        return 1
-    elif data_type == 'int4': ## TODO:: needs to confirm
-        return 0.5
-    else:
-        return 4                                         
-
-DEFAULT_EPILOGUE = """
-            using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
-                                ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
-                                                                      BDataType,
-                                                                      AccDataType, 
-                                                                      CDataType, 
-                                                                      CLayout, 
-                                                                      kPadM,
-                                                                      kPadN,
-                                                                      WarpTileM,
-                                                                      WarpTileN,
-                                                                      WarpTileK,
-                                                                      UniversalGemmProblem::TransposeC,
-                                                                      true,
-                                                                      memory_operation>>;
-"""
-
-CSHUFFLE_EPILOGUE = """
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                             BDataType,
-                                                             AccDataType,
-                                                             CDataType,
-                                                             CLayout,
-                                                             GemmPipelineProblem::kBlockSize,
-                                                             TilePartitioner::MPerBlock,
-                                                             TilePartitioner::NPerBlock,
-                                                             WarpM,
-                                                             WarpN,
-                                                             WarpTileM,
-                                                             WarpTileN,
-                                                             WarpTileK,
-                                                             UniversalGemmProblem::TransposeC,
-                                                             memory_operation>>;
-"""
-HOT_LOOP_FALSE = """
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Odd)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Even)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-            }
-            else
-            {
-                throw std::runtime_error("Num K loop must be larger than number of prefetech stages.");
-            }  
-"""
-RUN_MEM = """
-            // Handle One and Full cases directly
-            if (tail_num == ck_tile::TailNumber::One) {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-            } else if (tail_num == ck_tile::TailNumber::Full) {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            // Variadic call using fold expression
-            auto check_tail = [&](auto... TNs) {
-                (try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
-            };
-
-            check_tail(
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{}
-            );
-"""
-
-RUN_COMPV3 = """
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Odd)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Even)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-            }
-            else
-            {
-                throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even.");
-            }
-"""
-
-RUN_COMPV4 = """
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-            else
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-"""
-
-
-PIPELINE_MAP = {'mem' : ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'],
-                'compv3' : ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'],
-                'compv4' : ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']}
-
-SCHEDULER_MAP = {'interwave' : 'ck_tile::GemmPipelineScheduler::Interwave',
-                 'intrawave' : 'ck_tile::GemmPipelineScheduler::Intrawave'}
-
-EPILOGUE_MAP = {'default' :DEFAULT_EPILOGUE,
-                'cshuffle' : CSHUFFLE_EPILOGUE}      
-
-HOT_LOOP_TRUE = {'mem' : RUN_MEM,
-                 'compv3' : RUN_COMPV3,
-                 'compv4' : RUN_COMPV4}    
-
-
-def BOOL_MAP(b_) -> str:
-    if b_:
-        return 'true'
-    else:
-        return 'false'
-
-@dataclass
-class GemmConfig:
-    def __init__(self, config_data):
-        self.matrix_cfg : Dict[str, Any] = {}
-        self.impl_cfg : Dict[str, Any] = {}
-        for key, value in config_data.items():
-            if key in ["architecture", "datatype", "layout_a", "layout_b", "layout_c"]:
-                self.matrix_cfg[key] = value
-            else:
-                self.impl_cfg[key] = value
-    
-    @property
-    def architecture(self) -> str:
-        return self.matrix_cfg["architecture"]["values"][0]
-    
-    @property
-    def datatype(self) -> str:
-        return self.matrix_cfg["datatype"]["values"][0]
-    
-    @property
-    def layouts(self) -> List[str]:
-        return [
-            self.matrix_cfg["layout_a"]["values"][0],
-            self.matrix_cfg["layout_b"]["values"][0],
-            self.matrix_cfg["layout_c"]["values"][0]
-        ]
+logging.basicConfig(level=logging.INFO)
 
 
 class GemmCodeGenerator:
-    def __init__(self, output_dir: str, config: GemmConfig):
+    """GEMM (General Matrix Multiplication) code generator."""
+
+    def __init__(self, output_dir: str,
+                 user_provided_config: Optional[GemmConfig] = None):
         self.output_dir = Path(output_dir)
-        if not self.output_dir.exists():
-            self.output_dir.mkdir()
+        self.output_dir.mkdir(parents=True, exist_ok=True)
 
-        self.config = config
-        self.all_kernels = []
-        self.unique_configs = [] 
-        # Validate configurations
-        self._validate_config()
+        if user_provided_config is not None:
+            self.config = user_provided_config
+        else:
+            config_path = Path(__file__).resolve().parent / \
+                "configs" / "default_config.json"
+            self.config = GemmConfig.from_json(config_path)
 
-    def _validate_config(self):
-        """Validate matrix and implementation configurations"""
-        # Matrix config validation
-        for param in ["architecture", "datatype", "layout_a", "layout_b", "layout_c"]:
-            if len(self.config.matrix_cfg[param]["values"]) != 1:
-                raise ValueError(f"Matrix config {param} must have exactly one value")
-        
-        # Implementation traits validation
-        required_params = ["tile_m", "tile_n", "tile_k", "warp_m", "warp_n", "warp_k",
-                          "warp_tile_m", "warp_tile_n", "warp_tile_k", "pipeline",
-                          "epilogue", "scheduler", "kPadM", "kPadN", "kPadK"]
-        for param in required_params:
-            if not self.config.impl_cfg.get(param, {}).get("values"):
-                raise ValueError(f"Missing implementation parameter: {param}")
+        self.valid_trait_names: List[str] = []
+        self.valid_trait_tile_combinations: map[str, list[tuple[int]]] = {}
 
-    def list_all(self):
-        """List all possible kernel configurations"""
+    def list_all_trait_names(self):
+        """List all possible kernel trait names into file."""
         w_p = Path(self.output_dir)
-        list_p = w_p / 'gemm_instance_blobs.txt'
-        self._list_config_groups()
-        with list_p.open('w') as list_f:
-            list_f.write(str(w_p / ("gemm_common.hpp"))  + "\n")
-            list_f.write(str(w_p / ("gemm_instances.hpp"))  + "\n")
-            list_f.write(str(w_p / ("gemm_dispatcher.hpp"))  + "\n")  
-            for group in self.all_kernels:
-                list_f.write(str(w_p / ("gemm_" + group + ".hpp")) + "\n")
-            
+        file_path = w_p / 'gemm_instance_blobs.txt'
+        self._generate_all_traits()
+        self._get_valid_trait_tile_combinations()
 
+        # Write all file paths to the header file
+        with file_path.open('w') as f:
+            f.write(str(w_p / "gemm_common.hpp") + "\n")
+            f.write(str(w_p / "gemm_instances.hpp") + "\n")
+            f.write(str(w_p / "gemm_dispatcher.hpp") + "\n")
+            for trait in self.valid_trait_names:
+                f.write(str(w_p / f"gemm_{trait}.hpp") + "\n")
+            for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+                for tile in tile_valid_params:
+                    for tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k in tile:
+                        sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
+                            self.config.problem.datatype_map['matrix_b'] == 'fp16' and \
+                            self.config.problem.datatype_map['matrix_c'] == 'fp16' and \
+                            ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or
+                             (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32))
+                        if sparse:
+                            f.write(str(
+                                w_p / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_true.cpp") + "\n")
+                        f.write(str(
+                                w_p / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_false.cpp") + "\n")
 
-    def _list_config_groups(self):
+    def _generate_all_traits(self):
+        """Generate all possible kernel traits names."""
         params = [
-            ("pipeline", "pipeline"),
-            ("epilogue", "epilogue"),
-            ("scheduler", "scheduler"),
-            ("kPadM", "kPadM"),
-            ("kPadN", "kPadN"), 
-            ("kPadK", "kPadK")
-        ]
-        
+            "pipeline",
+            "epilogue",
+            "scheduler",
+            "pad_m",
+            "pad_n",
+            "pad_k"]
+
         # Generate all unique_combinations
-        _unique = set(itertools.product(*[self.config.impl_cfg[p]["values"] for (p, _) in params]))
+        _unique = set(itertools.product(*[
+            getattr(self.config.trait_config, param).values
+            for param in params
+        ]))
+
         for combo in _unique:
-            config = {name: value for (_, name), value in zip(params, combo)}
-            pipeline, epilogue, scheduler, kPadM, kPadN, kPadK = config.values()
-            # To remove some unsupported combinations
-            unsupported_combination = [("compv3", "cshuffle", "interwave"),
-                                       ("compv3", "default", "interwave"),
-                                       ("compv4", "cshuffle", "interwave"),
-                                       ("compv4", "default", "interwave")]
-            if (pipeline, epilogue, scheduler) not in unsupported_combination:
-                group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}"
-                self.all_kernels.append(group_name)
-                self.unique_configs.append(config)
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo
+            current_combination = (pipeline, epilogue, scheduler)
 
-    def generate_all(self):
-        self._generate_common_header()
-        self._generate_config_groups()
-        self._generate_dispatcher()
-       
+            if current_combination not in trait_unsupported_combinations:
+                trait_name = (
+                    f"{pipeline}_{epilogue}_{scheduler}_"
+                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}"
+                )
+                self.valid_trait_names.append(trait_name)
+            else:
+                logging.debug(
+                    f"Invalid combination: {pipeline}-{epilogue}-{scheduler}"
+                )
 
-    def _generate_common_header(self):
-        """Generate common header with datatypes and layout"""
-        self.ctype = self.config.datatype
-        self.atype = self.config.datatype
-        self.btype = self.config.datatype
-        if self.config.datatype in ['fp8', 'bf8']:
-            self.ctype = 'fp16'
-        elif self.config.datatype in ['int4']:
-            self.atype = 'fp16'
-            self.ctype = 'fp16'
+    def generate_all_instance_files(self):
+        """Generate all kernel instances files."""
+        self._generate_common_header_file()
+        self._generate_all_trait_files()
+        self._generate_dispatcher_file()
+
+    def _generate_common_header_file(self):
+        """Generate common header file with datatypes and layout."""
 
         content = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
+
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
 
 // Data types
-using ADataType = {DATA_TYPE_MAP[self.atype]};
-using BDataType = {DATA_TYPE_MAP[self.btype]};
+using ADataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_a']]};
+using BDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_b']]};
 using AccDataType = float;
-using CDataType = {DATA_TYPE_MAP[self.ctype]};
+using CDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_c']]};
 
 // Layout configurations
-using ALayout = {LAYOUT_MAP[self.config.layouts[0]]};
-using BLayout = {LAYOUT_MAP[self.config.layouts[1]]};
-using CLayout = {LAYOUT_MAP[self.config.layouts[2]]};
+using ALayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_a']]};
+using BLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_b']]};
+using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
 """
-        
 
         (self.output_dir / "gemm_common.hpp").write_text(content)
 
-    def _generate_config_groups(self):
-        """Generate implementation configuration groups"""
-        if not self.unique_configs:  # Check if the list is empty
-            self._list_config_groups()
-        for config in self.unique_configs:
-            self._generate_config_group(**config)
-        self.generate_common_instances_header()
+    def _generate_all_trait_files(self):
+        """Generate all kernel traits into files."""
+        if not self.valid_trait_names:
+            self._generate_all_traits()
+            self._get_valid_trait_tile_combinations()
+        for trait in self.valid_trait_names:
+            self._generate_trait_file(trait)
+        self._generate_instantiation_source_files()
+        self._generate_common_instance_header_file()
 
-    
-    def _generate_config_group(self, pipeline: str, epilogue: str, scheduler: str,
-                              kPadM: bool, kPadN: bool, kPadK: bool):
-        """Generate a configuration group with all tile/warp combinations"""
-        group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}"
-        filename = f"gemm_{group_name}.hpp"
+    def _generate_trait_file(self, trait: str):
+        """Generate a trait with all tile/warp combinations."""
+        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_")
+        filename = f"gemm_{trait}.hpp"
 
         content = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#pragma once
+
 #include "gemm_common.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/host.hpp"
 
-namespace {group_name} {{
+namespace {trait} {{
 """
         # Add template struct with configuration
-        content += self._generate_kernel_struct(pipeline, epilogue, scheduler, kPadM, kPadN, kPadK)
+        content += self._generate_kernel_struct(
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k)
 
-        content += f"\n}} // namespace {group_name}\n"
+        content += f"\n}} // namespace {trait}\n"
         (self.output_dir / filename).write_text(content)
 
     def _generate_kernel_struct(self, pipeline: str, epilogue: str, scheduler: str,
-                               kPadM: bool, kPadN: bool, kPadK: bool) -> str:
-        """Generate kernel struct template"""
+                                pad_m: str, pad_n: str, pad_k: str) -> str:
+        """Generate the code block of kernel struct"""
         return f"""
-template <typename Pipeline, ck_tile::TailNumber TN>
-void try_run(ck_tile::TailNumber tn) {{
-    if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN) - 1) {{
-        if (tn == TN) {{
-            RunSplitk(ck_tile::bool_constant<true>{{}},
-                ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
-        }}
-    }}
-}}
+
 template <int TileM, int TileN, int TileK,
           int WarpM, int WarpN, int WarpK,
           int WarpTileM, int WarpTileN, int WarpTileK,
           bool structured_sparsity>
 struct GemmKernel {{
-    static constexpr bool kPadM = {BOOL_MAP(kPadM)};
-    static constexpr bool kPadN = {BOOL_MAP(kPadN)};
-    static constexpr bool kPadK = {BOOL_MAP(kPadK)};
-   
-    static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
+    static constexpr bool kPadM = {pad_m};
+    static constexpr bool kPadN = {pad_n};
+    static constexpr bool kPadK = {pad_k};
+
+    static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
         static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
@@ -389,7 +203,7 @@ struct GemmKernel {{
         static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
         static constexpr ck_tile::index_t TileParitionerM01      = 4;
 
-        using GemmShape = 
+        using GemmShape =
             ck_tile::TileGemmShape<ck_tile::sequence<TileM, TileN, TileK>,
                                    ck_tile::sequence<WarpM, WarpN, WarpK>,
                                    ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>,
@@ -403,22 +217,22 @@ struct GemmKernel {{
                                                       TileParitionerM01>;
 
         using Traits  =
-            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;        
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
 
         using GemmUniversalTraits =
             ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
-                                             ALayout, BLayout, CLayout, TransposeC, structured_sparsity>;    
+                                             ALayout, BLayout, CLayout, TransposeC, structured_sparsity>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-        using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}<GemmPipelineProblem>;  
+        using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}<GemmPipelineProblem>;
 
         const ck_tile::index_t k_grain     = args.k_batch * TileK;
         const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * TileK;
         const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);                                                                                                             
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
         float ave_time{{0}};
 
@@ -428,7 +242,7 @@ struct GemmKernel {{
             constexpr auto scheduler      = {SCHEDULER_MAP[scheduler]};
             constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem = 
+            using UniversalGemmProblem =
                 ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                       BDataType,
                                                       AccDataType,
@@ -438,7 +252,7 @@ struct GemmKernel {{
                                                       has_hot_loop_v,
                                                       tail_number_v>;
 
-            using GemmPipeline = {PIPELINE_MAP[pipeline][1]}<UniversalGemmProblem>; 
+            using GemmPipeline = {PIPELINE_MAP[pipeline][1]}<UniversalGemmProblem>;
             {EPILOGUE_MAP[epilogue]}
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
@@ -451,7 +265,7 @@ struct GemmKernel {{
                 throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
             }}
 
-            if(s.log_level_ > 0)
+            if(stream.log_level_ > 0)
             {{
                 std::cout << "Launching kernel with args:"
                       << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
@@ -459,7 +273,7 @@ struct GemmKernel {{
                       << std::endl;
             }}
 
-            ave_time = ck_tile::launch_kernel(s,
+            ave_time = ck_tile::launch_kernel(stream,
                                           ck_tile::make_kernel<blocks.x, kBlockPerCu>(
                                               Kernel{{}}, grids, blocks, 0, kargs));
             return ave_time;
@@ -488,206 +302,333 @@ struct GemmKernel {{
 
         return ave_time;
     }}
-    
+
     static std::string get_name() {{
-        return std::string("GemmKernel<Bllktile: ") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) + ", " +
-                "WaveMap: " + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + ", " +
-                "WarpTile: " + std::to_string(WarpTileM) + "x" + std::to_string(WarpTileN) + "x" + std::to_string(WarpTileK) + ", " +
-                "PadidngM: " + "{kPadM}" + ", " +
-                "PaddingN: " + "{kPadN}" + ", " +
-                "PaddingK: " + "{kPadK}" + ", " +
-                "Pipeline: " + "{pipeline}" + ", " +
-                "Epilogue: " + "{epilogue}" + ", " +
-                "Scheduler: " + "{scheduler}";
-                }}
+        return std::string("gemm_") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) +
+                "_" + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + "_" +
+                std::to_string(WarpTileM) + "x" + std::to_string(WarpTileN) + "x" + std::to_string(WarpTileK) + "_" +
+                "{pad_m}" + "_" +
+                "{pad_n}" + "_" +
+                "{pad_k}" + "_" +
+                "{pipeline}" + "_" +
+                "{epilogue}" + "_" +
+                "{scheduler}";
+    }}
 }};
 """
 
-    def generate_common_instances_header(self):
-        """Generate common instances header"""
+    def _generate_common_instance_header_file(self):
+        """Generate common instance header into file."""
         content = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 """
-        for group in self.all_kernels:
-            content += f"#include \"gemm_{group}.hpp\"\n"
+        for trait in self.valid_trait_names:
+            content += f"#include \"gemm_{trait}.hpp\"\n"
         (self.output_dir / "gemm_instances.hpp").write_text(content)
 
-    def is_tile_valid(self, tile: tuple, group: str) -> bool:
-        """Check if the tile configuration is valid for the given group"""
-        # Extract tile parameters
+    def is_tile_valid(self, tile: tuple, trait: str) -> bool:
+        """Check if the tile configuration is valid for the given trait."""
         tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile
+        pipeline, *_ = trait.split("_")
 
-        # Extract the pipeline and epilogue from the group name
-        _, pipeline, epilogue, scheduler, *_ = group.split("_")
+        # Parameter validity check
+        invalid_params = []
+        if (warp_m, warp_n, warp_k) not in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]:
+            invalid_params.append(
+                f"warp_m({warp_m}) * warp_n({warp_n}) * warp_k({warp_k})")
+        if (warp_m * warp_tile_m) == 0:
+            invalid_params.append(
+                f"warp_m({warp_m}) * warp_tile_m({warp_tile_m})")
+        if (warp_n * warp_tile_n) == 0:
+            invalid_params.append(
+                f"warp_n({warp_n}) * warp_tile_n({warp_tile_n})")
+        if (warp_k * warp_tile_k) == 0:
+            invalid_params.append(
+                f"warp_k({warp_k}) * warp_tile_k({warp_tile_k})")
 
-        if tile_m % (warp_m * warp_tile_m) == 0 and \
-                tile_n % (warp_n * warp_tile_n) == 0 and \
-                tile_k % (warp_k * warp_tile_k) == 0:
-            total_tile_in_lds = (tile_m * tile_k + tile_n * tile_k ) * sizeOf(self.config.datatype)
-            # Validate and append valid tile parameters
-            is_compv4 = pipeline == "compv4"
-            max_tile_size = pow(2, 16) if is_compv4 else pow(2, 15)
+        if invalid_params:
+            logging.debug(
+                f"Trait: [{trait}], Invalid warp configuration: {', '.join(invalid_params)}. "
+                f"Parameter combination: warp=({warp_m},{warp_n},{warp_k}), "
+                f"warp_tile=({warp_tile_m},{warp_tile_n},{warp_tile_k})"
+            )
+            return False
 
-            if total_tile_in_lds > max_tile_size:
-                raise ValueError(f'Total tile size should not exceed {max_tile_size / 1024}KB of LDS. '
-                                f'{tile_m} * {tile_n} * {tile_k} > {max_tile_size / 1024}KB')
-            arch = self.config.architecture
-            if [warp_tile_m, warp_tile_n, warp_tile_k] in warp_tile_combinations_map[arch][self.config.datatype]:
-               return  True
-        return False
+        # Dimension alignment check
+        alignment_issues = []
+        if tile_m % (warp_m * warp_tile_m) != 0:
+            alignment_issues.append(
+                f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}")
+        if tile_n % (warp_n * warp_tile_n) != 0:
+            alignment_issues.append(
+                f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}")
+        if tile_k % (warp_k * warp_tile_k) != 0:
+            alignment_issues.append(
+                f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}")
 
-    def _generate_dispatcher(self):
-        """Generate dispatch mechanism"""
-        content = """// SPDX-License-Identifier: MIT
+        if alignment_issues:
+            logging.debug(
+                f"Trait: [{trait}], Dimension alignment failed: {', '.join(alignment_issues)}. "
+                f"Tile dimensions {tile_m}x{tile_n}x{tile_k} must be divisible by "
+                f"[warp]: {warp_m}x{warp_n}x{warp_k} x [warp_tile]: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+            )
+            return False
+
+        # LDS capacity verification
+        matrix_a_size = (tile_m * tile_k) * \
+            pow(2, element_size(self.config.problem.datatype_map['matrix_a']))
+        matrix_b_size = (tile_n * tile_k) * \
+            pow(2, element_size(self.config.problem.datatype_map['matrix_b']))
+        total_tile_in_lds = matrix_a_size + matrix_b_size
+
+        max_tile_size = 2**16 if pipeline == "compv4" else 2**15
+        if total_tile_in_lds > max_tile_size:
+            logging.debug(
+                f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds/1024:.1f}KB) > "
+                f"maximum allowed {max_tile_size:,}B ({max_tile_size/1024}KB). Breakdown:\n"
+                f"- Matrix A ({self.config.problem.datatype_map['matrix_a']}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n"
+                f"- Matrix B ({self.config.problem.datatype_map['matrix_b']}): {tile_n}x{tile_k} = {matrix_b_size:,}B"
+            )
+            return False
+
+        # Warp combination validation
+        warp_tile_key = f"{self.config.problem.datatype_map['matrix_a']}_{self.config.problem.datatype_map['matrix_b']}_{self.config.problem.datatype_map['matrix_c']}"
+        current_combination = [warp_tile_m, warp_tile_n, warp_tile_k]
+
+        gpu_name = get_gpu_name_by_id(0)
+        gpu_warp_tile_key = warp_tile_supported_combinations.get(gpu_name, {})
+        if not gpu_warp_tile_key:
+            logging.debug(
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check.")
+            return False
+
+        allowed_combinations = gpu_warp_tile_key.get(warp_tile_key, [])
+        if not allowed_combinations:
+            logging.debug(
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check.")
+            return False
+
+        if current_combination not in allowed_combinations:
+            logging.debug(
+                f"Trait: [{trait}], Invalid warp combination: {current_combination} not in allowed list. "
+                f"Valid combinations for data type '{warp_tile_key}': {allowed_combinations}"
+            )
+            return False
+
+        return True
+
+    def _get_valid_trait_tile_combinations(self):
+        def get_tile_value(tile_param): return tile_param.generate_candidates(
+        ) if isinstance(tile_param, RangeConfigParam) else tile_param.values
+
+        tile_group = list(itertools.product(
+            get_tile_value(self.config.tile_config.tile_m),
+            get_tile_value(self.config.tile_config.tile_n),
+            get_tile_value(self.config.tile_config.tile_k)
+        ))
+
+        warp_group = list(itertools.product(
+            get_tile_value(self.config.tile_config.warp_m),
+            get_tile_value(self.config.tile_config.warp_n),
+            get_tile_value(self.config.tile_config.warp_k)
+        ))
+
+        warp_tile_group = list(itertools.product(
+            get_tile_value(self.config.tile_config.warp_tile_m),
+            get_tile_value(self.config.tile_config.warp_tile_n),
+            get_tile_value(self.config.tile_config.warp_tile_k)
+        ))
+
+        tile_params = {
+            t + w + wt
+            for t in tile_group
+            for w in warp_group
+            for wt in warp_tile_group
+        }
+
+        for trait in self.valid_trait_names:
+            tile_valid_params = list(
+                filter(lambda t: self.is_tile_valid(t, trait), tile_params))
+            if trait not in self.valid_trait_tile_combinations:
+                self.valid_trait_tile_combinations[trait] = []
+            self.valid_trait_tile_combinations[trait].append(tile_valid_params)
+
+    def _generate_instantiation_source_files(self):
+        """Generate kernel instance instantiation source files """
+        for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+            for tile in tile_valid_params:
+                for tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k in tile:
+                    content = f"""
+// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "gemm_common.hpp"
-#include "gemm_instances.hpp"
-#include "gemm_host_api.hpp"
+
+
+#include "gemm_{trait}.hpp" 
+
+"""
+                    sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
+                        self.config.problem.datatype_map['matrix_b'] == 'fp16' and \
+                        self.config.problem.datatype_map['matrix_c'] == 'fp16' and \
+                        ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or
+                            (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32))
+                    if sparse:
+                        sparse_content = content + f"""
+template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, true>;
+"""
+                        (self.output_dir /
+                         f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_true.cpp").write_text(sparse_content)
+
+                    no_sparse_content = content + f"""
+template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, false>;
+"""
+                    (self.output_dir /
+                     f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_false.cpp").write_text(no_sparse_content)
+
+    def _generate_dispatcher_file(self):
+        """Generate the code block of dispatch mechanism."""
+        content = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
 #include <unordered_map>
 #include <functional>
 #include <vector>
 
+#include "gemm_common.hpp"
+#include "gemm_instances.hpp"
+
+/// @brief Defines the configuration parameters for a GEMM operation, enabling the selection of a
+/// specific kernel instance based on the provided settings.
+struct KernelTraits
+{
+    /// @brief The name of the pipeline.
+    std::string pipeline;
+    /// @brief The name of the scheduler (e.g., "intrawave", "interwave").
+    std::string scheduler;
+    /// @brief The name of the epilogue (e.g., "cshuffle", "default").
+    std::string epilogue;
+    /// @brief Indicates whether padding is applied to the M dimension.
+    bool pad_m;
+    /// @brief Indicates whether padding is applied to the N dimension.
+    bool pad_n;
+    /// @brief Indicates whether padding is applied to the K dimension.
+    bool pad_k;
+};
+
 struct GemmDispatcher {
     static auto& get_kernel_map() {
         // Use a static local variable
-        static std::unordered_map<std::string, 
-            std::function<void(ck_tile::DeviceMem& c_m_n_dev_buf,
-                               ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                               ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                               int verify, ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>> kernel_map;
+        static std::unordered_map<
+            std::string,
+            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>>
+            kernel_map;
         return kernel_map;
     }
 
     static void init(bool structured_sparsity) {
-        auto& kernel_map = get_kernel_map();    
-        if(!kernel_map.empty()) return;            
+        auto& kernel_map = get_kernel_map();
+        if(!kernel_map.empty()) return;
         \n"""
-         # Add tile/warp instantiations
-        tile_params = set(itertools.product(
-            self.config.impl_cfg["tile_m"]["values"],
-            self.config.impl_cfg["tile_n"]["values"],
-            self.config.impl_cfg["tile_k"]["values"],
-            self.config.impl_cfg["warp_m"]["values"],
-            self.config.impl_cfg["warp_n"]["values"],
-            self.config.impl_cfg["warp_k"]["values"],
-            self.config.impl_cfg["warp_tile_m"]["values"],
-            self.config.impl_cfg["warp_tile_n"]["values"],
-            self.config.impl_cfg["warp_tile_k"]["values"]
-        ))
 
-       
-        for group in self.all_kernels:
-            content += f"""        kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf,
-                                                               ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                                                               ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                                                               int verify, ck_tile::GemmHostArgs& args,
-                                                               const ck_tile::stream_config& stream) {{
-            if(structured_sparsity){{  // SMFMA"""
-            for tile in tile_params:
-                if self.is_tile_valid(tile, group):
-                    sparse = self.atype == 'fp16' and \
-                        ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or
-                        (tile[6] == 16 and tile[7] == 16 and tile[8] == 32))
+        for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+            content += f"""         kernel_map["{trait}"] = {{"""
+            for _, tile in enumerate(tile_valid_params):
+                for j in range(len(tile)):
+                    tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile[
+                        j]
+                    content += f"""[=](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ """
+                    content += f""" 
+                                    if(structured_sparsity){{  // SMFMA"""
+                    sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
+                        self.config.problem.datatype_map['matrix_b'] == 'fp16' and \
+                        self.config.problem.datatype_map['matrix_c'] == 'fp16' and \
+                        ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or
+                            (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32))
                     content += f"""
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
-                else:
-                    raise ValueError(f"Invalid tile configuration for group {group}: {tile}")
-            content += f"""
-            }} else {{"""
-            for tile in tile_params:
-                if self.is_tile_valid(tile, group):
+                                        return run_kernel<{trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, {BOOL_MAP(sparse)}>>(args, stream);"""
                     content += f"""
-                run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);"""
-                else:
-                    raise ValueError(f"Invalid tile configuration for group {group}: {tile}")
+                                    }} else {{"""
+                    content += f"""
+                                        return run_kernel<{trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, {BOOL_MAP(False)}>>(args, stream);"""
+                    content += f"""
+                                    }} """
+
+                    if j == len(tile)-1:
+                        content += f"""
+                                }} """
+                    else:
+                        content += f"""
+                                }}, """
             content += f"""
-            }}
-        }};\n"""
+            }};\n """
 
         content += """    }
-    
+
     template <typename Kernel>
-    static void run_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
-                           ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                           ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                           int verify, ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream)
+    static std::tuple<std::string, float> run_kernel(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream)
     {
+        std::string name = Kernel::get_name();
         float avg_time = Kernel::launch(args, stream);
-        std::string description = Kernel::get_name();
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         
-        std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
-        std::size_t num_byte = sizeof(ADataType) * args.M * args.K + sizeof(BDataType) * args.N * args.K + sizeof(CDataType) * args.M * args.N;
-        float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-        float gb_per_sec = num_byte / 1.E6 / avg_time;
-
-        std::cout << "Performance for " << description << " : " << avg_time << " ms, "
-                << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
-
-        if(verify)
-            compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result);
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
+        return std::make_tuple(name, avg_time);
     }
-
-    static auto dispatch(ck_tile::DeviceMem& c_m_n_dev_buf,
-                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                         ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                         int verify, bool structured_sparsity, const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args,
-                         const ck_tile::stream_config& stream) {
+    
+    
+    static auto dispatch(bool structured_sparsity, const KernelTraits& trait) {
         init(structured_sparsity);
         const std::string key = assemble_key(trait);
-        auto& kernel_map = get_kernel_map(); 
-        if(auto it = kernel_map.find(key); it != kernel_map.end()) {
-            return it->second(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, gemm_args, stream); 
+        auto& kernel_map = get_kernel_map();
+        if(auto it = kernel_map.find(key); it != kernel_map.end())
+        {
+            return it->second;
         }
         throw std::runtime_error("No suitable kernel found: " + key);
     }
 
 private:
     static std::string assemble_key(const KernelTraits &trait) {
-        return std::string(trait.pipeline) + "_" + 
-               trait.epilogue + "_" + 
+        return std::string(trait.pipeline) + "_" +
+               trait.epilogue + "_" +
                trait.scheduler + "_" +
-               "pad_" + 
-               (trait.kPadM ? "true" : "false") + "_" +
-               (trait.kPadN ? "true" : "false") + "_" +
-               (trait.kPadK ? "true" : "false");
+               (trait.pad_m ? "true" : "false") + "_" +
+               (trait.pad_n ? "true" : "false") + "_" +
+               (trait.pad_k ? "true" : "false");
     }
 };
 
 """
         (self.output_dir / "gemm_dispatcher.hpp").write_text(content)
 
-        
-def do_list_blobs(args, gemm_config):
-    generator = GemmCodeGenerator(args.working_path, gemm_config)
-    generator.list_all()
 
-def do_gen_blobs(args, gemm_config):
-    generator = GemmCodeGenerator(args.working_path, gemm_config)
-    generator.generate_all()
+def do_list_blobs(args: argparse.Namespace,
+                  user_provide_config: Optional[GemmConfig] = None):
+    generator = GemmCodeGenerator(args.working_path, user_provide_config)
+    generator.list_all_trait_names()
+
+
+def do_gen_blobs(args: argparse.Namespace,
+                 user_provide_config: Optional[GemmConfig] = None):
+    generator = GemmCodeGenerator(args.working_path, user_provide_config)
+    generator.generate_all_instance_files()
 
-     
 
 def main(args):
-    # Read json file
-    with open(args.json, 'r') as json_file:
-        config_data = json.load(json_file)
-    
-    gemm_config = GemmConfig(config_data)
+
+    gemm_config = GemmConfig.from_json(
+        args.config_json) if args.config_json is not None else args.config_json
 
     if args.list_blobs:
         do_list_blobs(args, gemm_config)
     elif args.gen_blobs:
         do_gen_blobs(args, gemm_config)
     else:
-        # If neither was specified, either do nothing or default to gen_blobs
-        print("No mode specified (use --list_blobs or --gen_blobs). Generating by default...")
+        logging.warning(
+            "No mode specified (use --list_blobs or --gen_blobs). Generating by default...")
         do_gen_blobs(args, gemm_config)
-   
 
 
 if __name__ == "__main__":
@@ -696,18 +637,18 @@ if __name__ == "__main__":
         description="gen API for CK gemm kernel",
     )
     parser.add_argument(
-        "-w", "--working_path", default="./", required=False, help="the path where all the blobs are going to be generated"
+        "-w", "--working_path", default="./", required=False, help="The path where all the blobs are going to be generated"
     )
     parser.add_argument(
-        "-j", "--json", required=True, help="Path to the json which contains the kernel configurations"
+        "-j", "--config_json", required=False, help="Path to the json which contains the configurations that user provide"
     )
     parser.add_argument(
-        "-l", "--list_blobs", action = 'store_true', help="List all kernel to file"
+        "-l", "--list_blobs", action='store_true', help="List all kernel instances to file"
     )
     parser.add_argument(
-        "-g", "--gen_blobs", action = 'store_true', help="Generate all kernels into different files"
+        "-g", "--gen_blobs", action='store_true', help="Generate all kernel instances into different files"
     )
-    
+
     args = parser.parse_args()
-    
+
     main(args)
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
new file mode 100644
index 0000000000..9170952aa8
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "benchmark_gemm.hpp"
+
+class GemmProfiler
+{
+    public:
+    static GemmProfiler& instance(Setting setting)
+    {
+        static GemmProfiler instance{setting};
+        return instance;
+    }
+
+    void benchmark(GemmProblem& gemm_problem,
+                   std::vector<std::function<std::tuple<std::string, float>(
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables)
+    {
+        const ALayout layout_a = ALayout{};
+        const BLayout layout_b = BLayout{};
+        const CLayout layout_c = CLayout{};
+
+        gemm_problem.stride_a_ = ck_tile::get_default_stride(
+            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a));
+        gemm_problem.stride_b_ = ck_tile::get_default_stride(
+            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b));
+        gemm_problem.stride_c_ = ck_tile::get_default_stride(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c));
+
+        ck_tile::HostTensor<ADataType> a_m_k(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)));
+        ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
+            gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
+
+        if(setting_.init_method_ == 0)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        }
+        else if(setting_.init_method_ == 1)
+        {
+            ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+            ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+        }
+        else if(setting_.init_method_ == 2)
+        {
+            ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
+            ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+        }
+        else
+        {
+            a_m_k.SetZero();
+            b_k_n.SetZero();
+        }
+
+        if(gemm_problem.structured_sparsity_)
+        {
+            ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+        }
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            // permute_tensor_b<decltype(b_k_n_dev)>(b_k_n_dev);
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        ck_tile::GemmHostArgs gemm_args;
+        gemm_args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
+        gemm_args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
+        gemm_args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+        gemm_args.k_batch  = gemm_problem.split_k_;
+        gemm_args.M        = gemm_problem.m_;
+        gemm_args.N        = gemm_problem.n_;
+        gemm_args.K        = gemm_problem.k_;
+        gemm_args.stride_A = gemm_problem.stride_a_;
+        gemm_args.stride_B = gemm_problem.stride_b_;
+        gemm_args.stride_C = gemm_problem.stride_c_;
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_result(ck_tile::host_tensor_descriptor(
+            gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
+
+        if(setting_.verify_)
+        {
+            gemm_host_reference(setting_.verify_,
+                                a_m_k,
+                                b_k_n,
+                                c_m_n_host_result,
+                                a_m_k_dev_buf,
+                                b_k_n_dev_buf,
+                                gemm_problem.m_,
+                                gemm_problem.n_,
+                                gemm_problem.k_,
+                                gemm_problem.stride_a_,
+                                gemm_problem.stride_b_,
+                                gemm_problem.stride_c_);
+        }
+
+        for(auto& callable : callables)
+        {
+            auto kernel_run_result = callable(gemm_args,
+                                              ck_tile::stream_config{nullptr,
+                                                                     true,
+                                                                     setting_.log_,
+                                                                     setting_.n_warmup_,
+                                                                     setting_.n_repeat_,
+                                                                     setting_.is_gpu_timer_});
+            process_result(gemm_problem,
+                           c_m_n_dev_buf,
+                           c_m_n_host_result,
+                           c_m_n_dev_result,
+                           kernel_run_result);
+        }
+    }
+
+    void process_result(const GemmProblem& gemm_problem,
+                        ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+
+        KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
+
+        // compute performance metric
+        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
+        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
+                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
+                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
+
+        // update
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        // verify result
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        bool verified_correct =
+            !setting_.verify_ ||
+            compare(gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result);
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        // clear tensor
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+    }
+
+    KernelInstance select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        std::cout << "**********************************" << std::endl;
+        std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                  << "The best kernel instance is: " << kernel_instance << std::endl;
+        std::cout << "**********************************" << std::endl;
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c,"
+                         << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity,"
+                         << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
+                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
+                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
+                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
+                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
+                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    GemmProfiler(const GemmProfiler&) = delete;
+    GemmProfiler& operator=(const GemmProfiler&) = delete;
+
+    private:
+    ~GemmProfiler() { kernel_instances_.clear(); }
+    GemmProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance> kernel_instances_;
+};
diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
new file mode 100644
index 0000000000..f6303ec9f8
--- /dev/null
+++ b/tile_engine/ops/gemm/json_config.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+Handles loading, parsing, and validation of JSON configuration parameters.
+"""
+
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Optional, Union, Tuple, Type
+import json
+
+
+@dataclass
+class EnumConfigParam:
+    """Represents an enumeration-type configuration parameter"""
+    values: List[Union[int, str, bool]]
+
+
+@dataclass
+class RangeConfigParam:
+    """Represents a numeric range-type configuration parameter"""
+    min: int
+    max: int
+    step: int
+    exclude: Optional[List[int]]
+
+    def generate_candidates(self) -> List[int]:
+        """Generates valid candidates after applying range constraints"""
+
+        if self.min > self.max:
+            raise ValueError(
+                f"Invalid range: min({self.min}) > max({self.max})"
+            )
+        if self.step <= 0:
+            raise ValueError(
+                f"Step must be positive, got {self.step}"
+            )
+
+        candidates = list(range(self.min, self.max + 1, self.step))
+
+        if hasattr(self, 'exclude') and self.exclude:
+            if not isinstance(self.exclude, list):
+                raise TypeError("exclude must be list type")
+            exclude_set = set(self.exclude)
+            candidates = [x for x in candidates if x not in exclude_set]
+
+        if not candidates:
+            raise ValueError(
+                f"No valid candidates for range [{self.min}-{self.max}] "
+                f"with step {self.step} and excludes {self.exclude}"
+            )
+
+        return candidates
+
+
+@dataclass
+class ProblemConfig:
+    """configuration class for problem parameter."""
+    datatypes: Tuple[EnumConfigParam, ...]
+    layouts: Tuple[EnumConfigParam, ...]
+
+    @property
+    def datatype_map(self) -> dict[str, str]:
+        """Get current layout selections as a key-value map."""
+        return {
+            'matrix_a': self.datatypes[0].values[0],
+            'matrix_b': self.datatypes[1].values[0],
+            'matrix_c': self.datatypes[2].values[0]
+        }
+
+    @property
+    def layout_map(self) -> dict[str, str]:
+        """Get current layout selections as a key-value map."""
+        return {
+            'matrix_a': self.layouts[0].values[0],
+            'matrix_b': self.layouts[1].values[0],
+            'matrix_c': self.layouts[2].values[0]
+        }
+
+
+@dataclass
+class TileConfig:
+    """configuration class for tile parameter."""
+    tile_m: Union[EnumConfigParam, RangeConfigParam]
+    tile_n: Union[EnumConfigParam, RangeConfigParam]
+    tile_k: Union[EnumConfigParam, RangeConfigParam]
+
+    warp_m: Union[EnumConfigParam, RangeConfigParam]
+    warp_n: Union[EnumConfigParam, RangeConfigParam]
+    warp_k: Union[EnumConfigParam, RangeConfigParam]
+
+    warp_tile_m: Union[EnumConfigParam, RangeConfigParam]
+    warp_tile_n: Union[EnumConfigParam, RangeConfigParam]
+    warp_tile_k: Union[EnumConfigParam, RangeConfigParam]
+
+
+@dataclass
+class TraitConfig:
+    """configuration class for kernel traits."""
+    pipeline: EnumConfigParam
+    scheduler: EnumConfigParam
+    epilogue: EnumConfigParam
+    pad_m: EnumConfigParam
+    pad_n: EnumConfigParam
+    pad_k: EnumConfigParam
+
+
+@dataclass
+class GemmConfig:
+    """Main configuration class for GEMM operations """
+    problem: ProblemConfig
+    tile_config: TileConfig
+    trait_config: TraitConfig
+
+    @classmethod
+    def from_json(cls: Type["GemmConfig"], filepath: str) -> "GemmConfig":
+        """JSON configuration loader with validation controls"""
+        config_path = Path(filepath)
+
+        try:
+            if not config_path.exists():
+                raise FileNotFoundError(f"Config file {filepath} not found")
+
+            with config_path.open('r') as f:
+                config_dict = json.load(f)
+
+            # Parse problem config
+            problem = ProblemConfig(
+                datatypes=(
+                    EnumConfigParam(
+                        values=config_dict['problem']['datatype_a']['values']),
+                    EnumConfigParam(
+                        values=config_dict['problem']['datatype_b']['values']),
+                    EnumConfigParam(
+                        values=config_dict['problem']['datatype_c']['values'])
+                ),
+                layouts=(
+                    EnumConfigParam(
+                        values=config_dict['problem']['layout_a']['values']),
+                    EnumConfigParam(
+                        values=config_dict['problem']['layout_b']['values']),
+                    EnumConfigParam(
+                        values=config_dict['problem']['layout_c']['values'])
+                )
+            )
+
+            # Parse tile config
+            def create_param(param_dict):
+                if 'values' in param_dict:
+                    return EnumConfigParam(values=param_dict['values'])
+                else:
+                    return RangeConfigParam(
+                        min=param_dict['min'],
+                        max=param_dict['max'],
+                        step=param_dict['step'],
+                        exclude=param_dict.get('exclude', [])
+                    )
+
+            tile_config = TileConfig(
+                tile_m=create_param(config_dict['tile_config']['tile_m']),
+                tile_n=create_param(config_dict['tile_config']['tile_n']),
+                tile_k=create_param(config_dict['tile_config']['tile_k']),
+                warp_m=create_param(config_dict['tile_config']['warp_m']),
+                warp_n=create_param(config_dict['tile_config']['warp_n']),
+                warp_k=create_param(config_dict['tile_config']['warp_k']),
+                warp_tile_m=create_param(
+                    config_dict['tile_config']['warp_tile_m']),
+                warp_tile_n=create_param(
+                    config_dict['tile_config']['warp_tile_n']),
+                warp_tile_k=create_param(
+                    config_dict['tile_config']['warp_tile_k'])
+            )
+
+            # Parse trait config
+            trait_config = TraitConfig(
+                pipeline=EnumConfigParam(
+                    values=config_dict['trait_config']['pipeline']['values']),
+                scheduler=EnumConfigParam(
+                    values=config_dict['trait_config']['scheduler']['values']),
+                epilogue=EnumConfigParam(
+                    values=config_dict['trait_config']['epilogue']['values']),
+                pad_m=EnumConfigParam(
+                    values=config_dict['trait_config']['pad_m']['values']),
+                pad_n=EnumConfigParam(
+                    values=config_dict['trait_config']['pad_n']['values']),
+                pad_k=EnumConfigParam(
+                    values=config_dict['trait_config']['pad_k']['values'])
+            )
+
+            return cls(
+                problem=problem,
+                tile_config=tile_config,
+                trait_config=trait_config
+            )
+
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON format: {str(e)}")
+        except KeyError as e:
+            raise KeyError(f"Missing required configuration field: {str(e)}")

From 132bd5b874537f313f871b61398e1999456981ce Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 May 2025 06:53:12 -0700
Subject: [PATCH 151/443] Bump rocm-docs-core[api_reference] from 1.18.4 to
 1.19.0 in /docs/sphinx (#2237)

Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.18.4 to 1.19.0.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.18.4...v1.19.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core[api_reference]
  dependency-version: 1.19.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 6c48b2de09..f04d4a4da6 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.18.4
+rocm-docs-core[api_reference]==1.19.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 62c3ea8ff8..74c32f53c1 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -237,7 +237,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.18.4
+rocm-docs-core[api-reference]==1.19.0
     # via -r requirements.in
 rpds-py==0.24.0
     # via

From c52649ad573a88d5a68f124f0af6d57ea655e3b5 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Wed, 28 May 2025 00:32:42 -0500
Subject: [PATCH 152/443] Add catch blocks in example GEMM apps to enable
 better error handling (Issue: 1928) (#2234)

* added catch statements to examples

* clang format
---
 example/ck_tile/03_gemm/gemm_basic.cpp        | 13 +++++++-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  2 +-
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    | 30 ++++++++++++-------
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  | 13 +++++++-
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 13 +++++++-
 example/ck_tile/18_flatmm/flatmm_basic.cpp    | 13 +++++++-
 6 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 1edb3da947..386fe93715 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -214,4 +214,15 @@ int run_gemm_example(int argc, char* argv[])
     }
 }
 
-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_gemm_example(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index b60a3b274b..5718baf677 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -345,7 +345,7 @@ int main(int argc, char* argv[])
 {
     try
     {
-        run_gemm_example(argc, argv);
+        return !run_gemm_example(argc, argv);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index ce689a370c..da1c15b86f 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -334,16 +334,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)
 
 int main(int argc, char** argv)
 {
-    auto [result, args] = create_args(argc, argv);
-    if(!result)
-        return -1;
-    std::string index_prec  = args.get_str("pr_i");
-    std::string weight_prec = args.get_str("pr_w");
-
-    bool r = true;
-    if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0)
+    try
     {
-        r &= test_moe_sorting<float, ck_tile::index_t>(args);
+        auto [result, args] = create_args(argc, argv);
+        if(!result)
+            return -1;
+
+        std::string index_prec  = args.get_str("pr_i");
+        std::string weight_prec = args.get_str("pr_w");
+
+        bool r = true;
+        if(weight_prec == "fp32" && index_prec == "int32")
+        {
+            r &= test_moe_sorting<float, ck_tile::index_t>(args);
+        }
+
+        return r ? 0 : -1;
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
     }
-    return r ? 0 : -1;
 }
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index 0219c67305..68ad1106ce 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -320,4 +320,15 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
 
 #include "run_batched_gemm_example.inc"
 
-int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_batched_gemm_example(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 61193e2e29..067319b3f9 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -319,4 +319,15 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
 #include "run_grouped_gemm_example.inc"
 
 constexpr bool Persistent = false;
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_grouped_gemm_example<Persistent>(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 5f2c2a5aab..c8b4a10d05 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -177,4 +177,15 @@ int run_flatmm_example(int argc, char* argv[])
     return -1;
 }
 
-int main(int argc, char* argv[]) { return !run_flatmm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_flatmm_example(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}

From 99857e10e64889e631f58a3cc38fca3a3f248bc4 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 27 May 2025 23:00:58 -0700
Subject: [PATCH 153/443] [CK_tile] Add rotating buffer feature for universal
 gemm (#2200)

* Add rotating buffer feature for universal gemm

* adding changes in tile_engine

* Updated code to merge kernel_launch

* removing comments

* Enable rotating buffer changes to flatmm

* Created diff launch_kernel function for rotating buffer

* Simplfied calculation using macros

* merge code with new changes in tile_engine

* clang formatted

* Redefine macros
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |   7 +
 example/ck_tile/03_gemm/run_gemm_example.inc  |   2 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    | 145 +++++++++++-------
 example/ck_tile/18_flatmm/flatmm_basic.cpp    |  45 +++++-
 example/ck_tile/18_flatmm/flatmm_basic.hpp    |   7 +
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |   2 +-
 include/ck_tile/host.hpp                      |   2 +
 include/ck_tile/host/flush_icache.hpp         |  30 ++++
 include/ck_tile/host/kernel_launch.hpp        |  61 +++++++-
 include/ck_tile/host/rotating_buffers.hpp     | 102 ++++++++++++
 include/ck_tile/host/stream_config.hpp        |   4 +-
 tile_engine/ops/gemm/README.md                |   2 +
 tile_engine/ops/gemm/benchmark_gemm.cpp       |  18 +--
 tile_engine/ops/gemm/benchmark_gemm.hpp       |   2 +
 tile_engine/ops/gemm/gemm_host_api.hpp        |   5 +
 tile_engine/ops/gemm/gemm_instance_builder.py |  45 +++++-
 tile_engine/ops/gemm/gemm_profiler.hpp        |   4 +-
 17 files changed, 409 insertions(+), 74 deletions(-)
 create mode 100644 include/ck_tile/host/flush_icache.hpp
 create mode 100644 include/ck_tile/host/rotating_buffers.hpp
 mode change 100755 => 100644 tile_engine/ops/gemm/gemm_host_api.hpp

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 25fab6bde0..4c9fecaba6 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -220,4 +220,11 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 79ed9ce76b..3010130e6c 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -178,7 +178,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
 
     float ave_time =
         gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 5718baf677..5dcb685839 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -11,6 +11,7 @@
 
 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
 
 template <typename Pipeline, ck_tile::TailNumber TN>
 void try_run(ck_tile::TailNumber tn)
@@ -74,64 +75,102 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-        constexpr auto memory_operation = memory_operation_.value;
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;
 
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
 
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             CLayout,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation>>;
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
 
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
 
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+                static constexpr ck_tile::index_t APackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                static constexpr ck_tile::index_t BPackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
-        ave_time = ck_tile::launch_kernel(s,
-                                          ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                              Kernel{}, grids, blocks, 0, kargs));
-        return ave_time;
-    };
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                };
+                ave_time = ck_tile::launch_kernel_preprocess(
+                    s,
+                    run_flush_cache,
+                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                        Kernel{}, grids, blocks, 0, kargs));
+            }
+            else
+            {
+                ave_time =
+                    ck_tile::launch_kernel(s,
+                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                               Kernel{}, grids, blocks, 0, kargs));
+            }
+            return ave_time;
+        };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
@@ -243,8 +282,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     return ave_time;
 }
 
-#include "run_gemm_example.inc"
-
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index c8b4a10d05..2dbff1bc5c 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -11,6 +11,7 @@
 
 #include "ck_tile/host.hpp"
 #include "flatmm_basic.hpp"
+#include "run_flatmm_example.inc"
 
 template <typename ADataType,
           typename BDataType,
@@ -115,9 +116,47 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                       << std::endl;
         }
 
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        float ave_time{0};
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.a_ptr, kargs.b_shuffle_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_preprocess(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
         return ave_time;
     };
     if(args.k_batch == 1)
@@ -132,8 +171,6 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
     }
 }
 
-#include "run_flatmm_example.inc"
-
 int run_flatmm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index bbce978724..55f2d4f367 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -133,4 +133,11 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index c191fff7d0..3d4f154af7 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -122,7 +122,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
 
     float ave_time =
         flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 24feaf7c62..3459e728e0 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -37,3 +37,5 @@
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/host/timer.hpp"
+#include "ck_tile/host/flush_icache.hpp"
+#include "ck_tile/host/rotating_buffers.hpp"
diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp
new file mode 100644
index 0000000000..9230b50a13
--- /dev/null
+++ b/include/ck_tile/host/flush_icache.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+static __global__ void flush_cache()
+{
+    asm __volatile__("s_icache_inv \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t" ::
+                         :);
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index d159787387..269e59a103 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -11,6 +11,13 @@
 #include <cstddef>
 
 namespace ck_tile {
+
+#define CU_FOR_MI308 80
+#define CU_FOR_MI300X 228
+#define OPTIMAL_LATENCY_MI308 0.005
+#define OPTIMAL_LATENCY_MI300X 0.0015
+#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
+
 template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -81,6 +88,8 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
 template <typename... Callables>
 CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
 {
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
     if(!s.time_kernel_)
     {
         launch_and_check(s, std::forward<Callables>(callables)...);
@@ -88,7 +97,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
     }
 
     auto time_launches = [&](auto timer) {
-        // warmup
+        // Warmup
         for(int i = 0; i < s.cold_niters_; i++)
         {
             launch_and_check(s, std::forward<Callables>(callables)...);
@@ -114,4 +123,52 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
     }
 }
 
+template <typename PreprocessFunc, typename... Callables>
+CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
+                                            PreprocessFunc preprocess,
+                                            Callables&&... callables)
+{
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
+    if(!s.time_kernel_)
+    {
+        preprocess();
+        launch_and_check(s, std::forward<Callables>(callables)...);
+        return 0;
+    }
+
+    auto time_launches = [&](auto timer) {
+        // Warmup
+        for(int i = 0; i < s.cold_niters_; i++)
+        {
+            launch_and_check(s, std::forward<Callables>(callables)...);
+        }
+
+        timer.start(s.stream_id_);
+        for(int i = 0; i < s.nrepeat_; i++)
+        {
+            preprocess();
+            launch_and_check(s, std::forward<Callables>(callables)...);
+        }
+        timer.stop(s.stream_id_);
+
+        hipDeviceProp_t deviceProps;
+        HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+
+        float preprocess_offset =
+            (deviceProps.multiProcessorCount >= CU_FOR_MI300X)  ? OPTIMAL_LATENCY_MI300X
+            : (deviceProps.multiProcessorCount == CU_FOR_MI308) ? OPTIMAL_LATENCY_MI308
+                                                                : OPTIMAL_LATENCY_SAFE_MARGIN;
+        return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
+    };
+
+    if(s.is_gpu_timer_)
+    {
+        return time_launches(gpu_timer{});
+    }
+    else
+    {
+        return time_launches(cpu_timer{});
+    }
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp
new file mode 100644
index 0000000000..86f68ad084
--- /dev/null
+++ b/include/ck_tile/host/rotating_buffers.hpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+template <typename ADataType, typename BDataType>
+struct RotatingMemWrapper
+{
+    RotatingMemWrapper() = delete;
+    RotatingMemWrapper(const void* a_ptr_,
+                       const void* b_ptr_,
+                       std::size_t rotating_count_,
+                       std::size_t size_a_,
+                       std::size_t size_b_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          rotating_count(rotating_count_),
+          size_a(size_a_),
+          size_b(size_b_)
+    {
+        p_a_grids.push_back(a_ptr);
+        p_b_grids.push_back(b_ptr);
+        for(size_t i = 1; i < rotating_count; i++)
+        {
+            {
+                void* pADeviceBuf;
+                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
+                                          const_cast<void*>(p_a_grids[0]),
+                                          size_a_,
+                                          hipMemcpyDeviceToDevice));
+                p_a_grids.push_back(pADeviceBuf);
+            }
+
+            {
+                void* pBDeviceBuf;
+                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
+                                          const_cast<void*>(p_b_grids[0]),
+                                          size_b_,
+                                          hipMemcpyDeviceToDevice));
+                p_b_grids.push_back(pBDeviceBuf);
+            }
+        }
+    }
+    void Next()
+    {
+        if(rotating_count > 1)
+        {
+            std::size_t idx = iter++ % rotating_count;
+            a_ptr           = p_a_grids[idx];
+            b_ptr           = p_b_grids[idx];
+        }
+    }
+    void Print()
+    {
+        std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
+                  << ", rotating_count: " << rotating_count << "}" << std::endl;
+    }
+    ~RotatingMemWrapper() noexcept
+    {
+        if(rotating_count > 1)
+        {
+            // restore ptr
+            a_ptr = p_a_grids[0];
+            b_ptr = p_b_grids[0];
+
+            // free device mem
+            for(size_t i = 1; i < rotating_count; i++)
+            {
+                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
+                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
+            }
+        }
+    }
+
+    private:
+    const void* a_ptr;
+    const void* b_ptr;
+    std::size_t iter           = 0;
+    std::size_t rotating_count = 1;
+    std::size_t size_a         = 0;
+    std::size_t size_b         = 0;
+    std::vector<const void*> p_a_grids;
+    std::vector<const void*> p_b_grids;
+};
+inline void flush_icache()
+{
+    hipDeviceProp_t deviceProps;
+    HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+
+    ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
+    HIP_CHECK_ERROR(hipGetLastError());
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp
index 47cf0fd5e4..f6bd40f6f2 100644
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -30,5 +30,7 @@ struct stream_config
     int cold_niters_       = 3;
     int nrepeat_           = 10;
     bool is_gpu_timer_     = true; // keep compatible
+    bool flush_cache_      = false;
+    int rotating_count_    = 1;
 };
 } // namespace ck_tile
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 87267f8bce..3ae280f8ce 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -42,6 +42,8 @@ rm -rf tile_engine/ && ninja benchmark_gemm  # rebuild
                  -repeat    The number of iterations to benchmark the kernel. Default is 100.
                   -timer    Whether if the timer is gpu timer or not. Possible values are true or false. Default is true.  
                    -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+            -flush_cache    To flush cache in between different runs.Possible values are true or false. Default is false.
+         -rotating_count    count to flush cache. Default is 5.     
                  -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
            -csv_filename    The filename of benchmark result. Default is gemm_kernel.
     -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
index fb56e524d2..db2b648437 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -26,15 +26,15 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                              CLayout::name,
                              arg_parser.get_bool("structured_sparsity")};
 
-    Setting setting{
-        arg_parser.get_int("warmup"),
-        arg_parser.get_int("repeat"),
-        arg_parser.get_bool("timer"),
-        arg_parser.get_int("verify"),
-        arg_parser.get_int("init"),
-        arg_parser.get_bool("log"),
-        arg_parser.get_str("csv_filename"),
-    };
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count")};
 
     auto& profiler = GemmProfiler::instance(setting);
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 292d67dad6..459a40b080 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,6 +125,8 @@ struct Setting
     int init_method_;
     bool log_;
     std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
 };
 
 inline std::string get_rocm_version()
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
old mode 100755
new mode 100644
index 8cbc3f26f6..b3aab6ad92
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -93,6 +93,11 @@ inline auto create_args(int argc, char* argv[])
                 "0",
                 "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
                 "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "false",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
         .insert("metric",
                 "0",
                 "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index c43797f3e0..ea7fa4e67c 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -273,9 +273,52 @@ struct GemmKernel {{
                       << std::endl;
             }}
 
-            ave_time = ck_tile::launch_kernel(stream,
+            if(stream.flush_cache_)
+            {{
+                std::cout << "Flushing cache..." << std::endl;
+                static constexpr ck_tile::index_t APackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                static constexpr ck_tile::index_t BPackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                
+                auto is_row_major = [](auto layout_) {{
+                    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{{}};
+                }};
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{{}})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{{}})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, stream.rotating_count_, size_a_buffer, size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {{
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
+                }};
+                ave_time = ck_tile::launch_kernel_preprocess(
+                    stream,
+                    run_flush_cache,
+                    ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                        Kernel{{}}, grids, blocks, 0, kargs));
+            }}
+            else{{
+                ave_time = ck_tile::launch_kernel(stream,
                                           ck_tile::make_kernel<blocks.x, kBlockPerCu>(
                                               Kernel{{}}, grids, blocks, 0, kargs));
+            }}
             return ave_time;
 
         }};
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 9170952aa8..0125a759b3 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -128,7 +128,9 @@ class GemmProfiler
                                                                      setting_.log_,
                                                                      setting_.n_warmup_,
                                                                      setting_.n_repeat_,
-                                                                     setting_.is_gpu_timer_});
+                                                                     setting_.is_gpu_timer_,
+                                                                     setting_.flush_cache_,
+                                                                     setting_.rotating_count_});
             process_result(gemm_problem,
                            c_m_n_dev_buf,
                            c_m_n_host_result,

From 9bd01b624ee13792a0b051e85bbbbc879e7f147f Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Wed, 28 May 2025 16:25:09 +0300
Subject: [PATCH 154/443] Remove extra if from CMakeLists.txt of gemm tests
 (#2213)

---
 test/ck_tile/gemm/CMakeLists.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 3e7296b1eb..fc04af5cdb 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Currently ck_tile is only built on gfx94/gfx95
+# Currently ck_tile_gemm is only built on gfx94/gfx95
 set(EXAMPLE_GEMM_COMPILE_OPTIONS "")
 if(CK_USE_OCP_FP8)
     list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
@@ -12,8 +12,6 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
     -enable-noalias-to-md-conversion=0
 )
 
-if(CK_USE_OCP_FP8)
-    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
     add_gtest_executable(test_ck_tile_gemm_pipeline_compv3 test_gemm_pipeline_compv3.cpp)
@@ -25,4 +23,3 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
 else()
     message("Skipping ck_tile_gemm tests for current target")
 endif()
-endif()

From 4286eae09a148e9e86dc05a5c266123ec6cd931d Mon Sep 17 00:00:00 2001
From: Casey-Shi <48111042+Yanxing-Shi@users.noreply.github.com>
Date: Wed, 28 May 2025 23:43:58 +0800
Subject: [PATCH 155/443] fix type hint (#2254)

---
 tile_engine/ops/gemm/json_config.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
index f6303ec9f8..597caba76f 100644
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -9,7 +9,7 @@ Handles loading, parsing, and validation of JSON configuration parameters.
 
 from pathlib import Path
 from dataclasses import dataclass
-from typing import List, Optional, Union, Tuple, Type
+from typing import List, Optional, Union, Tuple, Type, Dict
 import json
 
 
@@ -63,8 +63,8 @@ class ProblemConfig:
     layouts: Tuple[EnumConfigParam, ...]
 
     @property
-    def datatype_map(self) -> dict[str, str]:
-        """Get current layout selections as a key-value map."""
+    def datatype_map(self) -> Dict[str, str]:
+        """Get datatype as a key-value map."""
         return {
             'matrix_a': self.datatypes[0].values[0],
             'matrix_b': self.datatypes[1].values[0],
@@ -72,8 +72,8 @@ class ProblemConfig:
         }
 
     @property
-    def layout_map(self) -> dict[str, str]:
-        """Get current layout selections as a key-value map."""
+    def layout_map(self) -> Dict[str, str]:
+        """Get layout as a key-value map."""
         return {
             'matrix_a': self.layouts[0].values[0],
             'matrix_b': self.layouts[1].values[0],
@@ -83,7 +83,7 @@ class ProblemConfig:
 
 @dataclass
 class TileConfig:
-    """configuration class for tile parameter."""
+    """Configuration class for tile parameter."""
     tile_m: Union[EnumConfigParam, RangeConfigParam]
     tile_n: Union[EnumConfigParam, RangeConfigParam]
     tile_k: Union[EnumConfigParam, RangeConfigParam]
@@ -99,7 +99,7 @@ class TileConfig:
 
 @dataclass
 class TraitConfig:
-    """configuration class for kernel traits."""
+    """Configuration class for kernel traits."""
     pipeline: EnumConfigParam
     scheduler: EnumConfigParam
     epilogue: EnumConfigParam

From bbdaf79a52c7b1cb74fe3d758e7b5142b2084f70 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 28 May 2025 08:46:52 -0700
Subject: [PATCH 156/443] Revert "[CK_tile] Add rotating buffer feature for
 universal gemm (#2200)" (#2256)

This reverts commit 99857e10e64889e631f58a3cc38fca3a3f248bc4.
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |   7 -
 example/ck_tile/03_gemm/run_gemm_example.inc  |   2 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    | 145 +++++++-----------
 example/ck_tile/18_flatmm/flatmm_basic.cpp    |  45 +-----
 example/ck_tile/18_flatmm/flatmm_basic.hpp    |   7 -
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |   2 +-
 include/ck_tile/host.hpp                      |   2 -
 include/ck_tile/host/flush_icache.hpp         |  30 ----
 include/ck_tile/host/kernel_launch.hpp        |  61 +-------
 include/ck_tile/host/rotating_buffers.hpp     | 102 ------------
 include/ck_tile/host/stream_config.hpp        |   4 +-
 tile_engine/ops/gemm/README.md                |   2 -
 tile_engine/ops/gemm/benchmark_gemm.cpp       |  18 +--
 tile_engine/ops/gemm/benchmark_gemm.hpp       |   2 -
 tile_engine/ops/gemm/gemm_host_api.hpp        |   5 -
 tile_engine/ops/gemm/gemm_instance_builder.py |  45 +-----
 tile_engine/ops/gemm/gemm_profiler.hpp        |   4 +-
 17 files changed, 74 insertions(+), 409 deletions(-)
 delete mode 100644 include/ck_tile/host/flush_icache.hpp
 delete mode 100644 include/ck_tile/host/rotating_buffers.hpp
 mode change 100644 => 100755 tile_engine/ops/gemm/gemm_host_api.hpp

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 4c9fecaba6..25fab6bde0 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -220,11 +220,4 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 3010130e6c..79ed9ce76b 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -178,7 +178,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
 
     float ave_time =
         gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 5dcb685839..5718baf677 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -11,7 +11,6 @@
 
 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
-#include "run_gemm_example.inc"
 
 template <typename Pipeline, ck_tile::TailNumber TN>
 void try_run(ck_tile::TailNumber tn)
@@ -75,102 +74,64 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
     float ave_time{0};
 
-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
 
-            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 CLayout,
-                                                 GemmPipelineProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation>>;
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-            constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
 
-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
 
-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
 
-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
-
-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+        ave_time = ck_tile::launch_kernel(s,
+                                          ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                              Kernel{}, grids, blocks, 0, kargs));
+        return ave_time;
+    };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
@@ -282,6 +243,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     return ave_time;
 }
 
+#include "run_gemm_example.inc"
+
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 2dbff1bc5c..c8b4a10d05 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -11,7 +11,6 @@
 
 #include "ck_tile/host.hpp"
 #include "flatmm_basic.hpp"
-#include "run_flatmm_example.inc"
 
 template <typename ADataType,
           typename BDataType,
@@ -116,47 +115,9 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                       << std::endl;
         }
 
-        float ave_time{0};
-        if(s.flush_cache_)
-        {
-            std::cout << "Flushing cache..." << std::endl;
-            static constexpr ck_tile::index_t APackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-            static constexpr ck_tile::index_t BPackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
-            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.a_ptr, kargs.b_shuffle_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck_tile::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-                // clear c mem
-                if(args.k_batch > 1)
-                    hipGetErrorString(hipMemsetAsync(
-                        args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-            };
-            ave_time = ck_tile::launch_kernel_preprocess(
-                s,
-                run_flush_cache,
-                ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        else
-        {
-            ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
         return ave_time;
     };
     if(args.k_batch == 1)
@@ -171,6 +132,8 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
     }
 }
 
+#include "run_flatmm_example.inc"
+
 int run_flatmm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index 55f2d4f367..bbce978724 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -133,11 +133,4 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 3d4f154af7..c191fff7d0 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -122,7 +122,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
 
     float ave_time =
         flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 3459e728e0..24feaf7c62 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -37,5 +37,3 @@
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/host/timer.hpp"
-#include "ck_tile/host/flush_icache.hpp"
-#include "ck_tile/host/rotating_buffers.hpp"
diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp
deleted file mode 100644
index 9230b50a13..0000000000
--- a/include/ck_tile/host/flush_icache.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <hip/hip_runtime.h>
-
-namespace ck_tile {
-static __global__ void flush_cache()
-{
-    asm __volatile__("s_icache_inv \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t" ::
-                         :);
-}
-} // namespace ck_tile
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 269e59a103..d159787387 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -11,13 +11,6 @@
 #include <cstddef>
 
 namespace ck_tile {
-
-#define CU_FOR_MI308 80
-#define CU_FOR_MI300X 228
-#define OPTIMAL_LATENCY_MI308 0.005
-#define OPTIMAL_LATENCY_MI300X 0.0015
-#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
-
 template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -88,8 +81,6 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
 template <typename... Callables>
 CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
 {
-    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
-
     if(!s.time_kernel_)
     {
         launch_and_check(s, std::forward<Callables>(callables)...);
@@ -97,7 +88,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
     }
 
     auto time_launches = [&](auto timer) {
-        // Warmup
+        // warmup
         for(int i = 0; i < s.cold_niters_; i++)
         {
             launch_and_check(s, std::forward<Callables>(callables)...);
@@ -123,52 +114,4 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
     }
 }
 
-template <typename PreprocessFunc, typename... Callables>
-CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
-                                            PreprocessFunc preprocess,
-                                            Callables&&... callables)
-{
-    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
-
-    if(!s.time_kernel_)
-    {
-        preprocess();
-        launch_and_check(s, std::forward<Callables>(callables)...);
-        return 0;
-    }
-
-    auto time_launches = [&](auto timer) {
-        // Warmup
-        for(int i = 0; i < s.cold_niters_; i++)
-        {
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-
-        timer.start(s.stream_id_);
-        for(int i = 0; i < s.nrepeat_; i++)
-        {
-            preprocess();
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-        timer.stop(s.stream_id_);
-
-        hipDeviceProp_t deviceProps;
-        HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-
-        float preprocess_offset =
-            (deviceProps.multiProcessorCount >= CU_FOR_MI300X)  ? OPTIMAL_LATENCY_MI300X
-            : (deviceProps.multiProcessorCount == CU_FOR_MI308) ? OPTIMAL_LATENCY_MI308
-                                                                : OPTIMAL_LATENCY_SAFE_MARGIN;
-        return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
-    };
-
-    if(s.is_gpu_timer_)
-    {
-        return time_launches(gpu_timer{});
-    }
-    else
-    {
-        return time_launches(cpu_timer{});
-    }
-}
 } // namespace ck_tile
diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp
deleted file mode 100644
index 86f68ad084..0000000000
--- a/include/ck_tile/host/rotating_buffers.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core/config.hpp"
-#include "ck_tile/host/hip_check_error.hpp"
-#include <hip/hip_runtime.h>
-
-namespace ck_tile {
-
-template <typename ADataType, typename BDataType>
-struct RotatingMemWrapper
-{
-    RotatingMemWrapper() = delete;
-    RotatingMemWrapper(const void* a_ptr_,
-                       const void* b_ptr_,
-                       std::size_t rotating_count_,
-                       std::size_t size_a_,
-                       std::size_t size_b_)
-        : a_ptr(a_ptr_),
-          b_ptr(b_ptr_),
-          rotating_count(rotating_count_),
-          size_a(size_a_),
-          size_b(size_b_)
-    {
-        p_a_grids.push_back(a_ptr);
-        p_b_grids.push_back(b_ptr);
-        for(size_t i = 1; i < rotating_count; i++)
-        {
-            {
-                void* pADeviceBuf;
-                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
-                                          const_cast<void*>(p_a_grids[0]),
-                                          size_a_,
-                                          hipMemcpyDeviceToDevice));
-                p_a_grids.push_back(pADeviceBuf);
-            }
-
-            {
-                void* pBDeviceBuf;
-                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
-                                          const_cast<void*>(p_b_grids[0]),
-                                          size_b_,
-                                          hipMemcpyDeviceToDevice));
-                p_b_grids.push_back(pBDeviceBuf);
-            }
-        }
-    }
-    void Next()
-    {
-        if(rotating_count > 1)
-        {
-            std::size_t idx = iter++ % rotating_count;
-            a_ptr           = p_a_grids[idx];
-            b_ptr           = p_b_grids[idx];
-        }
-    }
-    void Print()
-    {
-        std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
-                  << ", rotating_count: " << rotating_count << "}" << std::endl;
-    }
-    ~RotatingMemWrapper() noexcept
-    {
-        if(rotating_count > 1)
-        {
-            // restore ptr
-            a_ptr = p_a_grids[0];
-            b_ptr = p_b_grids[0];
-
-            // free device mem
-            for(size_t i = 1; i < rotating_count; i++)
-            {
-                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
-                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
-            }
-        }
-    }
-
-    private:
-    const void* a_ptr;
-    const void* b_ptr;
-    std::size_t iter           = 0;
-    std::size_t rotating_count = 1;
-    std::size_t size_a         = 0;
-    std::size_t size_b         = 0;
-    std::vector<const void*> p_a_grids;
-    std::vector<const void*> p_b_grids;
-};
-inline void flush_icache()
-{
-    hipDeviceProp_t deviceProps;
-    HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
-
-    ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
-    HIP_CHECK_ERROR(hipGetLastError());
-}
-} // namespace ck_tile
diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp
index f6bd40f6f2..47cf0fd5e4 100644
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -30,7 +30,5 @@ struct stream_config
     int cold_niters_       = 3;
     int nrepeat_           = 10;
     bool is_gpu_timer_     = true; // keep compatible
-    bool flush_cache_      = false;
-    int rotating_count_    = 1;
 };
 } // namespace ck_tile
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 3ae280f8ce..87267f8bce 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -42,8 +42,6 @@ rm -rf tile_engine/ && ninja benchmark_gemm  # rebuild
                  -repeat    The number of iterations to benchmark the kernel. Default is 100.
                   -timer    Whether if the timer is gpu timer or not. Possible values are true or false. Default is true.  
                    -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-            -flush_cache    To flush cache in between different runs.Possible values are true or false. Default is false.
-         -rotating_count    count to flush cache. Default is 5.     
                  -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
            -csv_filename    The filename of benchmark result. Default is gemm_kernel.
     -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
index db2b648437..fb56e524d2 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -26,15 +26,15 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                              CLayout::name,
                              arg_parser.get_bool("structured_sparsity")};
 
-    Setting setting{arg_parser.get_int("warmup"),
-                    arg_parser.get_int("repeat"),
-                    arg_parser.get_bool("timer"),
-                    arg_parser.get_int("verify"),
-                    arg_parser.get_int("init"),
-                    arg_parser.get_bool("log"),
-                    arg_parser.get_str("csv_filename"),
-                    arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count")};
+    Setting setting{
+        arg_parser.get_int("warmup"),
+        arg_parser.get_int("repeat"),
+        arg_parser.get_bool("timer"),
+        arg_parser.get_int("verify"),
+        arg_parser.get_int("init"),
+        arg_parser.get_bool("log"),
+        arg_parser.get_str("csv_filename"),
+    };
 
     auto& profiler = GemmProfiler::instance(setting);
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 459a40b080..292d67dad6 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,8 +125,6 @@ struct Setting
     int init_method_;
     bool log_;
     std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
 };
 
 inline std::string get_rocm_version()
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
old mode 100644
new mode 100755
index b3aab6ad92..8cbc3f26f6
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -93,11 +93,6 @@ inline auto create_args(int argc, char* argv[])
                 "0",
                 "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
                 "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "false",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
         .insert("metric",
                 "0",
                 "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index ea7fa4e67c..c43797f3e0 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -273,52 +273,9 @@ struct GemmKernel {{
                       << std::endl;
             }}
 
-            if(stream.flush_cache_)
-            {{
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                
-                auto is_row_major = [](auto layout_) {{
-                    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{{}};
-                }};
-
-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{{}})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{{}})));
-
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, stream.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
-
-                auto run_flush_cache = [&]() {{
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
-                }};
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    stream,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                        Kernel{{}}, grids, blocks, 0, kargs));
-            }}
-            else{{
-                ave_time = ck_tile::launch_kernel(stream,
+            ave_time = ck_tile::launch_kernel(stream,
                                           ck_tile::make_kernel<blocks.x, kBlockPerCu>(
                                               Kernel{{}}, grids, blocks, 0, kargs));
-            }}
             return ave_time;
 
         }};
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 0125a759b3..9170952aa8 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -128,9 +128,7 @@ class GemmProfiler
                                                                      setting_.log_,
                                                                      setting_.n_warmup_,
                                                                      setting_.n_repeat_,
-                                                                     setting_.is_gpu_timer_,
-                                                                     setting_.flush_cache_,
-                                                                     setting_.rotating_count_});
+                                                                     setting_.is_gpu_timer_});
             process_result(gemm_problem,
                            c_m_n_dev_buf,
                            c_m_n_host_result,

From 29574f05f7188709493909447543b7954038f899 Mon Sep 17 00:00:00 2001
From: Casey-Shi <48111042+Yanxing-Shi@users.noreply.github.com>
Date: Thu, 29 May 2025 00:25:05 +0800
Subject: [PATCH 157/443] change from ninja to make (#2253)

---
 Jenkinsfile                    | 8 ++++----
 tile_engine/ops/gemm/README.md | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 776cf8f9fa..39dfd24f3a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1162,8 +1162,8 @@ pipeline {
                     agent{ label rocmnode("gfx90a") }
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a -G Ninja && \
-                                           ninja benchmark_gemm && \
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make benchmark_gemm -j && \
                                            ./bin/benchmark_gemm """
                     }
                     steps{
@@ -1180,8 +1180,8 @@ pipeline {
                     agent{ label rocmnode("gfx942") }
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 -G Ninja && \
-                                           ninja benchmark_gemm && \
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make benchmark_gemm -j && \
                                            ./bin/benchmark_gemm """
                     }
                     steps{
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 87267f8bce..01796da56c 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -6,7 +6,7 @@ CK Tile Engine GEMM is used to generate and run GEMM kernels with different comb
 
 User can provide kernel configuration such as tile size, warp size, padding, pipeline, scheduler and epilogue in the config file with limited values. For reference please see `./configs/user_provided_config.json`. 
 
-The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark  For reference please see in `./configs/default_config.json`
+The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark. For reference please see in `./configs/default_config.json`
 
 If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. 
 
@@ -15,16 +15,16 @@ If user does not provide kernel configuration, the tile engine uses default kern
 # in the root of composable kernel create build directory
 mkdir build && cd build
 # build composable kernel
-sh ../script/cmake-ck-dev.sh  ../ <arch> -G Ninja # replace <arch> with the appropriate architecture (example gfx942) or leave blank
+sh ../script/cmake-ck-dev.sh  ../ <arch> # replace <arch> with the appropriate architecture (example gfx942) or leave blank
 # generate the executable
-ninja benchmark_gemm
+make benchmark_gemm -j
 ```
 `benchmark_gemm` will be located in the `./bin/` directory.
 
 `benchmark_gemm` must be rebuilt everytime if configuration file is modified.
 
 ``` bash
-rm -rf tile_engine/ && ninja benchmark_gemm  # rebuild
+rm -rf tile_engine/ && make benchmark_gemm -j  # rebuild
 ```
 
 ## benchmark_gemm inputs

From e91be7d96a1b7b9c00933e888c1b297e5c62b6c1 Mon Sep 17 00:00:00 2001
From: BrianHarrisonAMD <169072757+BrianHarrisonAMD@users.noreply.github.com>
Date: Wed, 28 May 2025 14:47:56 -0600
Subject: [PATCH 158/443] Add option to disable offload compress for CK builds
 (#2250)

* Add option to disable offload compress for CK builds

* Remove gemm exe offload compress flag conditional
---
 CMakeLists.txt                                           | 1 +
 library/src/tensor_operation_instance/gpu/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98698e5940..ceb48dffd7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -607,6 +607,7 @@ ENDFOREACH()
 add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
 
 option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
+option(DISABLE_OFFLOAD_COMPRESS "Disable offload compress compiler flag when building instances" OFF)
 option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
 
 add_subdirectory(library)
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index aef40b8cb3..446f53889f 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -180,7 +180,7 @@ function(add_instance_library INSTANCE_NAME)
         target_compile_features(${INSTANCE_NAME} PUBLIC)
 
         # flags to compress the library
-        if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
+        if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
             #message("Adding --offload-compress flag for ${INSTANCE_NAME}")
             target_compile_options(${INSTANCE_NAME} PRIVATE --offload-compress)
         endif()

From 6df1c56ad63af98c38671f6141d44ed61d97bca9 Mon Sep 17 00:00:00 2001
From: Adam Dickin <adam.dickin@amd.com>
Date: Wed, 28 May 2025 14:51:15 -0600
Subject: [PATCH 159/443] Changes to allow MIOpen to build CK as part of its
 build. (#2247)

* tweaks to the miopen specific build.  add way to skip clang-tidy checks and a way to skip some custom build targets MIOpen also has.

* move the tidy if statment

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 CMakeLists.txt                                | 287 +++++++++---------
 .../gpu/CMakeLists.txt                        |  22 +-
 2 files changed, 161 insertions(+), 148 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ceb48dffd7..12ab5e4cef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,10 @@ set(version 1.1.0)
 project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
 include(CTest)
 
+option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
+option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
+option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
+
 # Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
 # CK Codegen requires dataclass which is added in Python 3.7
 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
@@ -390,146 +394,152 @@ else()
     add_compile_definitions(__HIP_PLATFORM_HCC__=1)
 endif()
 
-## tidy
 include(EnableCompilerWarnings)
+## tidy
 set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
 if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
-    set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
 # Enable tidy on hip
 elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
-    set(CK_TIDY_ERRORS ALL)
+set(CK_TIDY_ERRORS ALL)
 endif()
 
 
-include(ClangTidy)
-enable_clang_tidy(
-    CHECKS
-        *
-        -abseil-*
-        -android-cloexec-fopen
-        # Yea we shouldn't be using rand()
-        -cert-msc30-c
-        -bugprone-exception-escape
-        -bugprone-macro-parentheses
-        -cert-env33-c
-        -cert-msc32-c
-        -cert-msc50-cpp
-        -cert-msc51-cpp
-        -cert-dcl37-c
-        -cert-dcl51-cpp
-        -clang-analyzer-alpha.core.CastToStruct
-        -clang-analyzer-optin.performance.Padding
-        -clang-diagnostic-deprecated-declarations
-        -clang-diagnostic-extern-c-compat
-        -clang-diagnostic-unused-command-line-argument
-        -cppcoreguidelines-avoid-c-arrays
-        -cppcoreguidelines-avoid-magic-numbers
-        -cppcoreguidelines-explicit-virtual-functions
-        -cppcoreguidelines-init-variables
-        -cppcoreguidelines-macro-usage
-        -cppcoreguidelines-non-private-member-variables-in-classes
-        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
-        -cppcoreguidelines-pro-bounds-constant-array-index
-        -cppcoreguidelines-pro-bounds-pointer-arithmetic
-        -cppcoreguidelines-pro-type-member-init
-        -cppcoreguidelines-pro-type-reinterpret-cast
-        -cppcoreguidelines-pro-type-union-access
-        -cppcoreguidelines-pro-type-vararg
-        -cppcoreguidelines-special-member-functions
-        -fuchsia-*
-        -google-explicit-constructor
-        -google-readability-braces-around-statements
-        -google-readability-todo
-        -google-runtime-int
-        -google-runtime-references
-        -hicpp-vararg
-        -hicpp-braces-around-statements
-        -hicpp-explicit-conversions
-        -hicpp-named-parameter
-        -hicpp-no-array-decay
-        # We really shouldn't use bitwise operators with signed integers, but
-        # opencl leaves us no choice
-        -hicpp-avoid-c-arrays
-        -hicpp-signed-bitwise
-        -hicpp-special-member-functions
-        -hicpp-uppercase-literal-suffix
-        -hicpp-use-auto
-        -hicpp-use-equals-default
-        -hicpp-use-override
-        -llvm-header-guard
-        -llvm-include-order
-        #-llvmlibc-*
-        -llvmlibc-restrict-system-libc-headers
-        -llvmlibc-callee-namespace
-        -llvmlibc-implementation-in-namespace
-        -llvm-else-after-return
-        -llvm-qualified-auto
-        -misc-misplaced-const
-        -misc-non-private-member-variables-in-classes
-        -misc-no-recursion
-        -modernize-avoid-bind
-        -modernize-avoid-c-arrays
-        -modernize-pass-by-value
-        -modernize-use-auto
-        -modernize-use-default-member-init
-        -modernize-use-equals-default
-        -modernize-use-trailing-return-type
-        -modernize-use-transparent-functors
-        -performance-unnecessary-value-param
-        -readability-braces-around-statements
-        -readability-else-after-return
-        # we are not ready to use it, but very useful
-        -readability-function-cognitive-complexity
-        -readability-isolate-declaration
-        -readability-magic-numbers
-        -readability-named-parameter
-        -readability-uppercase-literal-suffix
-        -readability-convert-member-functions-to-static
-        -readability-qualified-auto
-        -readability-redundant-string-init
-        # too many narrowing conversions in our code
-        -bugprone-narrowing-conversions
-        -cppcoreguidelines-narrowing-conversions
-        -altera-struct-pack-align
-        -cppcoreguidelines-prefer-member-initializer
-        ${CK_TIDY_CHECKS}
-        ${CK_TIDY_ERRORS}
-    HEADER_FILTER
-        "\.hpp$"
-    EXTRA_ARGS
-        -DCK_USE_CLANG_TIDY
-)
+if(ENABLE_CLANG_CPP_CHECKS)
+    include(ClangTidy)
+    enable_clang_tidy(
+        CHECKS
+            *
+            -abseil-*
+            -android-cloexec-fopen
+            # Yea we shouldn't be using rand()
+            -cert-msc30-c
+            -bugprone-exception-escape
+            -bugprone-macro-parentheses
+            -cert-env33-c
+            -cert-msc32-c
+            -cert-msc50-cpp
+            -cert-msc51-cpp
+            -cert-dcl37-c
+            -cert-dcl51-cpp
+            -clang-analyzer-alpha.core.CastToStruct
+            -clang-analyzer-optin.performance.Padding
+            -clang-diagnostic-deprecated-declarations
+            -clang-diagnostic-extern-c-compat
+            -clang-diagnostic-unused-command-line-argument
+            -cppcoreguidelines-avoid-c-arrays
+            -cppcoreguidelines-avoid-magic-numbers
+            -cppcoreguidelines-explicit-virtual-functions
+            -cppcoreguidelines-init-variables
+            -cppcoreguidelines-macro-usage
+            -cppcoreguidelines-non-private-member-variables-in-classes
+            -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+            -cppcoreguidelines-pro-bounds-constant-array-index
+            -cppcoreguidelines-pro-bounds-pointer-arithmetic
+            -cppcoreguidelines-pro-type-member-init
+            -cppcoreguidelines-pro-type-reinterpret-cast
+            -cppcoreguidelines-pro-type-union-access
+            -cppcoreguidelines-pro-type-vararg
+            -cppcoreguidelines-special-member-functions
+            -fuchsia-*
+            -google-explicit-constructor
+            -google-readability-braces-around-statements
+            -google-readability-todo
+            -google-runtime-int
+            -google-runtime-references
+            -hicpp-vararg
+            -hicpp-braces-around-statements
+            -hicpp-explicit-conversions
+            -hicpp-named-parameter
+            -hicpp-no-array-decay
+            # We really shouldn't use bitwise operators with signed integers, but
+            # opencl leaves us no choice
+            -hicpp-avoid-c-arrays
+            -hicpp-signed-bitwise
+            -hicpp-special-member-functions
+            -hicpp-uppercase-literal-suffix
+            -hicpp-use-auto
+            -hicpp-use-equals-default
+            -hicpp-use-override
+            -llvm-header-guard
+            -llvm-include-order
+            #-llvmlibc-*
+            -llvmlibc-restrict-system-libc-headers
+            -llvmlibc-callee-namespace
+            -llvmlibc-implementation-in-namespace
+            -llvm-else-after-return
+            -llvm-qualified-auto
+            -misc-misplaced-const
+            -misc-non-private-member-variables-in-classes
+            -misc-no-recursion
+            -modernize-avoid-bind
+            -modernize-avoid-c-arrays
+            -modernize-pass-by-value
+            -modernize-use-auto
+            -modernize-use-default-member-init
+            -modernize-use-equals-default
+            -modernize-use-trailing-return-type
+            -modernize-use-transparent-functors
+            -performance-unnecessary-value-param
+            -readability-braces-around-statements
+            -readability-else-after-return
+            # we are not ready to use it, but very useful
+            -readability-function-cognitive-complexity
+            -readability-isolate-declaration
+            -readability-magic-numbers
+            -readability-named-parameter
+            -readability-uppercase-literal-suffix
+            -readability-convert-member-functions-to-static
+            -readability-qualified-auto
+            -readability-redundant-string-init
+            # too many narrowing conversions in our code
+            -bugprone-narrowing-conversions
+            -cppcoreguidelines-narrowing-conversions
+            -altera-struct-pack-align
+            -cppcoreguidelines-prefer-member-initializer
+            ${CK_TIDY_CHECKS}
+            ${CK_TIDY_ERRORS}
+        HEADER_FILTER
+            "\.hpp$"
+        EXTRA_ARGS
+            -DCK_USE_CLANG_TIDY
+    )
 
-include(CppCheck)
-enable_cppcheck(
-    CHECKS
-        warning
-        style
-        performance
-        portability
-    SUPPRESS
-        ConfigurationNotChecked
-        constStatement
-        duplicateCondition
-        noExplicitConstructor
-        passedByValue
-        preprocessorErrorDirective
-        shadowVariable
-        unusedFunction
-        unusedPrivateFunction
-        unusedStructMember
-        unmatchedSuppression
-    FORCE
-    SOURCES
-        library/src
-    INCLUDE
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-        ${CMAKE_CURRENT_BINARY_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/library/include
-    DEFINE
-        CPPCHECK=1
-        __linux__=1
-)
+    include(CppCheck)
+    enable_cppcheck(
+        CHECKS
+            warning
+            style
+            performance
+            portability
+        SUPPRESS
+            ConfigurationNotChecked
+            constStatement
+            duplicateCondition
+            noExplicitConstructor
+            passedByValue
+            preprocessorErrorDirective
+            shadowVariable
+            unusedFunction
+            unusedPrivateFunction
+            unusedStructMember
+            unmatchedSuppression
+        FORCE
+        SOURCES
+            library/src
+        INCLUDE
+            ${CMAKE_CURRENT_SOURCE_DIR}/include
+            ${CMAKE_CURRENT_BINARY_DIR}/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/library/include
+        DEFINE
+            CPPCHECK=1
+            __linux__=1
+    )
+else()
+    function(clang_tidy_check TARGET)
+        # stub out empty function if clang tidy is not enabled
+    endfunction()
+endif()
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
@@ -557,12 +567,15 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERS
     add_compile_options(-fdiagnostics-color=always)
 endif()
 
-# make check runs the entire set of examples and tests
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
-# make smoke runs the tests and examples that runs within 30 seconds on gfx90a
-add_custom_target(smoke COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "SMOKE_TEST")
-# make regression runs the tests and examples that runs for more 30 seconds on gfx90a
-add_custom_target(regression COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "REGRESSION_TEST")
+if(NOT MIOPEN_REQ_LIBS_ONLY)
+    # make check runs the entire set of examples and tests
+    add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+    # make smoke runs the tests and examples that runs within 30 seconds on gfx90a
+    add_custom_target(smoke COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "SMOKE_TEST")
+    # make regression runs the tests and examples that runs for more 30 seconds on gfx90a
+    add_custom_target(regression COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "REGRESSION_TEST")
+endif()
+
 
 
 file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 446f53889f..67ce4e39e1 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -104,17 +104,6 @@ function(add_instance_library INSTANCE_NAME)
         endif()
     endforeach()
 
-    if(MIOPEN_REQ_LIBS_ONLY)
-        message("Removing all sources that are not required for MIOpen")
-        foreach(source IN LISTS ARGN)
-            if(source MATCHES "gemm" OR 
-               source MATCHES "mha" OR 
-               source MATCHES "contraction" OR 
-               source MATCHES "reduce")
-                    list(REMOVE_ITEM ARGN "${source}")
-            endif()
-        endforeach()
-    endif()
     #message("remaining instances: ${ARGN}")
     #only continue if there are some source files left on the list
     if(ARGN)
@@ -294,6 +283,17 @@ FOREACH(subdir_path ${dir_list})
             message("Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
+
+        if(MIOPEN_REQ_LIBS_ONLY)
+            message("Removing all sources that are not required for MIOpen")
+            if("${cmake_instance}" MATCHES "gemm" OR 
+               "${cmake_instance}" MATCHES "mha" OR 
+               "${cmake_instance}" MATCHES "contraction" OR 
+               "${cmake_instance}" MATCHES "reduce")
+                    set(add_inst 0)
+            endif()
+        endif()
+
         if((add_inst EQUAL 1))
             get_filename_component(target_dir ${subdir_path} NAME)
             add_subdirectory(${target_dir})

From e7906dd644edfadcb6219b5f7f60d3e0d3a7301f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 29 May 2025 00:51:25 +0200
Subject: [PATCH 160/443] Change relu to clamp for grouped conv fwd instances
 (#2249)

---
 .../element/binary_element_wise_operation.hpp | 92 +++++++++++++++++++
 .../device_operation_instance_factory.hpp     |  1 +
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp |  2 +-
 .../device_grouped_conv_fwd_xdl_instance.hpp  |  2 +-
 ...ped_conv_fwd_xdl_large_tensor_instance.hpp |  2 +-
 ...vice_grouped_conv_fwd_xdl_mem_instance.hpp |  2 +-
 ...ed_conv_fwd_xdl_merged_groups_instance.hpp |  2 +-
 ...rouped_convolution_forward_bias_clamp.hpp} | 38 ++++----
 ...ed_convolution_forward_bias_clamp_xdl.inc} | 64 ++++++-------
 .../CMakeLists.txt                            | 16 ++++
 ...wgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp} | 10 +-
 ..._nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp} | 10 +-
 ..._gkyxc_nhwgk_bf16_comp_part2_instance.cpp} | 10 +-
 ...nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp} | 10 +-
 ...p_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp} | 10 +-
 ...ensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp} |  6 +-
 ...c_gkyxc_nhwgk_bf16_mem_inter_instance.cpp} | 10 +-
 ...c_gkyxc_nhwgk_bf16_mem_intra_instance.cpp} | 10 +-
 ...roups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp} | 12 +--
 .../CMakeLists.txt                            | 16 ----
 .../CMakeLists.txt                            | 16 ++++
 ...hwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp} | 22 ++---
 ...wgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp} | 10 +-
 ...dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} | 10 +-
 ...or_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} |  6 +-
 ...gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp} | 10 +-
 ...gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp} | 10 +-
 ...ps_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} |  8 +-
 .../CMakeLists.txt                            | 16 ----
 ...file_grouped_conv_fwd_bias_clamp_impl.hpp} | 19 ++--
 test/CMakeLists.txt                           |  2 +-
 .../CMakeLists.txt                            |  4 +
 .../test_grouped_convnd_fwd_bias_clamp.cpp}   | 24 ++---
 .../CMakeLists.txt                            |  4 -
 34 files changed, 291 insertions(+), 195 deletions(-)
 rename library/include/ck/library/tensor_operation_instance/gpu/{grouped_convolution_forward_bias_relu.hpp => grouped_convolution_forward_bias_clamp.hpp} (69%)
 rename library/include/ck/library/tensor_operation_instance/gpu/{grouped_convolution_forward_bias_relu_xdl.inc => grouped_convolution_forward_bias_clamp_xdl.inc} (88%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp} (95%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp} (95%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp} (93%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp => grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp} (95%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp} (96%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} (93%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp => grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp} (94%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt
 rename profiler/include/profiler/{profile_grouped_conv_fwd_bias_relu_impl.hpp => profile_grouped_conv_fwd_bias_clamp_impl.hpp} (96%)
 create mode 100644 test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt
 rename test/{grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp => grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp} (88%)
 delete mode 100644 test/grouped_convnd_fwd_bias_relu/CMakeLists.txt

diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 0e58d5acb4..badd64508d 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -351,6 +351,98 @@ struct Bilinear
     float beta_;
 };
 
+struct AddClamp
+{
+    AddClamp(float floor = 0.f, float ceil = NumericLimits<float>::Max())
+        : floor_(floor), ceil_(ceil){};
+
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double, double, double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y              = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const half_t a = x0 + x1;
+        y              = a > type_convert<half_t>(floor_)
+                             ? (a < type_convert<half_t>(ceil_) ? a : type_convert<half_t>(ceil_))
+                             : type_convert<half_t>(floor_);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > type_convert<half_t>(floor_)
+                            ? (a < type_convert<half_t>(ceil_) ? a : type_convert<half_t>(ceil_))
+                            : type_convert<half_t>(floor_);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, half_t>(float& y, const float& x0, const half_t& x1) const
+    {
+        const float a = x0 + type_convert<float>(x1);
+        y             = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t, float, bhalf_t>(bhalf_t& y, const float& x0, const bhalf_t& x1) const
+    {
+        const float a = x0 + type_convert<float>(x1);
+        y             = a > type_convert<bhalf_t>(floor_)
+                            ? (a < type_convert<bhalf_t>(ceil_) ? a : type_convert<bhalf_t>(ceil_))
+                            : type_convert<bhalf_t>(floor_);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float a = type_convert<float>(x0) + type_convert<float>(x1);
+        y             = a > type_convert<bhalf_t>(floor_)
+                            ? (a < type_convert<bhalf_t>(ceil_) ? a : type_convert<bhalf_t>(ceil_))
+                            : type_convert<bhalf_t>(floor_);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int, int, int8_t>(int& y, const int& x0, const int8_t& x1) const
+    {
+        const int8_t a = x0 + x1;
+        y              = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int8_t, int8_t, int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
+    {
+        const int8_t a = x0 + x1;
+        y              = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    const float floor_;
+    const float ceil_;
+};
+
 struct AddRelu
 {
     template <typename Y, typename X0, typename X1>
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index c3fd04ba35..0cb2c2bd79 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -115,6 +115,7 @@ using AddAddFastGelu      = ck::tensor_operation::element_wise::AddAddFastGelu;
 using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
 using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
 using AddRelu             = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp            = ck::tensor_operation::element_wise::AddClamp;
 using AddSilu             = ck::tensor_operation::element_wise::AddSilu;
 using AddReluAdd          = ck::tensor_operation::element_wise::AddReluAdd;
 using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index 17ffa65d1c..3fbf2fbc7b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -33,7 +33,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index df24b4cbcb..7311f4bf75 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -33,7 +33,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 6bb6d255f3..5a4d0338b0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -25,7 +25,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 195367ffd7..6da3ee1a4f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -33,7 +33,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 182c785978..d074988a22 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -25,7 +25,7 @@ using Empty_Tuple = ck::Tuple<>;
 using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
similarity index 69%
rename from library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp
rename to library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
index d873edadba..39231e31f0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
@@ -13,7 +13,7 @@
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 
 #ifdef CK_USE_XDL
-#include "grouped_convolution_forward_bias_relu_xdl.inc"
+#include "grouped_convolution_forward_bias_clamp_xdl.inc"
 #endif
 
 namespace ck {
@@ -44,7 +44,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     OutDataType,
     ck::tensor_operation::element_wise::PassThrough,
     ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::AddRelu,
+    ck::tensor_operation::element_wise::AddClamp,
     AComputeType,
     BComputeType>>
 {
@@ -60,7 +60,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                                         OutDataType,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::AddRelu,
+                                        ck::tensor_operation::element_wise::AddClamp,
                                         AComputeType,
                                         BComputeType>;
 
@@ -80,23 +80,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<AComputeType, ck::bhalf_t> &&
                          is_same_v<BComputeType, ck::bhalf_t>)
             {
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
                     op_ptrs);
-                add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
 #endif
@@ -112,19 +112,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                          is_same_v<AComputeType, ck::bhalf_t> &&
                          is_same_v<BComputeType, ck::bhalf_t>)
             {
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
                     op_ptrs);
-                add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
similarity index 88%
rename from library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc
rename to library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
index 1935f123a8..cc29e66cc1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
@@ -10,7 +10,7 @@ namespace instance {
 
 #ifdef CK_ENABLE_BF16
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -22,9 +22,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -36,9 +36,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_in
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -50,9 +50,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -64,9 +64,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -78,9 +78,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_ins
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -92,9 +92,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -106,9 +106,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_par
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -120,9 +120,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intr
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -134,9 +134,9 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inte
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -148,9 +148,9 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_insta
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -162,9 +162,9 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -176,9 +176,9 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhw
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -190,9 +190,9 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndh
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -204,9 +204,9 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -218,9 +218,9 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -232,7 +232,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances);
+                                                                AddClamp>>>& instances);
 
 #endif
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..b0a0cbb293
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+add_instance_library(device_grouped_conv2d_fwd_bias_clamp_instance
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
+
+   xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+
+   xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
index 75acd604ee..1dfb7577f7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -10,7 +10,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -22,7 +22,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     if(ck::get_device_name() == "gfx950")
     {
@@ -35,7 +35,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_
                                                                NHWGK,
                                                                ConvFwdDefault,
                                                                Tuple<BF16>,
-                                                               AddRelu>{});
+                                                               AddClamp>{});
 
         add_device_operation_instances(
             instances,
@@ -46,7 +46,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_
                                                                NHWGK,
                                                                ConvFwd1x1P0,
                                                                Tuple<BF16>,
-                                                               AddRelu>{});
+                                                               AddClamp>{});
 
         add_device_operation_instances(
             instances,
@@ -57,7 +57,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_
                                                                NHWGK,
                                                                ConvFwd1x1S1P0,
                                                                Tuple<BF16>,
-                                                               AddRelu>{});
+                                                               AddClamp>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
index 69a8a4bd9d..171efd60da 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -10,7 +10,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -22,7 +22,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_ins
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
@@ -32,7 +32,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_ins
                                                                                    NHWGK,
                                                                                    ConvFwdDefault,
                                                                                    Tuple<BF16>,
-                                                                                   AddRelu>{});
+                                                                                   AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
@@ -42,7 +42,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_ins
                                                                                    NHWGK,
                                                                                    ConvFwd1x1P0,
                                                                                    Tuple<BF16>,
-                                                                                   AddRelu>{});
+                                                                                   AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
@@ -52,7 +52,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_ins
                                                                                    NHWGK,
                                                                                    ConvFwd1x1S1P0,
                                                                                    Tuple<BF16>,
-                                                                                   AddRelu>{});
+                                                                                   AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
index 043c724e4a..49263b43eb 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -10,7 +10,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -22,7 +22,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_par
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     if(ck::get_device_name() != "gfx950")
     {
@@ -35,7 +35,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_par
                                                                   NHWGK,
                                                                   ConvFwdDefault,
                                                                   Tuple<BF16>,
-                                                                  AddRelu>{});
+                                                                  AddClamp>{});
 
         add_device_operation_instances(
             instances,
@@ -46,7 +46,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_par
                                                                   NHWGK,
                                                                   ConvFwd1x1P0,
                                                                   Tuple<BF16>,
-                                                                  AddRelu>{});
+                                                                  AddClamp>{});
 
         add_device_operation_instances(
             instances,
@@ -57,7 +57,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_par
                                                                   NHWGK,
                                                                   ConvFwd1x1S1P0,
                                                                   Tuple<BF16>,
-                                                                  AddRelu>{});
+                                                                  AddClamp>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
index c58631e169..b418807bdf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_in
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
@@ -31,7 +31,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_in
                                                                                     NHWGK,
                                                                                     ConvFwdDefault,
                                                                                     Tuple<BF16>,
-                                                                                    AddRelu>{});
+                                                                                    AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
@@ -41,7 +41,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_in
                                                                                     NHWGK,
                                                                                     ConvFwd1x1P0,
                                                                                     Tuple<BF16>,
-                                                                                    AddRelu>{});
+                                                                                    AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
@@ -51,7 +51,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_in
                                                                                     NHWGK,
                                                                                     ConvFwd1x1S1P0,
                                                                                     Tuple<BF16>,
-                                                                                    AddRelu>{});
+                                                                                    AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index cd80f2875f..6c666706a7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_instances<2,
@@ -31,7 +31,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance
                                                                               NHWGK,
                                                                               ConvFwdDefault,
                                                                               Tuple<BF16>,
-                                                                              AddRelu>{});
+                                                                              AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_instances<2,
@@ -41,7 +41,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance
                                                                               NHWGK,
                                                                               ConvFwd1x1P0,
                                                                               Tuple<BF16>,
-                                                                              AddRelu>{});
+                                                                              AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_instances<2,
@@ -51,7 +51,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance
                                                                               NHWGK,
                                                                               ConvFwd1x1S1P0,
                                                                               Tuple<BF16>,
-                                                                              AddRelu>{});
+                                                                              AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index a6286b55e8..cd679f4b2d 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(
         instances,
@@ -32,7 +32,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_
                                                                 NHWGK,
                                                                 ConvFwdDefault,
                                                                 Tuple<BF16>,
-                                                                AddRelu>{});
+                                                                AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
index 0736325b05..f0638a96f5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inte
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
@@ -32,7 +32,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inte
                                                                                   ConvFwdDefault,
                                                                                   Interwave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
@@ -43,7 +43,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inte
                                                                                   ConvFwd1x1P0,
                                                                                   Interwave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
@@ -54,7 +54,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inte
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
index 0d35ab1b05..6d07172806 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intr
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
@@ -32,7 +32,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intr
                                                                                   ConvFwdDefault,
                                                                                   Intrawave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
@@ -43,7 +43,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intr
                                                                                   ConvFwd1x1P0,
                                                                                   Intrawave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
@@ -54,7 +54,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intr
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 253e8b196e..2c576431e3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -10,7 +10,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 // Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NHWGC,
                                                                 GKYXC,
@@ -22,7 +22,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     if(ck::get_device_name() == "gfx950")
     {
@@ -35,7 +35,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk
                                                                         NHWGK,
                                                                         ConvFwdDefault,
                                                                         Tuple<BF16>,
-                                                                        AddRelu>{});
+                                                                        AddClamp>{});
 
         add_device_operation_instances(
             instances,
@@ -46,7 +46,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk
                                                                         NHWGK,
                                                                         ConvFwd3x3,
                                                                         Tuple<BF16>,
-                                                                        AddRelu>{});
+                                                                        AddClamp>{});
     }
     else
     {
@@ -59,7 +59,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk
                                                                      NHWGK,
                                                                      ConvFwdDefault,
                                                                      Tuple<BF16>,
-                                                                     AddRelu>{});
+                                                                     AddClamp>{});
 
         add_device_operation_instances(
             instances,
@@ -70,7 +70,7 @@ void add_device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk
                                                                      NHWGK,
                                                                      ConvFwd3x3,
                                                                      Tuple<BF16>,
-                                                                     AddRelu>{});
+                                                                     AddClamp>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt
deleted file mode 100644
index 98b0b1c4cb..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_relu/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# ONLY XDL_KERNELS
-add_instance_library(device_grouped_conv2d_fwd_bias_relu_instance
-   xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-   xdl/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
-
-   xdl/large_tensor/device_grouped_conv2d_fwd_bias_relu_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-
-   xdl/merged_groups/device_grouped_conv2d_fwd_bias_relu_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-
-   xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
-
-   xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
-   xdl/comp/device_grouped_conv2d_fwd_bias_relu_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
-)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..a1c3feed3b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_FWD
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
+
+   xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+
+   xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
+
+   xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+)
+
+add_instance_library(device_grouped_conv3d_fwd_bias_clamp_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
index 9819f0ea0b..5130312db2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -10,7 +10,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -22,7 +22,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
@@ -32,7 +32,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                                    NDHWGK,
                                                                                    ConvFwdDefault,
                                                                                    Tuple<BF16>,
-                                                                                   AddRelu>{});
+                                                                                   AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
                                                                                    NDHWGC,
@@ -41,7 +41,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                                    NDHWGK,
                                                                                    ConvFwd1x1P0,
                                                                                    Tuple<BF16>,
-                                                                                   AddRelu>{});
+                                                                                   AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
                                                                                    NDHWGC,
@@ -50,7 +50,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                                    NDHWGK,
                                                                                    ConvFwd1x1S1P0,
                                                                                    Tuple<BF16>,
-                                                                                   AddRelu>{});
+                                                                                   AddClamp>{});
 
     if(ck::get_device_name() != "gfx950")
     {
@@ -63,7 +63,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                   NDHWGK,
                                                                   ConvFwdDefault,
                                                                   Tuple<BF16>,
-                                                                  AddRelu>{});
+                                                                  AddClamp>{});
         add_device_operation_instances(
             instances,
             device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
@@ -73,7 +73,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                   NDHWGK,
                                                                   ConvFwd1x1P0,
                                                                   Tuple<BF16>,
-                                                                  AddRelu>{});
+                                                                  AddClamp>{});
         add_device_operation_instances(
             instances,
             device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
@@ -83,7 +83,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                   NDHWGK,
                                                                   ConvFwd1x1S1P0,
                                                                   Tuple<BF16>,
-                                                                  AddRelu>{});
+                                                                  AddClamp>{});
     }
 
     if(ck::get_device_name() == "gfx950")
@@ -97,7 +97,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                NDHWGK,
                                                                ConvFwdDefault,
                                                                Tuple<BF16>,
-                                                               AddRelu>{});
+                                                               AddClamp>{});
         add_device_operation_instances(
             instances,
             device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
@@ -107,7 +107,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                NDHWGK,
                                                                ConvFwd1x1P0,
                                                                Tuple<BF16>,
-                                                               AddRelu>{});
+                                                               AddClamp>{});
         add_device_operation_instances(
             instances,
             device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
@@ -117,7 +117,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_
                                                                NDHWGK,
                                                                ConvFwd1x1S1P0,
                                                                Tuple<BF16>,
-                                                               AddRelu>{});
+                                                               AddClamp>{});
     }
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
index dc3fc7a4bf..86dad21d43 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
@@ -31,7 +31,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16
                                                                                     NDHWGK,
                                                                                     ConvFwdDefault,
                                                                                     Tuple<BF16>,
-                                                                                    AddRelu>{});
+                                                                                    AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
                                                                                     NDHWGC,
@@ -40,7 +40,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16
                                                                                     NDHWGK,
                                                                                     ConvFwd1x1P0,
                                                                                     Tuple<BF16>,
-                                                                                    AddRelu>{});
+                                                                                    AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
                                                                                     NDHWGC,
@@ -49,7 +49,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16
                                                                                     NDHWGK,
                                                                                     ConvFwd1x1S1P0,
                                                                                     Tuple<BF16>,
-                                                                                    AddRelu>{});
+                                                                                    AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index a9a8ff8459..685a729c3a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_insta
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_instances<3,
@@ -31,7 +31,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_insta
                                                                               NDHWGK,
                                                                               ConvFwdDefault,
                                                                               Tuple<BF16>,
-                                                                              AddRelu>{});
+                                                                              AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                               NDHWGC,
@@ -40,7 +40,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_insta
                                                                               NDHWGK,
                                                                               ConvFwd1x1P0,
                                                                               Tuple<BF16>,
-                                                                              AddRelu>{});
+                                                                              AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                               NDHWGC,
@@ -49,7 +49,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_insta
                                                                               NDHWGK,
                                                                               ConvFwd1x1S1P0,
                                                                               Tuple<BF16>,
-                                                                              AddRelu>{});
+                                                                              AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
similarity index 93%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index e58e879973..b553d007af 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhw
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(
         instances,
@@ -32,7 +32,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhw
                                                                 NDHWGK,
                                                                 ConvFwdDefault,
                                                                 Tuple<BF16>,
-                                                                AddRelu>{});
+                                                                AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
index e76052c6e0..7d892855ec 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
@@ -32,7 +32,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                                   ConvFwdDefault,
                                                                                   Interwave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NDHWGC,
@@ -42,7 +42,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                                   ConvFwd1x1P0,
                                                                                   Interwave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NDHWGC,
@@ -52,7 +52,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                                   ConvFwd1x1S1P0,
                                                                                   Interwave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
index 0593f3f46a..a2d0c6a2e1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
@@ -32,7 +32,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                                   ConvFwdDefault,
                                                                                   Intrawave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NDHWGC,
@@ -42,7 +42,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                                   ConvFwd1x1P0,
                                                                                   Intrawave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
     add_device_operation_instances(instances,
                                    device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NDHWGC,
@@ -52,7 +52,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_i
                                                                                   ConvFwd1x1S1P0,
                                                                                   Intrawave,
                                                                                   Tuple<BF16>,
-                                                                                  AddRelu>{});
+                                                                                  AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index 6552f26f88..71f303f3dd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
@@ -21,7 +21,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndh
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                AddRelu>>>& instances)
+                                                                AddClamp>>>& instances)
 {
     add_device_operation_instances(
         instances,
@@ -32,7 +32,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndh
                                                                  NDHWGK,
                                                                  ConvFwdDefault,
                                                                  Tuple<BF16>,
-                                                                 AddRelu>{});
+                                                                 AddClamp>{});
     add_device_operation_instances(
         instances,
         device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
@@ -42,7 +42,7 @@ void add_device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndh
                                                                  NDHWGK,
                                                                  ConvFwd3x3,
                                                                  Tuple<BF16>,
-                                                                 AddRelu>{});
+                                                                 AddClamp>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt
deleted file mode 100644
index afdddfec70..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_relu/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# ONLY XDL_KERNELS
-set(GROUPED_CONV3D_FWD
-   xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-   xdl/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
-
-   xdl/large_tensor/device_grouped_conv3d_fwd_bias_relu_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-
-   xdl/merged_groups/device_grouped_conv3d_fwd_bias_relu_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-
-   xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
-
-   xdl/comp/device_grouped_conv3d_fwd_bias_relu_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
-)
-
-add_instance_library(device_grouped_conv3d_fwd_bias_relu_instance ${GROUPED_CONV3D_FWD})
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
similarity index 96%
rename from profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp
rename to profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
index 9d38263d4e..3ef9f4505d 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
@@ -11,7 +11,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_relu.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp"
 
 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -35,19 +35,22 @@ template <ck::index_t NDimSpatial,
           typename AComputeType = InDataType,
           typename BComputeType = AComputeType,
           typename IndexType    = ck::index_t>
-bool profile_grouped_conv_fwd_bias_relu_impl(int do_verification,
-                                             int init_method,
-                                             bool do_log,
-                                             bool time_kernel,
-                                             const ck::utils::conv::ConvParam& conv_param)
+bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
+                                              int init_method,
+                                              bool do_log,
+                                              bool time_kernel,
+                                              const ck::utils::conv::ConvParam& conv_param)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+    using OutElementOp = ck::tensor_operation::element_wise::AddClamp;
+
+    const float floor = 0.f;
+    const float ceil  = 256.f;
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
+    const auto out_element_op = OutElementOp{floor, ceil};
 
     const auto in_g_n_c_wis_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5ea61d2dfc..6692f55b5f 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -251,7 +251,7 @@ add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_convnd_fwd)
-add_subdirectory(grouped_convnd_fwd_bias_relu)
+add_subdirectory(grouped_convnd_fwd_bias_clamp)
 add_subdirectory(grouped_convnd_bwd_weight)
 add_subdirectory(block_to_ctile_map)
 add_subdirectory(softmax)
diff --git a/test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt b/test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..4630a37d33
--- /dev/null
+++ b/test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
+endif()
diff --git a/test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp b/test/grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp
similarity index 88%
rename from test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp
rename to test/grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp
index c508235d9c..7d5437d247 100644
--- a/test/grouped_convnd_fwd_bias_relu/test_grouped_convnd_fwd_bias_relu.cpp
+++ b/test/grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp
@@ -7,11 +7,11 @@
 #include <vector>
 #include <gtest/gtest.h>
 
-#include "profiler/profile_grouped_conv_fwd_bias_relu_impl.hpp"
+#include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
 
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-using AddRelu = ck::tensor_operation::element_wise::AddRelu;
+using AddClamp = ck::tensor_operation::element_wise::AddClamp;
 
 template <typename Tuple>
 class TestGroupedConvndFwd : public ::testing::Test
@@ -32,16 +32,16 @@ class TestGroupedConvndFwd : public ::testing::Test
         bool pass = true;
         for(auto& param : conv_params)
         {
-            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_relu_impl<NDimSpatial,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 OutLayout,
-                                                                                 DataType,
-                                                                                 DataType,
-                                                                                 DataType,
-                                                                                 DataType,
-                                                                                 DataType,
-                                                                                 IndexType>(
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                                  InLayout,
+                                                                                  WeiLayout,
+                                                                                  OutLayout,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  IndexType>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
diff --git a/test/grouped_convnd_fwd_bias_relu/CMakeLists.txt b/test/grouped_convnd_fwd_bias_relu/CMakeLists.txt
deleted file mode 100644
index 680a92b19c..0000000000
--- a/test/grouped_convnd_fwd_bias_relu/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_grouped_convnd_fwd_bias_relu test_grouped_convnd_fwd_bias_relu.cpp)
-    target_link_libraries(test_grouped_convnd_fwd_bias_relu PRIVATE utility device_grouped_conv2d_fwd_bias_relu_instance device_grouped_conv3d_fwd_bias_relu_instance)
-endif()

From 28cd0dffc9cb0e6893fad6a06a15a13ac466ad50 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Thu, 29 May 2025 18:36:33 +0900
Subject: [PATCH 161/443] [CK_TILE] FMHA forward batch_prefill optimization for
 low CU utilization (#2251)

* Add constraint on traits/tile/pipeline

* Use kM0=128 if max_seqlen_q == 8192

* Re-format codegen script

* Remove redundant attr name postix

* Fix import error: default field in dataclass

* Use kK0=64 & kK1=64 to hide latency

* Use CU utilization to decide tile size
---
 .../01_fmha/codegen/ops/fmha_batch_prefill.py | 263 ++++++++++--------
 1 file changed, 149 insertions(+), 114 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index 76b9429b2e..0f5670f1b9 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -3,7 +3,7 @@
 # generate kernel instances to speed up compilation
 
 import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import fnmatch
 import itertools
 from pathlib import Path
@@ -117,8 +117,50 @@ float fmha_batch_prefill_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_b
 
 FMHA_FWD_API_FILENAME="fmha_batch_prefill_api.cpp"
 FMHA_FWD_API="""
-float fmha_batch_prefill(fmha_batch_prefill_traits t, fmha_batch_prefill_args a, const ck_tile::stream_config& s){{
+#include <cstdio>
+
+namespace {{
+bool get_num_cus(unsigned& num_cu) {{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device");
+        return false;
+    }}
+
+    hipDeviceProp_t props{{}};
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device properties");
+        return false;
+    }}
+
+    num_cu = props.multiProcessorCount;
+    return true;
+}}
+
+unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{
+    const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0;
+    const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1
+
+    return batch * nheads * num_m_blocks * num_n_blocks;
+}}
+}} // namespace
+
+float fmha_batch_prefill(fmha_batch_prefill_traits t, fmha_batch_prefill_args a, const ck_tile::stream_config& s) {{
     float r = -1;
+
+    const float min_cu_util_rate = 0.8; // minimum CU utilization rate
+
+    unsigned num_cus;
+    if (!get_num_cus(num_cus)) {{
+        return r;
+    }}
+
+    auto get_num_blocks = [&](unsigned kM0) {{
+        return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
+    }};
+
 {F_dispatch}
     return r;
 }}
@@ -134,36 +176,50 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
                 using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
                 return fmha_batch_prefill_<trait_>(s, a);
             }}
 """
 
+@dataclass
+class CppConstraint:
+    bool_expr: str = None
+
+    def __str__(self):
+        if self.bool_expr is None:
+            return 'true'
+        else:
+            return f'{self.bool_expr}'
+
+    def __and__(self, other):
+        return CppConstraint(f'({str(self)}) && ({str(other)})')
+
 @dataclass
 class FmhaFwdApiTrait:
     pipeline_tag : str
     # sync with fmha_fwd_traits<>, to generate fallback calls
-    hdim      : str
-    dtype     : str  # data type
-    mode      : str  # value from MODE_MAP
-    bm0       : int  # tile size along q seqlen (block size)
-    bn0       : int  # tile size along qk seqlen
-    bk0       : int  # tile size along qk gemm unroll
-    bn1       : int  # tile size along v head_dim
-    bk1       : int  # tile size along kv gemm unroll
-    bk0max    : int
-    vlayout   : str
-    logits    : str
-    mask      : str
-    bias      : str  #
-    lse       : str  #
-    dropout   : str
-    squant    : str  #
-    spad      : str
-    skpad     : str
-    dpad      : str
-    dvpad     : str
+    hdim       : str
+    dtype      : str  # data type
+    mode       : str  # value from MODE_MAP
+    bm0        : int  # tile size along q seqlen (block size)
+    bn0        : int  # tile size along qk seqlen
+    bk0        : int  # tile size along qk gemm unroll
+    bn1        : int  # tile size along v head_dim
+    bk1        : int  # tile size along kv gemm unroll
+    bk0max     : int
+    vlayout    : str
+    logits     : str
+    mask       : str
+    bias       : str  #
+    lse        : str  #
+    dropout    : str
+    squant     : str  #
+    spad       : str
+    skpad      : str
+    dpad       : str
+    dvpad      : str
+    constraint : CppConstraint
 
     @property
     def name(self) -> str:
@@ -220,17 +276,18 @@ class FmhaFwdApiTrait:
 class FmhaFwdPipeline:
     tag : str
 
-    F_vlayout   : str  # row/col
-    F_spad      : str  # true/false
-    F_skpad     : str  #
-    F_dpad      : str  #
-    F_dvpad     : str  #
-    F_logits    : str  # t/f
-    F_bias      : str  # true/false
-    F_lse       : str  #
-    F_dropout   : str  #
-    F_squant    : str  #
-    F_mask      : str  # value from MASK_MAP
+    F_vlayout    : str  # row/col
+    F_spad       : str  # true/false
+    F_skpad      : str  #
+    F_dpad       : str  #
+    F_dvpad      : str  #
+    F_logits     : str  # t/f
+    F_bias       : str  # true/false
+    F_lse        : str  #
+    F_dropout    : str  #
+    F_squant     : str  #
+    F_mask       : str  # value from MASK_MAP
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
     def name(self) -> str:
@@ -297,8 +354,8 @@ class FmhaFwdApiPool:
                     inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
                                    F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
-                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_squant=BOOL_MAP[trait.squant],
+                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_constraint=trait.constraint,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
@@ -313,25 +370,27 @@ class FmhaFwdApiPool:
 
 @dataclass
 class FmhaFwdTileSize:
-    F_bm0       : int  # tile size along q seqlen (block size)
-    F_bn0       : int  # tile size along k seqlen
-    F_bk0       : int  # tile size along qk gemm unroll
-    F_bn1       : int  # tile size along v head_dim
-    F_bk1       : int  # tile size along kv gemm unroll
-    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
-    F_rm0       : int  # number of warps for gemm0 along q seqlen
-    F_rn0       : int  # number of warps for gemm0 along k seqlen
-    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
-    F_rm1       : int  # number of warps for gemm1 along q seqlen
-    F_rn1       : int  # number of warps for gemm1 along head dim v
-    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
-    F_wm0       : int  # gemm0 warp size along m
-    F_wn0       : int  # gemm0 warp size along n
-    F_wk0       : int  # gemm0 warp size along k
-    F_wm1       : int  # gemm1 warp size along m
-    F_wn1       : int  # gemm1 warp size along n
-    F_wk1       : int  # gemm1 warp size along k
-    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_bm0        : int  # tile size along q seqlen (block size)
+    F_bn0        : int  # tile size along k seqlen
+    F_bk0        : int  # tile size along qk gemm unroll
+    F_bn1        : int  # tile size along v head_dim
+    F_bk1        : int  # tile size along kv gemm unroll
+    F_bk0max     : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0        : int  # number of warps for gemm0 along q seqlen
+    F_rn0        : int  # number of warps for gemm0 along k seqlen
+    F_rk0        : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1        : int  # number of warps for gemm1 along q seqlen
+    F_rn1        : int  # number of warps for gemm1 along head dim v
+    F_rk1        : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0        : int  # gemm0 warp size along m
+    F_wn0        : int  # gemm0 warp size along n
+    F_wk0        : int  # gemm0 warp size along k
+    F_wm1        : int  # gemm1 warp size along m
+    F_wn1        : int  # gemm1 warp size along n
+    F_wk1        : int  # gemm1 warp size along k
+    F_occupancy  : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
+
     @property
     def name(self) -> str:
         return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
@@ -423,33 +482,21 @@ class FmhaFwdKernel:
                 spad=self.F_pipeline.F_spad,
                 skpad=self.F_pipeline.F_skpad,
                 dpad=self.F_pipeline.F_dpad,
-                dvpad=self.F_pipeline.F_dvpad)
+                dvpad=self.F_pipeline.F_dvpad,
+                constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
 
-# TODO: design a more practical way to do it
-# this is current supported tile size per hdim
-def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
-    if dtype == 'fp16' or dtype == 'bf16':
-        return {
-        ### '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        }
-    elif dtype == 'fp8' or dtype == 'bf8':
-        return {
-        ### '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-        ### '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-        ### '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-        }
-    else:
-        return None
+class KernelComponentFactory:
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        if dtype == 'fp16' or dtype == 'bf16':
+            return {
+                '128' : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            }
+        else:
+            return None
 
-def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
-    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
-    #       support this in future
-    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+    @staticmethod
+    def get_pipelines(dtype, hdim, receipt, mask_impl) -> List[FmhaFwdPipeline]:
         # this function will populate a list possible pipelines
         # TODO: the order of List matters! the later in this list will be also be checked later
         # TODO: currently for qr pipeline, let 't' padding to appear later!!
@@ -458,53 +505,41 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
-                if hdim == 256:
-                # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                    # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                else:
-                    if bias == "bias":
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
-        elif dtype in ['fp8', 'bf8']:
-            # no need lse/dropout kernels
-            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask))
-        elif dtype in ['fp8fp16', 'fp8bf16']:
-            # TODO
-            None
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
         else:
             assert False
         return pipelines
 
+class CustomFactory(KernelComponentFactory):
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        if dtype == 'fp16' or dtype == 'bf16':
+            return {
+                '128' : [FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint('get_num_blocks(128) < num_cus * min_cu_util_rate')),
+                         FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),]
+            }
+        else:
+            return None
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+
     gen = list()
     api_pool = FmhaFwdApiPool(mask_impl)
 
     for dtype in FWD_DTYPE_MAP.keys():
-        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
+        d = CustomFactory.get_hdim_tile_size_dict(dtype)
         if d == None:
             continue
         #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
         for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
-            tile = d[hdim_str]
+            tiles = d[hdim_str]
             hdim = int(hdim_str)
-            for pipeline in get_pipelines(dtype, hdim):
+            for tile, pipeline in itertools.product(tiles, CustomFactory.get_pipelines(dtype, hdim, receipt, mask_impl)):
                 if mode == "group":
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not

From fd6a859b447387c871bc763e4bbccac298a4f30a Mon Sep 17 00:00:00 2001
From: joyeamd <john.ye@amd.com>
Date: Thu, 29 May 2025 20:31:14 +0800
Subject: [PATCH 162/443] add CShuffleM/NXdlPerWavePerShuffle in
 cshuffle_epilogue (#2185)

* add cshuffle's mxdlperwavepershuffle support, not finished

* add epilogue functions

* add cshuffle's mxdlperwavepershuffle support, not finished

* add epilogue functions

* update cshuffle logic

* update cshuffle_logics

* add some change within review

* update some codes following the code review

* update epilogue logic

* remove from problem

* update codes following review.

* fix some issues
---
 .../ops/epilogue/cshuffle_epilogue.hpp        | 138 +++++++++++++-----
 1 file changed, 103 insertions(+), 35 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 9b8dde1905..83fde8764b 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -65,19 +65,6 @@ struct CShuffleEpilogue
     static constexpr index_t kNPerXdl                      = Problem::kNPerXdl;
     static constexpr index_t kKPerXdl                      = Problem::kKPerXdl;
     static constexpr index_t isCTransposed                 = Problem::isCTransposed;
-    static constexpr index_t kMPerIteration                = kMPerXdl * kMWave;
-    static constexpr index_t kNPerIteration                = kNPerXdl * kNWave;
-
-    using WG = WarpGemmMfmaDispatcher<ADataType,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      kMPerXdl,
-                                      kNPerXdl,
-                                      kKPerXdl,
-                                      isCTransposed>;
-
-    using CWarpDstr   = typename WG::CWarpDstr;
-    using CWarpTensor = typename WG::CWarpTensor;
 
     /**
      * @brief Get the vector store size for C tensor.
@@ -91,10 +78,62 @@ struct CShuffleEpilogue
      */
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
     {
-        constexpr index_t MaxVectorStoreSize = 16;
-        return MaxVectorStoreSize / sizeof(ODataType);
+        constexpr index_t max_vector_store_size = 16;
+        return max_vector_store_size / sizeof(ODataType);
     }
 
+    /**
+     * @brief Shuffle tile configuration parameters
+     *
+     * @details These parameters control the number of XDL tiles processed per wave in each shuffle
+     * iteration:
+     * - NumMXdlPerWavePerShuffle: Number of XDL tiles in M dimension processed per wave
+     * - NumNXdlPerWavePerShuffle: Number of XDL tiles in N dimension processed per wave
+     */
+    static constexpr auto shuffle_tile_tuple = [] {
+        constexpr index_t vecPerThread = kMPerXdl * kNPerXdl / get_warp_size();
+        if constexpr(vecPerThread >= GetVectorSizeC())
+        {
+            return std::make_tuple(1, 1);
+        }
+        else
+        {
+            constexpr index_t num_xdl_shuffles = GetVectorSizeC() / vecPerThread;
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                static_assert((kMPerBlock % (kMPerXdl * kMWave) == 0) &&
+                                  (kMPerBlock % num_xdl_shuffles == 0),
+                              "kMPerBlock must be divisible by kMPerXdl*kMWave and "
+                              "num_xdl_shuffles for CShuffleEpilogue");
+                return std::make_tuple(min(num_xdl_shuffles, kMPerBlock / (kMPerXdl * kMWave)), 1);
+            }
+            else
+            {
+                static_assert((kNPerBlock % (kNPerXdl * kNWave) == 0) &&
+                                  (kNPerBlock % num_xdl_shuffles == 0),
+                              "kNPerBlock must be divisible by kNPerXdl*kNWave and "
+                              "num_xdl_shuffles for CShuffleEpilogue");
+                return std::make_tuple(1, min(num_xdl_shuffles, kNPerBlock / (kNPerXdl * kNWave)));
+            }
+        }
+    }();
+    static constexpr index_t NumMXdlPerWavePerShuffle = std::get<0>(shuffle_tile_tuple);
+    static constexpr index_t NumNXdlPerWavePerShuffle = std::get<1>(shuffle_tile_tuple);
+
+    static constexpr index_t kMPerIteration = kMPerXdl * kMWave * NumMXdlPerWavePerShuffle;
+    static constexpr index_t kNPerIteration = kNPerXdl * kNWave * NumNXdlPerWavePerShuffle;
+
+    using WG = WarpGemmMfmaDispatcher<ADataType,
+                                      BTypeToUse,
+                                      AccDataType,
+                                      kMPerXdl,
+                                      kNPerXdl,
+                                      kKPerXdl,
+                                      isCTransposed>;
+
+    using CWarpDstr   = typename WG::CWarpDstr;
+    using CWarpTensor = typename WG::CWarpTensor;
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor()
     {
@@ -102,15 +141,17 @@ struct CShuffleEpilogue
         if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
         {
             return make_naive_tensor_descriptor(
-                make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
-                make_tuple(number<kNWave * kNPerXdl>{}, number<1>{}));
+                make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
+                           number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
+                make_tuple(number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}, number<1>{}));
         }
         // M is contiguous dimension
         else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
         {
             return make_naive_tensor_descriptor(
-                make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
-                make_tuple(number<1>{}, number<kMWave * kMPerXdl>{}));
+                make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
+                           number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
+                make_tuple(number<1>{}, number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{}));
         }
         else
         {
@@ -118,34 +159,57 @@ struct CShuffleEpilogue
         }
     }
 
+    CK_TILE_DEVICE static constexpr auto MakeLdsDistributionEncode()
+    {
+        constexpr auto block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<NumMXdlPerWavePerShuffle, kMWave>,
+                                             sequence<NumNXdlPerWavePerShuffle, kNWave>>,
+                                       tuple<sequence<1, 2>>,
+                                       tuple<sequence<1, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto block_dstr_encoding = detail::make_embed_tile_distribution_encoding(
+            block_outer_dstr_encoding, typename CWarpDstr::DstrEncode{});
+
+        return block_dstr_encoding;
+    }
+
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
-        return kMWave * kNWave * kMPerXdl * kNPerXdl * sizeof(ODataType);
+        return kMWave * kNWave * kMPerXdl * kNPerXdl * NumMXdlPerWavePerShuffle *
+               NumNXdlPerWavePerShuffle * sizeof(ODataType);
     }
 
     template <typename ODramWindow, typename OAccTile>
     CK_TILE_DEVICE auto
     operator()(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, void* p_smem)
     {
+        constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
 
-        const index_t iMWarp = get_warp_id() / kNWave;
-        const index_t iNWarp = get_warp_id() - iMWarp * kNWave;
+        auto lds_tile = make_static_distributed_tensor<AccDataType>(LdsTileDistr);
 
         constexpr auto lds_block_desc = MakeLdsBlockDescriptor<Problem>();
         auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
             static_cast<ODataType*>(p_smem), lds_block_desc);
+
         auto in_lds_window =
             make_tile_window(o_lds_block,
-                             make_tuple(number<kMPerXdl>{}, number<kNPerXdl>{}),
-                             {number<kMPerXdl>{} * iMWarp, number<kNPerXdl>{} * iNWarp});
+                             make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
+                                        number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
+                             {0, 0},
+                             LdsTileDistr);
+
         auto out_lds_window =
             make_tile_window(o_lds_block,
-                             make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
+                             make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
+                                        number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
                              {0, 0});
 
         using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
                                         sequence<0, 1>,
-                                        sequence<kMPerXdl * kMWave, kNPerXdl * kNWave>>;
+                                        sequence<kMPerXdl * kMWave * NumMXdlPerWavePerShuffle,
+                                                 kNPerXdl * kNWave * NumNXdlPerWavePerShuffle>>;
         constexpr index_t num_access = SFC::get_num_of_access();
 
         using TileEncodingPattern =
@@ -160,21 +224,25 @@ struct CShuffleEpilogue
             to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
-        CWarpTensor c_warp_in_tensor;
+        block_sync_lds();
         static_for<0, num_access, 1>{}([&](auto iAccess) {
             constexpr auto idx_y_start = SFC::get_index(iAccess);
 
-            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (kMPerXdl * kMWave)>{};
-            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (kNPerXdl * kNWave)>{};
+            constexpr auto mIter = number<idx_y_start.at(number<0>{}) /
+                                          (kMPerXdl * kMWave * NumMXdlPerWavePerShuffle)>{};
+            constexpr auto nIter = number<idx_y_start.at(number<1>{}) /
+                                          (kNPerXdl * kNWave * NumNXdlPerWavePerShuffle)>{};
 
-            c_warp_in_tensor.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
-                merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+                merge_sequences(
+                    sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
+                    c_warp_y_index_zeros),
+                merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
+                                c_warp_y_lengths));
 
-            const auto c_warp_in_tensor_casted = cast_tile<ODataType>(c_warp_in_tensor);
+            const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
 
-            block_sync_lds();
-            store_tile(in_lds_window, c_warp_in_tensor_casted);
+            store_tile(in_lds_window, c_warptile_in_tensor_casted);
             block_sync_lds();
 
             const auto c_out_tensor =

From 306f4c537e08e6ba5c16ee8a406ff7821db490cb Mon Sep 17 00:00:00 2001
From: Paul Fultz II <pfultz2@yahoo.com>
Date: Thu, 29 May 2025 13:03:51 -0500
Subject: [PATCH 163/443] Export codegen targets (#2259)

---
 codegen/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 9e7c360f54..8ddc663452 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -48,6 +48,7 @@ rocm_install_targets(
     INCLUDE include
 )
 rocm_export_targets(
+    TARGETS ck_host ck_headers
     EXPORT ck_host_targets
     NAMESPACE composable_kernel::
 )

From 4e561af18ce8af2e3be688ab65f03f85c38181f2 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 29 May 2025 16:22:16 -0700
Subject: [PATCH 164/443] Revert "add CShuffleM/NXdlPerWavePerShuffle in
 cshuffle_epilogue (#2185)" (#2260)

This reverts commit fd6a859b447387c871bc763e4bbccac298a4f30a.
---
 .../ops/epilogue/cshuffle_epilogue.hpp        | 138 +++++-------------
 1 file changed, 35 insertions(+), 103 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 83fde8764b..9b8dde1905 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -65,6 +65,19 @@ struct CShuffleEpilogue
     static constexpr index_t kNPerXdl                      = Problem::kNPerXdl;
     static constexpr index_t kKPerXdl                      = Problem::kKPerXdl;
     static constexpr index_t isCTransposed                 = Problem::isCTransposed;
+    static constexpr index_t kMPerIteration                = kMPerXdl * kMWave;
+    static constexpr index_t kNPerIteration                = kNPerXdl * kNWave;
+
+    using WG = WarpGemmMfmaDispatcher<ADataType,
+                                      BTypeToUse,
+                                      AccDataType,
+                                      kMPerXdl,
+                                      kNPerXdl,
+                                      kKPerXdl,
+                                      isCTransposed>;
+
+    using CWarpDstr   = typename WG::CWarpDstr;
+    using CWarpTensor = typename WG::CWarpTensor;
 
     /**
      * @brief Get the vector store size for C tensor.
@@ -78,62 +91,10 @@ struct CShuffleEpilogue
      */
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
     {
-        constexpr index_t max_vector_store_size = 16;
-        return max_vector_store_size / sizeof(ODataType);
+        constexpr index_t MaxVectorStoreSize = 16;
+        return MaxVectorStoreSize / sizeof(ODataType);
     }
 
-    /**
-     * @brief Shuffle tile configuration parameters
-     *
-     * @details These parameters control the number of XDL tiles processed per wave in each shuffle
-     * iteration:
-     * - NumMXdlPerWavePerShuffle: Number of XDL tiles in M dimension processed per wave
-     * - NumNXdlPerWavePerShuffle: Number of XDL tiles in N dimension processed per wave
-     */
-    static constexpr auto shuffle_tile_tuple = [] {
-        constexpr index_t vecPerThread = kMPerXdl * kNPerXdl / get_warp_size();
-        if constexpr(vecPerThread >= GetVectorSizeC())
-        {
-            return std::make_tuple(1, 1);
-        }
-        else
-        {
-            constexpr index_t num_xdl_shuffles = GetVectorSizeC() / vecPerThread;
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
-            {
-                static_assert((kMPerBlock % (kMPerXdl * kMWave) == 0) &&
-                                  (kMPerBlock % num_xdl_shuffles == 0),
-                              "kMPerBlock must be divisible by kMPerXdl*kMWave and "
-                              "num_xdl_shuffles for CShuffleEpilogue");
-                return std::make_tuple(min(num_xdl_shuffles, kMPerBlock / (kMPerXdl * kMWave)), 1);
-            }
-            else
-            {
-                static_assert((kNPerBlock % (kNPerXdl * kNWave) == 0) &&
-                                  (kNPerBlock % num_xdl_shuffles == 0),
-                              "kNPerBlock must be divisible by kNPerXdl*kNWave and "
-                              "num_xdl_shuffles for CShuffleEpilogue");
-                return std::make_tuple(1, min(num_xdl_shuffles, kNPerBlock / (kNPerXdl * kNWave)));
-            }
-        }
-    }();
-    static constexpr index_t NumMXdlPerWavePerShuffle = std::get<0>(shuffle_tile_tuple);
-    static constexpr index_t NumNXdlPerWavePerShuffle = std::get<1>(shuffle_tile_tuple);
-
-    static constexpr index_t kMPerIteration = kMPerXdl * kMWave * NumMXdlPerWavePerShuffle;
-    static constexpr index_t kNPerIteration = kNPerXdl * kNWave * NumNXdlPerWavePerShuffle;
-
-    using WG = WarpGemmMfmaDispatcher<ADataType,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      kMPerXdl,
-                                      kNPerXdl,
-                                      kKPerXdl,
-                                      isCTransposed>;
-
-    using CWarpDstr   = typename WG::CWarpDstr;
-    using CWarpTensor = typename WG::CWarpTensor;
-
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor()
     {
@@ -141,17 +102,15 @@ struct CShuffleEpilogue
         if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
         {
             return make_naive_tensor_descriptor(
-                make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
-                           number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
-                make_tuple(number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}, number<1>{}));
+                make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
+                make_tuple(number<kNWave * kNPerXdl>{}, number<1>{}));
         }
         // M is contiguous dimension
         else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
         {
             return make_naive_tensor_descriptor(
-                make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
-                           number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
-                make_tuple(number<1>{}, number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{}));
+                make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
+                make_tuple(number<1>{}, number<kMWave * kMPerXdl>{}));
         }
         else
         {
@@ -159,57 +118,34 @@ struct CShuffleEpilogue
         }
     }
 
-    CK_TILE_DEVICE static constexpr auto MakeLdsDistributionEncode()
-    {
-        constexpr auto block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<NumMXdlPerWavePerShuffle, kMWave>,
-                                             sequence<NumNXdlPerWavePerShuffle, kNWave>>,
-                                       tuple<sequence<1, 2>>,
-                                       tuple<sequence<1, 1>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-        constexpr auto block_dstr_encoding = detail::make_embed_tile_distribution_encoding(
-            block_outer_dstr_encoding, typename CWarpDstr::DstrEncode{});
-
-        return block_dstr_encoding;
-    }
-
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
-        return kMWave * kNWave * kMPerXdl * kNPerXdl * NumMXdlPerWavePerShuffle *
-               NumNXdlPerWavePerShuffle * sizeof(ODataType);
+        return kMWave * kNWave * kMPerXdl * kNPerXdl * sizeof(ODataType);
     }
 
     template <typename ODramWindow, typename OAccTile>
     CK_TILE_DEVICE auto
     operator()(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, void* p_smem)
     {
-        constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
 
-        auto lds_tile = make_static_distributed_tensor<AccDataType>(LdsTileDistr);
+        const index_t iMWarp = get_warp_id() / kNWave;
+        const index_t iNWarp = get_warp_id() - iMWarp * kNWave;
 
         constexpr auto lds_block_desc = MakeLdsBlockDescriptor<Problem>();
         auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
             static_cast<ODataType*>(p_smem), lds_block_desc);
-
         auto in_lds_window =
             make_tile_window(o_lds_block,
-                             make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
-                                        number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
-                             {0, 0},
-                             LdsTileDistr);
-
+                             make_tuple(number<kMPerXdl>{}, number<kNPerXdl>{}),
+                             {number<kMPerXdl>{} * iMWarp, number<kNPerXdl>{} * iNWarp});
         auto out_lds_window =
             make_tile_window(o_lds_block,
-                             make_tuple(number<kMWave * kMPerXdl * NumMXdlPerWavePerShuffle>{},
-                                        number<kNWave * kNPerXdl * NumNXdlPerWavePerShuffle>{}),
+                             make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
                              {0, 0});
 
         using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
                                         sequence<0, 1>,
-                                        sequence<kMPerXdl * kMWave * NumMXdlPerWavePerShuffle,
-                                                 kNPerXdl * kNWave * NumNXdlPerWavePerShuffle>>;
+                                        sequence<kMPerXdl * kMWave, kNPerXdl * kNWave>>;
         constexpr index_t num_access = SFC::get_num_of_access();
 
         using TileEncodingPattern =
@@ -224,25 +160,21 @@ struct CShuffleEpilogue
             to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
-        block_sync_lds();
+        CWarpTensor c_warp_in_tensor;
         static_for<0, num_access, 1>{}([&](auto iAccess) {
             constexpr auto idx_y_start = SFC::get_index(iAccess);
 
-            constexpr auto mIter = number<idx_y_start.at(number<0>{}) /
-                                          (kMPerXdl * kMWave * NumMXdlPerWavePerShuffle)>{};
-            constexpr auto nIter = number<idx_y_start.at(number<1>{}) /
-                                          (kNPerXdl * kNWave * NumNXdlPerWavePerShuffle)>{};
+            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (kMPerXdl * kMWave)>{};
+            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (kNPerXdl * kNWave)>{};
 
-            lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
-                merge_sequences(
-                    sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
-                    c_warp_y_index_zeros),
-                merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
-                                c_warp_y_lengths));
+            c_warp_in_tensor.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+                merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-            const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
+            const auto c_warp_in_tensor_casted = cast_tile<ODataType>(c_warp_in_tensor);
 
-            store_tile(in_lds_window, c_warptile_in_tensor_casted);
+            block_sync_lds();
+            store_tile(in_lds_window, c_warp_in_tensor_casted);
             block_sync_lds();
 
             const auto c_out_tensor =

From 57f497452ab09735d63f879dfb1091703ffb9048 Mon Sep 17 00:00:00 2001
From: slippedJim <jim.guo@amd.com>
Date: Fri, 30 May 2025 10:14:21 +0800
Subject: [PATCH 165/443] remove restriction of group mode hd192 no lse (#2252)

Co-authored-by: Jim <jimguo12@amd.com>
---
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index e70b896703..7cbbdb9034 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -517,7 +517,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                         continue
                 if hdim == 192 and tile.F_bn1 == 128:
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
-                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' or pipeline.F_dropout == 't':
+                    if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
                 # logits_soft_cap is only allowed if no bias
                 if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):

From 61e6c382c6b711cfa07e30dea8d90e5b7254cfb0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 30 May 2025 05:56:59 -0700
Subject: [PATCH 166/443] Bump rocm-docs-core[api_reference] from 1.19.0 to
 1.19.1 in /docs/sphinx (#2263)

Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.19.0 to 1.19.1.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.19.0...v1.19.1)

---
updated-dependencies:
- dependency-name: rocm-docs-core[api_reference]
  dependency-version: 1.19.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index f04d4a4da6..4cd4590283 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.19.0
+rocm-docs-core[api_reference]==1.19.1
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 74c32f53c1..3c3363d3b4 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -237,7 +237,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.19.0
+rocm-docs-core[api-reference]==1.19.1
     # via -r requirements.in
 rpds-py==0.24.0
     # via

From fbce6c7bb6dad3750e33e999d438197cdc5c7fe8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirza=20Halil=C4=8Devi=C4=87?=
 <109971222+mirza-halilcevic@users.noreply.github.com>
Date: Fri, 30 May 2025 17:23:44 +0200
Subject: [PATCH 167/443] Define CHAR_BIT during hipRTC (#2264)

* Fix failing codegen tests.

* fix clang format

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
---
 include/ck/utility/data_type.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index c11b9c0272..b90ff237dc 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -11,6 +11,7 @@
 /// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
 
 #if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
+#define CHAR_BIT 8
 using int8_t   = signed char;
 using uint8_t  = unsigned char;
 using int16_t  = signed short;

From 654956bb02c37d943a8e4c4d45d80f349c76b441 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 30 May 2025 12:50:08 -0700
Subject: [PATCH 168/443] Add a daily CI build on GFX950. (#2261)

* add CI build for gfx950

* make sure gfx950 CI always uses special docker and compiler

* enable codegen tests by default
---
 Jenkinsfile | 89 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 27 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 39dfd24f3a..c4b5efe3bc 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -114,6 +114,9 @@ def check_arch(){
     else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
         arch_type = 6
     }
+    else if ( runShell('grep -n "gfx950" rocminfo.log') ) {
+        arch_type = 7
+    }
     return arch_type
 }
 
@@ -132,6 +135,10 @@ def getDockerImage(Map conf=[:]){
         image = conf.get("docker_name", "")
         echo "Using legacy docker: ${image}"
     }
+    else if ( params.BUILD_GFX950 && conf.get("docker_name", "") != "" ){
+        image = conf.get("docker_name", "")
+        echo "Using special docker: ${image}"
+    }
     else{
         image = getDockerImageName()
         echo "Using default docker: ${image}"
@@ -208,6 +215,11 @@ def cmake_build(Map conf=[:]){
 
     def build_type_debug = (conf.get("build_type",'release') == 'debug')
 
+    // use special compiler for gfx950
+    if ( check_arch() == 7){
+        compiler = "/llvm-project/build/bin/clang++"
+    }
+
     //cmake_env can overwrite default CXX variables.
     def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
 
@@ -263,6 +275,9 @@ def cmake_build(Map conf=[:]){
     if (setup_args.contains("gfx94")){
         invocation_tag="gfx94"
     }
+    if (setup_args.contains("gfx95")){
+        invocation_tag="gfx95"
+    }
     echo "invocation tag: ${invocation_tag}"
     def redis_pre_setup_cmd = pre_setup_cmd
     if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
@@ -422,16 +437,6 @@ def buildHipClangJob(Map conf=[:]){
 
         env.HSA_ENABLE_SDMA=0
         checkout scm
-
-        def image
-        if ( params.BUILD_LEGACY_OS  && conf.get("docker_name", "") != "" ){
-            image = conf.get("docker_name", "")
-            echo "Using legacy docker: ${image}"
-        }
-        else{
-            image = getDockerImageName()
-            echo "Using default docker: ${image}"
-        }
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
@@ -455,7 +460,7 @@ def buildHipClangJob(Map conf=[:]){
         echo "Docker flags: ${dockerOpts}"
 
         def variant = env.STAGE_NAME
-
+        def image
         def retimage
         (retimage, image) = getDockerImage(conf)
 
@@ -496,17 +501,6 @@ def Build_CK(Map conf=[:]){
         env.HSA_ENABLE_SDMA=0
         env.DOCKER_BUILDKIT=1
         checkout scm
-
-        def image
-        if ( params.BUILD_LEGACY_OS  && conf.get("docker_name", "") != "" ){
-            image = conf.get("docker_name", "")
-            echo "Using legacy docker: ${image}"
-        }
-        else{
-            image = getDockerImageName()
-            echo "Using default docker: ${image}"
-        }
-
         def prefixpath = conf.get("prefixpath", "/opt/rocm")
 
         // Jenkins is complaining about the render group 
@@ -527,6 +521,7 @@ def Build_CK(Map conf=[:]){
         echo "Docker flags: ${dockerOpts}"
 
         def variant = env.STAGE_NAME
+        def image
         def retimage
 
         gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
@@ -638,6 +633,13 @@ def Build_CK(Map conf=[:]){
                             archiveArtifacts "perf_onnx_gemm_gfx908.log"
                             stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908"
                         }
+                        else if ( arch == 7 ){
+                            // run basic tests on gfx950
+                            echo "Run performance tests"
+                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx950"
+                            archiveArtifacts "perf_onnx_gemm_gfx950.log"
+                            stash includes: "perf_onnx_gemm_gfx950.log", name: "perf_log_gfx950"
+                        }
                         }
                     }
                     if (params.hipTensor_test && arch == 1 ){
@@ -775,7 +777,7 @@ def process_results(Map conf=[:]){
 
 //launch develop branch daily jobs
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true
-                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
+                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -848,8 +850,8 @@ pipeline {
             description: "Run the grouped conv large cases tests (default: OFF)")
         booleanParam(
             name: "RUN_CODEGEN_TESTS",
-            defaultValue: false,
-            description: "Run codegen tests (default: OFF)")
+            defaultValue: true,
+            description: "Run codegen tests (default: ON)")
         booleanParam(
             name: "RUN_CK_TILE_FMHA_TESTS",
             defaultValue: false,
@@ -874,6 +876,10 @@ pipeline {
             name: "BUILD_GFX908",
             defaultValue: false,
             description: "Build CK and run tests on gfx908 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX950",
+            defaultValue: false,
+            description: "Build CK and run tests on gfx950 (default: OFF)")
         booleanParam(
             name: "BUILD_GFX12",
             defaultValue: true,
@@ -1234,7 +1240,7 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK for all gfx9 targets")
+                stage("Build CK and run Tests on gfx942")
                 {
                     when {
                         beforeAgent true
@@ -1249,6 +1255,7 @@ pipeline {
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx942" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
@@ -1256,6 +1263,29 @@ pipeline {
                         cleanWs()
                     }
                 }
+                stage("Build CK and run Tests on gfx950")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.BUILD_GFX950.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx950") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
+                                         -DGPU_TARGETS="gfx950" \
+                                         -DCMAKE_CXX_FLAGS=" -O3 " """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx950" \
+                                           -DCMAKE_CXX_COMPILER=/llvm-project/build/bin/clang++ \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "rocm/composable_kernel-private:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
+                    }
+                }
                 stage("Build CK and run Tests on gfx908")
                 {
                     when {
@@ -1269,6 +1299,7 @@ pipeline {
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx908" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
@@ -1289,6 +1320,7 @@ pipeline {
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx90a" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
@@ -1296,7 +1328,7 @@ pipeline {
                         cleanWs()
                     }
                 }
-                stage("Build CK instances for different targets")
+                stage("Build CK instances for all supported targets")
                 {
                     when {
                         beforeAgent true
@@ -1327,6 +1359,7 @@ pipeline {
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx1030" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
@@ -1347,6 +1380,7 @@ pipeline {
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx1101" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
@@ -1367,6 +1401,7 @@ pipeline {
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx1201" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{

From 2215a9edf097544291729da0066ca7b7e5a0a23c Mon Sep 17 00:00:00 2001
From: Kiefer van Teutem <50830967+krithalith@users.noreply.github.com>
Date: Fri, 30 May 2025 22:32:28 +0200
Subject: [PATCH 169/443] Explicitly set the LINKER_LANGUAGE for the
 gemm_template_instances target to avoid Ninja build config failure. (#2265)

Co-authored-by: kiefer <kiefer.van.teutem@streamhpc.com>
---
 tile_engine/ops/gemm/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index 72bf1aa8a4..01b064ea98 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -36,6 +36,8 @@ add_custom_command(
 )
 
 add_library(gemm_template_instances OBJECT EXCLUDE_FROM_ALL ${GEMM_CODEGEN_CPP_FILES})
+# Explicitly set LINKER_LANGUAGE to avoid build config failures with Ninja.
+set_target_properties(gemm_template_instances PROPERTIES LINKER_LANGUAGE CXX)
 target_include_directories(gemm_template_instances PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(gemm_template_instances PRIVATE ${GEMM_CODEGEN_HPP_FILES})
 

From 0fdbf6bcd11cc60b154406d345150230ec17dae3 Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Mon, 2 Jun 2025 10:29:54 +0800
Subject: [PATCH 170/443] extend buffer load for fp16/bf16x16 (#2270)

* extend buffer load for fp16/bf16x16

* format
---
 .../core/arch/amd_buffer_addressing.hpp       | 40 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 5d6d6ce348..68648e1c02 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1437,8 +1437,8 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
     static_assert(
         (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, int32_t>::value &&
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
@@ -1561,6 +1561,24 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
 
             return bit_cast<rtn_type>(tmp);
         }
+        else if constexpr(N == 16)
+        {
+            thread_buffer<float, 8> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
     }
     else if constexpr(std::is_same<T, bf16_t>::value) // bf16
     {
@@ -1597,6 +1615,24 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
 
             return bit_cast<rtn_type>(tmp);
         }
+        else if constexpr(N == 16)
+        {
+            thread_buffer<float, 8> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
     }
     else // other datatype
     {

From cffe8fa2a442ac8e80dd236a1a5d24fe3d7e0cbf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Jun 2025 06:44:10 -0700
Subject: [PATCH 171/443] Bump rocm-docs-core[api_reference] from 1.19.1 to
 1.20.0 in /docs/sphinx (#2272)

Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.19.1 to 1.20.0.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.19.1...v1.20.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core[api_reference]
  dependency-version: 1.20.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 4cd4590283..725a745f3a 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.19.1
+rocm-docs-core[api_reference]==1.20.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 3c3363d3b4..f74ad725af 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -237,7 +237,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.19.1
+rocm-docs-core[api-reference]==1.20.0
     # via -r requirements.in
 rpds-py==0.24.0
     # via

From 2e38eb4f1c82ee0d1c5a6d7b9d407992e7e4790d Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Mon, 2 Jun 2025 10:25:01 -0700
Subject: [PATCH 172/443] Rotating buffer PR CI fix (#2257)

* Revert "Revert "[CK_tile] Add rotating buffer feature for universal gemm (#2200)" (#2256)"

This reverts commit bbdaf79a52c7b1cb74fe3d758e7b5142b2084f70.

* fix regression
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |   7 +
 example/ck_tile/03_gemm/run_gemm_example.inc  |   2 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    | 145 +++++++++++-------
 example/ck_tile/18_flatmm/flatmm_basic.cpp    |  45 +++++-
 example/ck_tile/18_flatmm/flatmm_basic.hpp    |   7 +
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |   2 +-
 include/ck_tile/host.hpp                      |   2 +
 include/ck_tile/host/flush_icache.hpp         |  30 ++++
 include/ck_tile/host/kernel_launch.hpp        |  62 +++++++-
 include/ck_tile/host/rotating_buffers.hpp     | 102 ++++++++++++
 include/ck_tile/host/stream_config.hpp        |   4 +-
 tile_engine/ops/gemm/README.md                |   2 +
 tile_engine/ops/gemm/benchmark_gemm.cpp       |  18 +--
 tile_engine/ops/gemm/benchmark_gemm.hpp       |   2 +
 tile_engine/ops/gemm/gemm_host_api.hpp        |   5 +
 tile_engine/ops/gemm/gemm_instance_builder.py |  45 +++++-
 tile_engine/ops/gemm/gemm_profiler.hpp        |   4 +-
 17 files changed, 410 insertions(+), 74 deletions(-)
 create mode 100644 include/ck_tile/host/flush_icache.hpp
 create mode 100644 include/ck_tile/host/rotating_buffers.hpp
 mode change 100755 => 100644 tile_engine/ops/gemm/gemm_host_api.hpp

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 25fab6bde0..4c9fecaba6 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -220,4 +220,11 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 79ed9ce76b..3010130e6c 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -178,7 +178,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
 
     float ave_time =
         gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 5718baf677..5dcb685839 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -11,6 +11,7 @@
 
 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
 
 template <typename Pipeline, ck_tile::TailNumber TN>
 void try_run(ck_tile::TailNumber tn)
@@ -74,64 +75,102 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-        constexpr auto memory_operation = memory_operation_.value;
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;
 
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
 
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             CLayout,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation>>;
-        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-        constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
 
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-        }
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
 
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
-        }
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+                static constexpr ck_tile::index_t APackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                static constexpr ck_tile::index_t BPackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
-        ave_time = ck_tile::launch_kernel(s,
-                                          ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                              Kernel{}, grids, blocks, 0, kargs));
-        return ave_time;
-    };
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                };
+                ave_time = ck_tile::launch_kernel_preprocess(
+                    s,
+                    run_flush_cache,
+                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                        Kernel{}, grids, blocks, 0, kargs));
+            }
+            else
+            {
+                ave_time =
+                    ck_tile::launch_kernel(s,
+                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                               Kernel{}, grids, blocks, 0, kargs));
+            }
+            return ave_time;
+        };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
@@ -243,8 +282,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     return ave_time;
 }
 
-#include "run_gemm_example.inc"
-
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index c8b4a10d05..2dbff1bc5c 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -11,6 +11,7 @@
 
 #include "ck_tile/host.hpp"
 #include "flatmm_basic.hpp"
+#include "run_flatmm_example.inc"
 
 template <typename ADataType,
           typename BDataType,
@@ -115,9 +116,47 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                       << std::endl;
         }
 
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        float ave_time{0};
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.a_ptr, kargs.b_shuffle_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_preprocess(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
         return ave_time;
     };
     if(args.k_batch == 1)
@@ -132,8 +171,6 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
     }
 }
 
-#include "run_flatmm_example.inc"
-
 int run_flatmm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index bbce978724..55f2d4f367 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -133,4 +133,11 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index c191fff7d0..3d4f154af7 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -122,7 +122,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
 
     float ave_time =
         flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 24feaf7c62..3459e728e0 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -37,3 +37,5 @@
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/host/timer.hpp"
+#include "ck_tile/host/flush_icache.hpp"
+#include "ck_tile/host/rotating_buffers.hpp"
diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp
new file mode 100644
index 0000000000..9230b50a13
--- /dev/null
+++ b/include/ck_tile/host/flush_icache.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+static __global__ void flush_cache()
+{
+    asm __volatile__("s_icache_inv \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t" ::
+                         :);
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index d159787387..9770e99738 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -11,6 +11,13 @@
 #include <cstddef>
 
 namespace ck_tile {
+
+#define LOW_CU_PROCESSORS 80
+#define HIGH_CU_PROCESSORS 228
+#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005
+#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015
+#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
+
 template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -81,6 +88,8 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
 template <typename... Callables>
 CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
 {
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
     if(!s.time_kernel_)
     {
         launch_and_check(s, std::forward<Callables>(callables)...);
@@ -88,7 +97,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
     }
 
     auto time_launches = [&](auto timer) {
-        // warmup
+        // Warmup
         for(int i = 0; i < s.cold_niters_; i++)
         {
             launch_and_check(s, std::forward<Callables>(callables)...);
@@ -114,4 +123,53 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
     }
 }
 
+template <typename PreprocessFunc, typename... Callables>
+CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
+                                            PreprocessFunc preprocess,
+                                            Callables&&... callables)
+{
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
+    if(!s.time_kernel_)
+    {
+        preprocess();
+        launch_and_check(s, std::forward<Callables>(callables)...);
+        return 0;
+    }
+
+    auto time_launches = [&](auto timer) {
+        // Warmup
+        for(int i = 0; i < s.cold_niters_; i++)
+        {
+            launch_and_check(s, std::forward<Callables>(callables)...);
+        }
+
+        timer.start(s.stream_id_);
+        for(int i = 0; i < s.nrepeat_; i++)
+        {
+            preprocess();
+            launch_and_check(s, std::forward<Callables>(callables)...);
+        }
+        timer.stop(s.stream_id_);
+
+        hipDeviceProp_t deviceProps;
+        HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+
+        float preprocess_offset = (deviceProps.multiProcessorCount >= HIGH_CU_PROCESSORS)
+                                      ? OPTIMAL_LATENCY_HIGH_CU_PROCESSORS
+                                  : (deviceProps.multiProcessorCount == LOW_CU_PROCESSORS)
+                                      ? OPTIMAL_LATENCY_LOW_CU_PROCESSORS
+                                      : OPTIMAL_LATENCY_SAFE_MARGIN;
+        return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
+    };
+
+    if(s.is_gpu_timer_)
+    {
+        return time_launches(gpu_timer{});
+    }
+    else
+    {
+        return time_launches(cpu_timer{});
+    }
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp
new file mode 100644
index 0000000000..86f68ad084
--- /dev/null
+++ b/include/ck_tile/host/rotating_buffers.hpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+template <typename ADataType, typename BDataType>
+struct RotatingMemWrapper
+{
+    RotatingMemWrapper() = delete;
+    RotatingMemWrapper(const void* a_ptr_,
+                       const void* b_ptr_,
+                       std::size_t rotating_count_,
+                       std::size_t size_a_,
+                       std::size_t size_b_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          rotating_count(rotating_count_),
+          size_a(size_a_),
+          size_b(size_b_)
+    {
+        p_a_grids.push_back(a_ptr);
+        p_b_grids.push_back(b_ptr);
+        for(size_t i = 1; i < rotating_count; i++)
+        {
+            {
+                void* pADeviceBuf;
+                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
+                                          const_cast<void*>(p_a_grids[0]),
+                                          size_a_,
+                                          hipMemcpyDeviceToDevice));
+                p_a_grids.push_back(pADeviceBuf);
+            }
+
+            {
+                void* pBDeviceBuf;
+                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
+                                          const_cast<void*>(p_b_grids[0]),
+                                          size_b_,
+                                          hipMemcpyDeviceToDevice));
+                p_b_grids.push_back(pBDeviceBuf);
+            }
+        }
+    }
+    void Next()
+    {
+        if(rotating_count > 1)
+        {
+            std::size_t idx = iter++ % rotating_count;
+            a_ptr           = p_a_grids[idx];
+            b_ptr           = p_b_grids[idx];
+        }
+    }
+    void Print()
+    {
+        std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
+                  << ", rotating_count: " << rotating_count << "}" << std::endl;
+    }
+    ~RotatingMemWrapper() noexcept
+    {
+        if(rotating_count > 1)
+        {
+            // restore ptr
+            a_ptr = p_a_grids[0];
+            b_ptr = p_b_grids[0];
+
+            // free device mem
+            for(size_t i = 1; i < rotating_count; i++)
+            {
+                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
+                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
+            }
+        }
+    }
+
+    private:
+    const void* a_ptr;
+    const void* b_ptr;
+    std::size_t iter           = 0;
+    std::size_t rotating_count = 1;
+    std::size_t size_a         = 0;
+    std::size_t size_b         = 0;
+    std::vector<const void*> p_a_grids;
+    std::vector<const void*> p_b_grids;
+};
+inline void flush_icache()
+{
+    hipDeviceProp_t deviceProps;
+    HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+
+    ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
+    HIP_CHECK_ERROR(hipGetLastError());
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp
index 47cf0fd5e4..f6bd40f6f2 100644
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -30,5 +30,7 @@ struct stream_config
     int cold_niters_       = 3;
     int nrepeat_           = 10;
     bool is_gpu_timer_     = true; // keep compatible
+    bool flush_cache_      = false;
+    int rotating_count_    = 1;
 };
 } // namespace ck_tile
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 01796da56c..db624e576e 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -42,6 +42,8 @@ rm -rf tile_engine/ && make benchmark_gemm -j  # rebuild
                  -repeat    The number of iterations to benchmark the kernel. Default is 100.
                   -timer    Whether if the timer is gpu timer or not. Possible values are true or false. Default is true.  
                    -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+            -flush_cache    To flush cache in between different runs.Possible values are true or false. Default is false.
+         -rotating_count    count to flush cache. Default is 5.     
                  -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
            -csv_filename    The filename of benchmark result. Default is gemm_kernel.
     -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
index fb56e524d2..db2b648437 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -26,15 +26,15 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                              CLayout::name,
                              arg_parser.get_bool("structured_sparsity")};
 
-    Setting setting{
-        arg_parser.get_int("warmup"),
-        arg_parser.get_int("repeat"),
-        arg_parser.get_bool("timer"),
-        arg_parser.get_int("verify"),
-        arg_parser.get_int("init"),
-        arg_parser.get_bool("log"),
-        arg_parser.get_str("csv_filename"),
-    };
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count")};
 
     auto& profiler = GemmProfiler::instance(setting);
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 292d67dad6..459a40b080 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,6 +125,8 @@ struct Setting
     int init_method_;
     bool log_;
     std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
 };
 
 inline std::string get_rocm_version()
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
old mode 100755
new mode 100644
index 8cbc3f26f6..b3aab6ad92
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -93,6 +93,11 @@ inline auto create_args(int argc, char* argv[])
                 "0",
                 "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
                 "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "false",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
         .insert("metric",
                 "0",
                 "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index c43797f3e0..ea7fa4e67c 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -273,9 +273,52 @@ struct GemmKernel {{
                       << std::endl;
             }}
 
-            ave_time = ck_tile::launch_kernel(stream,
+            if(stream.flush_cache_)
+            {{
+                std::cout << "Flushing cache..." << std::endl;
+                static constexpr ck_tile::index_t APackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                static constexpr ck_tile::index_t BPackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                
+                auto is_row_major = [](auto layout_) {{
+                    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{{}};
+                }};
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{{}})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{{}})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, stream.rotating_count_, size_a_buffer, size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {{
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
+                }};
+                ave_time = ck_tile::launch_kernel_preprocess(
+                    stream,
+                    run_flush_cache,
+                    ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                        Kernel{{}}, grids, blocks, 0, kargs));
+            }}
+            else{{
+                ave_time = ck_tile::launch_kernel(stream,
                                           ck_tile::make_kernel<blocks.x, kBlockPerCu>(
                                               Kernel{{}}, grids, blocks, 0, kargs));
+            }}
             return ave_time;
 
         }};
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 9170952aa8..0125a759b3 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -128,7 +128,9 @@ class GemmProfiler
                                                                      setting_.log_,
                                                                      setting_.n_warmup_,
                                                                      setting_.n_repeat_,
-                                                                     setting_.is_gpu_timer_});
+                                                                     setting_.is_gpu_timer_,
+                                                                     setting_.flush_cache_,
+                                                                     setting_.rotating_count_});
             process_result(gemm_problem,
                            c_m_n_dev_buf,
                            c_m_n_host_result,

From b76fdbe47f5b4c957ecb5db9d47074f55eac9136 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 3 Jun 2025 07:17:35 -0700
Subject: [PATCH 173/443] Upgrade to ROCm6.4.1 and use generic targets for
 gfx1x. (#2274)

* upgrade to rocm6.4.1 and use gfx1x-generic targets

* add rocm version parsing

* fix the gfx10-3-generic syntax in cmake
---
 CMakeLists.txt                                |  4 +-
 Dockerfile                                    |  6 +--
 Dockerfile.compiler                           |  2 +-
 Jenkinsfile                                   | 37 ++++++++++++++-----
 example/CMakeLists.txt                        |  8 ++--
 .../gpu/CMakeLists.txt                        | 14 +++----
 test/CMakeLists.txt                           | 10 ++---
 7 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12ab5e4cef..3bbdd77c21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,8 +176,10 @@ if(NOT ENABLE_ASAN_PACKAGING)
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
     elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600300000 AND ${hip_VERSION_FLAT} LESS 600400000)
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
-    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600400000)
+    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600400000 AND ${hip_VERSION_FLAT} LESS 600443483)
         set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950")
+    elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600443483)
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx950;gfx10-3-generic;gfx11-generic;gfx12-generic")
     endif()
 else()
     #build CK only for xnack-supported targets when using ASAN
diff --git a/Dockerfile b/Dockerfile
index c629bd034c..1a47639d31 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.4
+ARG ROCMVERSION=6.4.1
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -13,8 +13,8 @@ RUN set -xe && \
     curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
 RUN if [ "$ROCMVERSION" != "6.5" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60400-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60400-1_all.deb && \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
         wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
         sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
         sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
index 7534910681..0306057e45 100644
--- a/Dockerfile.compiler
+++ b/Dockerfile.compiler
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4"
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
 FROM $BASE_DOCKER
 ARG compiler_version=""
 ARG compiler_commit=""
diff --git a/Jenkinsfile b/Jenkinsfile
index c4b5efe3bc..1cb1a6ca6c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -12,6 +12,23 @@ def show_node_info() {
     """
 }
 
+class Version {
+    int major, minor, patch
+    @Override
+    String toString() {
+        return [major, minor, patch].findAll().join('.')
+    }
+}
+def parseVersion(String versionString) {
+    if (!versionString) return null
+    int[] tokens = versionString.split(/\./).collect { it as int } // Splits the string by '.' and converts each part to an integer.
+    return new Version(
+        major: tokens[0],
+        minor: tokens.length > 1 ? tokens[1] : null,
+        patch: tokens.length > 2 ? tokens[2] : null,
+    )
+}
+
 def nthreads() {
     def nproc = sh(returnStdout: true, script: 'nproc')
     echo "Number of cores: ${nproc}"
@@ -38,8 +55,8 @@ def getBaseDockerImageName(){
         img = "${params.USE_CUSTOM_DOCKER}"
     }
     else{
-        def ROCM_numeric = "${params.ROCMVERSION}" as float
-        if ( ROCM_numeric < 6.5 ){
+        def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
+        if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
             img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
             }
         else{
@@ -802,8 +819,8 @@ pipeline {
             description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
         string(
             name: 'ROCMVERSION', 
-            defaultValue: '6.4',
-            description: 'Specify which ROCM version to use: 6.3 (default).')
+            defaultValue: '6.4.1',
+            description: 'Specify which ROCM version to use: 6.4.1 (default).')
         string(
             name: 'COMPILER_VERSION', 
             defaultValue: '', 
@@ -1354,10 +1371,10 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1030") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DCMAKE_CXX_FLAGS=" -O3 " """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx10-3-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx1030" \
+                                           -DGPU_TARGETS="gfx10-3-generic" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
@@ -1375,10 +1392,10 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx1101" \
+                                           -DGPU_TARGETS="gfx11-generic" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
@@ -1396,10 +1413,10 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1201") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx1201" \
+                                           -DGPU_TARGETS="gfx12-generic" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 9c30a2e255..c86b434212 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -114,14 +114,14 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl" AND NOT FILE_NAME MATCHES "_pk_i4")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
             message("trimming targets for ${FILE_NAME}")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -210,7 +210,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #only continue if there are some source files left on the list
     if(FILE_NAME)
         if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         endif()
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 67ce4e39e1..dc43f65b10 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -111,31 +111,31 @@ function(add_instance_library INSTANCE_NAME)
         foreach(source IN LISTS ARGN)
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
             if(source MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
             elseif(source MATCHES "_wmma")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
             elseif(source MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
             endif()
 
             if(source MATCHES "_mx")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
             endif()
 
             #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
             else()
                 if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
                 if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
             endif()
             if(source MATCHES "gemm_wmma_universal" AND source MATCHES "f8")
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6692f55b5f..12abe5a245 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -102,11 +102,11 @@ function(add_test_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})
@@ -196,13 +196,13 @@ function(add_gtest_executable TEST_NAME)
     #only continue if there are some source files left on the list
     if(ARGN)
         if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_wmma")
              list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx908 gfx90a gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(ARGN MATCHES "_mx") #only build mx example for gfx950
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
         add_executable(${TEST_NAME} ${ARGN})

From 11f6c14e03996486fc86e7f509640098dfdf306d Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 3 Jun 2025 10:26:58 -0400
Subject: [PATCH 174/443] Add 0 as an acceptable arguement for strides in CK
 GEMM example (Issue 2037) (#2268)

* add 0 as valid default arguement for strides

* add 0 as valid default arguement for strides

# Conflicts:
#	example/01_gemm/common.hpp
---
 example/01_gemm/common.hpp                    | 42 ++++++++++---------
 example/01_gemm/run_gemm_example.inc          |  2 +-
 example/01_gemm/run_gemm_example_streamk.inc  |  2 +-
 .../01_gemm/run_gemm_example_streamk_v2.inc   |  2 +-
 example/01_gemm/run_gemm_example_v2.inc       |  2 +-
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index 9073ffcfc1..d3e61b8216 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -128,11 +128,12 @@ bool parse_cmd_args<ProblemSize>(int argc,
     }
     else
     {
-        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
-                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
-                  << std::endl
-                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
-                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl;
+        std::cerr
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
+            << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
+            << "arg3: time kernel (0=no, 1=yes)" << std::endl
+            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
+            << std::endl;
         return false;
     }
 
@@ -181,7 +182,8 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
             << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
             << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
             << "arg3: time kernel (0=no, 1=yes)" << std::endl
-            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
+            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
+            << std::endl
             << "arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
             << "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
         return false;
@@ -227,13 +229,14 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
     }
     else
     {
-        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
-                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
-                  << std::endl
-                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
-                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
-                  << "arg10: stream-k select (0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
-                  << "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
+        std::cerr
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
+            << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
+            << "arg3: time kernel (0=no, 1=yes)" << std::endl
+            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
+            << std::endl
+            << "arg10: stream-k select (0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
+            << "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
         return false;
     }
 
@@ -277,12 +280,13 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
     }
     else
     {
-        std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
-                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
-                  << std::endl
-                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
-                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
-                  << "arg10: KBatch" << std::endl;
+        std::cerr
+            << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
+            << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
+            << "arg3: time kernel (0=no, 1=yes)" << std::endl
+            << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
+            << std::endl
+            << "arg10: KBatch" << std::endl;
         return false;
     }
 
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
index c064ed500c..6c5d9f9fba 100644
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -33,7 +33,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     auto f_get_default_stride =
         [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
+            if(stride == -1 || stride == 0)
             {
                 // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
diff --git a/example/01_gemm/run_gemm_example_streamk.inc b/example/01_gemm/run_gemm_example_streamk.inc
index 438afcf71a..7e43847463 100644
--- a/example/01_gemm/run_gemm_example_streamk.inc
+++ b/example/01_gemm/run_gemm_example_streamk.inc
@@ -36,7 +36,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     auto f_get_default_stride =
         [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
+            if(stride == -1 || stride == 0)
             {
                 // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
index 9ee380d247..af35de0d25 100644
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -35,7 +35,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     auto f_get_default_stride =
         [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
+            if(stride == -1 || stride == 0)
             {
                 // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc
index 2b60fa5d28..4adb6f896b 100644
--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -34,7 +34,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
 
     auto f_get_default_stride =
         [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
+            if(stride == -1 || stride == 0)
             {
                 // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)

From 7f9eef40b0dde936e8f906da6dc32a226194809f Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@fb.com>
Date: Tue, 3 Jun 2025 07:27:51 -0700
Subject: [PATCH 175/443] Move pragma ahead (#2231)

---
 include/ck/utility/env.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp
index 469fb70f10..46ba32bb87 100644
--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 
 #ifndef CK_CODE_GEN_RTC
-#pragma once
 
 #include <cstdlib>
 #include <cstring>

From 6e5acee0f951e4d174ac9afb4afce83fc801305d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 3 Jun 2025 18:31:46 +0200
Subject: [PATCH 176/443] Add Clamp/Relu bf16/fp16 cast fixes (#2279)

* Add Clamp/Relu bf16/fp16 fixes

* fix
---
 .../element/binary_element_wise_operation.hpp | 28 +++++++++----------
 .../cpu/reference_conv_fwd.hpp                | 15 +++++++---
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index badd64508d..34c76b89e4 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -389,10 +389,9 @@ struct AddClamp
     __host__ __device__ constexpr void
     operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
     {
-        const float a = x0 + x1;
-        y             = a > type_convert<half_t>(floor_)
-                            ? (a < type_convert<half_t>(ceil_) ? a : type_convert<half_t>(ceil_))
-                            : type_convert<half_t>(floor_);
+        const float a = x0 + type_convert<float>(x1);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<half_t>(b);
     };
 
     template <>
@@ -408,9 +407,8 @@ struct AddClamp
     operator()<bhalf_t, float, bhalf_t>(bhalf_t& y, const float& x0, const bhalf_t& x1) const
     {
         const float a = x0 + type_convert<float>(x1);
-        y             = a > type_convert<bhalf_t>(floor_)
-                            ? (a < type_convert<bhalf_t>(ceil_) ? a : type_convert<bhalf_t>(ceil_))
-                            : type_convert<bhalf_t>(floor_);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<bhalf_t>(b);
     };
 
     template <>
@@ -418,9 +416,8 @@ struct AddClamp
     operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
     {
         const float a = type_convert<float>(x0) + type_convert<float>(x1);
-        y             = a > type_convert<bhalf_t>(floor_)
-                            ? (a < type_convert<bhalf_t>(ceil_) ? a : type_convert<bhalf_t>(ceil_))
-                            : type_convert<bhalf_t>(floor_);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<bhalf_t>(b);
     };
 
     template <>
@@ -476,8 +473,9 @@ struct AddRelu
     __host__ __device__ constexpr void
     operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
     {
-        const float a = x0 + x1;
-        y             = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
+        const float a = x0 + type_convert<float>(x1);
+        const float b = a > 0.0f ? a : 0.0f;
+        y             = type_convert<half_t>(b);
     };
 
     template <>
@@ -493,7 +491,8 @@ struct AddRelu
     operator()<bhalf_t, float, bhalf_t>(bhalf_t& y, const float& x0, const bhalf_t& x1) const
     {
         const float a = x0 + type_convert<float>(x1);
-        y             = a > type_convert<bhalf_t>(0.0f) ? a : type_convert<bhalf_t>(0.0f);
+        const float b = a > 0.0f ? a : 0.0f;
+        y             = type_convert<bhalf_t>(b);
     };
 
     template <>
@@ -501,7 +500,8 @@ struct AddRelu
     operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
     {
         const float a = type_convert<float>(x0) + type_convert<float>(x1);
-        y             = a > type_convert<bhalf_t>(0.0f) ? a : type_convert<bhalf_t>(0.0f);
+        const float b = a > 0.0f ? a : 0.0f;
+        y             = type_convert<bhalf_t>(b);
     };
 
     template <>
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index 9c1349f56c..3884902bbf 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -383,22 +383,29 @@ struct ReferenceConvFwd : public device::BaseOperator
                                      const T& x,
                                      Args... dims)
     {
+        float y_f32;
         if constexpr(NumTensor::value == 0)
         {
-            elementwise_op(y, x);
+            elementwise_op(y_f32, ck::type_convert<float>(x));
         }
         else if constexpr(NumTensor::value == 1)
         {
-            elementwise_op(y, x, elementwise_tensors[0](dims...));
+            elementwise_op(y_f32,
+                           ck::type_convert<float>(x),
+                           ck::type_convert<float>(elementwise_tensors[0](dims...)));
         }
         else if constexpr(NumTensor::value == 2)
         {
-            elementwise_op(y, x, elementwise_tensors[0](dims...), elementwise_tensors[1](dims...));
+            elementwise_op(y_f32,
+                           ck::type_convert<float>(x),
+                           ck::type_convert<float>(elementwise_tensors[0](dims...)),
+                           ck::type_convert<float>(elementwise_tensors[1](dims...)));
         }
         else
         {
             throw std::runtime_error("ElementOp not supported in reference.");
         }
+        y = ck::type_convert<T>(y_f32);
     }
 
     static constexpr bool IsValidCompilationParameter()

From 1037b21cfee153b562dd95a75a586859c1304f3b Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 3 Jun 2025 20:14:22 -0700
Subject: [PATCH 177/443] Update changelog for Rotating buffer (#2283)

* Updating changelog for Rotating buffer

* address review comment.
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d62a64f3e0..2ec0c1ecce 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
 * Added benchmarking support for tile engine GEMM.
+* Added rotating buffer feature for CK_Tile GEMM.
 
 ### Optimized
 

From 59a85cb4bcca9482fbccef570f6e9dc818d6deef Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 3 Jun 2025 20:16:10 -0700
Subject: [PATCH 178/443] [CK_Tile] Fix gemm kernel for 4,64,16 and 64,4,16
 warp tile sizes (#2262)

* debugging issue

* debugging issue

* debugging

* debugging

* reverting debugging code

* clang formatted

* updating default_config.json

* fix ci failure

* clang formatted
---
 example/ck_tile/03_gemm/universal_gemm.cpp        |  7 +++++--
 .../ck_tile/ops/epilogue/cshuffle_epilogue.hpp    | 15 ++++++++++++++-
 .../ck_tile/ops/epilogue/default_2d_epilogue.hpp  |  8 ++++++--
 tile_engine/ops/gemm/codegen_utils.py             | 12 ++++++------
 tile_engine/ops/gemm/configs/default_config.json  | 12 +++++++++---
 tile_engine/ops/gemm/gemm_instance_builder.py     |  9 ++++++---
 6 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 5dcb685839..0a094c29fe 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -121,8 +121,11 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                           << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
                           << "}" << std::endl;
             }
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 9b8dde1905..1f53dfd93c 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -92,7 +92,20 @@ struct CShuffleEpilogue
     CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
     {
         constexpr index_t MaxVectorStoreSize = 16;
-        return MaxVectorStoreSize / sizeof(ODataType);
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            return std::min(static_cast<int>(kNPerIteration),
+                            static_cast<int>(MaxVectorStoreSize / sizeof(ODataType)));
+        }
+        else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
+        {
+            return std::min(static_cast<int>(kMPerIteration),
+                            static_cast<int>(MaxVectorStoreSize / sizeof(ODataType)));
+        }
+        else
+        {
+            static_assert(false, "Unsupported CLayout!");
+        }
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index a2915f5c8f..ab3c0df88d 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -149,7 +149,9 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
             else
             {
                 // In this case each thread has just a single item in Ndim
-                return WG::WarpGemmAttribute::Impl::kCNLane / WG::kN;
+                return (WG::WarpGemmAttribute::Impl::kCNLane *
+                        WG::WarpGemmAttribute::Impl::kBNBlock) /
+                       WG::kN;
             }
         }
         // M is contiguous dimension
@@ -158,7 +160,9 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
             if constexpr(isCTransposed)
             {
                 // In this case each thread has just a single item in Mdim
-                return WG::WarpGemmAttribute::Impl::kCNLane / WG::kN;
+                return (WG::WarpGemmAttribute::Impl::kCNLane *
+                        WG::WarpGemmAttribute::Impl::kAMBlock) /
+                       WG::kN;
             }
             else
             {
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index a8955cec91..58eed45dc6 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -167,20 +167,20 @@ def BOOL_MAP(b_): return {True: 'true', False: 'false'}[bool(b_)]
 # To Do: add some more supported combinations
 warp_tile_supported_combinations = {
     "gfx90a": {
-        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
-        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
         'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]],
         'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]]
     },
     "gfx942": {
-        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
-        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
         'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
         'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]]
     },
     "gfx950": {
-        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
-        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
+        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
+        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
         'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
         'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
     }
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
index 09fe3b83ac..d20c5eef7d 100644
--- a/tile_engine/ops/gemm/configs/default_config.json
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -48,7 +48,7 @@
             "max": 512,
             "min": 64,
             "step": 64,
-            "exclude": []
+            "exclude": [192]
         },
         "warp_m": {
             "values": [
@@ -71,14 +71,20 @@
         },
         "warp_tile_m": {
             "values": [
+                4,
+                8,
                 16,
-                32
+                32,
+                64
             ]
         },
         "warp_tile_n": {
             "values": [
+                4,
+                8,
                 16,
-                32
+                32,
+                64
             ]
         },
         "warp_tile_k": {
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index ea7fa4e67c..a677b842c5 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -420,12 +420,12 @@ struct GemmKernel {{
 
         # LDS capacity verification
         matrix_a_size = (tile_m * tile_k) * \
-            pow(2, element_size(self.config.problem.datatype_map['matrix_a']))
+            element_size(self.config.problem.datatype_map['matrix_a'])
         matrix_b_size = (tile_n * tile_k) * \
-            pow(2, element_size(self.config.problem.datatype_map['matrix_b']))
+            element_size(self.config.problem.datatype_map['matrix_b'])
         total_tile_in_lds = matrix_a_size + matrix_b_size
 
-        max_tile_size = 2**16 if pipeline == "compv4" else 2**15
+        max_tile_size = 2**15 if pipeline == "compv4" else 2**16
         if total_tile_in_lds > max_tile_size:
             logging.debug(
                 f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds/1024:.1f}KB) > "
@@ -493,6 +493,9 @@ struct GemmKernel {{
         for trait in self.valid_trait_names:
             tile_valid_params = list(
                 filter(lambda t: self.is_tile_valid(t, trait), tile_params))
+
+            # if len(tile_valid_params) == 0:
+            #     raise RuntimeError(f"No valid kernel instance selected for trait: {trait}")
             if trait not in self.valid_trait_tile_combinations:
                 self.valid_trait_tile_combinations[trait] = []
             self.valid_trait_tile_combinations[trait].append(tile_valid_params)

From 52b4860a305fff7b71628b36bda1daa19a230c59 Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Wed, 4 Jun 2025 12:22:33 +0600
Subject: [PATCH 179/443] WMMA GEMM universal pipeline v1, mixed precision and
 paddings, examples (#2230)

* Fixed cmake errors related to  gemm_bilinear. Previously, if the above flags are set, cmake build fails: GPU_TARGETS="gfx1100;gfx1201" -D DTYPES="fp16;bf16;fp8"

* Fixed cmake build errors related to test_fp8

* Updates to support mixed precision

* Adding support for RRR, F8xF16xF16 gemm_universal_wmma - wip

* Added support for F8xF16xF16 to gemm_wmma_universal

* Added support for F16xF8xF16 to gemm_wmma_universal

* Added support for BF16xI4xBF16 to gemm_wmma_universal

* Added support for F16xI4xF16 to gemm_wmma_universal

* Fixed IsSupportedArgument to check ComputeTypeA, ComputeTypeB instead of ADataType, BDataType

* Added missing test class for FP16_KM_NK

* Pre-commit hooks fixes

* Added padding instances for f16xf16xf16

* Fixed cmake errors related to  gemm_bilinear. Previously, if the above flags are set, cmake build fails: GPU_TARGETS="gfx1100;gfx1201" -D DTYPES="fp16;bf16;fp8"

* Fixed cmake build errors related to test_fp8

* Ammending changes for adding support for padding instances for f16xf16xf16

* Fixes for padding instances for f16xf16xf16

* Added padding instances for bf16xbf16, f8xf8

* Added packed instances for bf16xi4xbf16

* Added padding instances for f8xf16xf16

* Added padding instances for f16xf8xf16, f16xi4xf16

* Fixed typos for bf16xbf16xbf16 padding instances

* Fixed typos for padded instances

* Added tests for fp16, KM_KN and KM_NK

* Padding not supported for when BDataType is pk_i4_t. Added fix for correct check and removed padding instances.

* Fixed typos

* Updated the set of tests for FP16

* Updated the set of tests for FP16

* Fix typo

* Moved f16xi4 test under the correct data layout group

* example for gemm_universal_bf16

* Adding examples for gemm_wmma instances

* Added the  missing parameters

* Fixed review comments and added executable to cmakeLists

* Fixing clang format

* Fixing build erros

* Fixed compilation failure.

* Modified some code as per gemm_universal_examples

* Fixed the gemm specialization error

* Fixed the build errors.

* Fix strides of a/b_thread_desc

The descriptors are larger than needed (even though the compiler don't alloc registers for unused values).

* Load in M/NRepeat dims with thread copy's slice instead of a loop

* Clone BlockwiseGemmXdlops_pipeline_v1 for WMMA implementation

* Implement Intrawave and Interwave variants of pipeline v1

* Add instances for Interwave and Intrawave v1

* Add instances with ABlockLdsExtraM and BBlockLdsExtraN = 0

* Remove instances that are too slow (mostly because of register spilling)

* Add a workaround for fp8/bf8->f32 packed conversion issue

* Add instances for Interwave and Intrawave v1

* Enable profiling of mixed precision with f8 and int4 on WMMA

* Fix segfault in profiler when B is pk_i4_t

b_device_buf's size in bytes is larger than b_k_n_permute so b_device_buf.ToDevice reads out-of-bounds.

* Remove instances that are too slow (mostly because of register spilling)

* Add missing add_device_gemm_wmma_universal_f8_f8_bf16 declarations

* Add test case for bf16_i4

* Add missing Regular tests

* Add test_gemm_universal_xdl/wmma_fp16 to REGRESSION_TESTS

They take more than 30 seconds

* Fix a bug that fp16_i4 validation passes only with PermuteB

A permutation required by conversion from pk_i4_t to half_t does not
depend on PermuteB, they can be used independently.

* Use PermuteB with f16_i4 in most instances (as xdl)

Some instances use PermuteB = false for checking correctness.
See also the previous commit.

* Fix cache flushing for pk_i4

* Add mixed precision examples

* Disable all tests and instances with f8 on gfx11

Even though f8_f16 and f16_f8 don't require f8 WMMA instructions,
gfx11 still lacks hardware instructions for fast f8->f32 conversion.

* Add FP16 KM_NK and KM_KN test suites for XDL

These tests were added to common .inc for better testing of WMMA instances

* Fix int8 DTYPES check for gemm_bilinear

---------

Co-authored-by: Anca Hamuraru <anca@streamhpc.com>
Co-authored-by: Apoorva Kalyani <apoorva@streamhpc.com>
---
 example/01_gemm/CMakeLists.txt                |  13 +
 example/01_gemm/gemm_wmma_bf16_pk_i4_v3.cpp   | 253 +++++++
 example/01_gemm/gemm_wmma_bf16_v3.cpp         |  47 ++
 example/01_gemm/gemm_wmma_fp16_fp8_v3.cpp     |  52 ++
 example/01_gemm/gemm_wmma_fp16_pk_i4_v3.cpp   | 302 +++++++++
 example/01_gemm/gemm_wmma_fp16_v3.cpp         |  47 ++
 example/01_gemm/gemm_wmma_fp8_v3.cpp          |  67 ++
 .../blockwise_gemm_pipeline_wmma_selector.hpp |  25 +-
 .../blockwise_gemm_pipeline_wmmaops_base.hpp  |  28 +-
 .../blockwise_gemm_pipeline_wmmaops_v1.hpp    | 638 ++++++++++++++++++
 .../blockwise_gemm_pipeline_wmmaops_v3.hpp    |  88 +--
 .../impl/device_gemm_wmma_cshuffle_v3.hpp     |  38 +-
 .../grid/gridwise_gemm_wmma_cshuffle_v3.hpp   |  16 +-
 include/ck/utility/type_convert.hpp           |  14 +
 .../gpu/gemm_universal.hpp                    | 183 ++++-
 .../gpu/gemm_universal_wmma.inc               | 306 ++++++++-
 .../gpu/CMakeLists.txt                        |   9 +
 .../gpu/gemm_universal/CMakeLists.txt         | 173 ++++-
 ...wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp |  21 +-
 ...6_bf16_km_kn_mn_comp_kpadding_instance.cpp |  25 +
 ...bf16_km_kn_mn_comp_mnkpadding_instance.cpp |  25 +
 ..._bf16_km_kn_mn_comp_mnpadding_instance.cpp |  25 +
 ...wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp |  23 +-
 ...6_bf16_km_nk_mn_comp_kpadding_instance.cpp |  25 +
 ...bf16_km_nk_mn_comp_mnkpadding_instance.cpp |  25 +
 ..._bf16_km_nk_mn_comp_mnpadding_instance.cpp |  25 +
 ...wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp |  25 +-
 ...6_bf16_mk_kn_mn_comp_kpadding_instance.cpp |  25 +
 ...bf16_mk_kn_mn_comp_mnkpadding_instance.cpp |  25 +
 ..._bf16_mk_kn_mn_comp_mnpadding_instance.cpp |  25 +
 ...wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp |  27 +-
 ...6_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  25 +
 ...bf16_mk_nk_mn_comp_mnkpadding_instance.cpp |  25 +
 ..._bf16_mk_nk_mn_comp_mnpadding_instance.cpp |  25 +
 ...m_wmma_universal_bf16_i4_bf16_km_nk_mn.hpp |  58 ++
 ...i4_bf16_km_nk_mn_comp_default_instance.cpp |  24 +
 ...m_wmma_universal_bf16_i4_bf16_mk_nk_mn.hpp |  59 ++
 ...i4_bf16_mk_nk_mn_comp_default_instance.cpp |  24 +
 ...mm_wmma_universal_f16_f16_f16_km_kn_mn.hpp |  21 +-
 ...16_f16_km_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_km_kn_mn_comp_mnkpadding_instance.cpp |  25 +
 ...6_f16_km_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...mm_wmma_universal_f16_f16_f16_km_nk_mn.hpp |  23 +-
 ...16_f16_km_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_km_nk_mn_comp_mnkpadding_instance.cpp |  25 +
 ...6_f16_km_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...mm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp |  25 +-
 ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp |  25 +
 ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...mm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp |  27 +-
 ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp |  25 +
 ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f16_f8_f16_km_kn_mn.hpp |  58 ++
 ..._f8_f16_km_kn_mn_comp_default_instance.cpp |  24 +
 ...f8_f16_km_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_km_kn_mn_comp_mnkpadding_instance.cpp |  24 +
 ...8_f16_km_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f16_f8_f16_km_nk_mn.hpp |  59 ++
 ..._f8_f16_km_nk_mn_comp_default_instance.cpp |  24 +
 ...f8_f16_km_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_km_nk_mn_comp_mnkpadding_instance.cpp |  24 +
 ...8_f16_km_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp |  61 ++
 ..._f8_f16_mk_kn_mn_comp_default_instance.cpp |  24 +
 ...f8_f16_mk_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp |  24 +
 ...8_f16_mk_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp |  61 ++
 ..._f8_f16_mk_nk_mn_comp_default_instance.cpp |  24 +
 ...f8_f16_mk_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp |  24 +
 ...8_f16_mk_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f16_i4_f16_km_nk_mn.hpp |  57 ++
 ..._i4_f16_km_nk_mn_comp_default_instance.cpp |  24 +
 ...emm_wmma_universal_f16_i4_f16_mk_nk_mn.hpp |  58 ++
 ..._i4_f16_mk_nk_mn_comp_default_instance.cpp |  24 +
 ...emm_wmma_universal_f8_f16_f16_km_kn_mn.hpp |  59 ++
 ...f16_f16_km_kn_mn_comp_default_instance.cpp |  24 +
 ...16_f16_km_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_km_kn_mn_comp_mnkpadding_instance.cpp |  24 +
 ...6_f16_km_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f8_f16_f16_km_nk_mn.hpp |  61 ++
 ...f16_f16_km_nk_mn_comp_default_instance.cpp |  24 +
 ...16_f16_km_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_km_nk_mn_comp_mnkpadding_instance.cpp |  24 +
 ...6_f16_km_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp |  61 ++
 ...f16_f16_mk_kn_mn_comp_default_instance.cpp |  24 +
 ...16_f16_mk_kn_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_kn_mn_comp_mnkpadding_instance.cpp |  24 +
 ...6_f16_mk_kn_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp |  60 ++
 ...f16_f16_mk_nk_mn_comp_default_instance.cpp |  24 +
 ...16_f16_mk_nk_mn_comp_kpadding_instance.cpp |  24 +
 ..._f16_mk_nk_mn_comp_mnkpadding_instance.cpp |  24 +
 ...6_f16_mk_nk_mn_comp_mnpadding_instance.cpp |  24 +
 ...emm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp |  15 +-
 ...8_bf16_mk_kn_mn_comp_kpadding_instance.cpp |  27 +
 ...bf16_mk_kn_mn_comp_mnkpadding_instance.cpp |  27 +
 ..._bf16_mk_kn_mn_comp_mnpadding_instance.cpp |  27 +
 ...emm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp |  13 +-
 ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  27 +
 ...bf16_mk_nk_mn_comp_mnkpadding_instance.cpp |  27 +
 ..._bf16_mk_nk_mn_comp_mnpadding_instance.cpp |  27 +
 .../profiler/profile_gemm_universal_impl.hpp  | 113 ++--
 profiler/src/CMakeLists.txt                   |  18 +-
 profiler/src/profile_gemm_universal.cpp       |   6 +-
 test/CMakeLists.txt                           |   3 +-
 test/data_type/CMakeLists.txt                 |   4 +-
 .../test_gemm_universal_ut_cases_bf16.inc     |  32 +
 .../test_gemm_universal_ut_cases_fp16.inc     | 128 ++++
 .../test_gemm_universal_wmma_bf16.cpp         |   7 +
 .../test_gemm_universal_wmma_fp16.cpp         |  44 ++
 .../test_gemm_universal_wmma_fp8.cpp          |   2 +-
 .../test_gemm_universal_xdl_fp16.cpp          |  18 +-
 117 files changed, 4953 insertions(+), 271 deletions(-)
 create mode 100644 example/01_gemm/gemm_wmma_bf16_pk_i4_v3.cpp
 create mode 100644 example/01_gemm/gemm_wmma_bf16_v3.cpp
 create mode 100644 example/01_gemm/gemm_wmma_fp16_fp8_v3.cpp
 create mode 100644 example/01_gemm/gemm_wmma_fp16_pk_i4_v3.cpp
 create mode 100644 example/01_gemm/gemm_wmma_fp16_v3.cpp
 create mode 100644 example/01_gemm/gemm_wmma_fp8_v3.cpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instance.cpp

diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 96678d275a..24292be4fe 100755
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -109,3 +109,16 @@ add_example_executable(example_gemm_wmma_bf16 gemm_wmma_bf16.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16)
 add_example_executable(example_gemm_wmma_int8 gemm_wmma_int8.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_int8)
+
+add_example_executable(example_gemm_wmma_bf16_v3 gemm_wmma_bf16_v3.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16_v3)
+add_example_executable(example_gemm_wmma_bf16_pk_i4_v3 gemm_wmma_bf16_pk_i4_v3.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16_pk_i4_v3)
+add_example_executable(example_gemm_wmma_fp8_v3 gemm_wmma_fp8_v3.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp8_v3)
+add_example_executable(example_gemm_wmma_fp16_v3 gemm_wmma_fp16_v3.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_v3)
+add_example_executable(example_gemm_wmma_fp16_pk_i4_v3 gemm_wmma_fp16_pk_i4_v3.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3)
+add_example_executable(example_gemm_wmma_fp16_fp8_v3 gemm_wmma_fp16_fp8_v3.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_fp8_v3)
diff --git a/example/01_gemm/gemm_wmma_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_wmma_bf16_pk_i4_v3.cpp
new file mode 100644
index 0000000000..69ced56c0b
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_bf16_pk_i4_v3.cpp
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::pk_i4_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA         = false;
+static constexpr bool PermuteB         = true;
+static constexpr ck::index_t KPerBlock = 32;
+
+// clang-format off
+using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
+    ALayout, BLayout, CLayout,
+    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
+    AElementOp, BElementOp, CElementOp, GemmDefault,
+    256,
+    128, 128, KPerBlock,
+    8, 8,
+    16, 16,
+    4, 2,
+    S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 1,
+    S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 1,
+    1, 1, S<1, 32, 1, 8>, 8,
+    ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1,
+    ADataType, ADataType, PermuteA, PermuteB>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_wmma_bf16_v3.cpp b/example/01_gemm/gemm_wmma_bf16_v3.cpp
new file mode 100644
index 0000000000..1dc5c5286f
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_bf16_v3.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
+    ALayout, BLayout, CLayout,
+    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
+    PassThrough, PassThrough, PassThrough, GemmDefault,
+    256,
+    128, 128, 32,
+    8, 8,
+    16, 16,
+    4, 2,
+    S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 1, 8, 1,
+    S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 1, 8, 1,
+    1, 1, S<1, 32, 1, 8>, 8,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_wmma_fp16_fp8_v3.cpp b/example/01_gemm/gemm_wmma_fp16_fp8_v3.cpp
new file mode 100644
index 0000000000..359d823ac2
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_fp16_fp8_v3.cpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
+    ALayout, BLayout, CLayout,
+    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
+    AElementOp, BElementOp, CElementOp, GemmDefault,
+    256,
+    128, 128, 32,
+    8, 8,
+    16, 16,
+    4, 2,
+    S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 1,
+    S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 1,
+    1, 1, S<1, 32, 1, 8>, 8,
+    ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+#include "run_gemm_example_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_wmma_fp16_pk_i4_v3.cpp b/example/01_gemm/gemm_wmma_fp16_pk_i4_v3.cpp
new file mode 100644
index 0000000000..ec5e48a86a
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_fp16_pk_i4_v3.cpp
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::pk_i4_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA         = false;
+static constexpr bool PermuteB         = true;
+static constexpr ck::index_t KPerBlock = 32;
+
+// clang-format off
+using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
+    ALayout, BLayout, CLayout,
+    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
+    AElementOp, BElementOp, CElementOp, GemmDefault,
+    256,
+    128, 128, KPerBlock,
+    8, 8,
+    16, 16,
+    4, 2,
+    S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 1,
+    S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 1,
+    1, 1, S<1, 32, 1, 8>, 8,
+    ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1,
+    ADataType, ADataType, PermuteA, PermuteB>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      KBatch,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_wmma_fp16_v3.cpp b/example/01_gemm/gemm_wmma_fp16_v3.cpp
new file mode 100644
index 0000000000..7225dba721
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_fp16_v3.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
+    ALayout, BLayout, CLayout,
+    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
+    PassThrough, PassThrough, PassThrough, GemmDefault,
+    128,
+    128, 64,
+    64, 8, 8,
+    16, 16,
+    4, 2,
+    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 1, 8, 1,
+    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 1, 8, 1,
+    1, 1, S<1, 32, 1, 4>, 8,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example_v2.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/example/01_gemm/gemm_wmma_fp8_v3.cpp b/example/01_gemm/gemm_wmma_fp8_v3.cpp
new file mode 100644
index 0000000000..0376820b7b
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_fp8_v3.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+using ADataType        = ck::f8_t;
+using BDataType        = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+using ComputeTypeA     = ck::f8_t;
+using ComputeTypeB     = ck::f8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
+    ALayout, BLayout, CLayout,
+    ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
+    PassThrough, PassThrough, PassThrough, GemmDefault,
+    128,
+    128, 64, 64,
+    8, 8,
+    16, 16,
+    4, 2,
+    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 0,
+    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
+    2, 8, 8, 0,
+    1, 1, S<1, 32, 1, 4>, 8,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1,
+    ComputeTypeA, ComputeTypeB>;
+// clang-format on
+
+using ReferenceComputeType  = ck::f8_t;
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CElementOp,
+                                                                        ReferenceComputeType,
+                                                                        ReferenceComputeType>;
+
+#include "run_gemm_example_v2.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return !run_gemm_splitk_example(argc, argv);
+}
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
index 2fdabc6bc7..bfb081330c 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp"
 
 namespace ck {
@@ -29,7 +30,29 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t KPack>
 constexpr auto BlockGemmPipeline_Selector()
 {
-    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmWmmaops_pipeline_v1<BlkGemmPipeSche,
+                                                BlockSize,
+                                                ADataType,
+                                                BDataType,
+                                                ComputeTypeA,
+                                                ComputeTypeB,
+                                                AccDataType,
+                                                AWmmaTileDesc,
+                                                BWmmaTileDesc,
+                                                ABlockTransferSrcScalarPerVector,
+                                                BBlockTransferSrcScalarPerVector,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                KPerBlock,
+                                                MPerWmma,
+                                                NPerWmma,
+                                                MRepeat,
+                                                NRepeat,
+                                                KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         return BlockwiseGemmWmmaops_pipeline_v3<BlkGemmPipeSche,
                                                 BlockSize,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
index a63d32802e..14856f210c 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -61,7 +61,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
     static_assert(KPack % (B_K1 * B_KRow) == 0, "wrong!");
 
     static constexpr auto wmma_gemm =
-        WmmaGemm<ADataType, BDataType, AccDataType, MPerWmma, NPerWmma, KPack, TransposeC>{};
+        WmmaGemm<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma, KPack, TransposeC>{};
 
     static constexpr index_t KRepeat = KPerBlock / KPack;
 
@@ -198,7 +198,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
                       "wrong! Desc should be known at compile-time");
 
         static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
-                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize");
 
         static_assert(MPerBlock % (MPerWmma * MRepeat) == 0 &&
                           NPerBlock % (NPerWmma * NRepeat) == 0,
@@ -257,10 +257,10 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                                 Number<A_K1>{}),
                                      make_tuple(Number<A_K1>{},
                                                 Number<KPack / A_KRow>{},
-                                                Number<KPack * A_K1>{},
-                                                Number<A_K1>{},
-                                                Number<A_K1>{},
-                                                Number<1>{}));
+                                                Number<KPack / A_KRow * MRepeat>{},
+                                                I0,
+                                                I0,
+                                                I1));
 
     static constexpr auto b_thread_desc_ =
         make_naive_tensor_descriptor(make_tuple(Number<KPack / B_K1 / B_KRow>{},
@@ -271,10 +271,10 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                                 Number<B_K1>{}),
                                      make_tuple(Number<B_K1>{},
                                                 Number<KPack / B_KRow>{},
-                                                Number<KPack * B_K1>{},
-                                                Number<B_K1>{},
-                                                Number<B_K1>{},
-                                                Number<1>{}));
+                                                Number<KPack / B_KRow * NRepeat>{},
+                                                I0,
+                                                I0,
+                                                I1));
 
     // C[M, N, NumRegWmma]
     static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
@@ -282,10 +282,10 @@ struct BlockwiseGemmWmmaops_pipeline_base
 
     using AThreadCopy =
         ThreadwiseTensorSliceTransfer_v4<ADataType,
-                                         ADataType,
+                                         ComputeTypeA,
                                          decltype(a_block_desc_k0_m0_m1_m2_k1),
                                          decltype(a_thread_desc_),
-                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
+                                         Sequence<KPack / A_K1 / A_KRow, MRepeat, 1, 1, 1, A_K1>,
                                          Sequence<0, 1, 2, 3, 4, 5>,
                                          5,
                                          A_K1,
@@ -293,10 +293,10 @@ struct BlockwiseGemmWmmaops_pipeline_base
 
     using BThreadCopy =
         ThreadwiseTensorSliceTransfer_v4<BDataType,
-                                         BDataType,
+                                         ComputeTypeB,
                                          decltype(b_block_desc_k0_n0_n1_n2_k1),
                                          decltype(b_thread_desc_),
-                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
+                                         Sequence<KPack / B_K1 / B_KRow, NRepeat, 1, 1, 1, B_K1>,
                                          Sequence<0, 1, 2, 3, 4, 5>,
                                          5,
                                          B_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
new file mode 100644
index 0000000000..df82e155be
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
@@ -0,0 +1,638 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 1
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
+                                        BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeTypeA,
+                                        ComputeTypeB,
+                                        AccDataType,
+                                        AWmmaTileDesc,
+                                        BWmmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerWmma,
+                                        NPerWmma,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+    : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                         ADataType,
+                                         BDataType,
+                                         ComputeTypeA,
+                                         ComputeTypeB,
+                                         AccDataType,
+                                         AWmmaTileDesc,
+                                         BWmmaTileDesc,
+                                         ABlockTransferSrcScalarPerVector,
+                                         BBlockTransferSrcScalarPerVector,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         KPerBlock,
+                                         MPerWmma,
+                                         NPerWmma,
+                                         MRepeat,
+                                         NRepeat,
+                                         KPack>
+
+{
+    using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                                    ADataType,
+                                                    BDataType,
+                                                    ComputeTypeA,
+                                                    ComputeTypeB,
+                                                    AccDataType,
+                                                    AWmmaTileDesc,
+                                                    BWmmaTileDesc,
+                                                    ABlockTransferSrcScalarPerVector,
+                                                    BBlockTransferSrcScalarPerVector,
+                                                    MPerBlock,
+                                                    NPerBlock,
+                                                    KPerBlock,
+                                                    MPerWmma,
+                                                    NPerWmma,
+                                                    MRepeat,
+                                                    NRepeat,
+                                                    KPack>;
+    using Base::I0;
+
+    using Base::A_K1;
+    using Base::A_KRow;
+    using Base::B_K1;
+    using Base::B_KRow;
+    using Base::KRepeat;
+    using Base::WmmaK;
+
+    using Base::wmma_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::
+        GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+    using Base::GetCThreadBuffer;
+    using Base::
+        GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+
+    using Base::a_block_desc_k0_m0_m1_m2_k1;
+    using Base::b_block_desc_k0_n0_n1_n2_k1;
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; }
+
+    static TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        auto blockwise_gemm_func = [&]() {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(
+                    a_block_desc_k0_m0_m1_m2_k1,
+                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, I0, I0, I0, I0, I0),
+                    a_block_buf,
+                    a_thread_desc_,
+                    make_tuple(I0, I0, k0, I0, I0, I0),
+                    a_thread_buf);
+                b_thread_copy_.Run(
+                    b_block_desc_k0_n0_n1_n2_k1,
+                    make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, I0, I0, I0, I0, I0),
+                    b_block_buf,
+                    b_thread_desc_,
+                    make_tuple(I0, I0, k0, I0, I0, I0),
+                    b_thread_buf);
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    Number<ik / A_K1>{}, m0, k0, I0, I0, Number<ik % A_K1>{}))>{}];
+                        });
+                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
+                                    Number<ik / B_K1>{}, n0, k0, I0, I0, Number<ik % B_K1>{}))>{}];
+                        });
+
+                        using wmma_input_type_a =
+                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                        using wmma_input_type_b =
+                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                      b_thread_vec.template AsType<wmma_input_type_b>(),
+                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        };
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+                blockwise_gemm_func();
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+            blockwise_gemm_func();
+        }
+    }
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          typename AccDataType,
+          typename AWmmaTileDesc,
+          typename BWmmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
+                                        BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeTypeA,
+                                        ComputeTypeB,
+                                        AccDataType,
+                                        AWmmaTileDesc,
+                                        BWmmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerWmma,
+                                        NPerWmma,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+    : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                         ADataType,
+                                         BDataType,
+                                         ComputeTypeA,
+                                         ComputeTypeB,
+                                         AccDataType,
+                                         AWmmaTileDesc,
+                                         BWmmaTileDesc,
+                                         ABlockTransferSrcScalarPerVector,
+                                         BBlockTransferSrcScalarPerVector,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         KPerBlock,
+                                         MPerWmma,
+                                         NPerWmma,
+                                         MRepeat,
+                                         NRepeat,
+                                         KPack>
+
+{
+    using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
+                                                    ADataType,
+                                                    BDataType,
+                                                    ComputeTypeA,
+                                                    ComputeTypeB,
+                                                    AccDataType,
+                                                    AWmmaTileDesc,
+                                                    BWmmaTileDesc,
+                                                    ABlockTransferSrcScalarPerVector,
+                                                    BBlockTransferSrcScalarPerVector,
+                                                    MPerBlock,
+                                                    NPerBlock,
+                                                    KPerBlock,
+                                                    MPerWmma,
+                                                    NPerWmma,
+                                                    MRepeat,
+                                                    NRepeat,
+                                                    KPack>;
+    using Base::I0;
+    using Base::I1;
+
+    using Base::A_K1;
+    using Base::A_KRow;
+    using Base::B_K1;
+    using Base::B_KRow;
+    using Base::KRepeat;
+    using Base::WmmaK;
+
+    using Base::wmma_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::
+        GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+    using Base::GetCThreadBuffer;
+    using Base::
+        GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
+
+    using Base::a_block_desc_k0_m0_m1_m2_k1;
+    using Base::b_block_desc_k0_n0_n1_n2_k1;
+
+    static constexpr index_t NumKClusters      = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
+    static constexpr index_t KRepeatPerCluster = math::max(KRepeat / NumKClusters, 1);
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; }
+
+    static TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        const BBlockDesc& b_block_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        auto blockwise_gemm_func = [&]() {
+            static_for<0, KRepeat, KRepeatPerCluster>{}([&](auto k0_offset) {
+                static_for<0, KRepeatPerCluster, 1>{}([&](auto k0_inner) {
+                    a_thread_copy_.Run(
+                        a_block_desc_k0_m0_m1_m2_k1,
+                        make_tuple(Number<(k0_offset + k0_inner) * KPack / A_K1 / A_KRow>{},
+                                   I0,
+                                   I0,
+                                   I0,
+                                   I0,
+                                   I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(I0, I0, k0_inner, I0, I0, I0),
+                        a_thread_buf);
+                    b_thread_copy_.Run(
+                        b_block_desc_k0_n0_n1_n2_k1,
+                        make_tuple(Number<(k0_offset + k0_inner) * KPack / B_K1 / B_KRow>{},
+                                   I0,
+                                   I0,
+                                   I0,
+                                   I0,
+                                   I0),
+                        b_block_buf,
+                        b_thread_desc_,
+                        make_tuple(I0, I0, k0_inner, I0, I0, I0),
+                        b_thread_buf);
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+                // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster,
+                // but except the first, as we can shorten non-MAC cluster a bit and there's no
+                // observable negative impact. The desired effect is waves in a workgroup
+                // executing MAC in sync. This avoids some out-of-sync waves hijacking MAC
+                // resource from other workgroups and reducing the chance of latency hiding by
+                // waiting for the rest of the workgroup at the eventual sync point.
+                if constexpr(k0_offset != 0 || KRepeat == 1)
+                {
+                    __builtin_amdgcn_s_barrier();
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                static_for<0, KRepeatPerCluster, 1>{}([&](auto k0_inner) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+
+                            static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<ik / A_K1>{},
+                                                   m0,
+                                                   k0_inner,
+                                                   I0,
+                                                   I0,
+                                                   Number<ik % A_K1>{}))>{}];
+                            });
+                            static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<ik / B_K1>{},
+                                                   n0,
+                                                   k0_inner,
+                                                   I0,
+                                                   I0,
+                                                   Number<ik % B_K1>{}))>{}];
+                            });
+
+                            using wmma_input_type_a =
+                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                            using wmma_input_type_b =
+                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                            // The block_sync_lds() here performs double duty:
+                            // A) safeguard against data hazard.
+                            // B) reduce VMEM FIFO congestion by applying small delays to
+                            // different wavefronts.
+                            // It is performed near the end of MAC cluster to minimize lgkmcnt
+                            // penalty
+                            if constexpr(k0_offset + k0_inner == KRepeat - 1 && m0 == MRepeat - 1 &&
+                                         n0 == NRepeat - 1)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                block_sync_lds();
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            if constexpr(k0_inner == 0 && m0 == 0 && n0 == 0)
+                            {
+                                __builtin_amdgcn_sched_barrier(0);
+                                __builtin_amdgcn_s_setprio(1);
+                                __builtin_amdgcn_sched_barrier(0);
+                            }
+                        });
+                    });
+                });
+                __builtin_amdgcn_sched_barrier(0);
+                __builtin_amdgcn_s_setprio(0);
+                __builtin_amdgcn_sched_barrier(0);
+            });
+        };
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+                blockwise_gemm_func();
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+            blockwise_gemm_func();
+        }
+    }
+
+    protected:
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor(make_tuple(Number<KPack / A_K1 / A_KRow>{},
+                                                Number<MRepeat>{},
+                                                Number<KRepeatPerCluster>{},
+                                                I1,
+                                                I1,
+                                                Number<A_K1>{}),
+                                     make_tuple(Number<A_K1>{},
+                                                Number<KPack / A_KRow>{},
+                                                Number<KPack / A_KRow * MRepeat>{},
+                                                I0,
+                                                I0,
+                                                I1));
+
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor(make_tuple(Number<KPack / B_K1 / B_KRow>{},
+                                                Number<NRepeat>{},
+                                                Number<KRepeatPerCluster>{},
+                                                I1,
+                                                I1,
+                                                Number<B_K1>{}),
+                                     make_tuple(Number<B_K1>{},
+                                                Number<KPack / B_KRow>{},
+                                                Number<KPack / B_KRow * NRepeat>{},
+                                                I0,
+                                                I0,
+                                                I1));
+
+    using AThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                         ComputeTypeA,
+                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                         decltype(a_thread_desc_),
+                                         Sequence<KPack / A_K1 / A_KRow, MRepeat, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         A_K1,
+                                         A_K1>;
+
+    using BThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<BDataType,
+                                         ComputeTypeB,
+                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                         decltype(b_thread_desc_),
+                                         Sequence<KPack / B_K1 / B_KRow, NRepeat, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         B_K1,
+                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{Base::CalculateBThreadOriginDataIndex()};
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
index 2fb95f0f8d..5ceb8a6be4 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
@@ -315,24 +315,18 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
         // Local prefetch 1
         block_sync_lds();
         static_for<0, KRepeat, 1>{}([&](auto k0) {
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                a_thread_copy_.Run(
-                    a_block_desc_k0_m0_m1_m2_k1,
-                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
-                    a_block_buf,
-                    a_thread_desc_,
-                    make_tuple(I0, m0, k0, I0, I0, I0),
-                    a_thread_buf);
-            });
-            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                b_thread_copy_.Run(
-                    b_block_desc_k0_n0_n1_n2_k1,
-                    make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
-                    b_block_buf,
-                    b_thread_desc_,
-                    make_tuple(I0, n0, k0, I0, I0, I0),
-                    b_thread_buf);
-            });
+            a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                               make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, I0, I0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, k0, I0, I0, I0),
+                               a_thread_buf);
+            b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                               make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, I0, I0, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, k0, I0, I0, I0),
+                               b_thread_buf);
         });
 
         __builtin_amdgcn_sched_barrier(0);
@@ -363,12 +357,22 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                             static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
                                 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                     a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                        make_tuple(ik / A_K1, m0, k0, 0, 0, ik % A_K1))>{}];
+                                        make_tuple(Number<ik / A_K1>{},
+                                                   m0,
+                                                   k0,
+                                                   I0,
+                                                   I0,
+                                                   Number<ik % A_K1>{}))>{}];
                             });
                             static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
                                 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
                                     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(ik / B_K1, n0, k0, 0, 0, ik % B_K1))>{}];
+                                        make_tuple(Number<ik / B_K1>{},
+                                                   n0,
+                                                   k0,
+                                                   I0,
+                                                   I0,
+                                                   Number<ik % B_K1>{}))>{}];
                             });
 
                             using wmma_input_type_a =
@@ -377,7 +381,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                 typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
 
                             constexpr index_t c_offset =
-                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
 
                             wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
                                           b_thread_vec.template AsType<wmma_input_type_b>(),
@@ -389,24 +393,20 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                 block_sync_lds();
 
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_k0_m0_m1_m2_k1,
-                            make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
-                            a_block_buf,
-                            a_thread_desc_,
-                            make_tuple(I0, m0, k0, I0, I0, I0),
-                            a_thread_buf);
-                    });
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        b_thread_copy_.Run(
-                            b_block_desc_k0_n0_n1_n2_k1,
-                            make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
-                            b_block_buf,
-                            b_thread_desc_,
-                            make_tuple(I0, n0, k0, I0, I0, I0),
-                            b_thread_buf);
-                    });
+                    a_thread_copy_.Run(
+                        a_block_desc_k0_m0_m1_m2_k1,
+                        make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, I0, I0, I0, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(I0, I0, k0, I0, I0, I0),
+                        a_thread_buf);
+                    b_thread_copy_.Run(
+                        b_block_desc_k0_n0_n1_n2_k1,
+                        make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, I0, I0, I0, I0, I0),
+                        b_block_buf,
+                        b_thread_desc_,
+                        make_tuple(I0, I0, k0, I0, I0, I0),
+                        b_thread_buf);
                 });
 
                 HotLoopScheduler();
@@ -426,13 +426,13 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
                         static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
                             a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(ik / A_K1, m0, k0, 0, 0, ik % A_K1))>{}];
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    Number<ik / A_K1>{}, m0, k0, I0, I0, Number<ik % A_K1>{}))>{}];
                         });
                         static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
                             b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(ik / B_K1, n0, k0, 0, 0, ik % B_K1))>{}];
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
+                                    Number<ik / B_K1>{}, n0, k0, I0, I0, Number<ik % B_K1>{}))>{}];
                         });
 
                         using wmma_input_type_a =
@@ -441,7 +441,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                             typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
 
                         constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
 
                         wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
                                       b_thread_vec.template AsType<wmma_input_type_b>(),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
index 1ef8a9b8ad..90afc467d4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
@@ -278,10 +278,10 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer =
-                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
-                    auto size_b_buffer =
-                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
 
                     ck::utility::RotatingMemWrapper<Argument> rotating_mem(
                         arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
@@ -340,7 +340,8 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
             if(has_main_k_block_loop)
             {
                 // Tail number always full
-                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
                     if(arg.KBatch > 1)
                     {
@@ -368,7 +369,28 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
             }
             else
             {
-                // TODO: Implement
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         false,
+                                                         InMemoryDataOperationEnum::AtomicAdd,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         false,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
             }
 
             return ave_time;
@@ -405,8 +427,8 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
             }
         }
 
-        if constexpr(std::is_same_v<ADataType, f8_t> || std::is_same_v<ADataType, bf8_t> ||
-                     std::is_same_v<BDataType, f8_t> || std::is_same_v<BDataType, bf8_t>)
+        if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
+                     std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
         {
             if(ck::is_gfx11_supported())
             {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
index 4dfa472103..f3354cd5dd 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
@@ -200,12 +200,12 @@ template <typename ALayout,
           index_t CShuffleNRepeatPerShuffle,
           typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
-          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
-          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
-          typename ComputeTypeA                       = CDataType,
-          typename ComputeTypeB                       = ComputeTypeA,
-          bool PermuteA                               = false,
-          bool PermuteB                               = false>
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          BlockGemmPipelineVersion BlkGemmPipelineVer,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          bool PermuteA,
+          bool PermuteB>
 struct GridwiseGemm_wmma_cshuffle_v3
 {
     static constexpr auto I0 = Number<0>{};
@@ -302,7 +302,7 @@ struct GridwiseGemm_wmma_cshuffle_v3
     template <index_t MNRepeat, index_t MNWaves, index_t MNPerWmma, typename BlockDesc>
     __host__ __device__ static constexpr auto MakeWmmaTileDescriptor(const BlockDesc&)
     {
-        // K0_N_K1 -> K0_MNRepeat_MNWaves_MNPerWmma_K1
+        // K0_MN_K1 -> K0_MNRepeat_MNWaves_KRow_MNPerWmma_K1
         constexpr auto K0 = BlockDesc{}.GetLength(I0);
         constexpr auto K1 = BlockDesc{}.GetLength(I2);
 #ifdef __gfx12__
@@ -420,7 +420,7 @@ struct GridwiseGemm_wmma_cshuffle_v3
 
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
-        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+        static_assert(!(is_same_v<remove_cvref_t<BDataType>, pk_i4_t> &&
                         GemmSpec != GemmSpecialization::Default),
                       "pk_i4_t does not support padding");
 
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 04ae046ac8..9b1321dea3 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -885,7 +885,14 @@ template <>
 inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_t x)
 {
 #if CK_OCP_FP8_CVT_FAST_PATH
+// __builtin_amdgcn_cvt_pk_f32_fp8 can produce incorrect results due to a compiler issue.
+// TODO: Enable when SWDEV-532959 is fixed.
+#if defined(__gfx1200__) || defined(__gfx1201__)
+    return float2_t{__builtin_amdgcn_cvt_f32_fp8(bit_cast<uint16_t>(x), 0),
+                    __builtin_amdgcn_cvt_f32_fp8(bit_cast<uint16_t>(x), 1)};
+#else
     return __builtin_amdgcn_cvt_pk_f32_fp8(bit_cast<uint16_t>(x), false);
+#endif
 #else
     return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
                         x.AsType<fp8_storage_t>()[Number<0>{}]),
@@ -1021,7 +1028,14 @@ template <>
 inline __host__ __device__ float2_t type_convert<float2_t, bf8x2_ocp_t>(bf8x2_ocp_t x)
 {
 #if CK_OCP_FP8_CVT_FAST_PATH
+// __builtin_amdgcn_cvt_pk_f32_bf8 can produce incorrect results due to a compiler issue.
+// TODO: Enable when SWDEV-532959 is fixed.
+#if defined(__gfx1200__) || defined(__gfx1201__)
+    return float2_t{__builtin_amdgcn_cvt_f32_bf8(bit_cast<uint16_t>(x), 0),
+                    __builtin_amdgcn_cvt_f32_bf8(bit_cast<uint16_t>(x), 1)};
+#else
     return __builtin_amdgcn_cvt_pk_f32_bf8(bit_cast<uint16_t>(x), false);
+#endif
 #else
     return float2_t{fp8_impl::cast_from_f8<float, bf8_ocp_t::wm, bf8_ocp_t::we, false>(
                         x.AsType<fp8_storage_t>()[Number<0>{}]),
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
index 79212e16dd..cd5d613e1f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
@@ -64,21 +64,45 @@ struct DeviceOperationInstanceFactory<
                          is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
         }
 #endif
@@ -91,28 +115,52 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
                     op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
                     op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
                     op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
                     op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
         }
 #endif
-#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8))
         if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, bhalf_t>)
         {
@@ -120,11 +168,144 @@ struct DeviceOperationInstanceFactory<
                          is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instances(op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+            }
+        }
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instances(op_ptrs);
             }
         }
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
index 1396437326..80414898ca 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_wmma.inc
@@ -13,55 +13,355 @@ void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 
 void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 
 void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 
 void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 
 void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 #endif
-#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8))
 void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
         instances);
-
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
         instances);
-#endif
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
 
+void add_device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8))
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index dc43f65b10..ec3287bf95 100755
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -283,6 +283,15 @@ FOREACH(subdir_path ${dir_list})
             message("Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
+        if ("${cmake_instance}" MATCHES "gemm_bilinear")
+            set(add_inst 0)
+            if((SUPPORTED_GPU_TARGETS MATCHES "gfx9") AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES))
+                set(add_inst 1)
+            endif()
+            if((SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES))
+                set(add_inst 1)
+            endif()
+        endif()
 
         if(MIOPEN_REQ_LIBS_ONLY)
             message("Removing all sources that are not required for MIOpen")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
index 18eeefa522..c8d56f46be 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
@@ -3,14 +3,90 @@ set(GEMM_UNIVERSAL_INSTANCES)
 
 list(APPEND GEMM_UNIVERSAL_INSTANCES
         device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
-        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
-        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
-        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
 
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
+        
         device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+
         device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
+
         device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
+
         device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instance.cpp
+
+        device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instance.cpp
+
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
+        
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instance.cpp
+
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instance.cpp
+        device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instance.cpp
 
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp
         device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -68,14 +144,91 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
         )
 
 set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+
+set_source_files_properties(device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kmnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -101,7 +254,14 @@ set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm
 
 list(APPEND GEMM_UNIVERSAL_INSTANCES
           device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+
           device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
+          device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
 
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -158,7 +318,14 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
         )
 
 set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
 set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
index 5d3bb3f7b4..430daae3ab 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -40,22 +40,15 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..2d7be90ae6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..c1ade989e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..76f0d7e122
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
index 6c3a641f9f..9b876f5430 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -40,22 +40,17 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..e38a89a549
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..fa77376cb0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..b4e5e3a2dd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index b700e78d3d..65261235b6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -40,24 +40,19 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        // Configurations used during development, mainly for testing
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..27a247f72b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..f0ec566878
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..6fe412e778
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 7b4cd64d33..dc770d8d9a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -40,22 +40,23 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..327c28c7e7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..6141cbbbff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..5b68474f24
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn.hpp
new file mode 100644
index 0000000000..958bff80cf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4   = pk_i4_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute| PermuteA| PermuteB|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|         |         |
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |         |         |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |         |         |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,    BF16,    BF16,    false,     true>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..0ab06a49e4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_bf16_i4_bf16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..5ffbbbdc4c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn.hpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4   = pk_i4_t;
+using F16  = half_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute| PermuteA| PermuteB|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|         |         |
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |         |         |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |         |         |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,    BF16,    BF16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,    I4,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,    BF16,    BF16,    false,    false>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..6d550374f7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_i4_bf16/device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_bf16_i4_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
index 3751dc5a11..266e6b1a5d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
@@ -40,22 +40,15 @@ using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..9c1f77d979
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..4847f8035b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..28a443799d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
index 222b49eb7d..1674b2de6c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
@@ -40,22 +40,17 @@ using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..74d05580dc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..694b6cb788
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..af6d71edff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
index 6960375ed6..758420ca37 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -40,24 +40,19 @@ using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        // Configurations used during development, mainly for testing
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..6774ffa40e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..1e6f7a337c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..6897778c15
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
index 7f71cf6f59..dad402dff4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -40,22 +40,23 @@ using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    32,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    16,    64,    64,   8,   8,   16,   16,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    32,    64,   8,   8,   16,   16,       1,       2,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..6a3c9159ed
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..bad4851eac
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..3f9c34c83e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
new file mode 100644
index 0000000000..ee15dfa94e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..cfd0a7aa8b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..669d66776c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..6b51066995
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..0ef41d88d7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
new file mode 100644
index 0000000000..93039a5008
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..1f736e775b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..db982d444a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..629348bd64
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..46fadb42fc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..1dc9678c5b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   256,    64,    64,   8,   8,   16,   16,       8,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..08f9cb533b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..a4b4ee34b1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..85f8d1d4a6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..6a7fdcc07a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..e4682c27d3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..5a3fd38c2f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..91ecd5cde8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..8a763ba7a4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..106b0acdd7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn.hpp
new file mode 100644
index 0000000000..a9ba9a3906
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn.hpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4  = pk_i4_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute| PermuteA| PermuteB|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|         |         |
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |         |         |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |         |         |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,     F16,     F16,    false,     true>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..df6719d605
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_i4_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_i4_f16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..5d374af4e4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4  = pk_i4_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm| Compute| Compute| PermuteA| PermuteB|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|         |         |
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |         |         |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |         |         |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,     F16,     F16,    false,    false>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,     F16,     F16,    false,     true>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,     F16,     F16,    false,     true>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..42c00b4e86
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_i4_f16/device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f16_i4_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
new file mode 100644
index 0000000000..0c601b3823
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..90b9ad8e64
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..dbbcba041a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..f6d39ed91f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..8c34c5d447
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
new file mode 100644
index 0000000000..8d11b6f9d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..5fa17f6f45
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..fc1fab401f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..1cc7de8813
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..a4db6f085b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Col, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
new file mode 100644
index 0000000000..d389da5ee8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..3af30df47a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..34053e860e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..db1c60967c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..fa84694eb7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..001330eabb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
new file mode 100644
index 0000000000..57a4bbd3c7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..c4d75b0c23
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..b722bd32c1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..3638fa33ea
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
index 2fca3551b4..4c37c398fe 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -41,7 +41,20 @@ using device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    64,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         0,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
         // clang-format on
         >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..6439f27f35
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..513acdd975
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..877ccac0a6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Row, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
index 244eb69190..6b5314b701 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -41,8 +41,17 @@ using device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances =
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|   TypeA|   TypeB|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |        |        |
         //#########################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |        |        |
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    32,    16,    16,    32,   8,   8,   16,   16,       1,       1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,   256,    64,   8,   8,   16,   16,       2,       8,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..c625cda347
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
new file mode 100644
index 0000000000..42d26a31d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
new file mode 100644
index 0000000000..6b83ba4e64
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    if(ck::is_gfx11_supported())
+        return;
+
+    add_device_operation_instances(
+        instances, device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmMNPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index f7b1d5f1f8..ed62828158 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -105,9 +105,9 @@ bool profile_gemm_universal_impl(int do_verification,
     const auto b_element_op = BElementOp{};
     const auto c_element_op = CElementOp{};
 
-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(a_m_k.GetElementSpaceSizeInBytes());
+    DeviceMem b_device_buf(b_k_n_permute.GetElementSpaceSizeInBytes());
+    DeviceMem c_device_buf(c_m_n_device_result.GetElementSpaceSizeInBytes());
 
     a_device_buf.ToDevice(a_m_k.mData.data());
 
@@ -176,64 +176,67 @@ bool profile_gemm_universal_impl(int do_verification,
                     }
                 }
             }
-
-            if constexpr(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
-            {
-                // vector pk_i4x4 permute
-                for(int i = 0; i < N; i++)
-                {
-                    for(int j = 0; j < K; j += 8)
-                    {
-                        int input[8];
-
-                        for(int k = 0; k < 4; k++)
-                        {
-                            int i4x2         = b_k_n_permute(j + k * 2, i).data;
-                            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
-                            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
-                        }
-
-                        // permute 01234567->20643175
-                        {
-                            int hi   = input[2];
-                            int lo   = input[0];
-                            int i4x2 = (hi << 4) | lo;
-
-                            b_k_n_permute(j + 0, i) = i4x2;
-                        }
-
-                        {
-                            int hi   = input[6];
-                            int lo   = input[4];
-                            int i4x2 = (hi << 4) | lo;
-
-                            b_k_n_permute(j + 2, i) = i4x2;
-                        }
-
-                        {
-                            int hi   = input[3];
-                            int lo   = input[1];
-                            int i4x2 = (hi << 4) | lo;
-
-                            b_k_n_permute(j + 4, i) = i4x2;
-                        }
-
-                        {
-                            int hi   = input[7];
-                            int lo   = input[5];
-                            int i4x2 = (hi << 4) | lo;
-
-                            b_k_n_permute(j + 6, i) = i4x2;
-                        }
-                    }
-                }
-            }
         }
         else
         {
             b_k_n_permute = b_k_n;
         }
 
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+        // Conversion from pk_i4_t to half_t expects a particular permutation
+        if constexpr(is_same_v<BDataType, pk_i4_t> && is_same_v<ComputeDataType, half_t>)
+        {
+            // vector pk_i4x4 permute
+            for(int i = 0; i < N; i++)
+            {
+                for(int j = 0; j < K; j += 8)
+                {
+                    int input[8];
+
+                    for(int k = 0; k < 4; k++)
+                    {
+                        int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                        input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                        input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                    }
+
+                    // permute 01234567->20643175
+                    {
+                        int hi   = input[2];
+                        int lo   = input[0];
+                        int i4x2 = (hi << 4) | lo;
+
+                        b_k_n_permute(j + 0, i) = i4x2;
+                    }
+
+                    {
+                        int hi   = input[6];
+                        int lo   = input[4];
+                        int i4x2 = (hi << 4) | lo;
+
+                        b_k_n_permute(j + 2, i) = i4x2;
+                    }
+
+                    {
+                        int hi   = input[3];
+                        int lo   = input[1];
+                        int i4x2 = (hi << 4) | lo;
+
+                        b_k_n_permute(j + 4, i) = i4x2;
+                    }
+
+                    {
+                        int hi   = input[7];
+                        int lo   = input[5];
+                        int i4x2 = (hi << 4) | lo;
+
+                        b_k_n_permute(j + 6, i) = i4x2;
+                    }
+                }
+            }
+        }
+#endif
+
         b_device_buf.ToDevice(b_k_n_permute.mData.data());
 
         std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 65dd704610..4f4a1f5356 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -81,10 +81,12 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
 
 endif()
 
+if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
+   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)))
+  list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
+endif()
+
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
-  endif()
   list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
@@ -188,10 +190,12 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
 endif()
 
-if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
-  endif()
+if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
+   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)))
+  list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
+endif()
+
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
   list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index 7f2393a7e6..24028b1448 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -105,8 +105,6 @@ int profile_gemm_universal(int argc, char* argv[])
     using BF16 = ck::bhalf_t;
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
     using F8 = ck::f8_t;
-#endif
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using I4 = ck::pk_i4_t;
 #endif
 
@@ -169,7 +167,7 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
@@ -212,8 +210,6 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
-#endif
-#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 12abe5a245..aa7e6651f1 100755
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -14,7 +14,8 @@ set(REGRESSION_TESTS
     test_gemm_fp16
     test_gemm_splitk
     test_batched_gemm
-    test_gemm_universal
+    test_gemm_universal_wmma_fp16
+    test_gemm_universal_xdl_fp16
     test_gemm_universal_streamk_fp16
     test_gemm_universal_streamk_bf16
     test_gemm_universal_streamk_fp8
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
index 8a0f631b39..8f6e9a0d15 100644
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -16,15 +16,15 @@ if (CK_USE_OCP_FP8)
   add_gtest_executable(test_fp8_ocp test_fp8_ocp.cpp)
   if(result EQUAL 0)
     target_link_libraries(test_fp8_ocp PRIVATE utility)
+    add_dependencies(test_fp8 test_fp8_ocp)
   endif()
 
   add_gtest_executable(test_bf8_ocp test_bf8_ocp.cpp)
   if(result EQUAL 0)
     target_link_libraries(test_bf8_ocp PRIVATE utility)
+    add_dependencies(test_fp8 test_bf8_ocp)
   endif()
 
-  add_dependencies(test_fp8 test_fp8_ocp)
-  add_dependencies(test_fp8 test_bf8_ocp)
 endif()
 
 if (CK_USE_FNUZ_FP8)
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
index 8a6c672a9f..233f86ef43 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
@@ -207,3 +207,35 @@ TYPED_TEST(TestGemmUniversal_BF16_MK_NK, Regular)
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
+
+TYPED_TEST(TestGemmUniversal_BF16_KM_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_BF16_KM_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
index 6f6d550625..adc84848f2 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
@@ -28,6 +28,38 @@ TYPED_TEST(TestGemmUniversal_FP16_MK_NK, SmallM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
+TYPED_TEST(TestGemmUniversal_FP16_KM_KN, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_KM_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
 TYPED_TEST(TestGemmUniversal_FP16_MK_KN, MidLargeM)
 {
     std::vector<int> Ms{127, 255, 312, 799, 1573};
@@ -56,6 +88,38 @@ TYPED_TEST(TestGemmUniversal_FP16_MK_NK, MidLargeM)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
+TYPED_TEST(TestGemmUniversal_FP16_KM_KN, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_KM_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 320;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
 TYPED_TEST(TestGemmUniversal_FP16_MK_KN, PaddK)
 {
     std::vector<int> Ms{127};
@@ -84,6 +148,38 @@ TYPED_TEST(TestGemmUniversal_FP16_MK_NK, PaddK)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
 
+TYPED_TEST(TestGemmUniversal_FP16_KM_KN, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_KM_NK, PaddK)
+{
+    std::vector<int> Ms{127};
+    constexpr int N = 512;
+    constexpr int K = 437;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
 TYPED_TEST(TestGemmUniversal_FP16_MK_KN, Regular)
 {
     std::vector<int> Ms{512};
@@ -111,3 +207,35 @@ TYPED_TEST(TestGemmUniversal_FP16_MK_NK, Regular)
     for(int M : Ms)
         this->Run(M, N, K, StrideA, StrideB, StrideC);
 }
+
+TYPED_TEST(TestGemmUniversal_FP16_KM_KN, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
+
+TYPED_TEST(TestGemmUniversal_FP16_KM_NK, Regular)
+{
+    std::vector<int> Ms{512};
+    constexpr int N = 512;
+    constexpr int K = 512;
+
+    constexpr int StrideB = N;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+    {
+        int StrideA = M;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+    }
+}
diff --git a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
index 22376a8599..311c4de32d 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_bf16.cpp
@@ -7,6 +7,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
+using I4   = ck::pk_i4_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;
 
@@ -58,6 +59,9 @@ using KernelTypes_MK_KN = ::testing::Types<
 
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8)
+    std::tuple<     BF16,        I4,            BF16,      BF16>,
+#endif
     std::tuple<     BF16,      BF16,            BF16,      BF16>
     >;
 
@@ -68,6 +72,9 @@ using KernelTypes_KM_KN = ::testing::Types<
 
 using KernelTypes_KM_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8)
+    std::tuple<     BF16,        I4,            BF16,      BF16>,
+#endif
     std::tuple<     BF16,      BF16,            BF16,      BF16>
     >;
 // clang-format on
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
index 1adee41ed2..2f51253766 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp16.cpp
@@ -7,6 +7,8 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
+using I4  = ck::pk_i4_t;
+using F8  = ck::f8_t;
 using F16 = ck::half_t;
 
 using F32 = float;
@@ -39,19 +41,61 @@ class TestGemmUniversal_FP16_MK_NK
 {
 };
 
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_KN
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Row>, Tuple>::type>
+{
+};
+
+template <typename Tuple>
+class TestGemmUniversal_FP16_KM_NK
+    : public ck::test::TestGemmUniversal<typename tuple_concat<std::tuple<Col, Col>, Tuple>::type>
+{
+};
+
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8)
+    std::tuple<       F8,       F16,             F16,       F16>,
+    std::tuple<      F16,        F8,             F16,       F16>,
+#endif
     std::tuple<      F16,       F16,             F16,       F16>
     >;
 
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8)
+    std::tuple<       F8,       F16,             F16,       F16>,
+    std::tuple<      F16,        F8,             F16,       F16>,
+    std::tuple<      F16,        I4,             F16,       F16>,
+#endif
+    std::tuple<      F16,       F16,             F16,       F16>
+    >;
+
+using KernelTypes_KM_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8)
+    std::tuple<       F8,       F16,             F16,       F16>,
+    std::tuple<      F16,        F8,             F16,       F16>,
+    std::tuple<      F16,        I4,             F16,       F16>,
+#endif
+    std::tuple<      F16,       F16,             F16,       F16>
+    >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+#if defined(CK_ENABLE_FP8) && defined(CK_USE_WMMA_FP8)
+    std::tuple<       F8,       F16,             F16,       F16>,
+    std::tuple<      F16,        F8,             F16,       F16>,
+#endif
     std::tuple<      F16,       F16,             F16,       F16>
     >;
 // clang-format on
 
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
 
 #include "test_gemm_universal_ut_cases_fp16.inc"
diff --git a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
index 3579424496..3484d49b93 100644
--- a/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
+++ b/test/gemm_universal/test_gemm_universal_wmma_fp8.cpp
@@ -7,7 +7,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "test_gemm_universal_util.hpp"
 
-#if CK_USE_WMMA_FP8
+#if defined(CK_USE_WMMA_FP8)
 
 using F8   = ck::f8_t;
 using BF16 = ck::bhalf_t;
diff --git a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
index 24f587daf6..4eafb8c2e3 100644
--- a/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
 
@@ -55,7 +55,7 @@ class TestGemmUniversal_FP16_KM_NK
 // clang-format off
 using KernelTypes_MK_KN = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    
+
 #if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
     std::tuple<      F16,        F8,             F16,     F16>,
     std::tuple<       F8,       F16,             F16,     F16>,
@@ -63,9 +63,10 @@ using KernelTypes_MK_KN = ::testing::Types<
 #endif
     std::tuple<      F16,       F16,             F16,     F16>
     >;
+
 using KernelTypes_MK_NK = ::testing::Types<
     //         ADataType, BDataType, ComputeDataType, CDataType
-    
+
 #if defined(CK_ENABLE_FP8) && (defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94))
     std::tuple<      F16,        F8,             F16,     F16>,
     std::tuple<       F8,       F16,             F16,     F16>,
@@ -74,9 +75,20 @@ using KernelTypes_MK_NK = ::testing::Types<
     std::tuple<      F16,       F16,             F16,     F16>
     >;
 
+using KernelTypes_KM_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
+
+using KernelTypes_KM_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,     F16>
+    >;
 // clang-format on
 
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_KN, KernelTypes_MK_KN);
 TYPED_TEST_SUITE(TestGemmUniversal_FP16_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_NK, KernelTypes_KM_NK);
+TYPED_TEST_SUITE(TestGemmUniversal_FP16_KM_KN, KernelTypes_KM_KN);
 
 #include "test_gemm_universal_ut_cases_fp16.inc"

From ffb52783d0a6b3afc168dfa6bfb5bd119f48b65b Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Wed, 4 Jun 2025 11:46:28 +0300
Subject: [PATCH 180/443] [CK_TILE] Tile loop persistent gemm kernel (#2191)

* Implement tile loop persistent gemm kernel

* Enable timing

* Add tests for persistent gemm

* Fix formatting

* Fix gemm_basic

* Rename True/False to Persistent/NonPersistent

* Use only one set of layouts for persistent tests

* Fix gemm example persistent template parameter

* Fix formatting
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |   5 +-
 example/ck_tile/03_gemm/gemm_utils.hpp        |   6 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  37 ++++++-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  16 ++-
 include/ck_tile/core/utility/type_traits.hpp  |  30 +++++
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 104 ++++++++++++++++++
 test/ck_tile/gemm/CMakeLists.txt              |   5 +
 .../gemm/test_gemm_pipeline_kernel_types.hpp  |   9 ++
 .../gemm/test_gemm_pipeline_persistent.cpp    |  16 +++
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  22 +++-
 10 files changed, 232 insertions(+), 18 deletions(-)
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 386fe93715..de9608bcb4 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -18,9 +18,12 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          bool Persistent>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
+    if constexpr(Persistent)
+        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
     constexpr bool kPadM = false;
     constexpr bool kPadN = false;
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 4c9fecaba6..aec5f6a116 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -213,7 +213,8 @@ auto create_args(int argc, char* argv[])
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -226,5 +227,6 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          bool Persistent = false>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 3010130e6c..bf455a6415 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -162,7 +162,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::index_t stride_C,
                   ck_tile::index_t kbatch,
                   int n_warmup,
-                  int n_repeat)
+                  int n_repeat,
+                  bool persistent)
 {
     ck_tile::GemmHostArgs args;
     args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
@@ -176,9 +177,31 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time =
-        gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+    float ave_time;
+    if(persistent)
+    {
+        ave_time = gemm_calc<ADataType,
+                             BDataType,
+                             AccDataType,
+                             CDataType,
+                             ALayout,
+                             BLayout,
+                             CLayout,
+                             true>(
             args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    }
+    else
+    {
+        ave_time = gemm_calc<ADataType,
+                             BDataType,
+                             AccDataType,
+                             CDataType,
+                             ALayout,
+                             BLayout,
+                             CLayout,
+                             false>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    }
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -193,8 +216,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
               << " B_Type=" << DataTypeTraits<BDataType>::name
               << " C_Type=" << DataTypeTraits<CDataType>::name
               << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
 
     return ave_time;
 }
@@ -229,6 +252,7 @@ int run_gemm_example_with_layouts(int argc,
     int n_warmup                 = arg_parser.get_int("warmup");
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -316,7 +340,8 @@ int run_gemm_example_with_layouts(int argc,
         stride_C,
         kbatch,
         n_warmup,
-        n_repeat);
+        n_repeat,
+        persistent);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 0a094c29fe..645263d26d 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -32,7 +32,8 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          bool Persistent>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -61,7 +62,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                  BLayout,
                                                                  CLayout,
                                                                  GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity>;
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
@@ -111,7 +113,15 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index 2e82e21ba1..95fb1bd834 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core/config.hpp"
+#include <tuple>
 #include <type_traits>
 #include <stdint.h>
 
@@ -138,4 +139,33 @@ struct is_specialization_of<RefTemplate<Args...>, RefTemplate> : std::true_type
 {
 };
 
+// Helper to get a tuple element or default type
+namespace detail {
+
+template <bool IsWithinBounds, std::size_t Idx, typename Tuple, typename DefaultType>
+struct tuple_element_or_default_dispatch
+{
+    using type = DefaultType;
+};
+
+template <std::size_t Idx, typename Tuple, typename DefaultType>
+struct tuple_element_or_default_dispatch<true, Idx, Tuple, DefaultType>
+{
+    using type = std::tuple_element_t<Idx, Tuple>;
+};
+
+} // namespace detail
+
+template <typename Tuple_, std::size_t Idx, typename DefaultType>
+struct tuple_element_or_default
+{
+    using Tuple                            = remove_cvref_t<Tuple_>;
+    static constexpr bool is_within_bounds = Idx < std::tuple_size_v<Tuple>;
+    using type                             = typename detail::
+        tuple_element_or_default_dispatch<is_within_bounds, Idx, Tuple, DefaultType>::type;
+};
+template <typename Tuple_, std::size_t Idx, typename DefaultType>
+using tuple_element_or_default_t =
+    typename tuple_element_or_default<Tuple_, Idx, DefaultType>::type;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 9c25104cd7..fea6633f9f 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -9,7 +9,9 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
+#include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
 
@@ -142,6 +144,21 @@ struct GemmKernel
     using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
+    // Get the persistent kernel if the pipeline has it available
+    struct has_persistent_kernel
+    {
+        template <typename T>
+        using has_persistent_type = decltype(T::UsePersistentKernel);
+
+        static constexpr bool value = []() {
+            if constexpr(is_detected<has_persistent_type, GemmPipeline>{})
+                return GemmPipeline::UsePersistentKernel;
+            else
+                return false;
+        }();
+    };
+    static constexpr bool PersistentKernel = has_persistent_kernel::value;
+
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
     // Below type is actually accumulation data type - the output of block GEMM.
@@ -163,6 +180,23 @@ struct GemmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
+    /**
+     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
+     * @return The maximum occupancy grid size.
+     * @note This function queries the maximum occupancy of the kernel using
+     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+     */
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        using Kernel      = GemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
+        const auto kernel = kentry<KernelBlockSize, 1, Kernel, GemmKernelArgs>;
+        int occupancy;
+        hip_check_error(
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
+        const int grid_size = get_available_compute_units(s) * occupancy;
+        return dim3(grid_size, 1, 1);
+    }
+
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
     CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
@@ -693,6 +727,8 @@ struct GemmKernel
             c_block_window, c_block_tile, smem_ptr_0);
     }
 
+    // Non-persistent kernel entry point
+    template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
     CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
     {
         const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
@@ -739,6 +775,74 @@ struct GemmKernel
             }
         }
     }
+
+    // Persistent kernel entry point
+    template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
+    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
+    {
+        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
+        const auto num_tiles =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
+        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
+        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
+
+        while(block_id < num_work)
+        {
+            // Get the tile index for this block
+            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
+            const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
+            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+            // Get the SplitK offset for this block
+            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
+            const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
+            const ADataType* a_ptr =
+                static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+            const BDataType* b_ptr =
+                static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+            CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+            // allocate LDS
+            __shared__ char smem_ptr_0[GetSmemSize()];
+            // Run the GEMM
+            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+            {
+                __shared__ char smem_ptr_1[GetSmemSize()];
+                if constexpr(!(EpiloguePipeline::MemoryOperation ==
+                                   memory_operation_enum::atomic_add &&
+                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<CDataType, fp16_t, bf16_t>::value))
+                {
+                    RunGemm2LDS(a_ptr,
+                                b_ptr,
+                                c_ptr,
+                                smem_ptr_0,
+                                smem_ptr_1,
+                                kargs,
+                                splitk_batch_offset,
+                                i_m,
+                                i_n);
+                }
+            }
+            else
+            {
+                if constexpr(!(EpiloguePipeline::MemoryOperation ==
+                                   memory_operation_enum::atomic_add &&
+                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<CDataType, fp16_t, bf16_t>::value))
+                {
+                    RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                }
+            }
+            // Advance to the next work item
+            block_id += grid_size;
+            if(block_id >= num_work)
+            {
+                break;
+            }
+        }
+    }
 };
 
 } // namespace ck_tile
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index fc04af5cdb..598bd68666 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -23,3 +23,8 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
 else()
     message("Skipping ck_tile_gemm tests for current target")
 endif()
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MATCHES "gfx90a")
+    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent test_gemm_pipeline_persistent.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index bd1502516b..b9d3f57dbb 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
+#include <type_traits>
 
 #include "gtest/gtest.h"
 
@@ -21,6 +22,9 @@ using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType:
 using CompV3    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
 using CompV4    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
+using Persistent    = std::true_type;
+using NonPersistent = std::false_type;
+
 // clang-format off
 using KernelTypesMem = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
@@ -59,4 +63,9 @@ using KernelTypesCompV4 = ::testing::Types<
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>
 >;
 
+using KernelTypesPersistent = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3,    Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3, NonPersistent>
+>;
+
 // clang-format on
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
new file mode 100644
index 0000000000..1dea1ab48c
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -0,0 +1,16 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelinePersistent : public TestCkTileGemmPipeline<T>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelinePersistent
+
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesPersistent);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 85742cb3de..1892aa0e31 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -89,6 +89,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
     using CDataType                    = std::tuple_element_t<6, Tuple>;
     static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
     static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
+    static constexpr bool Persistent =
+        ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
     // TODO: expose tile size through test t-param ?
 
     template <bool PadM, bool PadN, bool PadK>
@@ -130,14 +132,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
             GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
         using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+        static constexpr bool StructuredSparsity = false;
+        using GemmUniversalTraits                = ck_tile::TileGemmUniversalTraits<kPadM,
                                                                      kPadN,
                                                                      kPadK,
                                                                      DoubleSmemBuffer,
                                                                      ALayout,
                                                                      BLayout,
                                                                      CLayout,
-                                                                     TransposeC>;
+                                                                     TransposeC,
+                                                                     StructuredSparsity,
+                                                                     Persistent>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -190,7 +195,15 @@ class TestCkTileGemmPipeline : public ::testing::Test
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
@@ -442,9 +455,6 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
                                   rtol_atol.at(ck_tile::number<1>{}));
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
         EXPECT_TRUE(pass);
     }
 };

From 7ea1508b59a0e8f89540d8d5f7eb3e7da9a50a62 Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Wed, 4 Jun 2025 11:50:21 +0300
Subject: [PATCH 181/443] [CK_TILE] Move GEMM pipeline tail handling logic to
 pipelines (#2222)

* Add TailHandler for V3, V4 and Mem pipelines

* Adapt examples and tests to use TailHandler

* move tail-handling logic to pipeline in persistent grouped gemm

* Fix Mem pipeline dispatching, add CompV4 dispatching

* Use a macro for handling the many tails of Mem pipeline

* Fix formatting again

* Use const-ref RunFunction, remove unnecessary try_run
---
 example/ck_tile/03_gemm/universal_gemm.cpp    | 103 +-------------
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  | 132 +-----------------
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 116 +--------------
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |  61 +-------
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |  80 +++++++++++
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |  65 +++++++++
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |  78 ++++++++++-
 .../batched_gemm/test_batched_gemm_util.hpp   |  27 +---
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  98 +------------
 .../grouped_gemm/test_grouped_gemm_util.hpp   |  27 +---
 10 files changed, 234 insertions(+), 553 deletions(-)

diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 645263d26d..3a7cc93df8 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -13,19 +13,6 @@
 #include "gemm_utils.hpp"
 #include "run_gemm_example.inc"
 
-template <typename Pipeline, ck_tile::TailNumber TN>
-void try_run(ck_tile::TailNumber tn)
-{
-    if constexpr(Pipeline::PrefetchStages > static_cast<int>(TN))
-    {
-        if(tn == TN)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, TN>{});
-        }
-    }
-}
-
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
@@ -202,95 +189,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
         }
     };
 
-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "For compute pipeline tail number should always be Full, but have \"" << tail_num
-                << "\" which is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        auto check_tail = [&](auto... TNs) {
-            (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
-        };
-
-        check_tail(ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            RunSplitk(
-                ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Num K loop must be larger than number of prefetech stages."
-                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-    }
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
     return ave_time;
 }
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index 68ad1106ce..c5c86b1952 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -183,137 +183,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
         }
     };
 
-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Incorrect tail_num for compv3 pipeline! Expected Full, Odd or Even, but got "
-                << tail_num << "\nPrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        // Tail pipeline One to Seven
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-        {
-            if(tail_num == ck_tile::TailNumber::Two)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-        {
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-        {
-            if(tail_num == ck_tile::TailNumber::Four)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-        {
-            if(tail_num == ck_tile::TailNumber::Five)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-        {
-            if(tail_num == ck_tile::TailNumber::Six)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-        {
-            if(tail_num == ck_tile::TailNumber::Seven)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            RunSplitk(
-                ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        std::ostringstream err;
-        err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
-               "got "
-            << tail_num << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-            << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-        throw std::runtime_error(err.str());
-    }
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
     return ave_time;
 }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 067319b3f9..2a72c6325e 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -197,121 +197,7 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         }
     };
 
-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Incorrect tail_num for compv3 pipeline! Expected Full, Odd or Even, but got "
-                << tail_num << "\nPrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        // Tail pipeline One to Seven
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-        {
-            if(tail_num == ck_tile::TailNumber::Two)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-        {
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-        {
-            if(tail_num == ck_tile::TailNumber::Four)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-        {
-            if(tail_num == ck_tile::TailNumber::Five)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-        {
-            if(tail_num == ck_tile::TailNumber::Six)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-        {
-            if(tail_num == ck_tile::TailNumber::Seven)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            RunSplitk(
-                ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        std::ostringstream err;
-        err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
-            << "got " << tail_num << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-            << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-        throw std::runtime_error(err.str());
-    }
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
     return ave_time;
 }
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index d0ad97c800..f57600d7a5 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -252,60 +252,13 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
         const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
 
-        const auto RunEpilogue = [&](auto& c_block_tile) {
-            // Run Epilogue Pipeline
-            auto& c_block_window = gemm_tile_windows.at(Base::I2);
-            EpiloguePipeline{}
-                .template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-                    c_block_window, c_block_tile, smem_ptr_0);
-        };
-
-        if constexpr(is_specialization_of<GemmPipeline, GemmPipelineAgBgCrCompV3>::value)
-        {
-            // Run the specific implementation with hotloop+tailnum config
-            using PipelineImpl =
-                typename GemmPipeline::template PipelineImpl<GemmPipeline::Scheduler>;
-            const auto PassThrough = [](const auto& a) { return a; };
-            if(has_hot_loop && tail_num == TailNumber::Full)
-            {
-                const auto& c_block_tile =
-                    PipelineImpl{}.template operator()<true, TailNumber::Full>(a_block_window,
-                                                                               PassThrough,
-                                                                               b_block_window,
-                                                                               PassThrough,
-                                                                               num_loop,
-                                                                               smem_ptr_0);
-                RunEpilogue(c_block_tile);
-            }
-            else if(has_hot_loop && tail_num == TailNumber::Odd)
-            {
-                const auto& c_block_tile =
-                    PipelineImpl{}.template operator()<true, TailNumber::Odd>(a_block_window,
-                                                                              PassThrough,
-                                                                              b_block_window,
-                                                                              PassThrough,
-                                                                              num_loop,
-                                                                              smem_ptr_0);
-                RunEpilogue(c_block_tile);
-            }
-            else if(has_hot_loop && tail_num == TailNumber::Even)
-            {
-                const auto& c_block_tile =
-                    PipelineImpl{}.template operator()<true, TailNumber::Even>(a_block_window,
-                                                                               PassThrough,
-                                                                               b_block_window,
-                                                                               PassThrough,
-                                                                               num_loop,
-                                                                               smem_ptr_0);
-                RunEpilogue(c_block_tile);
-            }
-        }
-        else
-        {
-            ignore = a_block_window;
-            ignore = b_block_window;
-            static_assert(false, "GemmPipeline specialization not supported!");
-        }
+        // Run GEMM pipeline
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(Base::I2);
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, smem_ptr_0);
     }
 
     CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 90cd22429e..a6267e4c89 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -50,6 +50,50 @@ struct BaseGemmPipelineAgBgCrCompV3
             }
         }
     }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto
+    TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number)
+    {
+        // Handle all the valid cases.
+        if(has_hot_loop)
+        {
+            if(tail_number == TailNumber::Full)
+            {
+                return run_func(bool_constant<true>{},
+                                integral_constant<TailNumber, TailNumber::Full>{});
+            }
+        }
+        else
+        {
+            if(tail_number == TailNumber::Odd)
+            {
+                return run_func(bool_constant<false>{},
+                                integral_constant<TailNumber, TailNumber::Odd>{});
+            }
+            else if(tail_number == TailNumber::Even)
+            {
+                return run_func(bool_constant<false>{},
+                                integral_constant<TailNumber, TailNumber::Even>{});
+            }
+        }
+#if defined(__HIP_DEVICE_COMPILE__)
+        // This path should be unreachable in device code if tail_number is valid.
+        __builtin_unreachable();
+#else
+        // If execution reaches here, it's an invalid combination of arguments.
+        if(has_hot_loop)
+        {
+            throw std::logic_error("Invalid TailNumber: If has_hot_loop is true, tail_number must "
+                                   "be TailNumber::Full.");
+        }
+        else
+        {
+            throw std::logic_error("Invalid TailNumber: If has_hot_loop is false, tail_number must "
+                                   "be TailNumber::Odd or TailNumber::Even.");
+        }
+#endif
+    }
 };
 
 // Compute optimized pipeline
@@ -556,6 +600,42 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             p_smem);
     }
 
+    /**
+     * @brief This function runs the pipeline by wrapping it with the tail handler.
+     *
+     * @note This is used by the persistent gemm kernel variants that don't determine
+     *       hot loop and tail number on the host side, e.g. grouped gemm kernel.
+     */
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   bool has_hot_loop,
+                                   TailNumber tail_number,
+                                   void* p_smem) const
+    {
+        const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
+            constexpr bool hot_loop    = hot_loop_.value;
+            constexpr auto tail_num    = tail_num_.value;
+            constexpr auto PassThrough = [](const auto& x) { return x; };
+            return PipelineImpl<Scheduler>{}.template operator()<hot_loop, tail_num>(
+                a_dram_block_window_tmp,
+                PassThrough,
+                b_dram_block_window_tmp,
+                PassThrough,
+                num_loop,
+                p_smem);
+        };
+        return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
+    }
+
+    /**
+     * @brief This function runs the pipeline using compile-time known hot loop and tail number.
+     * @param num_loop The number of loop iterations. This is determined at runtime due to e.g.
+     * SplitK.
+     * @note This is used by the kernel variants that are able to determine
+     *       hot loop and tail number on the host side, e.g. non-persistent gemm kernel.
+     */
     template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
     CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                    const BDramBlockWindowTmp& b_dram_block_window_tmp,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 6535f612f1..6fc6ba2ba2 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -34,6 +34,46 @@ struct BaseGemmPipelineAgBgCrCompV4
             return TailNumber::Two;
         }
     }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto
+    TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number)
+    {
+        // Handle all the valid cases.
+        if(has_hot_loop)
+        {
+            if(tail_number == TailNumber::Three)
+            {
+                return run_func(bool_constant<true>{},
+                                integral_constant<TailNumber, TailNumber::Three>{});
+            }
+            else if(tail_number == TailNumber::Two)
+            {
+                return run_func(bool_constant<true>{},
+                                integral_constant<TailNumber, TailNumber::Two>{});
+            }
+        }
+        else
+        {
+            if(tail_number == TailNumber::Three)
+            {
+                return run_func(bool_constant<false>{},
+                                integral_constant<TailNumber, TailNumber::Three>{});
+            }
+            else if(tail_number == TailNumber::Two)
+            {
+                return run_func(bool_constant<false>{},
+                                integral_constant<TailNumber, TailNumber::Two>{});
+            }
+        }
+        // If execution reaches here, it's an invalid tail_number because it wasn't handled above.
+#if defined(__HIP_DEVICE_COMPILE__)
+        __builtin_unreachable();
+#else
+        throw std::logic_error("Invalid TailNumber: Only TailNumber::Full and smaller than "
+                               "PrefetchStages are supported.");
+#endif
+    }
 };
 
 /**
@@ -572,5 +612,30 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             p_smem_0,
             p_smem_1);
     }
+
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   bool has_hot_loop,
+                                   TailNumber tail_number,
+                                   void* __restrict__ p_smem_0,
+                                   void* __restrict__ p_smem_1) const
+    {
+        const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
+            constexpr bool hot_loop    = hot_loop_.value;
+            constexpr auto tail_num    = tail_num_.value;
+            constexpr auto PassThrough = [](const auto& x) { return x; };
+            return PipelineImpl<Scheduler>{}.template operator()<hot_loop, tail_num>(
+                a_dram_block_window_tmp,
+                PassThrough,
+                b_dram_block_window_tmp,
+                PassThrough,
+                num_loop,
+                p_smem_0,
+                p_smem_1);
+        };
+        return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
+    }
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index abf5b617ee..f7b5f9b3cb 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -52,13 +52,14 @@ struct BaseGemmPipelineAgBgCrMem
 
     static constexpr index_t LocalPrefillStages = 1;
     static constexpr index_t GlobalBufferNum    = PrefetchStages;
+    static constexpr bool UsePersistentKernel   = Problem::Traits::UsePersistentKernel;
 
-    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop)
     {
         return num_loop > PrefetchStages;
     }
 
-    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
     {
         if(num_loop % PrefetchStages == 1)
         {
@@ -93,6 +94,56 @@ struct BaseGemmPipelineAgBgCrMem
             return TailNumber::Full;
         }
     }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto
+    TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number)
+    {
+        // Wrap the hot_loop dispatch first.
+        auto tail_dispatch = [&](auto tail_num_constant) {
+            if(has_hot_loop)
+            {
+                return run_func(bool_constant<true>{}, tail_num_constant);
+            }
+            else
+            {
+                return run_func(bool_constant<false>{}, tail_num_constant);
+            }
+        };
+
+#define CHECK_TAIL_NUMBER(TAIL_NUMBER, PREFETCH_VALUE)                                      \
+    else if(tail_number == TailNumber::TAIL_NUMBER)                                         \
+    {                                                                                       \
+        if constexpr(PrefetchStages > PREFETCH_VALUE)                                       \
+        {                                                                                   \
+            return tail_dispatch(integral_constant<TailNumber, TailNumber::TAIL_NUMBER>{}); \
+        }                                                                                   \
+    }
+        // Handle all the valid cases.
+        if(tail_number == TailNumber::One)
+        {
+            return tail_dispatch(integral_constant<TailNumber, TailNumber::One>{});
+        }
+        else if(tail_number == TailNumber::Full)
+        {
+            return tail_dispatch(integral_constant<TailNumber, TailNumber::Full>{});
+        }
+        CHECK_TAIL_NUMBER(Two, 2)
+        CHECK_TAIL_NUMBER(Three, 3)
+        CHECK_TAIL_NUMBER(Four, 4)
+        CHECK_TAIL_NUMBER(Five, 5)
+        CHECK_TAIL_NUMBER(Six, 6)
+        CHECK_TAIL_NUMBER(Seven, 7)
+#undef CHECK_TAIL_NUMBER
+
+        // We shouldn't get here unless we have a tail number larger than the prefetch stages.
+#if defined(__HIP_DEVICE_COMPILE__)
+        __builtin_unreachable();
+#else
+        throw std::logic_error("Invalid TailNumber: Only TailNumber::Full and smaller than "
+                               "PrefetchStages are supported.");
+#endif
+    }
 };
 
 // Maximum Global Memory throughput pipeline with >=32KB data in fly
@@ -749,6 +800,29 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             p_smem);
     }
 
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   bool has_hot_loop,
+                                   TailNumber tail_number,
+                                   void* p_smem) const
+    {
+        const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
+            constexpr bool hot_loop    = hot_loop_.value;
+            constexpr auto tail_num    = tail_num_.value;
+            constexpr auto PassThrough = [](const auto& x) { return x; };
+            return PipelineImpl<Scheduler>{}.template operator()<hot_loop, tail_num>(
+                a_dram_block_window_tmp,
+                PassThrough,
+                b_dram_block_window_tmp,
+                PassThrough,
+                num_loop,
+                p_smem);
+        };
+        return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
+    }
+
     template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
     CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                    const BDramBlockWindowTmp& b_dram_block_window_tmp,
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index 4633f23ded..cffa81d1c5 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -159,32 +159,7 @@ class TestCkTileBatchedGemm : public ::testing::Test
             }
         };
 
-        if(has_hot_loop)
-        {
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else
-            {
-                std::ostringstream err;
-                err << "For compute pipeline tail number should always be Full, but have \""
-                    << tail_num << "\" which is not supported! PrefetchStages: "
-                    << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__ << ":"
-                    << __LINE__ << ", in function: " << __func__;
-                throw std::runtime_error(err.str());
-            }
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Num K loop must be larger than number of prefetech stages."
-                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
     public:
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 1892aa0e31..b3146b5f8e 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -63,19 +63,6 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
     using pipeline      = ck_tile::GemmPipelineAgBgCrCompV4<Problem>;
 };
 
-template <typename Pipeline, ck_tile::TailNumber TN>
-void try_run(ck_tile::TailNumber tn)
-{
-    if constexpr(Pipeline::PrefetchStages > static_cast<int>(TN))
-    {
-        if(tn == TN)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, TN>{});
-        }
-    }
-}
-
 template <typename Tuple>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
@@ -240,90 +227,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             }
         };
 
-        if(has_hot_loop)
-        {
-            if constexpr(PipelineType == GemmPipelineType::CompV3)
-            {
-                if(tail_num == ck_tile::TailNumber::Full)
-                {
-                    RunSplitk(ck_tile::bool_constant<true>{},
-                              ck_tile::integral_constant<ck_tile::TailNumber,
-                                                         ck_tile::TailNumber::Full>{});
-                }
-                else
-                {
-                    std::ostringstream err;
-                    err << "For compute pipeline tail number should always be Full, but have \""
-                        << tail_num << "\" which is not supported! PrefetchStages: "
-                        << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__ << ":"
-                        << __LINE__ << ", in function: " << __func__;
-                    throw std::runtime_error(err.str());
-                }
-            }
-
-            if constexpr(PipelineType == GemmPipelineType::Mem)
-            {
-                // Tail pipeline One to Seven
-                if(tail_num == ck_tile::TailNumber::One)
-                {
-                    RunSplitk(ck_tile::bool_constant<true>{},
-                              ck_tile::integral_constant<ck_tile::TailNumber,
-                                                         ck_tile::TailNumber::One>{});
-                }
-                else if(tail_num == ck_tile::TailNumber::Full)
-                {
-                    RunSplitk(ck_tile::bool_constant<true>{},
-                              ck_tile::integral_constant<ck_tile::TailNumber,
-                                                         ck_tile::TailNumber::Full>{});
-                }
-
-                auto check_tail = [&](auto... TNs) {
-                    (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
-                };
-
-                check_tail(
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-
-            if constexpr(PipelineType == GemmPipelineType::CompV4)
-            {
-                if(tail_num == ck_tile::TailNumber::Three)
-                {
-                    RunSplitk(ck_tile::bool_constant<true>{},
-                              ck_tile::integral_constant<ck_tile::TailNumber,
-                                                         ck_tile::TailNumber::Three>{});
-                }
-                else
-                {
-                    RunSplitk(ck_tile::bool_constant<true>{},
-                              ck_tile::integral_constant<ck_tile::TailNumber,
-                                                         ck_tile::TailNumber::Two>{});
-                }
-            }
-        }
-        else
-        {
-            // Tail number always Full - #PrefetchStages
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else
-            {
-                std::ostringstream err;
-                err << "When there's no hot loop, this tail number \"" << tail_num
-                    << "\" is not supported! " << __FILE__ << ":" << __LINE__
-                    << ", in function: " << __func__;
-                throw std::runtime_error(err.str());
-            }
-        }
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
     public:
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index cdc2e4f090..382a32a7d9 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -192,32 +192,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
             }
         };
 
-        if(has_hot_loop)
-        {
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(
-                    ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else
-            {
-                std::ostringstream err;
-                err << "For compute pipeline tail number should always be Full, but have \""
-                    << tail_num << "\" which is not supported! PrefetchStages: "
-                    << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__ << ":"
-                    << __LINE__ << ", in function: " << __func__;
-                throw std::runtime_error(err.str());
-            }
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Num K loop must be larger than number of prefetech stages."
-                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
     template <typename ALayout, typename BLayout, typename CLayout>

From 233e274077cae99f2f1deacf5044593ace5be65e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 5 Jun 2025 09:24:00 -0700
Subject: [PATCH 182/443] Revert "[CK_TILE] Tile loop persistent gemm kernel
 (#2191)" (#2293)

This reverts commit ffb52783d0a6b3afc168dfa6bfb5bd119f48b65b.
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |   5 +-
 example/ck_tile/03_gemm/gemm_utils.hpp        |   6 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  37 +------
 example/ck_tile/03_gemm/universal_gemm.cpp    |  16 +--
 include/ck_tile/core/utility/type_traits.hpp  |  30 -----
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 104 ------------------
 test/ck_tile/gemm/CMakeLists.txt              |   5 -
 .../gemm/test_gemm_pipeline_kernel_types.hpp  |   9 --
 .../gemm/test_gemm_pipeline_persistent.cpp    |  16 ---
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  22 +---
 10 files changed, 18 insertions(+), 232 deletions(-)
 delete mode 100644 test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index de9608bcb4..386fe93715 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -18,12 +18,9 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout,
-          bool Persistent>
+          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
-    if constexpr(Persistent)
-        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
     constexpr bool kPadM = false;
     constexpr bool kPadN = false;
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index aec5f6a116..4c9fecaba6 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -213,8 +213,7 @@ auto create_args(int argc, char* argv[])
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("persistent", "0", "0:non-persistent, 1:persistent");
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -227,6 +226,5 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout,
-          bool Persistent = false>
+          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index bf455a6415..3010130e6c 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -162,8 +162,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::index_t stride_C,
                   ck_tile::index_t kbatch,
                   int n_warmup,
-                  int n_repeat,
-                  bool persistent)
+                  int n_repeat)
 {
     ck_tile::GemmHostArgs args;
     args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
@@ -177,31 +176,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time;
-    if(persistent)
-    {
-        ave_time = gemm_calc<ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout,
-                             true>(
+    float ave_time =
+        gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
             args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
-    }
-    else
-    {
-        ave_time = gemm_calc<ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout,
-                             false>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
-    }
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -216,8 +193,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
               << " B_Type=" << DataTypeTraits<BDataType>::name
               << " C_Type=" << DataTypeTraits<CDataType>::name
               << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
 
     return ave_time;
 }
@@ -252,7 +229,6 @@ int run_gemm_example_with_layouts(int argc,
     int n_warmup                 = arg_parser.get_int("warmup");
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
-    bool persistent              = arg_parser.get_int("persistent");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -340,8 +316,7 @@ int run_gemm_example_with_layouts(int argc,
         stride_C,
         kbatch,
         n_warmup,
-        n_repeat,
-        persistent);
+        n_repeat);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 3a7cc93df8..bc9569d342 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -19,8 +19,7 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout,
-          bool Persistent>
+          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -49,8 +48,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                  BLayout,
                                                                  CLayout,
                                                                  GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity,
-                                                                 Persistent>;
+                                                                 GemmConfig::UseStructuredSparsity>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
@@ -100,15 +98,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index 95fb1bd834..2e82e21ba1 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck_tile/core/config.hpp"
-#include <tuple>
 #include <type_traits>
 #include <stdint.h>
 
@@ -139,33 +138,4 @@ struct is_specialization_of<RefTemplate<Args...>, RefTemplate> : std::true_type
 {
 };
 
-// Helper to get a tuple element or default type
-namespace detail {
-
-template <bool IsWithinBounds, std::size_t Idx, typename Tuple, typename DefaultType>
-struct tuple_element_or_default_dispatch
-{
-    using type = DefaultType;
-};
-
-template <std::size_t Idx, typename Tuple, typename DefaultType>
-struct tuple_element_or_default_dispatch<true, Idx, Tuple, DefaultType>
-{
-    using type = std::tuple_element_t<Idx, Tuple>;
-};
-
-} // namespace detail
-
-template <typename Tuple_, std::size_t Idx, typename DefaultType>
-struct tuple_element_or_default
-{
-    using Tuple                            = remove_cvref_t<Tuple_>;
-    static constexpr bool is_within_bounds = Idx < std::tuple_size_v<Tuple>;
-    using type                             = typename detail::
-        tuple_element_or_default_dispatch<is_within_bounds, Idx, Tuple, DefaultType>::type;
-};
-template <typename Tuple_, std::size_t Idx, typename DefaultType>
-using tuple_element_or_default_t =
-    typename tuple_element_or_default<Tuple_, Idx, DefaultType>::type;
-
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index fea6633f9f..9c25104cd7 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -9,9 +9,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
-#include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/core/utility/env.hpp"
-#include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
 
@@ -144,21 +142,6 @@ struct GemmKernel
     using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
-    // Get the persistent kernel if the pipeline has it available
-    struct has_persistent_kernel
-    {
-        template <typename T>
-        using has_persistent_type = decltype(T::UsePersistentKernel);
-
-        static constexpr bool value = []() {
-            if constexpr(is_detected<has_persistent_type, GemmPipeline>{})
-                return GemmPipeline::UsePersistentKernel;
-            else
-                return false;
-        }();
-    };
-    static constexpr bool PersistentKernel = has_persistent_kernel::value;
-
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
     // Below type is actually accumulation data type - the output of block GEMM.
@@ -180,23 +163,6 @@ struct GemmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
-    /**
-     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
-     * @return The maximum occupancy grid size.
-     * @note This function queries the maximum occupancy of the kernel using
-     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
-     */
-    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
-    {
-        using Kernel      = GemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
-        const auto kernel = kentry<KernelBlockSize, 1, Kernel, GemmKernelArgs>;
-        int occupancy;
-        hip_check_error(
-            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
-        const int grid_size = get_available_compute_units(s) * occupancy;
-        return dim3(grid_size, 1, 1);
-    }
-
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
     CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
@@ -727,8 +693,6 @@ struct GemmKernel
             c_block_window, c_block_tile, smem_ptr_0);
     }
 
-    // Non-persistent kernel entry point
-    template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
     CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
     {
         const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
@@ -775,74 +739,6 @@ struct GemmKernel
             }
         }
     }
-
-    // Persistent kernel entry point
-    template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
-    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
-    {
-        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
-        const auto num_tiles =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
-        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
-        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
-
-        while(block_id < num_work)
-        {
-            // Get the tile index for this block
-            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
-            const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
-            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
-
-            // Get the SplitK offset for this block
-            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
-            const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
-            const ADataType* a_ptr =
-                static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
-            const BDataType* b_ptr =
-                static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-            CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
-
-            // allocate LDS
-            __shared__ char smem_ptr_0[GetSmemSize()];
-            // Run the GEMM
-            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
-            {
-                __shared__ char smem_ptr_1[GetSmemSize()];
-                if constexpr(!(EpiloguePipeline::MemoryOperation ==
-                                   memory_operation_enum::atomic_add &&
-                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<CDataType, fp16_t, bf16_t>::value))
-                {
-                    RunGemm2LDS(a_ptr,
-                                b_ptr,
-                                c_ptr,
-                                smem_ptr_0,
-                                smem_ptr_1,
-                                kargs,
-                                splitk_batch_offset,
-                                i_m,
-                                i_n);
-                }
-            }
-            else
-            {
-                if constexpr(!(EpiloguePipeline::MemoryOperation ==
-                                   memory_operation_enum::atomic_add &&
-                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<CDataType, fp16_t, bf16_t>::value))
-                {
-                    RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
-                }
-            }
-            // Advance to the next work item
-            block_id += grid_size;
-            if(block_id >= num_work)
-            {
-                break;
-            }
-        }
-    }
 };
 
 } // namespace ck_tile
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 598bd68666..fc04af5cdb 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -23,8 +23,3 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
 else()
     message("Skipping ck_tile_gemm tests for current target")
 endif()
-
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MATCHES "gfx90a")
-    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent test_gemm_pipeline_persistent.cpp)
-    target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index b9d3f57dbb..bd1502516b 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -2,7 +2,6 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
-#include <type_traits>
 
 #include "gtest/gtest.h"
 
@@ -22,9 +21,6 @@ using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType:
 using CompV3    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
 using CompV4    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
-using Persistent    = std::true_type;
-using NonPersistent = std::false_type;
-
 // clang-format off
 using KernelTypesMem = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
@@ -63,9 +59,4 @@ using KernelTypesCompV4 = ::testing::Types<
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>
 >;
 
-using KernelTypesPersistent = ::testing::Types<
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3,    Persistent>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3, NonPersistent>
->;
-
 // clang-format on
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
deleted file mode 100644
index 1dea1ab48c..0000000000
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "test_gemm_pipeline_kernel_types.hpp"
-#include "test_gemm_pipeline_util.hpp"
-#include "gtest/gtest.h"
-
-template <typename T>
-class TestCkTileGemmPipelinePersistent : public TestCkTileGemmPipeline<T>
-{
-};
-
-#define TEST_SUITE_NAME TestCkTileGemmPipelinePersistent
-
-TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesPersistent);
-
-#include "test_gemm_pipeline_ut_cases.inc"
-
-#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index b3146b5f8e..c388df3a41 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -76,8 +76,6 @@ class TestCkTileGemmPipeline : public ::testing::Test
     using CDataType                    = std::tuple_element_t<6, Tuple>;
     static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
     static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
-    static constexpr bool Persistent =
-        ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
     // TODO: expose tile size through test t-param ?
 
     template <bool PadM, bool PadN, bool PadK>
@@ -119,17 +117,14 @@ class TestCkTileGemmPipeline : public ::testing::Test
             GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
         using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-        static constexpr bool StructuredSparsity = false;
-        using GemmUniversalTraits                = ck_tile::TileGemmUniversalTraits<kPadM,
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
                                                                      kPadN,
                                                                      kPadK,
                                                                      DoubleSmemBuffer,
                                                                      ALayout,
                                                                      BLayout,
                                                                      CLayout,
-                                                                     TransposeC,
-                                                                     StructuredSparsity,
-                                                                     Persistent>;
+                                                                     TransposeC>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -182,15 +177,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
@@ -359,6 +346,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
                                   rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
         EXPECT_TRUE(pass);
     }
 };

From 00247e3c297032a2cbdaae465113648ec1857d3f Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Thu, 5 Jun 2025 13:54:15 -0600
Subject: [PATCH 183/443] Optimized GEMMs for MX FP4/8 (#2294)

Adds V3 GEMM pipeline for MX FP4 and MX FP8
Adds V3 GEMM pipeline for MX FP4 with preshuffling
Adds MXFP4 GEMM tests (#2275)
Adds MXFP4 GEMM examples
Adds MXFP4 GEMMs to ckProfiler


Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com>
Co-authored-by: OscarXu <huaiguxu@amd.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>
Co-authored-by: Ding, Yi <yi.ding@amd.com>
Co-authored-by: feifei14119 <feiw@amd.com>
Co-authored-by: Lin, Qun <qlin@amd.com>
Co-authored-by: joye <joye@amd.com>
Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
---
 CHANGELOG.md                                  |    2 +-
 example/01_gemm/CMakeLists.txt                |    6 +
 ..._add_fastgelu_xdl_lds_direct_load_fp32.cpp |    4 +-
 .../batched_gemm_xdl_fp8_rowwise_v3.cpp       |   12 +-
 .../splitK_gemm_xdl_lds_direct_load_fp16.cpp  |    4 +-
 example/67_gemm_microscaling/CMakeLists.txt   |   37 +-
 example/67_gemm_microscaling/gemm_mx_bf8.cpp  |   23 +-
 .../67_gemm_microscaling/gemm_mx_common.hpp   |  260 +-
 example/67_gemm_microscaling/gemm_mx_fp4.cpp  |  105 +
 .../gemm_mx_fp4_bpreshuffle.cpp               |  105 +
 example/67_gemm_microscaling/gemm_mx_fp8.cpp  |   23 +-
 .../67_gemm_microscaling/gemm_mx_fp8_bf8.cpp  |   19 +-
 example/CMakeLists.txt                        |    8 +-
 ...blockwise_gemm_mx_pipeline_xdlops_base.hpp |  164 +-
 ...ipeline_xdlops_b_preshuffle_dequant_v3.hpp |    2 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v1.hpp |    4 +-
 .../blockwise_gemm_pipeline_xdlops_base.hpp   |   20 +-
 ...ipeline_xdlops_mx_bpreshuffle_selector.hpp |   68 +
 ...kwise_gemm_pipeline_xdlops_mx_selector.hpp |   55 +-
 ...kwise_gemm_pipeline_xdlops_v1_ab_scale.hpp |    2 +-
 .../blockwise_gemm_pipeline_xdlops_v1_mx.hpp  |  525 ++--
 .../blockwise_gemm_pipeline_xdlops_v3.hpp     |    2 +-
 ...kwise_gemm_pipeline_xdlops_v3_ab_scale.hpp |    2 +-
 ...ckwise_gemm_pipeline_xdlops_v3_b_scale.hpp |    2 +-
 .../blockwise_gemm_pipeline_xdlops_v3_mx.hpp  | 1090 ++++++++
 ...gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp | 1042 ++++++++
 .../blockwise_gemm_pipeline_xdlops_v5.hpp     |    2 +-
 ...roup_tensor_slice_transfer_direct_load.hpp |   63 +-
 .../gpu/device/device_gemm_mx.hpp             |   38 +
 .../impl/device_gemm_xdl_cshuffle_v3_mx.hpp   |  563 +---
 ...m_xdl_splitk_c_shuffle_lds_direct_load.hpp |    2 +
 .../element/unary_element_wise_operation.hpp  |    7 +
 ...ultiple_d_xdl_cshuffle_lds_direct_load.hpp |   36 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    |    3 +-
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp |    5 +-
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |   18 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp |  986 +++----
 ...se_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp | 2295 +++++++++++++++++
 ...ise_gemm_xdlops_splitk_lds_direct_load.hpp |   33 +-
 .../threadwise_tensor_slice_transfer.hpp      |  249 +-
 .../threadwise_tensor_slice_transfer_util.hpp |   12 +
 .../threadwise_tensor_slice_transfer_v3r1.hpp |    7 +-
 ...wise_tensor_slice_transfer_v3r1_gather.hpp |    9 +-
 ...ise_tensor_slice_transfer_v7r3_scatter.hpp |    2 -
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  119 +-
 include/ck/utility/amd_buffer_addressing.hpp  |   24 +-
 .../amd_buffer_addressing_builtins.hpp        |   10 +-
 include/ck/utility/amd_xdlops.hpp             |  220 +-
 include/ck/utility/blkgemmpipe_scheduler.hpp  |   14 +-
 include/ck/utility/data_type.hpp              |  159 +-
 include/ck/utility/dtype_vector.hpp           |    7 +
 include/ck/utility/functional2.hpp            |   43 +-
 include/ck/utility/integral_constant.hpp      |   14 +-
 include/ck/utility/type_convert.hpp           |    5 +
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp |    2 +-
 .../cpu/reference_mx_gemm.hpp                 |   68 +-
 .../device_operation_instance_factory.hpp     |    9 +-
 .../tensor_operation_instance/gpu/gemm_mx.hpp |  105 +-
 ...ect_load_f16_f16_f16_mk_nk_mn_instance.cpp |   26 +-
 ...ect_load_f32_f32_f32_km_kn_mn_instance.cpp |    4 +-
 ...ect_load_f32_f32_f32_km_nk_mn_instance.cpp |    4 +-
 ...ect_load_f32_f32_f32_mk_kn_mn_instance.cpp |    4 +-
 ...ect_load_f32_f32_f32_mk_nk_mn_instance.cpp |    4 +-
 .../gpu/gemm_mx/CMakeLists.txt                |    4 +
 ...device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp |   33 +-
 ...l_bf8_f8_f16_mk_kn_mn_default_instance.cpp |    4 +-
 ...evice_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp |   73 +
 ..._f4_f4_f16_mk_mfma_mn_default_instance.cpp |   32 +
 .../device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp |   65 +
 ...dl_f4_f4_f16_mk_nk_mn_default_instance.cpp |   32 +
 ...device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp |   35 +-
 ...l_f8_f8_bf16_km_nk_mn_default_instance.cpp |    4 +-
 ...device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp |   29 +-
 ...l_f8_f8_bf16_mk_nk_mn_default_instance.cpp |    4 +-
 .../device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp |   29 +-
 ...dl_f8_f8_f16_mk_nk_mn_default_instance.cpp |    4 +-
 ...ect_load_f16_f16_f16_mk_nk_mn_instance.cpp |   46 +-
 .../include/profiler/profile_gemm_mx_impl.hpp |  534 ++++
 profiler/src/CMakeLists.txt                   |    6 +
 profiler/src/profile_gemm_mx.cpp              |  155 ++
 test/gemm_mx/test_gemm_mx.cpp                 |   33 +-
 test/gemm_mx/test_gemm_mx_util.hpp            |  434 +---
 test/mx_mfma_op/mx_mfma_op.hpp                |   45 +-
 83 files changed, 8193 insertions(+), 2165 deletions(-)
 create mode 100644 example/67_gemm_microscaling/gemm_mx_fp4.cpp
 create mode 100644 example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_bpreshuffle_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp
 create mode 100644 profiler/include/profiler/profile_gemm_mx_impl.hpp
 create mode 100644 profiler/src/profile_gemm_mx.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2ec0c1ecce..aecf16d83d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
-* Added GEMM pipeline for microscaling (MX) data types
+* Added GEMM pipeline for microscaling (MX) FP8/FP4 data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 24292be4fe..e6a26ecafd 100755
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -39,6 +39,12 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_streamk_v3)
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
 
+set(GEMM_OPTIONS)
+list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-16")
+example_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS})
+example_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS})
+
+
 list(APPEND gpu_list gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_lds_direct_load_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_lds_direct_load_fp32.cpp
index de7af85fb3..67b3e646f7 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_lds_direct_load_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_lds_direct_load_fp32.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -34,7 +34,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 //######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,    64,    64,    64,    64,   8,   8,   32,   32,    2,    2,      S<1, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<1, 8, 8>,     S<1, 0, 2>,             2,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>;
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,    64,    64,    64,    64,   8,   8,   32,   32,    2,    2,      S<8, 1, 8>,     S<1, 0, 2>,              2,              1,         0,      S<8, 1, 8>,     S<1, 0, 2>,             2,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
index f0160b31ce..84f92eba8e 100644
--- a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
@@ -71,9 +71,9 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD
     256,            // BlockSize
     256,            // MPerBlock
     128,            // NPerBlock
-    32,             // KPerBlock
-    8,              // AK1
-    8,              // BK1
+    64,             // KPerBlock
+    16,             // AK1
+    16,             // BK1
     32,             // MPerXDL
     32,             // NPerXDL
     4,              // MXdlPerWave
@@ -84,14 +84,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD
     2,              // ABlockTransferSrcVectorDim
     8,              // ABlockTransferSrcScalarPerVector
     8,              // ABlockTransferDstScalarPerVector_AK1
-    1,              // ABlockLdsExtraM
+    0,              // ABlockLdsExtraM
     S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_BK0_N_BK1
     S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
     2,              // BBlockTransferSrcVectorDim
     8,              // BBlockTransferSrcScalarPerVector
     8,              // BBlockTransferDstScalarPerVector_BK1
-    1,              // BBlockLdsExtraN
+    0,              // BBlockLdsExtraN
     1,              // CShuffleMXdlPerWavePerShuffle
     1,              // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp
index 97a3f89e5e..fc55019fc4 100644
--- a/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -60,7 +60,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 //######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|          ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar| AddExtraM|          ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
 //######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |     |     | Wave| Wave| Lengths_KBatch_K0_M_K1|               |               |      PerVector|          | Lengths_KBatch_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|          |          |          |            |        |        |        |            |            |            |               |         |      |      |      |      |    |     |     |     |     |                       |               |               |               |          |                       |               |              |               |          |            |            |                                 |                |
-        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        2,   128,    32,    16,     4,  16,   16,   16,    1,    1,         S<1, 2, 8, 8>,  S<0, 2, 1, 3>,              3,              2,      true,         S<1, 2, 8, 8>,  S<0, 2, 1, 3>,             3,              2,      true,           1,           1,                   S<1, 32, 1, 4>,               4>;
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        2,   128,    32,    16,     4,   8,   16,   16,    1,    1,         S<1, 4, 8, 4>,  S<0, 2, 1, 3>,              3,              2,      0,         S<1, 4, 8, 4>,  S<0, 2, 1, 3>,             3,              2,      0,           1,           1,                   S<1, 32, 1, 4>,               4>;
 // clang-format on
 
 #else
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 1a1db51c37..86d90674e1 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -6,6 +6,39 @@ add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
 add_example_executable(example_gemm_mx_bf8 gemm_mx_bf8.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
 
-add_example_executable(example_gemm_mx_fp8_bf8 gemm_mx_fp8_bf8.cpp)
-add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_bf8)
+#add_example_executable(example_gemm_mx_fp8_bf8 gemm_mx_fp8_bf8.cpp)
+# add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_bf8) TOFO: Fix RRR
 
+add_example_executable(example_gemm_mx_fp4 gemm_mx_fp4.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp4)
+
+add_example_executable(example_gemm_mx_fp4_bpreshuffle gemm_mx_fp4_bpreshuffle.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp4_bpreshuffle)
+
+#add_example_executable(example_moe_gemm1_xdl_mx_fp4 moe_gemm1_xdl_mx_fp4.cpp)
+# add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4) TODO: Fix
+
+#add_example_executable(example_moe_gemm1_xdl_mx_fp4_bns moe_gemm1_xdl_mx_fp4_bns.cpp)
+#add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4_bns)
+
+#add_example_executable(example_moe_gemm2_xdl_mx_fp4 moe_gemm2_xdl_mx_fp4.cpp)
+# add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4) TODO: Fix
+
+#add_example_executable(example_moe_gemm2_xdl_mx_fp4_bns moe_gemm2_xdl_mx_fp4_bns.cpp)
+#add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4_bns)
+
+set(FP4_MXGEMM_OPTIONS)
+list(APPEND FP4_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --amdgpu-use-amdgpu-trackers=1")
+#list(APPEND FP4_MXGEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker -ftemplate-backtrace-limit=0)
+example_compile_options(example_gemm_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+example_compile_options(example_gemm_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
+# example_compile_options(example_moe_gemm1_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+# example_compile_options(example_moe_gemm2_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+# example_compile_options(example_moe_gemm1_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
+# example_compile_options(example_moe_gemm2_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
+
+set(FP8_MXGEMM_OPTIONS)
+list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+#list(APPEND FP8_MXGEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker -ftemplate-backtrace-limit=0)
+example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
+example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/gemm_mx_bf8.cpp b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
index 8e341fb591..58f2dcb010 100644
--- a/example/67_gemm_microscaling/gemm_mx_bf8.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
@@ -21,11 +21,11 @@ using BElementOp = PassThrough; // elementwise transformation for B matrix
 using CElementOp = PassThrough; // elementwise transformation for C matrix
 
 constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
-constexpr ck::index_t KPerBlock      = 128;
+constexpr ck::index_t KPerBlock      = 256;
 
 constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
 constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
-constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
 
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
     ALayout,          // ALayout
@@ -45,32 +45,32 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffle
     ScaleBlockSize,   // ScaleBlockSize: Scaling block size
     128,              // BlockSize: Thread block size
     128,              // MPerBlock
-    16,               // NPerBlock
+    32,               // NPerBlock
     KPerBlock,        // KPerBlock
     16,               // AK1
     16,               // BK1
     16,               // MPerXDL
     16,               // NPerXDL
     4,                // MXdlPerWave
-    1,                // NXdlPerWave
-    S<8, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    2,                // NXdlPerWave
+    S<16, 8, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
     S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
     2,                // ABlockTransferSrcVectorDim
     16,               // ABlockTransferSrcScalarPerVector
     16,               // ABlockTransferDstScalarPerVector_AK1
-    false,            // ABlockLdsExtraM
-    S<8, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    true,             // ABlockLdsExtraM
+    S<16, 8, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
     S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
     2,                // BBlockTransferSrcVectorDim
     16,               // BBlockTransferSrcScalarPerVector
     16,               // BBlockTransferDstScalarPerVector_BK1
-    false,            // BBlockLdsExtraN
-    1,                // CShuffleMXdlPerWavePerShuffle
-    1,                // CShuffleNXdlPerWavePerShuffle
+    true,             // BBlockLdsExtraN
+    2,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleNXdlPerWavePerShuffle
     S<1, 16, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    2,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4,                // CShuffleBlockTransferScalarPerVector_NPerBlock
     BlkGemmPSched,    // BlkGemmPipeSched
     BlkGemmPVer,      // BlkGemmPipelineVer
     ADataType,        // ComputeTypeA
@@ -83,6 +83,7 @@ int main(int argc, char* argv[])
                                ADataType,
                                BDataType,
                                XDataType,
+                               XDataType,
                                CDataType,
                                ALayout,
                                BLayout,
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 99ed2a23b9..30df8ccd37 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -23,8 +23,9 @@
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row  = ck::tensor_layout::gemm::RowMajor;
+using Col  = ck::tensor_layout::gemm::ColumnMajor;
+using MFMA = ck::tensor_layout::gemm::MFMA;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
@@ -36,6 +37,8 @@ struct ExecutionConfig final
     int init_method     = 2;     // (0=constant values, 1=integer values, 2=decimal values)
     bool time_kernel    = false; // (0=no, 1=yes)
     int verbosity       = 0;     // (0=no info, 1=verbose info)
+    int warm_up         = 10;
+    int repeat          = 10;
 };
 
 struct ProblemSizeSplitK final
@@ -86,6 +89,8 @@ bool parse_cmd_args(int argc,
         if(argc >= 12)
         {
             problem_size.KBatch = std::stoi(argv[11]);
+            config.warm_up      = std::stoi(argv[12]);
+            config.repeat       = std::stoi(argv[13]);
         }
     }
     else
@@ -103,10 +108,90 @@ bool parse_cmd_args(int argc,
     return true;
 }
 
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f,
+            // 2-k)));
+
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+void preShuffleBuffer(const ck::f4x2_pk_t* src, ck::f4x2_pk_t* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+    int K_pk  = K / 2;
+    int K0    = K_pk / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K_pk; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K_pk + k];
+        }
+    }
+}
+
 template <typename DeviceOpInstance,
           typename ADataType,
           typename BDataType,
           typename XDataType,
+          typename XPackedDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
@@ -119,6 +204,8 @@ template <typename DeviceOpInstance,
           ck::index_t ScaleBlockSize>
 bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& config)
 {
+    constexpr bool BPreShuffle = ck::is_same_v<BLayout, MFMA>;
+    using BRefLayout           = ck::conditional_t<BPreShuffle, Col, BLayout>;
 
     auto M       = problem_size.M;
     auto N       = problem_size.N;
@@ -131,28 +218,19 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     auto f_host_tensor_descriptor =
         [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
             if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-            {
                 return HostTensorDescriptor({row, col}, {stride, 1});
-            }
             else
-            {
                 return HostTensorDescriptor({row, col}, {1, stride});
-            }
         };
-
     auto f_get_default_stride =
         [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
             if(stride == -1)
             {
                 // give a chance if stride is -1, return a default packed stride
                 if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-                {
                     return static_cast<ck::index_t>(col);
-                }
                 else
-                {
                     return static_cast<ck::index_t>(row);
-                }
             }
             else
                 return static_cast<ck::index_t>(stride);
@@ -172,16 +250,30 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     using AScaleLayout = Row;
     using BScaleLayout = Col;
 
-    auto Scale_Stride_AM = f_get_default_stride(M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Padded_M = (M + ScaleBlockSize - 1) / ScaleBlockSize * ScaleBlockSize;
+    auto Scale_Stride_AM =
+        f_get_default_stride(Scale_Padded_M, K / ScaleBlockSize, -1, AScaleLayout{});
     auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    auto b_k_n =
+        std::make_shared<Tensor<BDataType>>(f_host_tensor_descriptor(K, N, StrideB, BRefLayout{}));
+    auto b_input = b_k_n;
+    if constexpr(BPreShuffle)
+        b_input = std::make_shared<Tensor<BDataType>>(
+            f_host_tensor_descriptor(K, N, StrideB, BRefLayout{})); // use layout only for size
 
+    // scales for A and B
     Tensor<XDataType> a_m_k_scale(f_host_tensor_descriptor(
-        M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{})); // scales for A
-    Tensor<XDataType> b_k_n_scale(f_host_tensor_descriptor(
-        K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{})); // scales for B
+        Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
+    Tensor<XDataType> b_k_n_scale(
+        f_host_tensor_descriptor(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
+
+    // shuffled scales for A and B
+    Tensor<XDataType> a_shuffled_scale(f_host_tensor_descriptor(
+        Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
+    Tensor<XDataType> b_shuffled_scale(
+        f_host_tensor_descriptor(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
 
     Tensor<CDataType> c_m_n_host_result(
         f_host_tensor_descriptor(M, N, StrideC, CLayout{})); // host verification
@@ -192,18 +284,31 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     {
         std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
         std::cout << "a_m_k_scale: " << a_m_k_scale.mDesc << std::endl;
-        std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+        std::cout << "b_k_n: " << b_k_n->mDesc << std::endl;
         std::cout << "b_k_n_scale: " << b_k_n_scale.mDesc << std::endl;
         std::cout << "c_m_n_device_result: " << c_m_n_device_result.mDesc << std::endl;
     }
 
+    auto a_data_element = [](float x) {
+        if constexpr(ck::is_same_v<ADataType, ck::f4x2_pk_t>)
+            return ck::type_convert<ADataType>(ck::float2_t(x));
+        else
+            return ck::type_convert<ADataType>(x);
+    };
+    auto b_data_element = [](float x) {
+        if constexpr(ck::is_same_v<BDataType, ck::f4x2_pk_t>)
+            return ck::type_convert<BDataType>(ck::float2_t(x));
+        else
+            return ck::type_convert<BDataType>(x);
+    };
+
     switch(config.init_method)
     {
     case 0: // Initializations for development and debugging
-        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.0f)}(a_m_k);
-        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(2.0f)}(a_m_k_scale);
-        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(0.5f)}(b_k_n);
-        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(1.0f)}(b_k_n_scale);
+        ck::utils::FillConstant<ADataType>{a_data_element(1.0f)}(a_m_k);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(1.0f)}(a_m_k_scale);
+        ck::utils::FillConstant<BDataType>{b_data_element(2.0f)}(*b_k_n);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(0.5f)}(b_k_n_scale);
         if(config.verbosity > 0)
         {
             std::cout << "Init A = {1}" << std::endl;
@@ -216,29 +321,20 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
 
     case 1:
 
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6}); // Z[-5,5]
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6}); // Z[-5,5]
-
-        if constexpr(ck::is_same_v<XDataType, ck::e8m0_bexp_t>)
-        {
-            a_m_k_scale.GenerateTensorValue(
-                GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
-            b_k_n_scale.GenerateTensorValue(
-                GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
-        }
-        else
-        {
-            ck::utils::FillUniformDistributionIntegerValue<XDataType>{-1.0f, 1.0f}(a_m_k_scale);
-            ck::utils::FillUniformDistributionIntegerValue<XDataType>{-1.0f, 1.0f}(b_k_n_scale);
-        }
-
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6});  // Z[-5,5]
+        b_k_n->GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6}); // Z[-5,5]
+        static_assert(ck::is_same_v<XDataType, ck::e8m0_bexp_t>);
+        a_m_k_scale.GenerateTensorValue(
+            GeneratorTensor_2<XDataType>{120, 129}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorValue(
+            GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
         break;
 
     case 2:
         a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
         a_m_k_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
 
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        b_k_n->GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
         b_k_n_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
         break;
 
@@ -249,20 +345,33 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
         }
     }
 
+    preShuffleScaleBuffer<ck::is_same_v<ALayout, Row>>(a_m_k_scale.mData.data(),
+                                                       a_shuffled_scale.mData.data(),
+                                                       Scale_Padded_M,
+                                                       K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<BRefLayout, Col>>(
+        b_k_n_scale.mData.data(), b_shuffled_scale.mData.data(), N, K / ScaleBlockSize);
+    if constexpr(BPreShuffle)
+    {
+        int NPerXdl = 16; // Fixed 16
+        preShuffleBuffer(b_k_n->mData.data(), b_input->mData.data(), N, K, NPerXdl);
+    }
+
     if(config.verbosity > 0)
         std::cout << "Device memory allocation..." << std::endl;
-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem a_scale_device_buf(sizeof(XDataType) * a_m_k_scale.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem b_scale_device_buf(sizeof(XDataType) * b_k_n_scale.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.GetElementSpaceSize());
+    DeviceMem a_scale_device_buf(sizeof(XDataType) * a_m_k_scale.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n->GetElementSpaceSize());
+    DeviceMem b_scale_device_buf(sizeof(XDataType) * b_k_n_scale.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.GetElementSpaceSize());
 
     if(config.verbosity > 0)
         std::cout << "Upload data to device..." << std::endl;
     a_device_buf.ToDevice(a_m_k.mData.data());
-    a_scale_device_buf.ToDevice(a_m_k_scale.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    b_scale_device_buf.ToDevice(b_k_n_scale.mData.data());
+    a_scale_device_buf.ToDevice(a_shuffled_scale.mData.data());
+    b_device_buf.ToDevice(b_input->mData.data());
+    b_scale_device_buf.ToDevice(b_shuffled_scale.mData.data());
+
     if(config.verbosity > 0)
         std::cout << "Done." << std::endl;
 
@@ -275,9 +384,9 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     auto invoker   = device_op.MakeInvoker();
     auto argument =
         device_op.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                               static_cast<XDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+                               static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
                                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                               static_cast<XDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+                               static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
                                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
                                M,
                                N,
@@ -299,13 +408,26 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
                                  "not consistent with the supported device_gemm arguments.");
     }
 
+    std::size_t total_size =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n->GetElementSpaceSizeInBytes() +
+        a_m_k_scale.GetElementSpaceSizeInBytes() + b_k_n_scale.GetElementSpaceSizeInBytes() +
+        a_shuffled_scale.GetElementSpaceSizeInBytes() +
+        b_shuffled_scale.GetElementSpaceSizeInBytes();
+    const auto total_cnt     = ck::math::integer_divide_ceil(512 * 1024 * 1024, total_size);
+    const int rotating_count = std::max(1, std::min(config.repeat, static_cast<int>(total_cnt)));
     if(config.verbosity > 0)
     {
         std::cout << "Computing GEMM on device..." << std::endl << std::endl;
     }
 
-    float ave_time =
-        invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, config.verbosity, 20, 50});
+    float ave_time = invoker.Run(argument,
+                                 StreamConfig{nullptr,
+                                              config.time_kernel,
+                                              config.verbosity,
+                                              config.warm_up,
+                                              config.repeat,
+                                              rotating_count > 1,
+                                              rotating_count});
 
     bool res_verified = true;
     if(config.do_verification > 0)
@@ -332,7 +454,7 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
 
         auto ref_argument = ref_gemm.MakeArgument(a_m_k,
                                                   a_m_k_scale,
-                                                  b_k_n,
+                                                  *b_k_n,
                                                   b_k_n_scale,
                                                   c_m_n_host_result,
                                                   PassThrough{},
@@ -347,20 +469,21 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
             std::cout << "Comparing results..." << std::endl;
         }
 
-        if(config.init_method == 0)
-        {
-            auto expected = static_cast<float>(K);
-            auto computed = type_convert<float>(c_m_n_device_result(1, 12));
+        // if(config.init_method == 0)
+        // {
+        //     auto expected = static_cast<float>(K);
+        //     auto computed = type_convert<float>(c_m_n_device_result(1, 12));
 
-            res_verified = res_verified && std::abs(expected - computed) <= 0.0f;
-            std::cout << "\nExpected vs Computed: " << expected << " vs " << computed
-                      << ((res_verified) ? " (PASSED!)" : " (FAILED!)") << std::endl
-                      << std::endl;
-        }
+        //     res_verified = res_verified && std::abs(expected - computed) <= 0.0f;
+        //     std::cout << "\nExpected vs Computed: " << expected << " vs " << computed
+        //               << ((res_verified) ? " (PASSED!)" : " (FAILED!)") << std::endl
+        //               << std::endl;
+        // }
 
-        res_verified = res_verified && ck::utils::check_err(c_m_n_device_result,
-                                                            c_m_n_host_result,
-                                                            "Error: Incorrect results!");
+        res_verified =
+            res_verified &&
+            ck::utils::check_err(
+                c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results!", 5e-1, 5e-1);
 
         if(config.verbosity > 0 && res_verified)
             std::cout << "Verification Successful!" << std::endl;
@@ -377,13 +500,14 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
         // partial sums(K/ScaleBlockSize)]
         // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
         std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
-        std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                sizeof(CDataType) * M * N +
-                                sizeof(XDataType) * (M * K + K * N) / ScaleBlockSize;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K / ck::packed_size_v<ADataType> +
+            sizeof(BDataType) * K * N / ck::packed_size_v<BDataType> + sizeof(CDataType) * M * N +
+            sizeof(XDataType) * M * K / ScaleBlockSize + sizeof(XDataType) * N * K / ScaleBlockSize;
 
         float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
-        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        float gb_per_sec = static_cast<float>(num_btype) / 1e6f / ave_time;
 
         std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                   << " GB/s, " << device_op.GetTypeString() << std::endl;
@@ -396,6 +520,7 @@ template <typename DeviceOpInstance,
           typename ADataType,
           typename BDataType,
           typename XDataType,
+          typename XPackedDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
@@ -416,6 +541,7 @@ bool run_mx_gemm_example(int argc, char* argv[])
                        ADataType,
                        BDataType,
                        XDataType,
+                       XPackedDataType,
                        CDataType,
                        ALayout,
                        BLayout,
diff --git a/example/67_gemm_microscaling/gemm_mx_fp4.cpp b/example/67_gemm_microscaling/gemm_mx_fp4.cpp
new file mode 100644
index 0000000000..cff5148fa7
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_fp4.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::f4x2_pk_t;
+using BDataType = ck::f4x2_pk_t;
+// using ADataType = ck::f4_t;
+// using BDataType = ck::f4_t;
+
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t;
+
+using CDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t DataPackedSize = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock      = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
+
+// AB DataType: f4x2_pk_t
+// Mathmatically, all numbers are represented as f4x2.
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XPackedDataType,  // AScaleDataType
+    BDataType,        // BDataType
+    XPackedDataType,  // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Thread block size
+    256,              // MPerBlock
+    256,              // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    8,                // MXdlPerWave
+    8,                // NXdlPerWave
+    S<8, 32, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    true,             // ABlockLdsExtraM
+    S<8, 32, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    true,             // BBlockLdsExtraN
+    2,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               XPackedDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp
new file mode 100644
index 0000000000..562b2fdb17
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::f4x2_pk_t;
+using BDataType = ck::f4x2_pk_t;
+// using ADataType = ck::f4_t;
+// using BDataType = ck::f4_t;
+
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t;
+
+using CDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = MFMA;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t DataPackedSize = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock      = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
+
+// AB DataType: f4x2_pk_t
+// Mathmatically, all numbers are represented as f4x2.
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XPackedDataType,  // AScaleDataType
+    BDataType,        // BDataType
+    XPackedDataType,  // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    512,              // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    8,                // MXdlPerWave
+    8,                // NXdlPerWave
+    S<8, 32, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    true,             // ABlockLdsExtraM
+    S<8, 32, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    true,             // BBlockLdsExtraN
+    2,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               XPackedDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8.cpp b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
index 9fc5666197..e6fe791178 100644
--- a/example/67_gemm_microscaling/gemm_mx_fp8.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_fp8.cpp
@@ -25,7 +25,7 @@ constexpr ck::index_t KPerBlock      = 256;
 
 constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
 constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
-constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
 
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
     ALayout,          // ALayout
@@ -49,26 +49,26 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffle
     KPerBlock,        // KPerBlock
     16,               // AK1
     16,               // BK1
-    32,               // MPerXDL
-    32,               // NPerXDL
-    2,                // MXdlPerWave
-    2,                // NXdlPerWave
-    S<4, 64, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    4,                // MXdlPerWave
+    4,                // NXdlPerWave
+    S<16, 16, 1>,     // ABlockTransferThreadClusterLengths_AK0_M_AK1
     S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
     2,                // ABlockTransferSrcVectorDim
     16,               // ABlockTransferSrcScalarPerVector
     16,               // ABlockTransferDstScalarPerVector_AK1
-    false,            // ABlockLdsExtraM
-    S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    true,             // ABlockLdsExtraM
+    S<16, 16, 1>,     // BBlockTransferThreadClusterLengths_BK0_N_BK1
     S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
     2,                // BBlockTransferSrcVectorDim
     16,               // BBlockTransferSrcScalarPerVector
     16,               // BBlockTransferDstScalarPerVector_BK1
-    false,            // BBlockLdsExtraN
-    1,                // CShuffleMXdlPerWavePerShuffle
-    1,                // CShuffleNXdlPerWavePerShuffle
+    true,             // BBlockLdsExtraN
+    2,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
     8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
     BlkGemmPSched,    // BlkGemmPipeSched
@@ -83,6 +83,7 @@ int main(int argc, char* argv[])
                                ADataType,
                                BDataType,
                                XDataType,
+                               XDataType,
                                CDataType,
                                ALayout,
                                BLayout,
diff --git a/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp b/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp
index ce4ebc0a40..fdc4ace471 100644
--- a/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_fp8_bf8.cpp
@@ -24,7 +24,7 @@ constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
 
 constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
 constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
-constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
 
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
     ALayout,          // ALayout
@@ -43,30 +43,30 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffle
     GemmSpec,         // GemmSpec
     ScaleBlockSize,   // ScaleBlockSize: Scaling block size
     256,              // BlockSize: Thread block size
-    256,              // MPerBlock
-    256,              // NPerBlock
-    128,              // KPerBlock
+    128,              // MPerBlock
+    128,              // NPerBlock
+    256,              // KPerBlock
     16,               // AK1
     8,                // BK1
     16,               // MPerXDL
     16,               // NPerXDL
-    8,                // MXdlPerWave
-    8,                // NXdlPerWave
-    S<8, 32, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    4,                // MXdlPerWave
+    4,                // NXdlPerWave
+    S<16, 16, 1>,     // ABlockTransferThreadClusterLengths_AK0_M_AK1
     S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
     S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
     2,                // ABlockTransferSrcVectorDim
     16,               // ABlockTransferSrcScalarPerVector
     16,               // ABlockTransferDstScalarPerVector_AK1
     false,            // ABlockLdsExtraM
-    S<16, 16, 1>,     // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<32, 8, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
     S<0, 2, 1>,       // BBlockTransferThreadClusterArrangeOrder
     S<0, 2, 1>,       // BBlockTransferSrcAccessOrder
     1,                // BBlockTransferSrcVectorDim
     16,               // BBlockTransferSrcScalarPerVector
     8,                // BBlockTransferDstScalarPerVector_BK1
     false,            // BBlockLdsExtraN
-    1,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleMXdlPerWavePerShuffle
     2,                // CShuffleNXdlPerWavePerShuffle
     S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
     8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
@@ -82,6 +82,7 @@ int main(int argc, char* argv[])
                                ADataType,
                                BDataType,
                                XDataType,
+                               XDataType,
                                CDataType,
                                ALayout,
                                BLayout,
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index c86b434212..54d9f13453 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -222,12 +222,18 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
         rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
         set(result 0)
     endif()
-    
+
     #message("add_example returns ${result}")
     set(result ${result} PARENT_SCOPE)
 
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 
+function(example_compile_options EXAMPLE_NAME)
+    if(TARGET ${EXAMPLE_NAME})
+        target_compile_options(${EXAMPLE_NAME} ${ARGN})
+    endif()
+endfunction(example_compile_options)
+
 # add all example subdir
 file(GLOB dir_list LIST_DIRECTORIES true *)
 FOREACH(subdir ${dir_list})
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
index ebe075b55d..f366f309ff 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -35,6 +35,9 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
     using ComputeTypeB = BDataType;
     using AccType      = float; // for now only support V_MFMA_SCALE_F32
 
+    static constexpr index_t APackedSize = packed_size_v<ComputeTypeA>;
+    static constexpr index_t BPackedSize = packed_size_v<ComputeTypeB>;
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -48,17 +51,24 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
     static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
     static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
-    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+    // static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 =
+        BTileDesc{}.GetLength(Number < BTileDesc{}.GetNumOfDimension() == 4 ? 3 : 2 > {});
 
-    static constexpr auto xdlops_gemm =
-        XdlopsGemm<ComputeTypeA, MPerXDL, NPerXDL, KPack, ComputeTypeB, TransposeC, true>{};
+    static constexpr auto xdlops_gemm = XdlopsGemm<ComputeTypeA,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   KPack * APackedSize,
+                                                   ComputeTypeB,
+                                                   TransposeC,
+                                                   true>{};
 
     static constexpr index_t AMmaKStride = KPack;
     static constexpr index_t BMmaKStride = KPack;
 
     //> store rows/cols into thread registers in chunks of 16
     //> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
-    static constexpr index_t KThreadChunk = 16;
+    static constexpr index_t KThreadChunk = 16 / sizeof(ComputeTypeA);
 
     static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t KRepeat       = KPerThread / KPack;
@@ -67,22 +77,29 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
     static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
 
-    using HotLoopInstList =
-        ck::BlockwiseGemmXdlops_pipeline_hotloop_inst<BlockSize,
-                                                      MPerBlock,
-                                                      NPerBlock,
-                                                      KPerBlock,
-                                                      ABlockTransferSrcScalarPerVector,
-                                                      BBlockTransferSrcScalarPerVector,
-                                                      A_K1,
-                                                      B_K1,
-                                                      A_K1,
-                                                      B_K1,
-                                                      MRepeat,
-                                                      NRepeat,
-                                                      MPerXDL,
-                                                      NPerXDL,
-                                                      xdlops_gemm.KPerXdlops>;
+    // Hardcode to 2, for better 8-bit access pattern
+
+    static constexpr index_t MXdlPack = 2;
+    static constexpr index_t NXdlPack = 2;
+    static constexpr index_t KXdlPack = 2;
+
+    using HotLoopInstList = ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< //
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        ABlockTransferSrcScalarPerVector,
+        BBlockTransferSrcScalarPerVector,
+        A_K1,
+        B_K1,
+        A_K1,
+        B_K1,
+        MRepeat,
+        NRepeat,
+        MPerXDL,
+        NPerXDL,
+        xdlops_gemm.KPerXdlops,
+        (packed_size_v<ComputeTypeA> > 1 || packed_size_v<ComputeTypeB> > 1)>;
 
     static_assert(KPerThread % KPack == 0,
                   "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
@@ -116,7 +133,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
 
         const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
 
-        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KThreadChunk * xdlops_a_idx[I0]);
+        return make_tuple(0, waveId_m, 0, xdlops_a_idx[I1], KThreadChunk * xdlops_a_idx[I0]);
     }
 
     __device__ static auto CalculateBThreadOriginDataIndex()
@@ -127,7 +144,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
 
         const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
 
-        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KThreadChunk * xdlops_b_idx[I0]);
+        return make_tuple(0, waveId_n, 0, xdlops_b_idx[I1], KThreadChunk * xdlops_b_idx[I0]);
     }
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
@@ -142,24 +159,27 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
         const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
 
         constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
-            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),
+            make_tuple(
+                make_unmerge_transform(make_tuple(MRepeat / MXdlPack, MWaves, MXdlPack, MPerXDL))),
             make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0, 1, 2>{}));
+            make_tuple(Sequence<0, 1, 2, 3>{}));
 
         constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor(
-            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))),
+            make_tuple(
+                make_unmerge_transform(make_tuple(NRepeat / NXdlPack, NWaves, NXdlPack, NPerXDL))),
             make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0, 1, 2>{}));
+            make_tuple(Sequence<0, 1, 2, 3>{}));
 
+        // We pack 2 mfma in M/N direction, so we need to divide by 2
         const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
-            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+            make_tuple(m0 / MXdlPack, waveId_m, m0 % MXdlPack, blk_idx[I0]))[I0];
         const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
-            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+            make_tuple(n0 / NXdlPack, waveId_n, n0 % NXdlPack, blk_idx[I1]))[I0];
 
         return make_tuple(c_thread_m, c_thread_n);
     }
 
-    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
+    using Tuple5 = decltype(CalculateAThreadOriginDataIndex());
 
     /**
      * @brief Constructor for BlockwiseGemmXdlops_mx_pipeline_base.
@@ -179,13 +199,12 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
      * repeat dimensions.
      */
     __host__ __device__
-    BlockwiseGemmXdlops_mx_pipeline_base(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
-                                         Tuple4 b_origin = CalculateBThreadOriginDataIndex())
+    BlockwiseGemmXdlops_mx_pipeline_base(Tuple5 a_origin = CalculateAThreadOriginDataIndex(),
+                                         Tuple5 b_origin = CalculateBThreadOriginDataIndex())
         : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
     {
         static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
                       "wrong! Desc should be known at compile-time");
-
         static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
                       "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
 
@@ -221,6 +240,28 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
             make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
     }
 
+    // XDL output supporting C_xdl = A_xdl * B_xdl, packed mfma
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat / MXdlPack>{},
+                                                              Number<NRepeat / NXdlPack>{},
+                                                              I1,
+                                                              I1,
+                                                              Number<MXdlPack>{},
+                                                              Number<NXdlPack>{},
+                                                              M0,
+                                                              M1,
+                                                              M2,
+                                                              N));
+    }
+
     __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
     {
         constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
@@ -262,6 +303,23 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
         return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
     }
 
+    // XDL output supporting C_xdl = A_xdl * B_xdl_packed mfma
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat / MXdlPack>{},
+                                                           Number<NRepeat / NXdlPack>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MXdlPack>{},
+                                                           Number<NXdlPack>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3(
+            c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
     __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
     {
         constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
@@ -314,45 +372,47 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
             c_grid_desc_g_m0_n0_m1_n1_m2_n2);
     }
 
-    static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k;
-    static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k;
+    __host__ __device__ static constexpr auto GetCThreadDesc() { return c_thread_desc_; }
+
+    static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k;
+    static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_n3_k;
 
     protected:
     // M1, N1 as double buffer index
     // Read buffer + Compute buffer
     // A[M0, M1, M2, KPack]
-    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
-        make_tuple(Number<MRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
-        make_tuple(
-            Number<KPack>{}, Number<KRepeat * MRepeat * KPack>{}, Number<MRepeat * KPack>{}, I1));
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(make_tuple(
+        Number<MRepeat / MXdlPack>{}, I1, Number<MXdlPack>{}, Number<KRepeat>{}, Number<KPack>{}));
 
     // B[N0, N1, N2, KPack]
-    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
-        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}),
-        make_tuple(
-            Number<KPack>{}, Number<KRepeat * NRepeat * KPack>{}, Number<NRepeat * KPack>{}, I1));
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(make_tuple(
+        Number<NRepeat / NXdlPack>{}, I1, Number<NXdlPack>{}, Number<KRepeat>{}, Number<KPack>{}));
 
     // C[M, N, NumRegXdlops]
-    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
+    static constexpr auto c_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat / MXdlPack>{},
+                                                       Number<NRepeat / NXdlPack>{},
+                                                       Number<MXdlPack>{},
+                                                       Number<NXdlPack>{},
+                                                       xdlops_gemm.GetRegSizePerXdlops()));
 
     using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
                                                          ComputeTypeA,
-                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_block_desc_m0_m1_m2_m3_k),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, KThreadChunk>,
-                                                         Sequence<0, 1, 2, 3>,
-                                                         3,
+                                                         Sequence<1, 1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
                                                          A_K1,
                                                          A_K1>;
 
     using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<BDataType,
                                                          ComputeTypeB,
-                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_block_desc_n0_n1_n2_n3_k),
                                                          decltype(b_thread_desc_),
-                                                         Sequence<1, 1, 1, KThreadChunk>,
-                                                         Sequence<0, 1, 2, 3>,
-                                                         3,
+                                                         Sequence<1, 1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
                                                          B_K1,
                                                          B_K1>;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
index e5fe92a50d..8b227a8aa1 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp
@@ -145,7 +145,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3<BlockGemmPipelineSch
     using Base::MWaves;
 
     static constexpr auto xdlops_gemm =
-        XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, BDataType>{};
+        XdlopsGemm<ComputeDataType, MPerXDL, NPerXDL, KPack, ComputeDataType>{};
 
     static constexpr index_t PrefetchStages        = 2;
     static constexpr index_t PrefillStages         = 1;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index 1d27a74bd7..d8f11572a8 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -270,10 +270,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
         __builtin_amdgcn_sched_barrier(0);
 
-        // // Local prefill A1
+        // Local prefill A1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
 
-        // // Global prefetch A2
+        // Global prefetch A2
         a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index 6c1c5b1c4d..94772361d3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -58,11 +58,21 @@ struct BlockwiseGemmXdlops_pipeline_base
     static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t KRepeat       = KPerThread / KPack;
     static constexpr index_t KPerInnerLoop = KPack;
-    static constexpr index_t KGroup =
-        ((MPerXDL == 16 && MPerXDL == 16 && xdlops_gemm.KPerXdlops == 128) ||
-         (MPerXDL == 32 && MPerXDL == 32 && xdlops_gemm.KPerXdlops == 64))
-            ? 2
-            : 1;
+
+    static constexpr index_t KGroup = []() {
+        if constexpr(is_same_v<remove_cvref_t<ComputeDataType>, f8_t>)
+            // On gfx950, we have mfma that required 32 f8 elements as input,
+            // splited into 2 groups of 16 f8 elements.
+            // the 2 groups is not contiguous in the B preshuffed layout.
+            // and we do not want it to be contiguous in the B preshuffled layout
+            // because a memory instruction can only read 16 f8 elements at a time.
+            return ((MPerXDL == 16 && MPerXDL == 16 && xdlops_gemm.KPerXdlops == 128) ||
+                    (MPerXDL == 32 && MPerXDL == 32 && xdlops_gemm.KPerXdlops == 64))
+                       ? 2
+                       : 1;
+        else
+            return 1;
+    }();
 
     static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_bpreshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_bpreshuffle_selector.hpp
new file mode 100644
index 0000000000..7d21c44504
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_bpreshuffle_selector.hpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp"
+
+namespace ck {
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType, // TODO: remove this as in this pipeline ADataType and BDataType
+                                    // must be used for compute
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmMXBPreshufflePipeline_Selector()
+{
+
+    // Hardware MX GEMM pipeline
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        return BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle<BlkGemmPipeSche,
+                                                             ThreadBlockSize,
+                                                             ScaleBlockSize,
+                                                             ADataType,
+                                                             AScaleDataType,
+                                                             BDataType,
+                                                             BScaleDataType,
+                                                             ATileDesc,
+                                                             BTileDesc,
+                                                             AMmaTileDesc,
+                                                             BMmaTileDesc,
+                                                             ABlockTransferSrcScalarPerVector,
+                                                             BBlockTransferSrcScalarPerVector,
+                                                             MPerBlock,
+                                                             NPerBlock,
+                                                             KPerBlock,
+                                                             MPerXDL,
+                                                             NPerXDL,
+                                                             MRepeat,
+                                                             NRepeat,
+                                                             KPack>{};
+    }
+    else
+    {
+        std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
index c1433659d6..52ab86b6d4 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
@@ -4,38 +4,9 @@
 #pragma once
 
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp"
 
 namespace ck {
-
-/**
- * @brief Define matrix data types that have hardware support for MX GEMMs
- */
-template <typename T>
-static constexpr bool is_scale_mfma_data_type()
-{
-    return is_same_v<T, f8_ocp_t> || is_same_v<T, bf8_ocp_t> || is_same_v<T, f6_t> ||
-           is_same_v<T, bf6_t> || is_same_v<T, f4_t>;
-}
-
-/**
- * @brief Define scale data types that have hardware support for MX GEMMs
- */
-template <typename T>
-static constexpr bool is_scale_mfma_scale_type()
-{
-    return is_same_v<T, e8m0_bexp_t>;
-}
-
-/**
- * @brief Combination of data types that have hardware support for MX GEMMs
- */
-template <typename ADataType, typename BDataType, typename AScaleDataType, typename BScaleDataType>
-static constexpr bool scale_mfma_hw_support()
-{
-    return is_scale_mfma_data_type<ADataType>() && is_scale_mfma_data_type<BDataType>() &&
-           is_scale_mfma_scale_type<AScaleDataType>() && is_scale_mfma_scale_type<BScaleDataType>();
-}
-
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           BlockGemmPipelineScheduler BlkGemmPipeSche,
           index_t ThreadBlockSize,
@@ -89,6 +60,30 @@ constexpr auto BlockGemmMXPipeline_Selector()
                                                   NRepeat,
                                                   KPack>{};
     }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        return BlockwiseGemmXdlops_pipeline_v3_mx<BlkGemmPipeSche,
+                                                  ThreadBlockSize,
+                                                  ScaleBlockSize,
+                                                  ADataType,
+                                                  AScaleDataType,
+                                                  BDataType,
+                                                  BScaleDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>{};
+    }
     else
     {
         std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
index 8375e81fa0..ea4f5e4a28 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp
@@ -205,7 +205,7 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale<BlockGemmPipelineScheduler::Intr
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
index 9acf401410..f4337745bf 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
@@ -136,15 +136,21 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
 
-    using Base::a_block_desc_m0_m1_m2_k;
-    using Base::b_block_desc_n0_n1_n2_k;
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
 
     using Base::AMmaKStride;
+    using Base::APackedSize;
     using Base::BMmaKStride;
+    using Base::BPackedSize;
     using Base::KThreadChunk;
 
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
     using AccType      = typename Base::AccType;
-    using Tuple4       = typename Base::Tuple4;
+    using Tuple5       = typename Base::Tuple5;
     using ComputeTypeA = typename Base::ComputeTypeA;
     using ComputeTypeB = typename Base::ComputeTypeB;
 
@@ -156,11 +162,26 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
         KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
 
     //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+    static constexpr auto AScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+    static constexpr auto BScalesPerXdlopsRun =
+        (BPackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
 
     //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRunPerThread =
-        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+    static constexpr auto ScalesPerXdlopsRunPerThreadA =
+        AScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+    static constexpr auto ScalesPerXdlopsRunPerThreadB =
+        BScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
 
     __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
     {
@@ -232,76 +253,58 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
             b_scale_thread_desc.GetElementSpaceSize());
 
         // Global prefetch 1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);
 
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
         // Prefetch a_scales
-        static_for<0, MRepeat, 1>{}([&](auto m0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto a_scale_offset =
-                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
-                    auto a_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-                            a_scale_thread_desc_copy.GetElementSpaceSize());
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            a_scale_thread_buf_copy);
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_buf);
 
-                    a_scale_thread_buf(Number<a_scale_offset>{}) =
-                        a_scale_thread_buf_copy[Number<0>{}];
-                    a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
             });
             a_scale_thread_copy.MoveSrcSliceWindow(
-                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
         });
 
         // restore row id and advance to the next set of scales
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
 
         // Prefetch b_scales
-        static_for<0, NRepeat, 1>{}([&](auto n0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto b_scale_offset =
-                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                    auto b_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            b_scale_thread_buf_copy);
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_buf);
 
-                    b_scale_thread_buf(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy[Number<0>{}];
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
             });
             b_scale_thread_copy.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
         });
 
         // restore col id and advance to the next set of scales
         // NWaves * NPerXDL * NRepeat == NPerBlock
-        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
 
         // Local prefill 1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+        __builtin_amdgcn_s_waitcnt(3952); // wait for EXP_CNT, LDS, GDS, Constant and Message
+        block_sync_lds();
 
         // Initialize C
         c_thread_buf.Clear();
@@ -314,13 +317,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
             do
             {
                 // -------------------------------------------------------------------------------------------
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
 
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                block_sync_lds();
+                // wait previous blockwise copy to finish
 
                 // k indexes mapping to threads for 32x32x64:
                 // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
@@ -335,160 +333,184 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
                 //              k = 0                    k = 1
                 static_for<0, KRepeat, 1>{}([&](auto k) {
                     constexpr auto k_step =
-                        k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+                        k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                            constexpr auto a_k_step_chunk =
-                                k_step +
-                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                            a_thread_copy_.Run(
-                                a_block_desc_m0_m1_m2_k,
-                                make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                a_block_buf,
-                                a_thread_desc_,
-                                make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                a_thread_buf);
-                        });
+                        static_for<0, xdlops_gemm.K1PerXdlops / APackedSize / KThreadChunk, 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_buf,
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
                     });
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
                         // read block data in chunks to assemble correct thread vectors
-                        static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                            constexpr auto b_k_step_chunk =
-                                k_step +
-                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                            b_thread_copy_.Run(
-                                b_block_desc_n0_n1_n2_k,
-                                make_tuple(n0, I0, I0, Number<b_k_step_chunk>{}),
-                                b_block_buf,
-                                b_thread_desc_,
-                                make_tuple(n0, I0, k, Number<chunk * KThreadChunk>{}),
-                                b_thread_buf);
-                        });
+                        static_for<0, xdlops_gemm.K1PerXdlops / BPackedSize / KThreadChunk, 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_buf,
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
                     });
                 });
 
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            vector_type<ComputeTypeA, KPack> a_thread_vec;
-                            vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                            static_for<0, KPack, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                        make_tuple(m0, I0, k0, ik))>{}];
-                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(n0, I0, k0, ik))>{}];
-                            });
+                // load for next k loop
+                block_sync_lds();
+                a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+                b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+                static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                             constexpr index_t a_scale_offset =
                                 a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
                             constexpr index_t b_scale_offset =
                                 b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
 
-                            static_assert(0 < ScalesPerXdlopsRunPerThread,
+                            static_assert(0 < ScalesPerXdlopsRunPerThreadA &&
+                                              0 < ScalesPerXdlopsRunPerThreadB,
                                           "Must have at least one scale per Xdlops per Thread.");
 
-                            vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
-                                a_scale_thread_vec;
-                            vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                                b_scale_thread_vec;
+                            vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                            vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
 
                             // Pack scale_thread_buf into scale_thread_vec
-                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
                                 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
                                     a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                            });
+                            static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
                                 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
                                     b_scale_thread_buf[Number<b_scale_offset + s>{}];
                             });
 
-                            using mfma_input_type_a =
-                                typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
-                            using mfma_input_type_b =
-                                typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+                            static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                    static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                        constexpr auto kxdl = ikxdl + k0 * KXdlPack;
 
-                            constexpr index_t c_offset =
-                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                        vector_type<ComputeTypeB, KPack> b_thread_vec;
 
-                            // MFMA accumulation
-                            xdlops_gemm.template Run<>(
-                                a_thread_vec.template AsType<mfma_input_type_a>(),
-                                a_scale_thread_vec.template AsType<AScaleDataType>(),
-                                b_thread_vec.template AsType<mfma_input_type_b>(),
-                                b_scale_thread_vec.template AsType<BScaleDataType>(),
-                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                        static_for<0, KPack, 1>{}([&](auto ik) {
+                                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                    make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        });
+
+                                        using mfma_input_type_a = typename vector_type< //
+                                            ComputeTypeA,
+                                            xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                                        using mfma_input_type_b = typename vector_type< //
+                                            ComputeTypeB,
+                                            xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                        using mfma_scale_input_type_a = typename vector_type< //
+                                            AScaleDataType,
+                                            a_scale_thread_vec_size>::type;
+                                        using mfma_scale_input_type_b = typename vector_type< //
+                                            BScaleDataType,
+                                            b_scale_thread_vec_size>::type;
+
+                                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                        // MFMA accumulation
+                                        xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                 ikxdl * NXdlPack + inxdl>(
+                                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                                            a_scale_thread_vec
+                                                .template AsType<mfma_scale_input_type_a>(),
+                                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                                            b_scale_thread_vec
+                                                .template AsType<mfma_scale_input_type_b>(),
+                                            c_thread_buf.GetVectorTypeReference(
+                                                Number<c_offset>{}));
+                                    });
+                                });
+                            });
                         });
                     });
                 });
 
                 // Prefetch a_scales
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            constexpr auto a_scale_offset =
-                                a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
-                            auto a_scale_thread_buf_copy =
-                                make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-                                    a_scale_thread_desc_copy.GetElementSpaceSize());
-                            a_scale_thread_copy.Run(a_scale_grid_desc,
-                                                    a_scale_grid_buf,
-                                                    a_scale_thread_desc_copy,
-                                                    make_tuple(I0, I0),
-                                                    a_scale_thread_buf_copy);
+                static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                a_scale_grid_buf,
+                                                a_scale_thread_desc,
+                                                make_tuple(m0, k0, I0),
+                                                a_scale_thread_buf);
 
-                            a_scale_thread_buf(Number<a_scale_offset>{}) =
-                                a_scale_thread_buf_copy[Number<0>{}];
-                            a_scale_thread_copy.MoveSrcSliceWindow(
-                                a_scale_grid_desc,
-                                make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                               make_multi_index(0, I1, 0));
                     });
                     a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc,
-                        make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+                        a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
                 });
 
                 // restore row id and advance to the next set of scales
                 a_scale_thread_copy.MoveSrcSliceWindow(
-                    a_scale_grid_desc, make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+                    a_scale_grid_desc,
+                    make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
 
                 // Prefetch b_scales
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            constexpr auto b_scale_offset =
-                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                            auto b_scale_thread_buf_copy =
-                                make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                    b_scale_thread_desc_copy.GetElementSpaceSize());
-                            b_scale_thread_copy.Run(b_scale_grid_desc,
-                                                    b_scale_grid_buf,
-                                                    b_scale_thread_desc_copy,
-                                                    make_tuple(I0, I0),
-                                                    b_scale_thread_buf_copy);
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                b_scale_grid_buf,
+                                                b_scale_thread_desc,
+                                                make_tuple(n0, k0, I0),
+                                                b_scale_thread_buf);
 
-                            b_scale_thread_buf(Number<b_scale_offset>{}) =
-                                b_scale_thread_buf_copy[Number<0>{}];
-                            b_scale_thread_copy.MoveSrcSliceWindow(
-                                b_scale_grid_desc,
-                                make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                               make_multi_index(0, I1, 0));
                     });
                     b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                        b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
                 });
 
                 // restore col id and advance to the next set of scales
                 // NWaves * NPerXDL * NRepeat == NPerBlock
                 b_scale_thread_copy.MoveSrcSliceWindow(
-                    b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                    b_scale_grid_desc,
+                    make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
 
+                __builtin_amdgcn_s_waitcnt(3952); // wait for EXP_CNT and LGKM_CNT
                 block_sync_lds();
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
 
                 i += 1;
             } while(i < (num_loop - 1));
@@ -497,87 +519,128 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
         // tail
         if constexpr(TailNum == TailNumber::Full)
         {
-            block_sync_lds();
-
             static_for<0, KRepeat, 1>{}([&](auto k) {
                 constexpr auto k_step =
-                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+                    k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
 
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    // read block data in chunks to assemble correct thread
-                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                        constexpr auto a_k_step_chunk =
-                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                           a_block_buf,
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                           a_thread_buf);
-                    });
+                    static_for<0, xdlops_gemm.K1PerXdlops / APackedSize / KThreadChunk, 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
                 });
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    // read block data in chunks to assemble correct thread
-                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                        constexpr auto b_k_step_chunk =
-                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
-                                           make_tuple(n0, I0, I0, Number<b_k_step_chunk>{}),
-                                           b_block_buf,
-                                           b_thread_desc_,
-                                           make_tuple(n0, I0, k, Number<chunk * KThreadChunk>{}),
-                                           b_thread_buf);
-                    });
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / BPackedSize / KThreadChunk, 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
                 });
             });
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                        static_for<0, KPack, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                         constexpr index_t a_scale_offset =
                             a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
                         constexpr index_t b_scale_offset =
                             b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
 
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        static_assert(0 < ScalesPerXdlopsRunPerThreadA &&
+                                          0 < ScalesPerXdlopsRunPerThreadB,
+                                      "Must have at least one scale per Xdlops per Thread.");
 
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
                             a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
                                 a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                        });
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
                             b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
                                 b_scale_thread_buf[Number<b_scale_offset + s>{}];
                         });
 
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA, xdlops_gemm.K1PerXdlops>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB, xdlops_gemm.K1PerXdlops>::type;
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
 
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
 
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
                     });
                 });
             });
@@ -587,20 +650,16 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
     // TODO: make this field protected when a_scale_thread_copy_ is moved
     // here
     static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from a_scale_grid to a_scale_thread
-    static constexpr auto a_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThreadA * a_scale_thread_vec_size>{}));
 
     // TODO: make this field protected when b_scale_thread_copy_ is moved
     // here
     static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from b_scale_grid to b_scale_thread_buf
-    static constexpr auto b_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThreadB * b_scale_thread_vec_size>{}));
 
     protected:
     using Base::a_thread_copy_;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
index 171a232c0f..b5d6180ab3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp
@@ -177,8 +177,8 @@ struct BlockwiseGemmXdlops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
         constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
index fc0075b196..a4038e9543 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
@@ -179,7 +179,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
index d1be88dd63..3179a90b7f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
@@ -178,7 +178,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_b_scale<BlockGemmPipelineScheduler::Intra
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp
new file mode 100644
index 0000000000..fe7d84eda4
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx.hpp
@@ -0,0 +1,1090 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v3_mx
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v3_mx<BlockGemmPipelineScheduler::Intrawave,
+                                          ThreadBlockSize,
+                                          ScaleBlockSize,
+                                          ADataType,
+                                          AScaleDataType,
+                                          BDataType,
+                                          BScaleDataType,
+                                          ATileDesc,
+                                          BTileDesc,
+                                          AMmaTileDesc,
+                                          BMmaTileDesc,
+                                          ABlockTransferSrcScalarPerVector,
+                                          BBlockTransferSrcScalarPerVector,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          MPerXDL,
+                                          NPerXDL,
+                                          MRepeat,
+                                          NRepeat,
+                                          KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+        constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                               num_buffer_load_a_scale + num_buffer_load_b_scale;
+
+        constexpr auto mfma_perstage_more =
+            math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+        constexpr auto mfma_perstage_less =
+            math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+        constexpr auto mfma_stages_more =
+            num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            if constexpr(i < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                          num_buffer_load_a_scale) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_bufs,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_bufs,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I0));
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(3952);
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_bufs(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_bufs(I0),
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
+            });
+        });
+
+        // Global prefetch 2
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I1));
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(I1));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+
+                    a_blockwise_copy.Run(
+                        a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(scale_comp_buf));
+                    b_blockwise_copy.Run(
+                        b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(scale_comp_buf));
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                                a_thread_vec.template AsType<ComputeTypeA>()(
+                                                    ik) = a_thread_buf
+                                                    [Number<a_thread_desc_.CalculateOffset(
+                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                                b_thread_vec.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                            });
+
+                                            using mfma_input_type_a = typename vector_type< //
+                                                ComputeTypeA,
+                                                xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                            using mfma_input_type_b = typename vector_type< //
+                                                ComputeTypeB,
+                                                xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                            using mfma_scale_input_type_a = typename vector_type< //
+                                                AScaleDataType,
+                                                a_scale_thread_vec_size>::type;
+                                            using mfma_scale_input_type_b = typename vector_type< //
+                                                BScaleDataType,
+                                                b_scale_thread_vec_size>::type;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                            // MFMA accumulation
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+
+                    // k indexes mapping to threads for 32x32x64:
+                    // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                    // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                    //              k = 0                 k = 1
+
+                    //  k indexes mapping to threads for 16x16x128:
+                    // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                    // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                    // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                    // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                    //              k = 0                    k = 1
+                    // __builtin_amdgcn_s_waitcnt(3952);
+                    // block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step =
+                            k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_bufs(scale_mem_buf),
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_bufs(scale_mem_buf),
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            __builtin_amdgcn_s_waitcnt(3952);
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_bufs(I1),
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_bufs(I1),
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp
new file mode 100644
index 0000000000..7e11304e2f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp
@@ -0,0 +1,1042 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle<BlockGemmPipelineScheduler::Intrawave,
+                                                     ThreadBlockSize,
+                                                     ScaleBlockSize,
+                                                     ADataType,
+                                                     AScaleDataType,
+                                                     BDataType,
+                                                     BScaleDataType,
+                                                     ATileDesc,
+                                                     BTileDesc,
+                                                     AMmaTileDesc,
+                                                     BMmaTileDesc,
+                                                     ABlockTransferSrcScalarPerVector,
+                                                     BBlockTransferSrcScalarPerVector,
+                                                     MPerBlock,
+                                                     NPerBlock,
+                                                     KPerBlock,
+                                                     MPerXDL,
+                                                     NPerXDL,
+                                                     MRepeat,
+                                                     NRepeat,
+                                                     KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::A_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t LocalPrefetchStages   = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    static constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+    static constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack;
+    static constexpr auto async_vmcnt =
+        num_buffer_load_a_scale + num_buffer_load_b_scale + HotLoopInstList::B_Buffer_Load_Inst_Num;
+    static constexpr auto async_vmcnt_encoding = 3952 + async_vmcnt % 16 + async_vmcnt / 16 * 16384;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_stage1 =
+            num_buffer_load_inst_b + num_buffer_load_a_scale + num_buffer_load_b_scale;
+
+        constexpr auto num_buffer_load_stage2 = num_buffer_load_inst_a;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 8, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more =
+            math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less =
+            math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_stage2 =
+            math::integer_divide_floor((num_buffer_load_stage2), 2);
+
+        constexpr auto buffer_load_stages_more =
+            num_buffer_load_stage1 -
+            math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less;
+        constexpr auto buffer_load_issue_point_interval_stage2 =
+            num_mfma_perstage / buffer_load_perstage_stage2;
+
+        // Stage 1
+        // global read more
+        static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+
+        // global read less
+        static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+
+        // Stage 2, Sync
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_bufs,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_bufs,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        ignore            = b_block_bufs;
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I0));
+        b_blockwise_copy.Run(
+            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+        block_sync_lds();
+        static_for<0, LocalPrefetchStages, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_m3_k,
+                            make_tuple(
+                                I0, I0, Number<m0 % MXdlPack>{}, I0, Number<a_k_step_chunk>{}),
+                            a_block_bufs(I0),
+                            a_thread_desc_,
+                            make_tuple(
+                                I0, I0, Number<m0 % MXdlPack>{}, k, Number<chunk * KThreadChunk>{}),
+                            a_thread_buf);
+                    });
+            });
+        });
+
+        // Global prefetch 2
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I1));
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+        constexpr index_t SwitchM = MRepeat - LocalPrefetchStages;
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(scale_mem_buf));
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    // a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        constexpr auto im_major = m0 / MXdlPack;
+                        constexpr auto im_minor = m0 % MXdlPack;
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            constexpr auto ik_major = k0 / KXdlPack;
+                            constexpr auto ik_minor = k0 % KXdlPack;
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                constexpr auto in_major = n0 / NXdlPack;
+                                constexpr auto in_minor = n0 % NXdlPack;
+
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(
+                                        make_tuple(im_major, ik_major, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(
+                                        make_tuple(in_major, ik_major, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) = b_thread_bufs
+                                        [scale_comp_buf][Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type_a =
+                                    typename vector_type<ComputeTypeA,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             APackedSize>::type;
+
+                                using mfma_input_type_b =
+                                    typename vector_type<ComputeTypeB,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             BPackedSize>::type;
+
+                                using mfma_scale_input_type_a =
+                                    typename vector_type<AScaleDataType,
+                                                         a_scale_thread_vec_size>::type;
+                                using mfma_scale_input_type_b =
+                                    typename vector_type<BScaleDataType,
+                                                         b_scale_thread_vec_size>::type;
+
+                                constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                                // MFMA accumulation
+                                xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                         ik_minor * NXdlPack + in_minor>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+
+                        if constexpr(m0.value == SwitchM)
+                        {
+                            __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+                            block_sync_lds();
+                            a_blockwise_copy.Run(a_grid_desc,
+                                                 a_grid_buf,
+                                                 a_block_desc,
+                                                 a_block_bufs(scale_comp_buf));
+                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        }
+
+                        constexpr auto lds_buf =
+                            m0.value >= SwitchM ? scale_mem_buf : scale_comp_buf;
+
+                        static_for<0, KRepeat, 1>{}([&](auto k) {
+                            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(Number<lds_buf>{}),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(
+                b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I1));
+
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+                if constexpr(m0.value == SwitchM)
+                {
+                    __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+                    block_sync_lds();
+                }
+
+                constexpr auto lds_buf = m0.value >= SwitchM ? I1 : I0;
+
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                            (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_m3_k,
+                                make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                  (MRepeat / MXdlPack)>{},
+                                           I0,
+                                           Number<im_minor>{},
+                                           I0,
+                                           Number<a_k_step_chunk>{}),
+                                a_block_bufs(Number<lds_buf>{}),
+                                a_thread_desc_,
+                                make_tuple(
+                                    I0, I0, Number<im_minor>{}, k, Number<chunk * KThreadChunk>{}),
+                                a_thread_buf);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(I1),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                    });
+                }
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(I0),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                    });
+                }
+            });
+        }
+    }
+
+    //  Length:  A[ARegBuf, MWave, MXdlPack, KRepeat, KPack]
+    //  Order:     1        0      3         2        4
+    static constexpr auto ARegBuf        = 2;
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<ARegBuf>{}, I1, Number<MXdlPack>{}, Number<KRepeat>{}, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeTypeA,
+                                                         decltype(a_block_desc_m0_m1_m2_m3_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()};
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    // using Base::a_thread_copy_;
+    // using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
index b6a4f05502..99934fa74e 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v5.hpp
@@ -188,7 +188,7 @@ struct BlockwiseGemmXdlops_pipeline_v5<BlockGemmPipelineScheduler::Intrawave,
 
         constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
 
-        constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
         constexpr auto ds_read_b_issue_cycle =
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
index a737c9195b..98cc149f4d 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -42,10 +42,12 @@ namespace ck {
 template <typename ThreadGroup,
           typename BlockSliceLengths,
           typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
           typename SrcData,
           typename DstData,
           typename SrcDesc,
           typename DstDesc,
+          typename SrcDimAccessOrder,
           index_t SrcVectorDim,
           index_t DstVectorDim,
           index_t ScalarPerVector>
@@ -61,6 +63,7 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
     using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
 
     static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
 
     static constexpr auto block_slice_lengths    = BlockSliceLengths{};
     static constexpr auto thread_cluster_lengths = ThreadClusterLengths{};
@@ -96,8 +99,12 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
         //    VALID: ThreadClusterLengths = [4, 16, 4] or [2, 32, 4] or [1, 64, 4] since in the
         //           first iteration, threads 0-63 write [0, 0, 0] -  [0, 15, 7] -> 128 consecutive
         //           elements = 64 consecutive DWORDs.
+#if defined(__gfx950__)
+        int num_contiguous_dwords = 4;
+#else
         int num_contiguous_dwords = 1;
-        bool is_contiguous        = true;
+#endif
+        bool is_contiguous = true;
         static_for<0, nDim, 1>{}([&](auto i) {
             if(is_contiguous)
             {
@@ -141,11 +148,11 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        constexpr auto dword_bytes           = 4;
-        constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        static_assert(bytes_per_thread_load == dword_bytes,
-                      "Direct load transfer requires each thread to load exactly a single "
-                      "DWORD of data.");
+        // constexpr auto dword_bytes           = 4;
+        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
+        // static_assert(bytes_per_thread_load == dword_bytes,
+        //               "Direct load transfer requires each thread to load exactly a single "
+        //               "DWORD of data.");
 
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
@@ -156,18 +163,45 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
                       "The number of threads cannot be less than the number of elements in "
                       "thread cluster lengths.");
 
-        static_assert(
-            AreThreadClusterLengthsValid(),
-            "Thread cluster lengths are incorrect. They must be set in a way that allows a single "
-            "wavefront to write contiguous DWORDs into LDS memory. ");
+        // static_assert(
+        //     AreThreadClusterLengthsValid(),
+        //     "Thread cluster lengths are incorrect. They must be set in a way that allows a single
+        //     " "wavefront to write contiguous DWORDs into LDS memory. ");
 
         const auto thread_cluster_idx =
             thread_cluster_desc_.CalculateBottomIndex(make_multi_index(ThreadGroup::GetThreadId()));
 
+        constexpr auto wave_cluster_lengths = generate_sequence_v2(
+            [&](auto i) {
+                // FIXME: wave parallelism is not always in that dimension.
+                // The ThreadClusterLengths{} must be bigger than wave_num;
+                if constexpr(ThreadClusterArrangeOrder{}.At(i) == (nDim - 3))
+                {
+                    return Number<ThreadGroup::GetNumOfThread() / 64>{};
+                }
+                else
+                {
+                    return I1;
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto wave_thread_cluster_lengths = ThreadClusterLengths{} / wave_cluster_lengths;
+        constexpr auto wave_single_load_size =
+            wave_thread_cluster_lengths * thread_single_load_size;
+        constexpr auto wave_cluster_desc_ =
+            make_cluster_descriptor(wave_cluster_lengths, ThreadClusterArrangeOrder{});
+
+        const auto wave_cluster_idx = wave_cluster_desc_.CalculateBottomIndex(
+            make_multi_index(ThreadGroup::GetThreadId() / 64));
+
         const auto thread_data_idx_begin = thread_cluster_idx * thread_single_load_size;
+        const auto wave_data_idx_begin   = wave_cluster_idx * wave_single_load_size;
 
         SetSrcSliceOrigin(src_desc, src_block_slice_origin + thread_data_idx_begin);
-        SetDstSliceOrigin(dst_desc, dst_block_slice_origin + thread_data_idx_begin);
+        // We don't need threadwise offset for lds since it was calculate by HW
+        // We still need input the wavewise offset.
+        SetDstSliceOrigin(dst_desc, dst_block_slice_origin + wave_data_idx_begin);
     }
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -215,7 +249,7 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
         // Loop over the destination block and copy data.
         static_ford<decltype(dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
             const auto src_offset = src_coord_.GetOffset();
-            const auto dst_offset = dst_coord_.GetOffset();
+            const auto dst_offset = __builtin_amdgcn_readfirstlane(dst_coord_.GetOffset());
 
             // Check if src data is not in the logic padding area.
             const bool is_src_valid =
@@ -303,7 +337,8 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
     }
 
     private:
-    static constexpr auto thread_cluster_desc_ = make_cluster_descriptor(ThreadClusterLengths{});
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
index e89185a35c..0562e452ac 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
@@ -45,6 +45,44 @@ struct DeviceGemmMX : public BaseOperator
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
 
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          index_t ScaleBlockSize,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmMX_BPreshuffle : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_a_scale,
+                        const void* p_b,
+                        const void* p_b_scale,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideAScale,
+                        ck::index_t StrideB,
+                        ck::index_t StrideBScale,
+                        ck::index_t StrideC,
+                        ck::index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual int GetPreShuffleParameters() = 0;
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index 2c34be9007..ed168195ec 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -15,6 +15,7 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 
@@ -162,56 +163,108 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
                                                          CElementwiseOperation>
 {
     // GridwiseGemm
-    using GridwiseGemm = GridwiseGemmMX_xdl_cshuffle_v3<
-        ALayout,
-        BLayout,
-        CLayout,
-        ADataType,
-        AScaleDataType,
-        BDataType,
-        BScaleDataType,
-        GemmAccDataType,
-        CShuffleDataType,
-        CDataType,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        GemmSpec,
-        ScaleBlockSize,
-        BlockSize,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_AK0_M_AK1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_AK1,
-        false,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_BK0_N_BK1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_BK1,
-        false,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock,
-        BlkGemmPipeSched,
-        BlkGemmPipelineVer,
-        ComputeTypeA,
-        ComputeTypeB>;
+    using GridwiseGemm = conditional_t< //
+        !is_same_v<BLayout, tensor_layout::gemm::MFMA>,
+        GridwiseGemmMX_xdl_cshuffle_v3<
+            ALayout,
+            BLayout,
+            CLayout,
+            ADataType,
+            AScaleDataType,
+            BDataType,
+            BScaleDataType,
+            GemmAccDataType,
+            CShuffleDataType,
+            CDataType,
+            AElementwiseOperation,
+            BElementwiseOperation,
+            CElementwiseOperation,
+            GemmSpec,
+            ScaleBlockSize,
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            AK1,
+            BK1,
+            MPerXDL,
+            NPerXDL,
+            MXdlPerWave,
+            NXdlPerWave,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorDim,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            false,
+            ABlockLdsExtraM,
+            BBlockTransferThreadClusterLengths_BK0_N_BK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_BK1,
+            false,
+            BBlockLdsExtraN,
+            CShuffleMXdlPerWavePerShuffle,
+            CShuffleNXdlPerWavePerShuffle,
+            CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+            CShuffleBlockTransferScalarPerVector_NPerBlock,
+            BlkGemmPipeSched,
+            BlkGemmPipelineVer,
+            ComputeTypeA,
+            ComputeTypeB>,
+        GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle<
+            ALayout,
+            BLayout,
+            CLayout,
+            ADataType,
+            AScaleDataType,
+            BDataType,
+            BScaleDataType,
+            GemmAccDataType,
+            CShuffleDataType,
+            CDataType,
+            AElementwiseOperation,
+            BElementwiseOperation,
+            CElementwiseOperation,
+            GemmSpec,
+            ScaleBlockSize,
+            BlockSize,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            AK1,
+            BK1,
+            MPerXDL,
+            NPerXDL,
+            MXdlPerWave,
+            NXdlPerWave,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorDim,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            false,
+            ABlockLdsExtraM,
+            BBlockTransferThreadClusterLengths_BK0_N_BK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_BK1,
+            false,
+            BBlockLdsExtraN,
+            CShuffleMXdlPerWavePerShuffle,
+            CShuffleNXdlPerWavePerShuffle,
+            CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+            CShuffleBlockTransferScalarPerVector_NPerBlock,
+            BlkGemmPipeSched,
+            BlkGemmPipelineVer,
+            ComputeTypeA,
+            ComputeTypeB>>;
 
     using Argument = typename GridwiseGemm::Argument;
 
@@ -304,385 +357,45 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
                           : 1
                     : 2;
 
-            if(has_main_k_block_loop)
-            {
-                // Tail number always full
-                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
-                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
-                {
-                    if(arg.KBatch > 1)
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::AtomicAdd,
-                                                        minimum_occupancy>;
-                        Run(kernel);
-                    }
-                    else
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy>;
-                        Run(kernel);
-                    }
-                }
-                // Tail number could be One to Seven
-                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-                {
-                    if(arg.KBatch > 1)
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::AtomicAdd,
-                                                            minimum_occupancy,
-                                                            TailNumber::One>;
-                            Run(kernel);
-                        }
-                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                TailNumber::Full)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::AtomicAdd,
-                                                            minimum_occupancy,
-                                                            TailNumber::Full>;
-                            Run(kernel);
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
-                            {
-                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy,
-                                    TailNumber::Two>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Three)
-                            {
-                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy,
-                                    TailNumber::Three>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Four)
-                            {
-                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy,
-                                    TailNumber::Four>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Five)
-                            {
-                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy,
-                                    TailNumber::Five>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
-                            {
-                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy,
-                                    TailNumber::Six>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Seven)
-                            {
-                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
-                                    GridwiseGemm,
-                                    true,
-                                    InMemoryDataOperationEnum::AtomicAdd,
-                                    minimum_occupancy,
-                                    TailNumber::Seven>;
-                                Run(kernel);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::One>;
-                            Run(kernel);
-                        }
-                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                                TailNumber::Full)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::Full>;
-                            Run(kernel);
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Two>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Three)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Three>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Four)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Four>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Five)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Five>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Six>;
-                                Run(kernel);
-                            }
-                        }
-
-                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
-                        {
-                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
-                               TailNumber::Seven)
-                            {
-                                const auto kernel =
-                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                                true,
-                                                                InMemoryDataOperationEnum::Set,
-                                                                minimum_occupancy,
-                                                                TailNumber::Seven>;
-                                Run(kernel);
-                            }
-                        }
-                    }
-                }
-                // Tail number could be Odd or Even
-                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
-                {
-                    if(arg.KBatch > 1)
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
-                                GridwiseGemm,
-                                true,
-                                InMemoryDataOperationEnum::AtomicAdd,
-                                minimum_occupancy,
-                                TailNumber::Odd>;
-                            Run(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
-                                GridwiseGemm,
-                                true,
-                                InMemoryDataOperationEnum::AtomicAdd,
-                                minimum_occupancy,
-                                TailNumber::Even>;
-                            Run(kernel);
-                        }
-                    }
-                    else
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
-                                                                 true,
-                                                                 InMemoryDataOperationEnum::Set,
-                                                                 minimum_occupancy,
-                                                                 TailNumber::Odd>;
-                            Run(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
-                                                                 true,
-                                                                 InMemoryDataOperationEnum::Set,
-                                                                 minimum_occupancy,
-                                                                 TailNumber::Even>;
-                            Run(kernel);
-                        }
-                    }
-                }
-                else
-                {
-                    if(arg.KBatch > 1)
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::AtomicAdd,
-                                                            minimum_occupancy,
-                                                            TailNumber::Odd>;
-                            Run(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::AtomicAdd,
-                                                            minimum_occupancy,
-                                                            TailNumber::Even>;
-                            Run(kernel);
-                        }
-                    }
-                    else
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::Odd>;
-                            Run(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel =
-                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                            true,
-                                                            InMemoryDataOperationEnum::Set,
-                                                            minimum_occupancy,
-                                                            TailNumber::Even>;
-                            Run(kernel);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                // Tail number always 1
+            constexpr auto TailNumChoices = []() {
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
-                {
-                    if(arg.KBatch > 1)
+                    return Tuple<constant<TailNumber::Full>>{};
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                    return Tuple<constant<TailNumber::Even>, constant<TailNumber::Odd>>{};
+                else
+                    static_assert(false, "Unexpected BlkGemmPipelineVer!");
+            }();
+            constexpr bool Use2LDS = []() {
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                    return false;
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                    return true;
+                else
+                    static_assert(false, "Unexpected BlkGemmPipelineVer!");
+            }();
+            const TailNumber tail_num = GridwiseGemm::CalculateKBlockLoopTailNum(K_split);
+            using BoolChoices         = Tuple<ck::true_type, ck::false_type>;
+            static_for_product<BoolChoices,
+                               BoolChoices,
+                               remove_cvref_t<decltype(TailNumChoices)>>{}(
+                [&](auto mainloop_choice, auto KBatch_cond_choice, auto tail_num_choice) {
+                    constexpr auto CGlobalMemoryDataOperation =
+                        KBatch_cond_choice.value ? InMemoryDataOperationEnum::AtomicAdd
+                                                 : InMemoryDataOperationEnum::Set;
+                    if(mainloop_choice.value == has_main_k_block_loop &&
+                       KBatch_cond_choice.value == (arg.KBatch > 1) &&
+                       tail_num_choice.value == tail_num)
                     {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        false,
-                                                        InMemoryDataOperationEnum::AtomicAdd,
-                                                        minimum_occupancy>;
+                        const auto kernel = kernel_gemm_xdl_cshuffle_v3_mx< //
+                            Use2LDS,
+                            GridwiseGemm,
+                            mainloop_choice.value,
+                            CGlobalMemoryDataOperation,
+                            minimum_occupancy,
+                            tail_num_choice.value>;
                         Run(kernel);
                     }
-                    else
-                    {
-                        const auto kernel =
-                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
-                                                        false,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy>;
-                        Run(kernel);
-                    }
-                }
-            }
-
+                });
             return ave_time;
         }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
index d704d04054..eda966c48a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
@@ -98,10 +98,12 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo
         MXdlPerWave,
         NXdlPerWave,
         ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferSrcAccessOrder,
         ABlockTransferSrcVectorDim,
         ABlockTransferScalarPerVector,
         ABlockLdsAddExtraM,
         BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferSrcAccessOrder,
         BBlockTransferSrcVectorDim,
         BBlockTransferScalarPerVector,
         BBlockLdsAddExtraN,
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 672998d811..047ff3bd06 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -315,6 +315,13 @@ struct PassThrough
         y = x;
     }
 
+    template <>
+    __host__ __device__ void operator()<f4x2_pk_t, f4x2_pk_t>(f4x2_pk_t& y,
+                                                              const f4x2_pk_t& x) const
+    {
+        y = x;
+    }
+
     template <>
     __host__ __device__ void operator()<double, double>(double& y, const double& x) const
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index 7781d1def3..1e79d67f93 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -173,18 +173,34 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
 
     __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
-        // A matrix in LDS memory, destination of blockwise copy.
-        return make_naive_tensor_descriptor(
-            make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1),
-            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+        if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+        {
+            // FIXME: our support to non-K contiguous layout is limited, only work in some specific
+            // setting
+            return make_naive_tensor_descriptor_packed(
+                make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1),
+                                                make_tuple(AK1, Number<KPerBlock>{}, I1));
+        }
     }
 
     __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
-        // B matrix in LDS memory, destination of blockwise copy.
-        return make_naive_tensor_descriptor(
-            make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
-            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+        if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+        {
+            // FIXME: our support to non-K contiguous layout is limited, only work in some specific
+            // setting
+            return make_naive_tensor_descriptor_packed(
+                make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
+                                                make_tuple(BK1, Number<KPerBlock>{}, I1));
+        }
     }
 
     __host__ __device__ static constexpr auto
@@ -566,10 +582,12 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
                                                       Sequence<AK0PerBlock, MPerBlock, AK1>,
                                                       ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                      ABlockTransferSrcAccessOrder,
                                                       ADataType,
                                                       AComputeDataType,
                                                       decltype(a_grid_desc_ak0_m_ak1),
                                                       decltype(a_block_desc_ak0_m_ak1),
+                                                      ABlockTransferSrcAccessOrder,
                                                       ABlockTransferSrcVectorDim,
                                                       2,
                                                       ABlockTransferScalarPerVector>(
@@ -582,10 +600,12 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
                                                       Sequence<BK0PerBlock, NPerBlock, BK1>,
                                                       BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                      BBlockTransferSrcAccessOrder,
                                                       BDataType,
                                                       BComputeDataType,
                                                       decltype(b_grid_desc_bk0_n_bk1),
                                                       decltype(b_block_desc_bk0_n_bk1),
+                                                      BBlockTransferSrcAccessOrder,
                                                       BBlockTransferSrcVectorDim,
                                                       2,
                                                       BBlockTransferScalarPerVector>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 0dbbc2a5e9..338674ae85 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -256,8 +256,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
          (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         // gfx950 double rate mfma16x16 require at least 128 KPerBlock to consume
          ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
-          lcm_AK1_BK1 < 32))
+          KPerBlock < 128 && MPerXdl == 16))
             ? true
             : false;
     static constexpr auto is_scale_mfma = false;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index 38ce9536ab..812e41ba58 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -184,8 +184,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
          (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
+         // gfx950 double rate mfma16x16 require at least 128 KPerBlock to consume
          ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
-          lcm_AK1_BK1 < 32))
+          KPerBlock < 128 && MPerXdl == 16))
             ? true
             : false;
     static constexpr auto is_scale_mfma = false;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 8fb955c561..cb22f99fc2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -173,15 +173,25 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
           lcm_AK1_BK1 < 32))
             ? true
             : false;
-    static constexpr auto is_scale_mfma    = false;
-    static constexpr auto mfma             = MfmaSelector<ComputeTypeA,
+    static constexpr auto is_scale_mfma = false;
+    static constexpr auto mfma          = MfmaSelector<ComputeTypeA,
                                               MPerXdl,
                                               NPerXdl,
                                               ComputeTypeA,
                                               is_single_rate_mfma,
                                               is_scale_mfma>{};
-    static constexpr index_t KPack         = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
-    static constexpr index_t KGroup        = mfma.selected_mfma.k_per_blk == 32 ? 2 : 1;
+    static constexpr index_t KPack      = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
+    static constexpr index_t KGroup     = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
+            // On gfx950, we have a mfma that required 32 f8 elements as input,
+            // splited into 2 groups of 16 f8 elements.
+            // the 2 groups is not contiguous in the B preshuffed layout.
+            // and we do not want it to be contiguous in the B preshuffled layout
+            // because a memory instruction can only read 16 f8 elements at a time.
+            return mfma.selected_mfma.k_per_blk == 32 ? 2 : 1;
+        else
+            return 1;
+    }();
     static constexpr index_t KLane         = mfma.GetKPerXdlops() / mfma.GetK1PerXdlops();
     static constexpr index_t KPackPerGroup = KPack / KGroup;
     static constexpr index_t KRepeat       = KPerBlock / KLane / KPackPerGroup;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index f877912329..e32301fcd2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -14,26 +14,30 @@
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/utility/env.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp"
 
 namespace ck {
 
+#ifndef KERNEL_GEMM_XDL_CSHUFFLE_V3_MX
+#define KERNEL_GEMM_XDL_CSHUFFLE_V3_MX
 // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
 // kernel function Blockers:
 // 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
 // two lds chunks.
 // 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
 // buffer when we declare __shared__ inside blkgemmpipe
-template <typename GridwiseGemm,
+template <bool Use2LDS,
+          typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
           TailNumber TailNum       = TailNumber::Full>
-__global__ void
+__global__ enable_if_t<!Use2LDS, void>
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -54,17 +58,18 @@ __global__ void
 #endif // end of if (defined(__gfx9__))
 }
 
-template <typename GridwiseGemm,
+template <bool Use2LDS,
+          typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
           TailNumber TailNum       = TailNumber::Full>
-__global__ void
+__global__ enable_if_t<Use2LDS, void>
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
     // Pass two lds pointer is the key to tell compiler that ds_read/write
@@ -76,9 +81,10 @@ __global__ void
 
     GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
         karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
         karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
         karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
         p_shared_0,
         p_shared_1,
         karg);
@@ -87,6 +93,7 @@ __global__ void
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
 }
+#endif
 
 template <typename ALayout,
           typename BLayout,
@@ -152,6 +159,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
     static constexpr auto I5 = Number<5>{};
     static constexpr auto I6 = Number<6>{};
     static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
 
     // K1 should be Number<...>
     static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
@@ -163,10 +172,19 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma = false;
     static constexpr auto is_scale_mfma       = true;
 
+    static constexpr auto MXdlPack = 2;
+    static constexpr auto NXdlPack = 2;
+    static constexpr auto KXdlPack = 2;
+
     //> KPack is at least the k_per_blk of selected mfma
     //
     // Should be a multiple of k_per_blk.
     // TODO: Move this to blockwise pipeline base
+    // KPack in packed data types for pk A/B
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
     static constexpr index_t KPack =
         math::max(lcm_AK1_BK1,
                   MfmaSelector<ComputeTypeA,
@@ -174,24 +192,11 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                                NPerXdl,
                                ComputeTypeB,
                                is_single_rate_mfma,
-                               is_scale_mfma>::selected_mfma.k_per_blk);
+                               is_scale_mfma>::selected_mfma.k_per_blk /
+                      APackedSize);
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t APackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
-    static constexpr index_t BPackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
     __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
     {
         return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
@@ -247,19 +252,33 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         return math::integer_divide_ceil(N, NPerBlock);
     }
 
-    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    template <index_t MNXdlPerWave,
+              index_t MNWaves,
+              index_t MNXdlPack,
+              index_t MNPerXdl,
+              typename TileDesc_K0_MN_K1>
     __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
     {
         constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t MN = TileDesc_K0_MN_K1{}.GetLength(Number<1>{});
         constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
 
-        return transform_tensor_descriptor(
+        constexpr auto permuted_desc = transform_tensor_descriptor(
             TileDesc_K0_MN_K1{},
+            make_tuple(make_xor_with_modulo_transform(make_tuple(Number<MN>{}, Number<K0>{})),
+                       make_pass_through_transform(Number<K1>{})),
+            make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+            make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+        return transform_tensor_descriptor(
+            permuted_desc,
             make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
-                       make_unmerge_transform(make_tuple(
-                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+                       make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                         Number<MNWaves>{},
+                                                         Number<MNXdlPack>{},
+                                                         Number<MNPerXdl>{}))),
             make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
-            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+            make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
     }
 
     __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
@@ -304,12 +323,28 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             // pad M, but not K
             const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
                 a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
                            make_right_pad_transform(M, MPad - M)),
                 make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            return a_grid_desc_ak0_m_ak1;
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(MPad, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(MPad),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            return a_grid_desc;
         }
         else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
                           GemmSpec == GemmSpecialization::NKPadding)
@@ -335,12 +370,29 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             // not pad M or K
             const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
                 a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
                            make_pass_through_transform(M)),
                 make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            return a_grid_desc_ak0_m_ak1;
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(M, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(M),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_grid_desc;
         }
     }
 
@@ -363,6 +415,10 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
                         GemmSpec != GemmSpecialization::Default),
                       "pk_i4_t does not support padding");
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
+                        (GemmSpec != GemmSpecialization::Default &&
+                         GemmSpec != GemmSpecialization::MPadding)),
+                      "f4x2_pk_t does not support K padding");
 
         if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
                      GemmSpec == GemmSpecialization::MNKPadding)
@@ -423,12 +479,30 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                 // not pad N or K
                 const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
                     b_grid_desc_nraw_kraw,
-                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                               make_pass_through_transform(N)),
+                    make_tuple(
+                        make_unmerge_transform(make_tuple(K / KPerBlock, BK0Number, BK1Value)),
+                        make_pass_through_transform(N)),
                     make_tuple(Sequence<1>{}, Sequence<0>{}),
-                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                    make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-                return b_grid_desc_bk0_n_bk1;
+                const auto b_grid_desc_permuted = transform_tensor_descriptor(
+                    b_grid_desc_bk0_n_bk1,
+                    make_tuple(make_pass_through_transform(K / KPerBlock),
+                               make_xor_with_modulo_transform(make_tuple(N, BK0Number)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+                const auto b_grid_desc = transform_tensor_descriptor(
+                    b_grid_desc_permuted,
+                    make_tuple(
+                        make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, BK0Number)),
+                        make_pass_through_transform(N),
+                        make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc;
             }
             else
             {
@@ -456,20 +530,22 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
     template <typename ABlockDesc_AK0_M_AK1>
     __host__ __device__ static constexpr auto
-    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    MakeAMmaTileDescriptor_M0_M1_M2_M3_K(const ABlockDesc_AK0_M_AK1&)
     {
         constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
 
-        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MXdlPack, MPerXdl>(
+            ABlockDesc_AK0_M_AK1{});
     }
 
     template <typename BBlockDesc_BK0_N_BK1>
     __host__ __device__ static constexpr auto
-    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    MakeBMmaTileDescriptor_N0_N1_N2_N3_K(const BBlockDesc_BK0_N_BK1&)
     {
         constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
 
-        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NXdlPack, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
     }
 
     __host__ __device__ static auto
@@ -627,10 +703,10 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                           bool is_reduce_ = false)
             : Problem{M_,
                       N_,
-                      K_,
-                      StrideA_,
+                      K_ / APackedSize,
+                      StrideA_ / APackedSize,
                       StrideScaleA_,
-                      StrideB_,
+                      StrideB_ / BPackedSize,
                       StrideScaleB_,
                       StrideC_,
                       k_batch_},
@@ -675,7 +751,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         {
             if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = k_id * karg.KRead / APackedSize;
+                a_k_split_offset = k_id * karg.KRead;
             }
             else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
@@ -690,34 +766,22 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             {
                 if constexpr(!PermuteB)
                 {
-                    b_k_split_offset = k_id * karg.KRead / BPackedSize;
+                    b_k_split_offset = k_id * karg.KRead;
                 }
                 else
                 {
                     const int k0_offset = karg.KRead * karg.N;
-                    b_k_split_offset    = k_id * k0_offset / BPackedSize;
+                    b_k_split_offset    = k_id * k0_offset;
                 }
             }
 
             // Calculate A scale offset
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                a_scale_k_split_offset = k_id * karg.KRead / ScaleBlockSize;
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                a_scale_k_split_offset = k_id * karg.KRead / ScaleBlockSize * karg.StrideScaleA;
-            }
+            a_scale_k_split_offset =
+                k_id * karg.KRead / (ScaleBlockSize / APackedSize) * MXdlPack * MPerXdl;
 
             // Calculate B scale offset
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
-            {
-                b_scale_k_split_offset = k_id * (karg.KRead / ScaleBlockSize) * karg.StrideScaleB;
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
-            {
-                b_scale_k_split_offset = k_id * karg.KRead / ScaleBlockSize;
-            }
+            b_scale_k_split_offset =
+                k_id * karg.KRead / (ScaleBlockSize / BPackedSize) * NXdlPack * NPerXdl;
 
             if(k_id < (karg.KBatch - 1))
             {
@@ -750,47 +814,28 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         // A matrix in LDS memory, dst of blockwise copy
         if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
+            // contiguous in LDS
             return make_naive_tensor_descriptor(
                 make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+                make_tuple(AK1Number, Number<KPerBlock>{}, I1));
         }
         // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
         // in some cases.
         else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
         {
-            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
-            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
-            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(
-                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
 
             constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
                 a_lds_block_desc,
-                make_tuple(make_xor_with_modulo_transform(make_tuple(
-                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
                            make_pass_through_transform(AK1Number)),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
 
-            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
-                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
-
-            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_ak0_mldslayer_m_ak1,
-                make_tuple(make_pass_through_transform(AK0Number),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return a_lds_block_desc_ak0_m_ak1;
+            return a_lds_block_desc_permuted;
         }
         else // ColumnMajor A
         {
@@ -887,46 +932,27 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         // B matrix in LDS memory, dst of blockwise copy
         if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
+            // contiguous in lds
             return make_naive_tensor_descriptor(
                 make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
-                make_tuple(BK1Number, Number<KPerBlock + BBlockLdsExtraN>{}, I1));
+                make_tuple(BK1Number, Number<KPerBlock>{}, I1));
         }
         else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
         {
             // NLdsLayer * K0 as logical Bank
-            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
-            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(
-                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
-                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+            constexpr auto b_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                                             make_tuple(BK1Number, Number<KPerBlock>{}, I1));
 
             constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
                 b_lds_block_desc,
-                make_tuple(make_xor_with_modulo_transform(make_tuple(
-                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<NPerBlock>{}, Number<BK0Number>{})),
                            make_pass_through_transform(BK1Number)),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
                 make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
 
-            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
-                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
-
-            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_bk0_nldslayer_n_bk1,
-                make_tuple(make_pass_through_transform(BK0Number),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return b_lds_block_desc_bk0_n_bk1;
+            return b_lds_block_desc_permuted;
         }
         else // RowMajor B
         {
@@ -1044,9 +1070,9 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                                 AccDataType,
                                 decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
                                 decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
-                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_M3_K(
                                     GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
-                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_N3_K(
                                     GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
                                 ABlockTransferSrcScalarPerVector,
                                 BBlockTransferSrcScalarPerVector,
@@ -1081,8 +1107,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         constexpr auto c_block_size =
             c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
 
-        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
-                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) +
+                          b_block_space_size_aligned * sizeof(BDataType)),
                          c_block_size * sizeof(CShuffleDataType));
     }
 
@@ -1093,7 +1119,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                           (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
                       "Invalid tuning param!");
 
-        static_assert(KPerBlock % ScaleBlockSize == 0,
+        static_assert(KPerBlock % (ScaleBlockSize / BPackedSize) == 0,
                       "KPerBlock should be multiple of ScaleBlockSize");
 
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
@@ -1269,7 +1295,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                 }
             }
         }
-
+#if 0
         // check gridwise gemm pipeline
         const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
 
@@ -1280,7 +1306,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                 return false;
             }
         }
-
+#endif
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         return true;
     }
@@ -1318,6 +1344,18 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
     using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
     // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
 
+    using mx_scale_t                           = e8m0_bexp_t;
+    static constexpr index_t scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr index_t scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+
+    static_assert(is_same_v<AElementwiseOperation, tensor_operation::element_wise::PassThrough> &&
+                      is_same_v<BElementwiseOperation, tensor_operation::element_wise::PassThrough>,
+                  "A/B ElementwiseOperation should be PassThrough as load_to_lds is used!");
+
     template <typename AGridDesc_AK0_M_K1,
               typename AScaleGridDesc_AM_AK,
               typename BGridDesc_BK0_N_K1,
@@ -1355,8 +1393,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
 
-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
         const CElementwiseOperation c_element_op{};
 
         // divide block work by [M, N]
@@ -1392,67 +1428,42 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         // B matrix in LDS memory, dst of blockwise copy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
-        // A matrix blockwise copy
         auto a_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                AElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<AK0Number, MPerBlock, AK1Number>,
-                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ADataType,
-                                                ADataType,
-                                                decltype(a_grid_desc_ak0_m_ak1),
-                                                decltype(a_block_desc_ak0_m_ak1),
-                                                ABlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                ABlockTransferSrcVectorDim,
-                                                2,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_AK1,
-                                                1,
-                                                1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                      ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                      ABlockTransferThreadClusterArrangeOrder,
+                                                      ADataType,
+                                                      ADataType,
+                                                      decltype(a_grid_desc_ak0_m_ak1),
+                                                      decltype(a_block_desc_ak0_m_ak1),
+                                                      ABlockTransferSrcAccessOrder,
+                                                      ABlockTransferSrcVectorDim,
+                                                      2,
+                                                      ABlockTransferSrcScalarPerVector>(
                 a_grid_desc_ak0_m_ak1,
                 make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_element_op,
                 a_block_desc_ak0_m_ak1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
+                make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
         auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                BElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<BK0Number, NPerBlock, BK1Number>,
-                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BDataType,
-                                                BDataType,
-                                                decltype(b_grid_desc_bk0_n_bk1),
-                                                decltype(b_block_desc_bk0_n_bk1),
-                                                BBlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                BBlockTransferSrcVectorDim,
-                                                2,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_BK1,
-                                                1,
-                                                1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                      BBlockTransferThreadClusterArrangeOrder,
+                                                      BDataType,
+                                                      BDataType,
+                                                      decltype(b_grid_desc_bk0_n_bk1),
+                                                      decltype(b_block_desc_bk0_n_bk1),
+                                                      BBlockTransferSrcAccessOrder,
+                                                      BBlockTransferSrcVectorDim,
+                                                      2,
+                                                      BBlockTransferSrcScalarPerVector>(
                 b_grid_desc_bk0_n_bk1,
                 make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_element_op,
                 b_block_desc_bk0_n_bk1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
+                make_multi_index(0, 0, 0));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
@@ -1463,9 +1474,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
-                                                                            sizeof(ADataType) /
-                                                                            APackedSize),
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
+                                         a_block_space_size_aligned * sizeof(ADataType)),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
@@ -1501,42 +1511,48 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         const auto waveId_m = wave_idx[I0];
         const auto waveId_n = wave_idx[I1];
 
-        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+        // static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
 
-        auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
-                               mfma.selected_mfma.num_threads_per_blk;
+        // auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
+        //                        mfma.selected_mfma.num_threads_per_blk;
 
-        auto a_thread_offset_m = get_thread_local_1d_id() % MPerXdl + waveId_m * MPerXdl;
+        // A wave access continuous memory
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
 
-        auto a_scale_thread_copy =
-            ThreadwiseTensorSliceTransfer_v2<AScaleDataType,
-                                             AScaleDataType,
-                                             decltype(a_scale_grid_desc_am_ak),
-                                             decltype(BlockwiseGemmPipe::a_scale_thread_desc_copy),
-                                             Sequence<1, 1>, // SliceLengths
-                                             Sequence<0, 1>, // DimAccessOrder
-                                             1,              // SrcVectorDim
-                                             1,              // SrcScalarPerVector
-                                             1,              // SrcScalarStrideInVector
-                                             true>(
-                a_scale_grid_desc_am_ak,
-                make_multi_index(block_m_id * MPerBlock + a_thread_offset_m, thread_offset_k));
+        auto a_thread_offset_m = waveId_m;
 
-        auto b_thread_offset_n = get_thread_local_1d_id() % NPerXdl + waveId_n * NPerXdl;
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
 
-        auto b_scale_thread_copy =
-            ThreadwiseTensorSliceTransfer_v2<BScaleDataType,
-                                             BScaleDataType,
-                                             decltype(b_scale_grid_desc_bn_ak),
-                                             decltype(BlockwiseGemmPipe::b_scale_thread_desc_copy),
-                                             Sequence<1, 1>, // SliceLengths
-                                             Sequence<0, 1>, // DimAccessOrder
-                                             1,              // SrcVectorDim
-                                             1,              // SrcScalarPerVector
-                                             1,
-                                             true>(
-                b_scale_grid_desc_bn_ak,
-                make_multi_index(block_n_id * NPerBlock + b_thread_offset_n, thread_offset_k));
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
 
         blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
                                                                          a_block_desc_ak0_m_ak1,
@@ -1564,27 +1580,32 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
                               NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
                           "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
 
             constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
             constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
 
             // TODO: hacky, fix it!
             constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             // TODO: hacky, fix it!
             // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
             constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
             constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
             constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
             constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
 
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
@@ -1598,19 +1619,25 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                 make_tuple(
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
-                        M1,                                      // M1 = MWave
-                        M2,                                      // M2 * M3 * M4 = MPerXdl
-                        M3,
-                        M4)),
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2,                                                 // M2 = MXdlPack
+                        M3, // M3 * M4 * M5 = MPerXdl
+                        M4,
+                        M5)),
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
-                        N1,                                      // N1 = NWave
-                        N2))),                                   // N2 = NPerXdl
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave) per
+                                                                            // shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(
-                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -1622,8 +1649,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
             const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto m_thread_data_on_block_idx =
@@ -1632,8 +1659,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
             const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
-                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto n_thread_data_on_block_idx =
@@ -1641,36 +1668,39 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                     make_multi_index(n_thread_data_on_block));
 
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
 
             // shuffle: blockwise copy C from LDS to global
             auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
@@ -1700,12 +1730,23 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
             // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle,
-                                           CShuffleNXdlPerWavePerShuffle,
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
                                            1,
                                            1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
                                            M2,
                                            1,
                                            M4,
@@ -1778,15 +1819,31 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
-        // A Scale grid
+        // A/B shuffled scale for better 8-bit scale access pattern
+        // MNRepeat -> KRepeat -> KThreadPerXdl -> MNThreadPerXdl -> KXdlPack -> MNXdlPack
+        const auto Padded_Scale_M =
+            math::integer_divide_ceil(problem.M, ScaleBlockSize) * ScaleBlockSize;
         const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
-            make_tuple(problem.M, math::integer_divide_ceil(problem.K, ScaleBlockSize)),
-            make_tuple(problem.StrideScaleA, 1));
+            make_tuple(Padded_Scale_M / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / APackedSize)) *
+                           MPerXdl * MXdlPack / scale_pack_size_a,
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a,
+                       1));
 
-        // B Scale grid transposed
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(problem.N, math::integer_divide_ceil(problem.K, ScaleBlockSize)),
-            make_tuple(problem.StrideScaleB, 1));
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / BPackedSize)) *
+                           NPerXdl * NXdlPack / scale_pack_size_b,
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b,
+                       1));
 
         Run<decltype(a_grid_desc_ak0_m_ak1),
             decltype(a_scale_grid_desc_am_ak),
@@ -1832,12 +1889,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                                     const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
                                         c_grid_desc_mblock_mperblock_nblock_nperblock)
     {
-        ignore = p_a_scale_grid;
-        ignore = a_scale_grid_desc_am_ak;
-
-        // TODO: Implement 2 LDS version
-        static_assert(false, "Not implemented");
-
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -1845,12 +1896,14 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
+        // A Scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+
         // B Scale buffer
         const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
 
-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
         const CElementwiseOperation c_element_op{};
 
         // divide block work by [M, N]
@@ -1886,67 +1939,42 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         // B matrix in LDS memory, dst of blockwise copy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
-        // A matrix blockwise copy
         auto a_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                AElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<AK0Number, MPerBlock, AK1Number>,
-                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ADataType,
-                                                ADataType,
-                                                decltype(a_grid_desc_ak0_m_ak1),
-                                                decltype(a_block_desc_ak0_m_ak1),
-                                                ABlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                ABlockTransferSrcVectorDim,
-                                                2,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_AK1,
-                                                1,
-                                                1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                      ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                      ABlockTransferThreadClusterArrangeOrder,
+                                                      ADataType,
+                                                      ADataType,
+                                                      decltype(a_grid_desc_ak0_m_ak1),
+                                                      decltype(a_block_desc_ak0_m_ak1),
+                                                      ABlockTransferSrcAccessOrder,
+                                                      ABlockTransferSrcVectorDim,
+                                                      2,
+                                                      ABlockTransferSrcScalarPerVector>(
                 a_grid_desc_ak0_m_ak1,
                 make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_element_op,
                 a_block_desc_ak0_m_ak1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
+                make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
         auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                BElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<BK0Number, NPerBlock, BK1Number>,
-                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BDataType,
-                                                BDataType,
-                                                decltype(b_grid_desc_bk0_n_bk1),
-                                                decltype(b_block_desc_bk0_n_bk1),
-                                                BBlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                BBlockTransferSrcVectorDim,
-                                                2,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_BK1,
-                                                1,
-                                                1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                      BBlockTransferThreadClusterArrangeOrder,
+                                                      BDataType,
+                                                      BDataType,
+                                                      decltype(b_grid_desc_bk0_n_bk1),
+                                                      decltype(b_block_desc_bk0_n_bk1),
+                                                      BBlockTransferSrcAccessOrder,
+                                                      BBlockTransferSrcVectorDim,
+                                                      2,
+                                                      BBlockTransferSrcScalarPerVector>(
                 b_grid_desc_bk0_n_bk1,
                 make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_element_op,
                 b_block_desc_bk0_n_bk1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
+                make_multi_index(0, 0, 0));
 
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
@@ -1957,7 +1985,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
         auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
             bit_cast<BDataType*>(static_cast<char*>(p_shared_0) +
-                                 a_block_space_size_aligned * sizeof(ADataType) / APackedSize),
+                                 a_block_space_size_aligned * sizeof(ADataType)),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
@@ -1965,7 +1993,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
         auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
             bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
-                                 a_block_space_size_aligned * sizeof(ADataType) / APackedSize),
+                                 a_block_space_size_aligned * sizeof(ADataType)),
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
@@ -1983,97 +2011,122 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        // B scale
-        static constexpr auto mfma =
-            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>{};
-        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
-        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
-        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
-        static constexpr auto KPerThread  = KPerBlock / K0PerXdlops;
+        // Initial thread mapping for:
+        // BlockSize = 256
+        // MPerXdl=NPerXdl=32 and MPerBlock=NPerBlock=128 MRepeat=NRepeat=2 MWaves=NWaves=2
+        // For each [m0, n0] tile, there are 4 waves:
+        // tId in [  0,  63]  m x n = [ 0, 31] x [ 0, 31]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [ 0, 31] x [32, 63]  waveId = [0, 1]
+        // tId in [128, 191]  m x n = [32, 63] x [ 0, 31]  waveId = [1, 0]
+        // tId in [192, 255]  m x n = [32, 63] x [32, 63]  waveId = [1, 1]
 
-        const index_t ScaleSliceSizeN         = NXdlPerWave;
-        static constexpr auto ScaleSliceSizeK = (KPerThread + ScaleBlockSize - 1) / ScaleBlockSize;
-        static constexpr auto KBlockScaleSliceSizeK =
-            (KPerBlock + ScaleBlockSize - 1) / ScaleBlockSize;
+        // BlockSize = 128
+        // MPerXdl=NPerXdl=16 and MPerBlock=128 NPerBlock=16 MRepeat=4 NRepeat=1 MWaves=2 NWaves=1
+        // For each [m0, n0] tile, there are 2 waves:
+        // tId in [  0,  63]  m x n = [ 0, 15] x [0, 15]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [16, 31] x [0, 15]  waveId = [1, 0]
 
-        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+        // TODO: Document initial thread mapping for more combinations of parameters
 
-        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
 
-        auto b_thread_offset_n =
-            get_thread_local_1d_id() % NPerXdl +
-            (get_thread_local_1d_id() / BlockwiseGemmPipe::WaveSize) % NWaves * NPerXdl;
-        auto b_thread_offset_k =
-            (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) / NPerXdl * KPerThread;
+        // static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
 
-        auto b_scale_thread_copy =
-            ThreadwiseTensorSliceTransfer_v2<BScaleDataType,
-                                             BScaleDataType,
-                                             decltype(b_scale_grid_desc_bn_ak),
-                                             decltype(b_scale_thread_desc),
-                                             Sequence<1, ScaleSliceSizeK>,
-                                             Sequence<0, 1>,
-                                             1,
-                                             ScaleSliceSizeK,
-                                             1,
-                                             false>(
-                b_scale_grid_desc_bn_ak,
-                make_multi_index(block_n_id * NPerBlock + b_thread_offset_n,
-                                 b_thread_offset_k / ScaleBlockSize));
+        // auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
+        //                        mfma.selected_mfma.num_threads_per_blk;
 
-        constexpr auto b_scale_thread_slice_copy_step =
-            make_tuple(make_multi_index(NWaves * NPerXdl, 0),
-                       make_multi_index(-NPerBlock, 0),
-                       make_multi_index(-NPerBlock, KBlockScaleSliceSizeK));
+        // A wave access continuous memory
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-            a_grid_desc_ak0_m_ak1,
-            a_block_desc_ak0_m_ak1,
-            a_blockwise_copy,
-            a_grid_buf,
-            a_block_bufs,
-            a_block_slice_copy_step,
-            b_grid_desc_bk0_n_bk1,
-            b_block_desc_bk0_n_bk1,
-            b_blockwise_copy,
-            b_grid_buf,
-            b_block_bufs,
-            b_block_slice_copy_step,
-            c_thread_buf,
-            b_scale_grid_desc_bn_ak,
-            b_scale_thread_desc,
-            b_scale_thread_copy,
-            b_scale_grid_buf,
-            b_scale_thread_slice_copy_step,
-            num_k_block_main_loop);
+        auto a_thread_offset_m = waveId_m;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_bufs,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_bufs,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         a_scale_grid_desc_am_ak,
+                                                                         a_scale_thread_copy,
+                                                                         a_scale_grid_buf,
+                                                                         b_scale_grid_desc_bn_ak,
+                                                                         b_scale_thread_copy,
+                                                                         b_scale_grid_buf,
+                                                                         num_k_block_main_loop);
 
         // shuffle C and write out
         {
             static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
                               NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
                           "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
 
             constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
             constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
 
             // TODO: hacky, fix it!
             constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             // TODO: hacky, fix it!
             // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
             constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
             constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
             constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
             constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
 
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
@@ -2087,19 +2140,25 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                 make_tuple(
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
-                        M1,                                      // M1 = MWave
-                        M2,                                      // M2 * M3 * M4 = MPerXdl
-                        M3,
-                        M4)),
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2,                                                 // M2 = MXdlPack
+                        M3, // M3 * M4 * M5 = MPerXdl
+                        M4,
+                        M5)),
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
-                        N1,                                      // N1 = NWave
-                        N2))),                                   // N2 = NPerXdl
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave) per
+                                                                            // shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(
-                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -2111,8 +2170,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
             const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto m_thread_data_on_block_idx =
@@ -2121,8 +2180,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
             const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
-                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto n_thread_data_on_block_idx =
@@ -2130,36 +2189,39 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                     make_multi_index(n_thread_data_on_block));
 
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
 
             // shuffle: blockwise copy C from LDS to global
             auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
@@ -2189,12 +2251,23 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
             // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle,
-                                           CShuffleNXdlPerWavePerShuffle,
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
                                            1,
                                            1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
                                            M2,
                                            1,
                                            M4,
@@ -2250,6 +2323,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
     __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
                                     const BDataType* p_b_grid,
                                     const BScaleDataType* p_b_scale_grid,
                                     CDataType* p_c_grid,
@@ -2263,22 +2337,45 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
             problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
-
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
+        // A/B shuffled scale for better 8-bit scale access pattern
+        // MNRepeat -> KRepeat -> KThreadPerXdl -> MNThreadPerXdl -> KXdlPack -> MNXdlPack
+        const auto Padded_Scale_M =
+            math::integer_divide_ceil(problem.M, ScaleBlockSize) * ScaleBlockSize;
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(Padded_Scale_M / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / APackedSize)) *
+                           MPerXdl * MXdlPack / scale_pack_size_a,
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a,
+                       1));
+
         const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(problem.N, math::integer_divide_ceil(problem.K, ScaleBlockSize)),
-            make_tuple(problem.StrideScaleB, 1));
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / BPackedSize)) *
+                           NPerXdl * NXdlPack / scale_pack_size_b,
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b,
+                       1));
 
         Run_2Lds<decltype(a_grid_desc_ak0_m_ak1),
+                 decltype(a_scale_grid_desc_am_ak),
                  decltype(b_grid_desc_bk0_n_bk1),
                  decltype(b_scale_grid_desc_bn_ak),
                  decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
                  HasMainKBlockLoop,
                  CGlobalMemoryDataOperation,
                  TailNum>(p_a_grid,
+                          p_a_scale_grid,
                           p_b_grid,
                           p_b_scale_grid,
                           p_c_grid,
@@ -2286,6 +2383,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                           p_shared_1,
                           problem,
                           a_grid_desc_ak0_m_ak1,
+                          a_scale_grid_desc_am_ak,
                           b_grid_desc_bk0_n_bk1,
                           b_scale_grid_desc_bn_ak,
                           c_grid_desc_mblock_mperblock_nblock_nperblock);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
new file mode 100644
index 0000000000..a0e716ba8e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -0,0 +1,2295 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_bpreshuffle_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp"
+
+namespace ck {
+
+#ifndef KERNEL_GEMM_XDL_CSHUFFLE_V3_MX
+#define KERNEL_GEMM_XDL_CSHUFFLE_V3_MX
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <bool Use2LDS,
+          typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ enable_if_t<!Use2LDS, void>
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
+{
+#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        p_shared,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <bool Use2LDS,
+          typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ enable_if_t<Use2LDS, void>
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
+{
+#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+        p_shared_0,
+        p_shared_1,
+        karg);
+
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+#endif
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize, // Scaling block size
+          index_t BlockSize,      // Thread block size
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v3,
+          typename ComputeTypeA =
+              ADataType, // XXX: These should always be the same as ADataType and BDataType
+          typename ComputeTypeB =
+              BDataType, // TODO: Hardcode them and remove from the list of template parameters
+          bool PermuteA = false,
+          bool PermuteB = false>
+struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr auto lcm_AK1_BK1         = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
+
+    static constexpr auto MXdlPack = 2;
+    static constexpr auto NXdlPack = 2;
+    static constexpr auto KXdlPack = 2;
+
+    //> KPack is at least the k_per_blk of selected mfma
+    //
+    // Should be a multiple of k_per_blk.
+    // TODO: Move this to blockwise pipeline base
+    // KPack in packed data types for pk A/B
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA,
+                               MPerXdl,
+                               NPerXdl,
+                               ComputeTypeB,
+                               is_single_rate_mfma,
+                               is_scale_mfma>::selected_mfma.k_per_blk /
+                      APackedSize);
+
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t KLane   = 64 / NLane;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using mx_scale_t                           = e8m0_bexp_t;
+    static constexpr index_t scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr index_t scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave,
+              index_t MNWaves,
+              index_t MNXdlPack,
+              index_t MNPerXdl,
+              bool IsXor,
+              typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t MN = TileDesc_K0_MN_K1{}.GetLength(Number<1>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        if constexpr(IsXor)
+        {
+            constexpr auto permuted_desc = transform_tensor_descriptor(
+                TileDesc_K0_MN_K1{},
+                make_tuple(make_xor_with_modulo_transform(make_tuple(Number<MN>{}, Number<K0>{})),
+                           make_pass_through_transform(Number<K1>{})),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return transform_tensor_descriptor(
+                permuted_desc,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                    make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                      Number<MNWaves>{},
+                                                      Number<MNXdlPack>{},
+                                                      Number<MNPerXdl>{}))),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                TileDesc_K0_MN_K1{},
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                    make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                      Number<MNWaves>{},
+                                                      Number<MNXdlPack>{},
+                                                      Number<MNPerXdl>{}))),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
+        }
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(M, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(M),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_grid_desc;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::MFMA, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "f4x2_pk_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(
+                        make_unmerge_transform(make_tuple(K / KPerBlock, BK0Number, BK1Value)),
+                        make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+                const auto b_grid_desc_permuted = transform_tensor_descriptor(
+                    b_grid_desc_bk0_n_bk1,
+                    make_tuple(make_pass_through_transform(K / KPerBlock),
+                               make_xor_with_modulo_transform(make_tuple(N, BK0Number)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+                const auto b_grid_desc = transform_tensor_descriptor(
+                    b_grid_desc_permuted,
+                    make_tuple(
+                        make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, BK0Number)),
+                        make_pass_through_transform(N),
+                        make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc;
+            }
+            else
+            {
+                // Weight Tile Permute
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                // const index_t BK00     = BK0 / BK01;
+                const index_t BK0_ = StrideB / BK1Value;
+                const index_t BK00 = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_M3_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MXdlPack, MPerXdl, true>(
+            ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_N3_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NXdlPack, NPerXdl, false>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideScaleA_,
+                         index_t StrideB_,
+                         index_t StrideScaleB_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideScaleA{StrideScaleA_},
+              StrideB{StrideB_},
+              StrideScaleB{StrideScaleB_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideScaleA;
+        index_t StrideB;
+        index_t StrideScaleB;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // FOR PRESHUFFLE ONLY
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const AScaleDataType* p_a_scale_grid_,
+                          const BDataType* p_b_grid_,
+                          const BScaleDataType* p_b_scale_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideScaleA_,
+                          index_t StrideB_,
+                          index_t StrideScaleB_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_,
+                          bool is_reduce_ = false)
+            : Problem{M_,
+                      N_,
+                      K_ / APackedSize,
+                      StrideA_ / APackedSize,
+                      StrideScaleA_,
+                      StrideB_ / BPackedSize,
+                      StrideScaleB_,
+                      StrideC_,
+                      k_batch_},
+              p_a_grid{p_a_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_grid{p_b_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              p_c_grid{p_c_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const AScaleDataType* p_a_scale_grid;
+        const BDataType* p_b_grid;
+        const BScaleDataType* p_b_scale_grid;
+        CDataType* p_c_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::MFMA, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = k_id * karg.KRead * NPerXdl;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = k_id * k0_offset;
+                }
+            }
+
+            // Calculate A scale offset
+            a_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / APackedSize) * MXdlPack *
+                                     MPerXdl / scale_pack_size_a;
+
+            // Calculate B scale offset
+            b_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / BPackedSize) * NXdlPack *
+                                     NPerXdl / scale_pack_size_b;
+
+            if(k_id < (karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = k_id * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t a_scale_k_split_offset; // New member for scale matrix offset
+        index_t b_scale_k_split_offset; // New member for scale matrix offset
+        index_t c_reduce_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // contiguous in LDS
+            return make_naive_tensor_descriptor(
+                make_tuple(Number<AK0Number>{}, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto WaveSize = 64;
+            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1       = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave/NXdlPack -> NWave -> NXdlPack -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(make_tuple(Number<NXdlPerWave / NXdlPack>{},
+                                                              I1,
+                                                              Number<NXdlPack>{},
+                                                              Number<KRepeat>{},
+                                                              Number<BK1Value>{}));
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        // constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmMXBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ScaleBlockSize,
+                                ADataType,
+                                AScaleDataType,
+                                BDataType,
+                                BScaleDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_M3_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_N3_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(ADataType),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        static_assert(KPerBlock % (ScaleBlockSize / BPackedSize) == 0,
+                      "KPerBlock should be multiple of ScaleBlockSize");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+#if 0
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename AScaleGridDesc_AM_AK,
+              typename BGridDesc_BK0_N_K1,
+              typename BScaleGridDesc_BN_AK,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const AScaleGridDesc_AM_AK& a_scale_grid_desc_am_ak,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // A Scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+
+        // B Scale buffer
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                      ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                      ABlockTransferThreadClusterArrangeOrder,
+                                                      ADataType,
+                                                      ADataType,
+                                                      decltype(a_grid_desc_ak0_m_ak1),
+                                                      decltype(a_block_desc_ak0_m_ak1),
+                                                      ABlockTransferSrcAccessOrder,
+                                                      ABlockTransferSrcVectorDim,
+                                                      2,
+                                                      ABlockTransferSrcScalarPerVector>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                      BBlockTransferThreadClusterArrangeOrder,
+                                                      BDataType,
+                                                      BDataType,
+                                                      decltype(b_grid_desc_bk0_n_bk1),
+                                                      decltype(b_block_desc_bk0_n_bk1),
+                                                      BBlockTransferSrcAccessOrder,
+                                                      BBlockTransferSrcVectorDim,
+                                                      2,
+                                                      BBlockTransferSrcScalarPerVector>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
+                                         a_block_space_size_aligned * sizeof(ADataType)),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // Initial thread mapping for:
+        // BlockSize = 256
+        // MPerXdl=NPerXdl=32 and MPerBlock=NPerBlock=128 MRepeat=NRepeat=2 MWaves=NWaves=2
+        // For each [m0, n0] tile, there are 4 waves:
+        // tId in [  0,  63]  m x n = [ 0, 31] x [ 0, 31]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [ 0, 31] x [32, 63]  waveId = [0, 1]
+        // tId in [128, 191]  m x n = [32, 63] x [ 0, 31]  waveId = [1, 0]
+        // tId in [192, 255]  m x n = [32, 63] x [32, 63]  waveId = [1, 1]
+
+        // BlockSize = 128
+        // MPerXdl=NPerXdl=16 and MPerBlock=128 NPerBlock=16 MRepeat=4 NRepeat=1 MWaves=2 NWaves=1
+        // For each [m0, n0] tile, there are 2 waves:
+        // tId in [  0,  63]  m x n = [ 0, 15] x [0, 15]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [16, 31] x [0, 15]  waveId = [1, 0]
+
+        // TODO: Document initial thread mapping for more combinations of parameters
+
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        // static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+
+        // auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
+        //                        mfma.selected_mfma.num_threads_per_blk;
+
+        // A wave access continuous memory
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_buf,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_buf,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         a_scale_grid_desc_am_ak,
+                                                                         a_scale_thread_copy,
+                                                                         a_scale_grid_buf,
+                                                                         b_scale_grid_desc_bn_ak,
+                                                                         b_scale_thread_copy,
+                                                                         b_scale_grid_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2,                                                 // M2 = MXdlPack
+                        M3, // M3 * M4 * M5 = MPerXdl
+                        M4,
+                        M5)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave) per
+                                                                            // shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        // A/B shuffled scale for better 8-bit scale access pattern
+        // MNRepeat -> KRepeat -> KThreadPerXdl -> MNThreadPerXdl -> KXdlPack -> MNXdlPack
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.M / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+
+        Run<decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(b_grid_desc_bk0_n_bk1),
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+            HasMainKBlockLoop,
+            CGlobalMemoryDataOperation,
+            TailNum>(p_a_grid,
+                     p_a_scale_grid,
+                     p_b_grid,
+                     p_b_scale_grid,
+                     p_c_grid,
+                     p_shared,
+                     problem,
+                     a_grid_desc_ak0_m_ak1,
+                     a_scale_grid_desc_am_ak,
+                     b_grid_desc_bk0_n_bk1,
+                     b_scale_grid_desc_bn_ak,
+                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename AScaleGridDesc_AM_AK,
+              typename BGridDesc_BK0_N_K1,
+              typename BScaleGridDesc_BN_AK,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem,
+                                    const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                                    const AScaleGridDesc_AM_AK& a_scale_grid_desc_am_ak,
+                                    const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                                    const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                                    const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT1>(
+                p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // A Scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+
+        // B Scale buffer
+        const auto b_scale_grid_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::SYSTEM_NT1>(
+                p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        const CElementwiseOperation c_element_op{};
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave / NXdlPack);
+
+        // lds max alignment
+        // constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                      ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                      ABlockTransferThreadClusterArrangeOrder,
+                                                      ADataType,
+                                                      ADataType,
+                                                      decltype(a_grid_desc_ak0_m_ak1),
+                                                      decltype(a_block_desc_ak0_m_ak1),
+                                                      ABlockTransferSrcAccessOrder,
+                                                      ABlockTransferSrcVectorDim,
+                                                      2,
+                                                      ABlockTransferSrcScalarPerVector>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0));
+
+        // dummys
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bk0_n_bk1),
+            decltype(b_block_desc_bk0_n_bk1), // actually the thread desc
+            Sequence<Number<NXdlPerWave / NXdlPack>{},
+                     I1,
+                     Number<NXdlPack>{},
+                     Number<KRepeat>{},
+                     Number<BK1Value>{}>,
+            Sequence<0, 1, 2, 3, 4>,
+            4,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bk0_n_bk1,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   0,
+                                   KPack * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // Initial thread mapping for:
+        // BlockSize = 256
+        // MPerXdl=NPerXdl=32 and MPerBlock=NPerBlock=128 MRepeat=NRepeat=2 MWaves=NWaves=2
+        // For each [m0, n0] tile, there are 4 waves:
+        // tId in [  0,  63]  m x n = [ 0, 31] x [ 0, 31]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [ 0, 31] x [32, 63]  waveId = [0, 1]
+        // tId in [128, 191]  m x n = [32, 63] x [ 0, 31]  waveId = [1, 0]
+        // tId in [192, 255]  m x n = [32, 63] x [32, 63]  waveId = [1, 1]
+
+        // BlockSize = 128
+        // MPerXdl=NPerXdl=16 and MPerBlock=128 NPerBlock=16 MRepeat=4 NRepeat=1 MWaves=2 NWaves=1
+        // For each [m0, n0] tile, there are 2 waves:
+        // tId in [  0,  63]  m x n = [ 0, 15] x [0, 15]  waveId = [0, 0]
+        // tId in [ 64, 127]  m x n = [16, 31] x [0, 15]  waveId = [1, 0]
+
+        // TODO: Document initial thread mapping for more combinations of parameters
+
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        // static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+
+        // auto thread_offset_k = (get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize) /
+        //                        mfma.selected_mfma.num_threads_per_blk;
+
+        // A wave access continuous memory
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_bufs,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_bufs,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         a_scale_grid_desc_am_ak,
+                                                                         a_scale_thread_copy,
+                                                                         a_scale_grid_buf,
+                                                                         b_scale_grid_desc_bn_ak,
+                                                                         b_scale_thread_copy,
+                                                                         b_scale_grid_buf,
+                                                                         num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            // constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared_0),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2,                                                 // M2 = MXdlPack
+                        M3, // M3 * M4 * M5 = MPerXdl
+                        M4,
+                        M5)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave) per
+                                                                            // shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        // A/B shuffled scale for better 8-bit scale access pattern
+        // MNRepeat -> KRepeat -> KThreadPerXdl -> MNThreadPerXdl -> KXdlPack -> MNXdlPack
+        // We pad the M unconditionaly for Scale
+        const auto Padded_Scale_M =
+            math::integer_divide_ceil(problem.M, ScaleBlockSize) * ScaleBlockSize;
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(Padded_Scale_M / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / APackedSize)) *
+                           MPerXdl * MXdlPack / scale_pack_size_a,
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a,
+                       1));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / BPackedSize)) *
+                           NPerXdl * NXdlPack / scale_pack_size_b,
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b,
+                       1));
+
+        Run_2Lds<decltype(a_grid_desc_ak0_m_ak1),
+                 decltype(a_scale_grid_desc_am_ak),
+                 decltype(b_grid_desc_bk0_n_bk1),
+                 decltype(b_scale_grid_desc_bn_ak),
+                 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                 HasMainKBlockLoop,
+                 CGlobalMemoryDataOperation,
+                 TailNum>(p_a_grid,
+                          p_a_scale_grid,
+                          p_b_grid,
+                          p_b_scale_grid,
+                          p_c_grid,
+                          p_shared_0,
+                          p_shared_1,
+                          problem,
+                          a_grid_desc_ak0_m_ak1,
+                          a_scale_grid_desc_am_ak,
+                          b_grid_desc_bk0_n_bk1,
+                          b_scale_grid_desc_bn_ak,
+                          c_grid_desc_mblock_mperblock_nblock_nperblock);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
index bac8c32886..3e23008a5f 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
@@ -76,10 +76,12 @@ template <index_t BlockSize,
           index_t MRepeat,
           index_t NRepeat,
           typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferSrcAccessOrder,
           index_t ABlockTransferSrcVectorDim,
           index_t ABlockTransferSrcScalarPerVector,
           bool ABlockLdsExtraM,
           typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferSrcAccessOrder,
           index_t BBlockTransferSrcVectorDim,
           index_t BBlockTransferSrcScalarPerVector,
           bool BBlockLdsExtraN,
@@ -102,9 +104,10 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
     static constexpr auto I7 = Number<7>{};
 
     // K1 should be Number<...>
-    static constexpr auto K1  = Number<K1Value>{};
-    static constexpr auto M01 = 1;
-    static constexpr auto N01 = 1;
+    static constexpr auto K1        = Number<K1Value>{};
+    static constexpr auto KPerBlock = Number<K1Value * K0PerBlock>{};
+    static constexpr auto M01       = 1;
+    static constexpr auto N01       = 1;
 
     static constexpr auto gemm_padder =
         tensor_operation::device::GemmPadder<GemmSpec, index_t, index_t, index_t>{
@@ -613,8 +616,9 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             }
             else
             {
-                return make_naive_tensor_descriptor_aligned(
-                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(K1, Number<KPerBlock>{}, I1));
             }
         }();
 
@@ -630,9 +634,10 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             }
             else
             {
-                return make_naive_tensor_descriptor_aligned(
+                return make_naive_tensor_descriptor(
                     make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
-                    max_lds_align);
+                    make_tuple(
+                        Number<KPerBlock>{} * Number<MPerBlock>{}, K1, Number<KPerBlock>{}, I1));
             }
         }();
         // B matrix in LDS memory, dst of blockwise copy
@@ -645,8 +650,9 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             }
             else
             {
-                return make_naive_tensor_descriptor_aligned(
-                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(K1, Number<KPerBlock>{}, I1));
             }
         }();
 
@@ -662,9 +668,10 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             }
             else
             {
-                return make_naive_tensor_descriptor_aligned(
+                return make_naive_tensor_descriptor(
                     make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
-                    max_lds_align);
+                    make_tuple(
+                        Number<KPerBlock>{} * Number<NPerBlock>{}, K1, Number<KPerBlock>{}, I1));
             }
         }();
 
@@ -672,10 +679,12 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
                                                       Sequence<1, K0PerBlock, MPerBlock, K1>,
                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                      ABlockTransferSrcAccessOrder,
                                                       FloatA,
                                                       ComputeType,
                                                       decltype(a_b_k0_m_k1_grid_desc),
                                                       decltype(a_b_k0_m_k1_block_desc),
+                                                      ABlockTransferSrcAccessOrder,
                                                       ABlockTransferSrcVectorDim,
                                                       3,
                                                       ABlockTransferSrcScalarPerVector>(
@@ -688,10 +697,12 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
             ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
                                                       Sequence<1, K0PerBlock, NPerBlock, K1>,
                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                      BBlockTransferSrcAccessOrder,
                                                       FloatB,
                                                       ComputeType,
                                                       decltype(b_b_k0_n_k1_grid_desc),
                                                       decltype(b_b_k0_n_k1_block_desc),
+                                                      BBlockTransferSrcAccessOrder,
                                                       BBlockTransferSrcVectorDim,
                                                       3,
                                                       BBlockTransferSrcScalarPerVector>(
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 2255505985..c17b88ccea 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -260,7 +260,8 @@ struct ThreadwiseTensorSliceTransfer_v2
         static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
                       "wrong! Not divisible");
 
-        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t> ||
+                     is_same_v<remove_cvref_t<SrcData>, f4x2_pk_t>)
         {
             static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
         }
@@ -422,6 +423,240 @@ struct ThreadwiseTensorSliceTransfer_v2
     SrcCoord src_coord_;
 }; // namespace ck
 
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun,
+          index_t scale_gather_num,
+          bool InvalidElementAsNaN                                        = false,
+          typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v2_gather
+{
+    static_assert((InvalidElementAsNaN && !ck::is_integral<DstData>::value) ||
+                      (!InvalidElementAsNaN),
+                  "Filling invalid element as NaN is only for floating point types");
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    static constexpr index_t PackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v2_gather(
+        const SrcDesc& src_desc,
+        const Index& src_slice_origin_idx,
+        const StaticallyIndexedArray<index_t, scale_gather_num>& scale_gather_offsets)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx)),
+          scale_gather_offsets_(scale_gather_offsets)
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
+
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        {
+            static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
+        }
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        auto adjusted_origin_idx = [&]() {
+            Index idx;
+
+            static_for<0, nDim, 1>{}(
+                [&](auto i) { idx(i) = i.value == 0 ? 0 : src_slice_origin_idx[Number<i>{}]; });
+
+            return idx;
+        }();
+
+        src_coord_ = make_tensor_coordinate(src_desc, adjusted_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! DstDesc need to known at compile-time");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<DstSliceOriginIdx>>::value,
+                      "wrong! DstSliceOrigin need to known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value &&
+            "wrong! inconsistent type");
+
+        // DstDesc and dst_slice_origin_idx are known at compile-time
+        constexpr auto dst_desc             = remove_cvref_t<DstDesc>{};
+        constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;
+
+        // loop over tensor and copy
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        static_for<0, scale_gather_num, 1>{}([&](auto gather_idx) {
+            constexpr auto current_dst_origin =
+                to_multi_index(dst_slice_origin_idx) + make_multi_index(gather_idx, 0);
+
+            static_for<0, num_access, 1>{}([&](auto idx_1d) {
+                typename vector_type_maker<SrcData, SrcScalarPerVector / PackedSize>::type
+                    src_vector;
+
+                using src_vector_t =
+                    typename vector_type_maker<SrcData,
+                                               SrcScalarPerVector / PackedSize>::type::type;
+                constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
+
+                const bool is_src_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc,
+                                                                                src_coord_);
+
+                // copy data from src_buf into src_vector
+                src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                    src_buf.template Get<src_vector_t>(src_coord_.GetOffset() / PackedSize +
+                                                           scale_gather_offsets_(gather_idx),
+                                                       is_src_valid);
+
+                // copy data from src_vector into dst_buf
+                static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset =
+                        dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) +
+                                                 src_data_idx + i * src_scalar_step_in_vector);
+                    constexpr auto full_dst_offset =
+                        dst_desc.CalculateOffset(current_dst_origin) + dst_offset;
+
+                    if constexpr(InvalidElementAsNaN)
+                    {
+                        dst_buf(full_dst_offset) =
+                            is_src_valid
+                                ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
+                                : NumericLimits<DstData>::QuietNaN();
+                    }
+                    else
+                    {
+                        dst_buf(Number<full_dst_offset>{}) =
+                            type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
+                    }
+                });
+
+                if constexpr(idx_1d.value != num_access - 1)
+                {
+                    constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+
+                    move_tensor_coordinate(
+                        src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+                }
+            });
+        });
+
+        // printf("blockIdx.y: %d, tid: %d, dst_buf<%f>\n",
+        //        blockIdx.y,
+        //        threadIdx.x,
+        //        dst_buf(Number<0>{}));
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    private:
+    SrcCoord src_coord_;
+    StaticallyIndexedArray<index_t, scale_gather_num> scale_gather_offsets_;
+}; // namespace ck
+
 // Assume:
 //   1. src_desc and dst_desc are not known at compile-time
 //   2. SrcBuffer and DstBuffer are DynamicBuffer
@@ -1053,10 +1288,8 @@ struct ThreadwiseTensorSliceTransfer_v4
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc and DstDesc need to known at compile-time");
 
-        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
-                      "wrong! Not divisible");
-
-        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t> ||
+                     is_same_v<remove_cvref_t<SrcData>, f4x2_pk_t>)
         {
             static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1");
         }
@@ -1236,16 +1469,16 @@ struct ThreadwiseTensorSliceTransfer_v4
             {
                 // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
                 // DstData)
-                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+                vector_type_maker_t<DstData, SrcScalarPerVector / PackedSize> dst_tmp_vector;
 
                 // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
-                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
                     dst_tmp_vector.template AsType<DstData>()(i) =
                         type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
                 });
 
                 // copy data from dst_tmp_vector into dst_buf
-                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) {
                     constexpr index_t dst_offset = dst_desc.CalculateOffset(
                         dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp
index 96b95579f5..168f028e2a 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp
@@ -62,6 +62,18 @@ struct lambda_scalar_per_access_for_src_and_dst
     }
 };
 
+template <index_t WaveNum, index_t nDim>
+struct lambda_wave_cluster_dimension
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        if((nDim - i) == 3)
+            return WaveNum;
+        else
+            return 1;
+    }
+};
+
 } // namespace detail
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
index 7ccea96dda..79e22018a6 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -90,7 +90,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
           src_element_op_(src_element_op),
           dst_element_op_(dst_element_op)
     {
-        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        if constexpr((packed_size_v<SrcData>) > 1)
         {
             static_assert(is_same_v<remove_cvref_t<SrcData>, remove_cvref_t<DstData>>,
                           "SrcData != DstData");
@@ -99,7 +99,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 SrcScalarPerVector_ % PackedSize == 0 && DstScalarPerVector_ % PackedSize == 0,
                 "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type");
 
-            static_assert(SrcVectorDim == DstVectorDim, "pk_i4_t does not support transpose");
+            static_assert(SrcVectorDim == DstVectorDim,
+                          "Packed data type does not support transpose");
         }
     }
 
@@ -444,6 +445,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
         {
             static_assert(!is_same_v<remove_cvref_t<SrcData>, pk_i4_t>,
                           "in-register transpose is not supported for pk_i4_t");
+            static_assert(!is_same_v<remove_cvref_t<SrcData>, f4x2_pk_t>,
+                          "in-register transpose is not supported for f4x2_pk_t");
             // each transpose does
             // DstScalarPerVector # of src vectors in src_thread_scratch_
             // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
index bd6fe772e4..50f1e21beb 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -96,7 +96,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
           dst_element_op_(dst_element_op),
           gather_offsets_(gather_offsets)
     {
-        if constexpr(is_same_v<remove_cvref_t<SrcData>, pk_i4_t>)
+        if constexpr((packed_size_v<SrcData>) > 1)
         {
             static_assert(is_same_v<remove_cvref_t<SrcData>, remove_cvref_t<DstData>>,
                           "SrcData != DstData");
@@ -105,7 +105,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
                 SrcScalarPerVector_ % PackedSize == 0 && DstScalarPerVector_ % PackedSize == 0,
                 "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type");
 
-            static_assert(SrcVectorDim == DstVectorDim, "pk_i4_t does not support transpose");
+            static_assert(SrcVectorDim == DstVectorDim,
+                          "Packed data type does not support transpose");
         }
     }
 
@@ -222,7 +223,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
             auto gather_offset =
                 gather_offsets_(ordered_src_access_idx[Number<ordered_gather_dim>{}]);
 
-            const IndexType ld_offset = src_coord_.GetOffset() + gather_offset;
+            const IndexType ld_offset = src_coord_.GetOffset() / PackedSize + gather_offset;
             src_oob_thread_scratch_tuple_(thread_scratch_id)
                 .template SetAsType<bool>(src_data_idx_seq, true);
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
index 7cd0a0fc7f..9b1ff3dbf8 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
@@ -410,8 +410,6 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                 using dst_vector_t      = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
                 IndexType dst_offset    = scatter_offset + (dst_coords_[i].GetOffset());
                 const bool is_dst_valid = dst_offset < dst_descs[i].GetElementSpaceSize();
-                // coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
-                //                                                             dst_coords_[i]);
                 constexpr InMemoryDataOperationEnum DstInMemOp =
                     static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
                 dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index b825d7ab69..7da353d9ad 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -8,6 +8,35 @@
 #include "ck/utility/amd_xdlops.hpp"
 
 namespace ck {
+/**
+ * @brief Define matrix data types that have hardware support for MX GEMMs
+ */
+template <typename T>
+static constexpr bool is_scale_mfma_data_type()
+{
+    using U = element_type_t<T>;
+    return is_same_v<U, f8_ocp_t> || is_same_v<U, bf8_ocp_t> || is_same_v<U, f6_t> ||
+           is_same_v<U, bf6_t> || is_same_v<U, f4_t>;
+}
+
+/**
+ * @brief Define scale data types that have hardware support for MX GEMMs
+ */
+template <typename T>
+static constexpr bool is_scale_mfma_scale_type()
+{
+    return is_same_v<T, e8m0_bexp_t>;
+}
+
+/**
+ * @brief Combination of data types that have hardware support for MX GEMMs
+ */
+template <typename ADataType, typename BDataType, typename AScaleDataType, typename BScaleDataType>
+static constexpr bool scale_mfma_hw_support()
+{
+    return is_scale_mfma_data_type<ADataType>() && is_scale_mfma_data_type<BDataType>() &&
+           is_scale_mfma_scale_type<AScaleDataType>() && is_scale_mfma_scale_type<BScaleDataType>();
+}
 
 enum struct MfmaInstr
 {
@@ -847,6 +876,8 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
 
     template <index_t MPerXdlops,
               index_t NPerXdlops,
+              index_t OpselA,
+              index_t OpselB,
               class FloatA,
               class ScaleA,
               class FloatB,
@@ -858,11 +889,9 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
                         const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
-        static_assert(scalar_type<ScaleA>::vector_size == 1, "Expect single scale at this point.");
-        static_assert(scalar_type<ScaleB>::vector_size == 1, "Expect single scale at this point.");
 
-        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
-            a, utils::get_exponent_value(scale_a), b, utils::get_exponent_value(scale_b), reg_c);
+        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops, OpselA, OpselB>::Run(
+            a, bit_cast<uint32_t>(scale_a), b, bit_cast<uint32_t>(scale_b), reg_c);
     }
 };
 
@@ -885,6 +914,8 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
 
     template <index_t MPerXdlops,
               index_t NPerXdlops,
+              index_t OpselA,
+              index_t OpselB,
               class FloatA,
               class ScaleA,
               class FloatB,
@@ -896,11 +927,9 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
                         const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
-        static_assert(scalar_type<ScaleA>::vector_size == 1, "Expect single scale at this point.");
-        static_assert(scalar_type<ScaleB>::vector_size == 1, "Expect single scale at this point.");
 
-        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
-            a, utils::get_exponent_value(scale_a), b, utils::get_exponent_value(scale_b), reg_c);
+        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops, OpselA, OpselB>::Run(
+            a, bit_cast<uint32_t>(scale_a), b, bit_cast<uint32_t>(scale_b), reg_c);
     }
 };
 
@@ -1117,7 +1146,7 @@ struct MfmaSelector
 #endif
     }
 
-    // Use singal rate mfma instruction for this special case A (f8_t) * B (pk_i4_t)
+    // Use single rate mfma instruction for this special case A (f8_t) * B (pk_i4_t)
     // See example gemm_xdl_fp8_pk_i4_bpreshuffle_v3
     // TODO: explore optimization opportunity by using new mfma instructions on gfx950
     template <>
@@ -1153,6 +1182,16 @@ struct MfmaSelector
     {
         return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
     }
+    template <>
+    constexpr auto GetMfma<f4_t, 32, 32, f4_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
+    }
+    template <>
+    constexpr auto GetMfma<f4_t, 16, 16, f4_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
 
     template <>
     constexpr auto GetMfma<f8_t, 16, 16, f8_t, true, false>()
@@ -1290,10 +1329,10 @@ struct MfmaSelector
 #endif
     }
 
-    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type,
+    static constexpr auto selected_mfma = mfma_type<GetMfma<element_type_t<base_type>,
                                                             MPerXdlops,
                                                             NPerXdlops,
-                                                            additional_type,
+                                                            element_type_t<additional_type>,
                                                             is_single_rate_mfma,
                                                             is_scale_mfma>()>{};
 
@@ -1375,7 +1414,8 @@ struct XdlopsGemm
                           MPerXdlops == 64,
                       "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
 
-        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack should be a multiple of k_per_blk");
+        static_assert(KPack * 2 % mfma_instr.k_per_blk == 0,
+                      "KPack should be a multiple of k_per_blk");
     }
 
     // XDL output supporting C = A * B
@@ -1413,6 +1453,49 @@ struct XdlopsGemm
                        Sequence<7>{}));
     }
 
+    // XDL output supporting C = A * B
+    // M3_N3 -> M3_M4_M5_N3
+    template <typename CDesc_M0_N0_M1_N1_M2_N2>
+    __host__ __device__ static constexpr auto MakeCDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3(
+        const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
+    {
+        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto M2 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I4);
+        const auto N2 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I5);
+
+        return transform_tensor_descriptor(
+            c_desc_m0_n0_m1_n1_m2_n2,
+            make_tuple(make_pass_through_transform(M0),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(N1),
+                       make_pass_through_transform(M2),
+                       make_pass_through_transform(N2),
+                       make_unmerge_transform(make_tuple(Number<mfma_instr.num_groups_per_blk>{},
+                                                         Number<mfma_instr.num_input_blks>{},
+                                                         Number<mfma_instr.group_size>{})),
+                       make_pass_through_transform(Number<mfma_instr.num_threads_per_blk>{})),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{},
+                       Sequence<6>{},
+                       Sequence<7>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{},
+                       Sequence<6, 7, 8>{},
+                       Sequence<9>{}));
+    }
+
     // transposed XDL output supporting C' = B' * A'
     // M2_N2 -> M2_N2_N3_N4
     template <typename CDesc_M0_N0_M1_N1_M2_N2>
@@ -1518,7 +1601,13 @@ struct XdlopsGemm
         });
     }
 
-    template <class FloatA, class ScaleA, class FloatB, class ScaleB, class FloatC>
+    template <index_t OpselA,
+              index_t OpselB,
+              class FloatA,
+              class ScaleA,
+              class FloatB,
+              class ScaleB,
+              class FloatC>
     __device__ void Run(const FloatA& p_a_wave,
                         const ScaleA& a_scale_thread,
                         const FloatB& p_b_wave,
@@ -1528,12 +1617,12 @@ struct XdlopsGemm
         static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
             if constexpr(!TransposeC)
             {
-                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                mfma_instr.template run<MPerXdlops, NPerXdlops, OpselA, OpselB>(
                     p_a_wave[k], a_scale_thread[k], p_b_wave[k], b_scale_thread[k], p_c_thread);
             }
             else
             {
-                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                mfma_instr.template run<MPerXdlops, NPerXdlops, OpselB, OpselA>(
                     p_b_wave[k], b_scale_thread[k], p_a_wave[k], a_scale_thread[k], p_c_thread);
             }
         });
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index 62e3220b5a..783fc661ce 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -430,7 +430,9 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
             (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f4x2_pk_t::type>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
         "wrong! not implemented");
 
     using r_t     = typename vector_type<T, N>::type;
@@ -1018,18 +1020,18 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                               const index_t src_element_space_size)
 {
     // Direct loads require that each thread reads and writes exactly a single DWORD.
-    constexpr auto dword_bytes      = 4;
     constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+#if defined(__gfx950__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
+                  bytes_per_thread == dword_bytes * 4);
+#elif defined(__gfx942__)
+    constexpr auto dword_bytes = 4;
     static_assert(bytes_per_thread == dword_bytes);
-
-#ifndef CK_CODE_GEN_RTC
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
-#else
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<size_t>(global_base_ptr));
 #endif
-    const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
+
+    const int32x4_t src_resource =
+        make_wave_buffer_resource(global_base_ptr, src_element_space_size);
     const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
 
 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
@@ -1057,7 +1059,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
 #endif
 
     llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
 #endif
 }
 #endif
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 296c1d44d7..1836e9461d 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -843,14 +843,8 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
     constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
     static_assert(bytes_per_thread == dword_bytes);
 
-#ifndef CK_CODE_GEN_RTC
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
-#else
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<size_t>(global_base_ptr));
-#endif
-    const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
+    const int32x4_t src_resource =
+        make_wave_buffer_resource(global_base_ptr, src_element_space_size);
     const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
 
 #if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index ed3354dfb5..9a28c5f332 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -662,11 +662,11 @@ struct intrin_mfma_f32_32x32x64f8f6f4<32, 32>
     }
 };
 
-template <index_t MPerWave, index_t NPerWave>
+template <index_t MPerWave, index_t NPerWave, index_t OpselA, index_t OpselB>
 struct intrin_mfma_scale_f32_32x32x64f8f6f4;
 
-template <>
-struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
+template <index_t OpselA, index_t OpselB>
+struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32, OpselA, OpselB>
 {
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
@@ -682,11 +682,11 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                0, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                0, // blgp
-                0, // OPSEL
+                0,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
         // XXX: Note on the scale_a and scale_b parameters:
         // If compiler detects that one or both scales are constant values, it will treat that
@@ -719,11 +719,11 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                1, // blgp
-                0, // OPSEL
+                1,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
         // XXX: Note on the scale_a and scale_b parameters:
         // If compiler detects that one or both scales are constant values, it will treat that
@@ -756,11 +756,11 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                0, // blgp
-                0, // OPSEL
+                1,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
         // XXX: Note on the scale_a and scale_b parameters:
         // If compiler detects that one or both scales are constant values, it will treat that
@@ -798,11 +798,11 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                2, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                2, // blgp
-                0, // OPSEL
+                2,      // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
@@ -832,11 +832,11 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                3, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                3, // blgp
-                0, // OPSEL
+                3,      // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
@@ -866,11 +866,11 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
                 reg_c.template AsType<float16_t>()[Number<0>{}],
-                4, // cbsz  {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                4, // blgp
-                0, // OPSEL
+                4,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                4,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
@@ -881,13 +881,60 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
 #endif
     }
 };
+#define BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS 1
 
-template <index_t MPerWave, index_t NPerWave>
+#ifndef BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
+#define BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS 0
+#endif
+
+template <index_t MPerWave, index_t NPerWave, index_t OpselA, index_t OpselB>
 struct intrin_mfma_scale_f32_16x16x128f8f6f4;
 
-template <>
-struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
+template <index_t OpselA, index_t OpselB>
+struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
 {
+
+#define V_MFMA_SCALE_F32_16X16X128_F8F6F4(OPF_F8F6F4_CTRL_A,                   \
+                                          OPF_F8F6F4_CTRL_B,                   \
+                                          F8F6F4_VEC_TYPE_A,                   \
+                                          F8F6F4_VEC_TYPE_B,                   \
+                                          OPSEL_A_L,                           \
+                                          OPSEL_A_H,                           \
+                                          OPSEL_B_L,                           \
+                                          OPSEL_B_H)                           \
+    if constexpr((OpselA == 1 * OPSEL_A_L + 2 * OPSEL_A_H) &&                  \
+                 (OpselB == 1 * OPSEL_B_L + 2 * OPSEL_B_H))                    \
+    asm volatile("v_mfma_scale_f32_16x16x128_f8f6f4  %0, %1, %2, %3, %4, %5  " \
+                 "op_sel:[" #OPSEL_A_L "," #OPSEL_A_H "] "                     \
+                 "op_sel_hi:[" #OPSEL_B_L "," #OPSEL_B_H "] "                  \
+                 "cbsz:" #OPF_F8F6F4_CTRL_A " blgp:" #OPF_F8F6F4_CTRL_B        \
+                 : "+v"(reg_c.template AsType<float4_t>()(Number<0>{}))        \
+                 : "v"(bit_cast<F8F6F4_VEC_TYPE_A>(reg_a)),                    \
+                   "v"(bit_cast<F8F6F4_VEC_TYPE_B>(reg_b)),                    \
+                   "v"(reg_c.template AsType<float4_t>()[Number<0>{}]),        \
+                   "v"(scale_a),                                               \
+                   "v"(scale_b))
+#define BOOL4_CASES(F) \
+    do                 \
+    {                  \
+        F(0, 0, 0, 0); \
+        F(0, 0, 0, 1); \
+        F(0, 0, 1, 0); \
+        F(0, 0, 1, 1); \
+        F(0, 1, 0, 0); \
+        F(0, 1, 0, 1); \
+        F(0, 1, 1, 0); \
+        F(0, 1, 1, 1); \
+        F(1, 0, 0, 0); \
+        F(1, 0, 0, 1); \
+        F(1, 0, 1, 0); \
+        F(1, 0, 1, 1); \
+        F(1, 1, 0, 0); \
+        F(1, 1, 0, 1); \
+        F(1, 1, 1, 0); \
+        F(1, 1, 1, 1); \
+    } while(0)
+
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
                                const int32_t& scale_a,
@@ -896,18 +943,24 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
+#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                0, // cbsz   {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                0, // blgp
-                0, // OPSEL
+                0,      // cbsz   {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
+#else
+#define f8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(0, 0, int32x8_t, int32x8_t, __VA_ARGS__)
+        BOOL4_CASES(f8_cases);
+#undef f8_cases
+#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -925,18 +978,23 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
+#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                1, // blgp
-                0, // OPSEL
+                1,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
+#else
+#define bf8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(1, 1, int32x8_t, int32x8_t, __VA_ARGS__)
+        BOOL4_CASES(bf8_cases);
+#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -954,18 +1012,24 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
+#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                0, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                1, // blgp
-                0, // OPSEL
+                0,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                1,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
+#else
+#define f8bf8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(0, 1, int32x8_t, int32x8_t, __VA_ARGS__)
+        BOOL4_CASES(f8bf8_cases);
+#undef f8bf8_cases
+#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -983,18 +1047,24 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
+#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
                 reg_a,
                 reg_b,
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                1, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                0, // blgp
-                0, // OPSEL
+                1,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                0,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
+#else
+#define bf8f8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(1, 0, int32x8_t, int32x8_t, __VA_ARGS__)
+        BOOL4_CASES(bf8f8_cases);
+#undef bf8f8_cases
+#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -1022,11 +1092,11 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                2, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                2, // blgp
-                0, // OPSEL
+                2,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
@@ -1055,11 +1125,11 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], arg_a[4], arg_a[5], 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], arg_b[4], arg_b[5], 0, 0},
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                3, // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
-                3, // blgp
-                0, // OPSEL
+                3,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
@@ -1071,29 +1141,43 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
     }
 
     template <class FloatC>
-    __device__ static void Run(const f4x32_t& reg_a,
-                               const int32_t scale_a,
-                               const f4x32_t& reg_b,
-                               const int32_t scale_b,
-                               FloatC& reg_c)
+    __device__ static void
+    Run(const f4x32_t& reg_a, // misalignment between pk_f4_t, 32 and f4_t, 32
+        const int32_t scale_a,
+        const f4x32_t& reg_b,
+        const int32_t scale_b,
+        FloatC& reg_c)
     {
+#if 0
+        if(get_thread_local_1d_id()){
+            printf("Tid: %03d, Scale A: %08x, Scale B: %08x, OpSelA: %d, OpSelB: %d\n",
+                get_thread_local_1d_id(),
+                *reinterpret_cast<const uint32_t*>(&scale_a), *reinterpret_cast<const
+                uint32_t*>(&scale_b),
+                OpselA, OpselB);
+        }
+#endif
 #if defined(__gfx950__)
+#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         int32x4_t arg_a = bit_cast<int32x4_t>(reg_a);
         int32x4_t arg_b = bit_cast<int32x4_t>(reg_b);
-
-        using arg_type = int32x8_t;
-
+        using arg_type  = int32x8_t;
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
                 arg_type{arg_a[0], arg_a[1], arg_a[2], arg_a[3], 0, 0, 0, 0},
                 arg_type{arg_b[0], arg_b[1], arg_b[2], arg_b[3], 0, 0, 0, 0},
                 reg_c.template AsType<float4_t>()[Number<0>{}],
-                4, // cbsz
-                4, // blgp
-                0, // OPSEL
+                4,      // cbsz
+                4,      // blgp
+                OpselA, // OPSEL
                 scale_a,
-                0, // OPSEL
+                OpselB, // OPSEL
                 scale_b);
+#else
+#define f4_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(4, 4, int32x4_t, int32x4_t, __VA_ARGS__)
+        BOOL4_CASES(f4_cases);
+#undef f4_cases
+#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -1102,7 +1186,9 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
         ignore = reg_c;
 #endif
     }
-};
+#undef BOOL4_CASES
+#undef V_MFMA_SCALE_F32_16X16X128_F8F6F4
+}; // namespace ck
 
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_16x16x128f8f6f4;
diff --git a/include/ck/utility/blkgemmpipe_scheduler.hpp b/include/ck/utility/blkgemmpipe_scheduler.hpp
index 6c788fb41e..861b81b1f6 100644
--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -71,7 +71,8 @@ template <index_t BlockSize,
           index_t NRepeat,
           index_t MPerXDL,
           index_t NPerXDL,
-          index_t KPerXDL>
+          index_t KPerXDL,
+          bool IsF4F6 = false>
 struct BlockwiseGemmXdlops_pipeline_hotloop_inst
 {
     static constexpr index_t WaveSize = 64;
@@ -99,14 +100,16 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
     static constexpr index_t C_MFMA_Inst_Num =
         MPerBlock * NPerBlock * KPerBlock / (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
 
+    static constexpr index_t C_MFMA_SpeedUp = IsF4F6 ? 2 : 1;
+
     static constexpr index_t C_MFMA_Inst_Cycle = []() {
         if constexpr(NPerXDL == 16)
         {
-            return KPerXDL == 128 ? 32 : 16;
+            return KPerXDL == 128 ? 32 / C_MFMA_SpeedUp : 16 / C_MFMA_SpeedUp;
         }
         else if constexpr(NPerXDL == 32)
         {
-            return KPerXDL == 64 ? 64 : 32;
+            return KPerXDL == 64 ? 64 / C_MFMA_SpeedUp : 32 / C_MFMA_SpeedUp;
         }
     }();
 
@@ -123,7 +126,7 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
                KPerXDL);
 
         printf(" A/B buffer load inst: %d, %d\n A/B LDS write inst: %d, %d\n A/B LDS read inst: "
-               "%d, %d\n C MFMA inst: %d\n"
+               "%d, %d\n C MFMA inst: %d C MFMA cycle: %d\n"
                "A/B LDS read width: %d, %d, A/B LDS write width: %d, %d, A/B buffer load width: "
                "%d/ %d\n",
                A_Buffer_Load_Inst_Num,
@@ -133,6 +136,7 @@ struct BlockwiseGemmXdlops_pipeline_hotloop_inst
                A_LDS_Read_Inst_Num,
                B_LDS_Read_Inst_Num,
                C_MFMA_Inst_Num,
+               C_MFMA_Inst_Cycle,
                A_LDS_Read_Width,
                B_LDS_Read_Width,
                ALDSWriteWidth,
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index b90ff237dc..ad9bb45158 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -43,8 +43,8 @@ struct f4x2_pk_t
 
     using type = uint8_t;
     type data;
-    __host__ __device__ f4x2_pk_t() : data{type{}} {}
-    __host__ __device__ f4x2_pk_t(type init) : data{init} {}
+    __host__ __device__ constexpr f4x2_pk_t() : data{type{}} {}
+    __host__ __device__ constexpr f4x2_pk_t(const type init) : data{init} {}
 
     template <index_t I>
     __host__ __device__ inline type unpack(Number<I>) const
@@ -165,6 +165,17 @@ inline constexpr bool is_native_type()
            is_same<T, f8_fnuz_t>::value || is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value;
 }
 
+template <typename T>
+struct is_f8f6f4
+{
+    static constexpr bool value =
+        is_same_v<T, f8_t> || is_same_v<T, bf8_t> || is_same_v<T, f6_t> || is_same_v<T, bf6_t> ||
+        is_same_v<T, f6x16_pk_t> || is_same_v<T, f6x32_pk_t> || is_same_v<T, bf6x16_pk_t> ||
+        is_same_v<T, bf6x32_pk_t> || is_same_v<T, f4_t> || is_same_v<T, f4x2_pk_t>;
+};
+template <typename T>
+inline constexpr bool is_f8f6f4_v = is_f8f6f4<T>::value;
+
 // scalar_type
 template <typename TV>
 struct scalar_type;
@@ -303,105 +314,87 @@ struct scalar_type<bool>
     static constexpr index_t vector_size = 1;
 };
 
-// Default behavior for types that do not need special handling
 template <typename T>
-struct packed_type
-{
-    using type                           = T;
-    static constexpr index_t packed_size = 1; // number of packed elements
-};
-
-template <>
-struct packed_type<int4_t>
-{
-    using type                           = pk_i4_t;
-    static constexpr index_t packed_size = 2; // number of packed elements
-};
-
-template <>
-struct packed_type<f4_t>
-{
-    using type                           = f4x2_pk_t;
-    static constexpr index_t packed_size = 2; // number of packed elements
-};
-
-template <>
-struct packed_type<f6_t>
-{
-    using type                           = f6x32_pk_t;
-    static constexpr index_t packed_size = f6x32_pk_t::packed_size; // number of packed elements
-};
-
-template <>
-struct packed_type<bf6_t>
-{
-    using type                           = bf6x32_pk_t;
-    static constexpr index_t packed_size = bf6x32_pk_t::packed_size; // number of packed elements
-};
-
-template <typename T>
-using packed_type_t = typename packed_type<T>::type;
-
-// Check if the type has packed type specialization
-template <typename T>
-inline constexpr bool has_packed_type_v = !is_same_v<packed_type_t<T>, T>;
-
-template <typename T>
-struct element_type
+struct packed_type_info
 {
     private:
-    static constexpr auto get_element_type()
+    static constexpr auto get_packed_type_info()
     {
         using U = remove_cvref_t<T>;
         if constexpr(is_same_v<U, pk_i4_t>)
-            return int4_t{};
+            return ck::Tuple<ck::Number<2>, int4_t>{};
         else if constexpr(is_same_v<U, f4x2_pk_t>)
-            return f4_t{};
+            return ck::Tuple<ck::Number<2>, f4_t>{};
         else if constexpr(is_same_v<U, f6x16_pk_t>)
-            return f6_t{};
+            return ck::Tuple<ck::Number<16>, f6_t>{};
         else if constexpr(is_same_v<U, bf6x16_pk_t>)
-            return bf6_t{};
+            return ck::Tuple<ck::Number<16>, bf6_t>{};
         else if constexpr(is_same_v<U, f6x32_pk_t>)
-            return f6_t{};
+            return ck::Tuple<ck::Number<32>, f6_t>{};
         else if constexpr(is_same_v<U, bf6x32_pk_t>)
-            return bf6_t{};
+            return ck::Tuple<ck::Number<32>, bf6_t>{};
+        else
+            return ck::Tuple<ck::Number<1>, T>{};
+    }
+
+    public:
+    using element_type = remove_cvref_t<decltype(get_packed_type_info().At(ck::Number<1>{}))>;
+    static constexpr auto packed_size =
+        static_cast<index_t>(get_packed_type_info().At(ck::Number<0>{}));
+};
+template <typename T>
+using element_type_t = typename packed_type_info<T>::element_type;
+
+template <typename T>
+inline constexpr index_t packed_size_v = packed_type_info<T>::packed_size;
+
+template <typename T>
+inline constexpr bool is_packed_type_v = packed_size_v<T> > 1;
+
+template <typename T, index_t N = 0>
+struct packed_type_maker
+{
+    private:
+    static constexpr auto get_packed_type()
+    {
+        using U = remove_cvref_t<T>;
+        if constexpr(is_same_v<U, int4_t>)
+        {
+            static_assert(N == 0 || N == 2, "Packed size N for int4_t must be 2.");
+            return pk_i4_t{};
+        }
+        else if constexpr(is_same_v<U, f4_t>)
+        {
+            static_assert(N == 0 || N == 2, "Packed size N for f4_t must be 2.");
+            return f4x2_pk_t{};
+        }
+        else if constexpr(is_same_v<U, f6_t>)
+        {
+            static_assert(N == 0 || N == 16 || N == 32, "Packed size N for f6_t must be 16 or 32.");
+            if constexpr(N == 16)
+                return f6x16_pk_t{};
+            else if constexpr(N == 0 || N == 32)
+                return f6x32_pk_t{};
+        }
+        else if constexpr(is_same_v<U, bf6_t>)
+        {
+            static_assert(N == 0 || N == 16 || N == 32,
+                          "Packed size N for bf6_t must be 16 or 32.");
+            if constexpr(N == 16)
+                return bf6x16_pk_t{};
+            else if constexpr(N == 0 || N == 32)
+                return bf6x32_pk_t{};
+        }
         else
             return T{};
     }
 
     public:
-    using type = decltype(get_element_type());
-};
-template <typename T>
-using element_type_t = typename element_type<T>::type;
-
-template <typename T>
-inline constexpr bool is_packed_type_v =
-    has_packed_type_v<element_type_t<T>>&& is_same_v<T, packed_type_t<element_type_t<T>>>;
-
-template <typename T>
-struct packed_size
-{
-    private:
-    static constexpr auto get_packed_size()
-    {
-        using U = remove_cvref_t<T>;
-        if constexpr(is_packed_type_v<U>)
-            return Number<packed_type<element_type_t<U>>::packed_size>{};
-        else
-            return Number<packed_type<U>::packed_size>{};
-    }
-
-    public:
-    using type                  = decltype(get_packed_size());
-    static constexpr auto value = get_packed_size();
+    using packed_type = remove_cvref_t<decltype(get_packed_type())>;
 };
 
-template <typename T>
-using packed_size_t = typename packed_size<T>::type;
-
-template <typename T>
-inline constexpr index_t packed_size_v = packed_size<T>::value;
+template <typename T, index_t N = 0>
+using packed_type_t = typename packed_type_maker<T, N>::packed_type;
 
 #if defined(_WIN32)
 using int64_t = long long;
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index 65eed0624c..049221cea1 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -1330,6 +1330,12 @@ struct nnvb_data_t_selector<pk_i4_t>
     using type = pk_i4_t::type;
 };
 
+template <>
+struct nnvb_data_t_selector<f4x2_pk_t>
+{
+    using type = f4x2_pk_t::type;
+};
+
 template <typename T, index_t N>
 struct non_native_vector_base<
     T,
@@ -2222,6 +2228,7 @@ using f6x32_t = typename vector_type<f6x32_pk_t, 1>::type;
 using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type;
 using bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type;
 
+using e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type;
 // pack int4
 using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
 using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
index a11963cb47..16213173f3 100644
--- a/include/ck/utility/functional2.hpp
+++ b/include/ck/utility/functional2.hpp
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck/utility/functional.hpp"
 #include "ck/utility/sequence.hpp"
+#include "ck/utility/tuple.hpp"
 
 namespace ck {
 
@@ -70,4 +71,44 @@ struct static_for<0, N, 1> : detail::make_applier<N>
     using detail::make_applier<N>::operator();
 };
 
+template <typename... Is>
+struct static_for_range
+{
+    template <typename F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        // tweak -fbracket-depth if compilation fails. Clang default limit is 256
+        (f(Is{}), ...);
+    }
+};
+
+template <typename... Ts>
+struct static_for_product;
+template <typename... Is>
+struct static_for_product<Tuple<Is...>> : public static_for_range<Is...>
+{
+};
+template <typename... Is, typename... Rest>
+struct static_for_product<Tuple<Is...>, Rest...>
+{
+    template <typename F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        static_for_product<Tuple<Is...>>{}([&](auto i0) {   //
+            static_for_product<Rest...>{}([&](auto... is) { //
+                f(i0, is...);
+            });
+        });
+    }
+};
+
+struct identity
+{
+    template <typename T>
+    __host__ __device__ constexpr T&& operator()(T&& arg) const noexcept
+    {
+        return forward<T>(arg);
+    }
+};
+
 } // namespace ck
diff --git a/include/ck/utility/integral_constant.hpp b/include/ck/utility/integral_constant.hpp
index 75f35d762c..a7fa64d710 100644
--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
@@ -5,14 +5,22 @@
 
 namespace ck {
 
+template <auto v>
+struct constant
+{
+    using value_type                  = decltype(v);
+    using type                        = constant; // using injected-class-name
+    static constexpr value_type value = v;
+    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
+    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
+};
+
 template <class T, T v>
-struct integral_constant
+struct integral_constant : constant<v>
 {
     static constexpr T value = v;
     typedef T value_type;
     typedef integral_constant type;
-    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
-    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
 };
 
 template <typename TX, TX X, typename TY, TY Y>
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 9b1321dea3..5865f1dd78 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -1586,6 +1586,11 @@ inline __host__ __device__ f4x2_t type_convert<f4x2_t, float2_t>(float2_t x)
     return f4_convert_rne(x);
 #endif
 }
+template <>
+inline __host__ __device__ f4x2_pk_t type_convert<f4x2_pk_t, float2_t>(float2_t x)
+{
+    return static_cast<f4x2_pk_t>(type_convert<f4x2_t>(x));
+}
 
 // convert vector of 32 fp32 to vector of 32 fp4
 template <>
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 1a1b729394..7d06d871a9 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -112,7 +112,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
                         make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
             make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
-            return a_lds_block_desc;
+        return a_lds_block_desc;
 #endif
     }
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
index 3fc39911dd..6a2b007ef5 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp
@@ -77,33 +77,34 @@ struct ReferenceMXGemm : public device::BaseOperator
                                                                            ComputeTypeA,
                                                                            ComputeTypeB>;
 
-            Tensor<ComputeTypeA> a_m_k_scaled(arg.a_m_k_.mDesc);
-            Tensor<ComputeTypeB> b_k_n_scaled(arg.b_k_n_.mDesc);
+            const ck::index_t M = arg.a_m_k_.mDesc.GetLengths()[0];
+            const ck::index_t N = arg.b_k_n_.mDesc.GetLengths()[1];
+            assert(arg.a_m_k_.mDesc.GetLengths()[1] == arg.b_k_n_.mDesc.GetLengths()[0]);
+            const ck::index_t K           = arg.a_m_k_.mDesc.GetLengths()[1];
+            const ck::index_t SCALE_BLOCK = K / arg.a_m_kblock_scales_.mDesc.GetLengths()[1];
+            Tensor<ComputeTypeA> a_m_k_scaled(HostTensorDescriptor({M, K}, {K, 1}));
+            Tensor<ComputeTypeB> b_k_n_scaled(HostTensorDescriptor({K, N}, {1, K}));
+            // printf("K: %d\n", K);
 
-            const auto M           = arg.a_m_k_.mDesc.GetLengths()[0];
-            const auto N           = arg.b_k_n_.mDesc.GetLengths()[1];
-            const auto K           = arg.a_m_k_.mDesc.GetLengths()[1];
-            const auto SCALE_BLOCK = K / arg.a_m_kblock_scales_.mDesc.GetLengths()[1];
-
-            for(size_t m = 0; m < M; m++)
+            for(int m = 0; m < M; m++)
             {
-                for(size_t k = 0; k < K; k++)
+                for(int k = 0; k < K; k++)
                 {
                     if constexpr(is_same_v<ADataType, f4x2_pk_t>)
                     {
-                        // TODO: add support for ColMajor layout as well
                         if(k % 2 == 1)
-                            a_m_k_scaled(m, k) =
-                                type_convert<ComputeTypeA>(
-                                    f4_t(arg.a_m_k_(m, k).template unpack<>(Number<1>{}))) *
-                                type_convert<ComputeTypeA>(
-                                    arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
-                        else
-                            a_m_k_scaled(m, k) =
-                                type_convert<ComputeTypeA>(
-                                    f4_t(arg.a_m_k_(m, k).template unpack<>(Number<0>{}))) *
-                                type_convert<ComputeTypeA>(
-                                    arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                        {
+                            continue;
+                        }
+                        // TODO: add support for ColMajor layout as well
+                        auto a_pack = arg.a_m_k_(m, k);
+                        auto a_scale =
+                            type_convert<ComputeTypeA>(arg.a_m_kblock_scales_(m, k / SCALE_BLOCK));
+                        auto a_f4_lo = f4_t(a_pack.template unpack<>(Number<0>{}));
+                        auto a_f4_hi = f4_t(a_pack.template unpack<>(Number<1>{}));
+
+                        a_m_k_scaled(m, k)     = type_convert<ComputeTypeA>(a_f4_lo) * a_scale;
+                        a_m_k_scaled(m, k + 1) = type_convert<ComputeTypeA>(a_f4_hi) * a_scale;
                     }
                     else if constexpr(is_same_v<ADataType, f6x16_pk_t> ||
                                       is_same_v<ADataType, bf6x16_pk_t> ||
@@ -124,25 +125,24 @@ struct ReferenceMXGemm : public device::BaseOperator
                 }
             }
 
-            for(size_t n = 0; n < N; n++)
+            for(int n = 0; n < N; n++)
             {
-                for(size_t k = 0; k < K; k++)
+                for(int k = 0; k < K; k++)
                 {
                     if constexpr(is_same_v<BDataType, f4x2_pk_t>)
                     {
                         // TODO: add support for RowMajor layout as well
                         if(k % 2 == 1)
-                            b_k_n_scaled(k, n) =
-                                type_convert<ComputeTypeB>(
-                                    f4_t(arg.b_k_n_(k, n).template unpack<>(Number<1>{}))) *
-                                type_convert<ComputeTypeB>(
-                                    arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
-                        else
-                            b_k_n_scaled(k, n) =
-                                type_convert<ComputeTypeB>(
-                                    f4_t(arg.b_k_n_(k, n).template unpack<>(Number<0>{}))) *
-                                type_convert<ComputeTypeB>(
-                                    arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                        {
+                            continue;
+                        }
+                        auto b_pack = arg.b_k_n_(k, n);
+                        auto b_scale =
+                            type_convert<ComputeTypeB>(arg.b_kblock_n_scales_(k / SCALE_BLOCK, n));
+                        auto b_f4_lo           = f4_t(b_pack.template unpack<>(Number<0>{}));
+                        auto b_f4_hi           = f4_t(b_pack.template unpack<>(Number<1>{}));
+                        b_k_n_scaled(k, n)     = type_convert<ComputeTypeB>(b_f4_lo) * b_scale;
+                        b_k_n_scaled(k + 1, n) = type_convert<ComputeTypeB>(b_f4_hi) * b_scale;
                     }
                     else if constexpr(is_same_v<BDataType, f6x16_pk_t> ||
                                       is_same_v<BDataType, bf6x16_pk_t> ||
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 0cb2c2bd79..274273d576 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -23,6 +23,10 @@ using I32  = int32_t;
 using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
 using I4   = ck::pk_i4_t;
+using F4   = ck::f4x2_pk_t;
+
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
 
 using Empty_Tuple = ck::Tuple<>;
 
@@ -42,8 +46,9 @@ using BF16_Tuple    = ck::Tuple<BF16>;
 using F32_F32_Tuple = ck::Tuple<F32, F32>;
 
 // GEMM layout
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Row  = ck::tensor_layout::gemm::RowMajor;
+using Col  = ck::tensor_layout::gemm::ColumnMajor;
+using MFMA = ck::tensor_layout::gemm::MFMA;
 
 using Row_Tuple     = ck::Tuple<Row>;
 using Row_Row_Tuple = ck::Tuple<Row, Row>;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
index 4af5143f45..ec75a0cfb0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
@@ -22,9 +22,9 @@ void add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(
                                              Col,
                                              Row,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              F16,
                                              32,
                                              PassThrough,
@@ -36,23 +36,37 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
                                              Col,
                                              Row,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              BF16,
                                              32,
                                              PassThrough,
                                              PassThrough,
                                              PassThrough>>>& instances);
 
+void add_device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F4,
+                                             I32,
+                                             F4,
+                                             I32,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
 void add_device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMX<Row,
                                              Row,
                                              Row,
                                              BF8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              F16,
                                              32,
                                              PassThrough,
@@ -64,9 +78,9 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(
                                              Col,
                                              Row,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              F8,
-                                             e8m0_bexp_t,
+                                             E8M0PK,
                                              BF16,
                                              32,
                                              PassThrough,
@@ -94,7 +108,8 @@ struct DeviceOperationInstanceFactory<
                                                ScaleBlockSize,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                ck::tensor_operation::element_wise::PassThrough,
-                                               ck::tensor_operation::element_wise::PassThrough>>
+                                               ck::tensor_operation::element_wise::PassThrough>,
+    enable_if_t<!is_same_v<BLayout, MFMA>>> // non-weight-pre-shuffle
 {
     using DeviceOp = DeviceGemmMX<ALayout,
                                   BLayout,
@@ -127,6 +142,11 @@ struct DeviceOperationInstanceFactory<
 
                 add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(op_ptrs);
             }
+            else if constexpr(is_same_v<ADataType, F4> && is_same_v<BDataType, F4> &&
+                              is_same_v<CDataType, F16>)
+            {
+                add_device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instances(op_ptrs);
+            }
         }
         else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                           is_same_v<CLayout, Row>)
@@ -153,6 +173,73 @@ struct DeviceOperationInstanceFactory<
     }
 };
 
+void add_device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             MFMA,
+                                             Row,
+                                             F4,
+                                             I32,
+                                             F4,
+                                             I32,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+template <typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          index_t ScaleBlockSize,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                               BLayout,
+                                               CLayout,
+                                               ADataType,
+                                               AScaleDataType,
+                                               BDataType,
+                                               BScaleDataType,
+                                               CDataType,
+                                               ScaleBlockSize,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough>,
+    enable_if_t<is_same_v<BLayout, MFMA>>>
+{
+    using DeviceOp = DeviceGemmMX<ALayout,
+                                  BLayout,
+                                  CLayout,
+                                  ADataType,
+                                  AScaleDataType,
+                                  BDataType,
+                                  BScaleDataType,
+                                  CDataType,
+                                  ScaleBlockSize,
+                                  ck::tensor_operation::element_wise::PassThrough,
+                                  ck::tensor_operation::element_wise::PassThrough,
+                                  ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, MFMA> && is_same_v<CLayout, Row>)
+        {
+            if constexpr(is_same_v<ADataType, F4> && is_same_v<BDataType, F4> &&
+                         is_same_v<CDataType, F16>)
+            {
+                add_device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
index 4c12e515e8..a99416f80b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -34,19 +34,19 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instances =
     // ##################################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
     // ##################################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
     // ##################################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    64,  16,  16,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<4, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    16,    32,    32,   8,   8,   16,   16,    1,    1,     S<2, 16, 4>,     S<1, 0, 2>,              2,              2,         0,     S<2, 16, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    16,    32,    64,  16,  16,   16,   16,    1,    1,      S<2, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<2, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    16,    32,    64,  16,  16,   16,   16,    1,    1,      S<2, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<2, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,   256,    64,    64,    64,  16,  16,   16,   16,    2,    2,      S<4, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<4, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,     S<2, 16, 4>,     S<1, 0, 2>,              2,              2,         1,     S<2, 16, 4>,     S<1, 0, 2>,             2,              2,         1,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,   128,    16,    32,    64,  16,  16,   16,   16,    1,    1,      S<2, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<2, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    16,    32,    64,  16,  16,   16,   16,    1,    1,      S<2, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<2, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    16,    32,    32,   8,   8,   16,   16,    1,    1,     S<2, 16, 4>,     S<1, 0, 2>,              2,              2,         0,     S<2, 16, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,   128,    32,    16,    64,  16,  16,   16,   16,    1,    1,      S<2, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<2, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    32,    16,    64,  16,  16,   16,   16,    1,    1,      S<2, 8, 8>,     S<1, 0, 2>,              2,              2,         0,      S<2, 8, 8>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,    64,    16,    16,   128,  32,  32,   16,   16,    1,    1,     S<1, 4, 16>,     S<1, 0, 2>,              2,              2,         0,     S<1, 4, 16>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,    64,    16,    16,   128,  32,  32,   16,   16,    1,    1,     S<1, 4, 16>,     S<1, 0, 2>,              2,              2,         0,     S<1, 4, 16>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    64,   8,   8,   32,   32,    1,    1,      S<8, 8, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 8, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    16,    32,    32,   8,   8,   16,   16,    1,    1,      S<4, 8, 4>,     S<1, 0, 2>,              2,              2,         0,      S<4, 8, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    16,    32,    64,   8,   8,   16,   16,    1,    1,      S<8, 4, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 4, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    16,    32,    64,   8,   8,   16,   16,    1,    1,      S<8, 4, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 4, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,   256,    64,    64,    64,   8,   8,   16,   16,    2,    2,      S<8, 8, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 8, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 4>,     S<1, 0, 2>,              2,              2,         0,      S<4, 8, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,   128,    16,    32,    64,   8,   8,   16,   16,    1,    1,      S<8, 4, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 4, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    16,    32,    64,   8,   8,   16,   16,    1,    1,      S<8, 4, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 4, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    16,    32,    32,   8,   8,   16,   16,    1,    1,      S<4, 8, 4>,     S<1, 0, 2>,              2,              2,         0,      S<4, 8, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,   128,    32,    16,    64,   8,   8,   16,   16,    1,    1,      S<8, 4, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 4, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,   128,    32,    16,    64,   8,   8,   16,   16,    1,    1,      S<8, 4, 4>,     S<1, 0, 2>,              2,              2,         0,      S<8, 4, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        1,    64,    16,    16,   128,   8,   8,   16,   16,    1,    1,     S<16, 1, 4>,     S<1, 0, 2>,              2,              2,         0,     S<16, 1, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,  GemmMNPadding,        2,    64,    16,    16,   128,   8,   8,   16,   16,    1,    1,     S<16, 1, 4>,     S<1, 0, 2>,              2,              2,         0,     S<16, 1, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 16, 1, 4>,               4>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
index 94f75d0e0f..7e8daef867 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
@@ -32,8 +32,8 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instances =
     // ##################################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
     // ##################################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
     // ##################################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<0, 2, 1>,              1,              1,         1,      S<4, 8, 8>,     S<0, 2, 1>,             1,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<0, 2, 1>,              1,              1,         1,      S<4, 8, 8>,     S<0, 2, 1>,             1,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    32,    32,   4,   4,   32,   32,    1,    1,      S<1, 16, 4>,     S<0, 1, 2>,              1,              1,         0,      S<1, 16, 4>,     S<0, 1, 2>,             1,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    32,    32,    32,   4,   4,   32,   32,    1,    1,      S<1, 16, 4>,     S<0, 1, 2>,              1,              1,         0,      S<1, 16, 4>,     S<0, 1, 2>,             1,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instance.cpp
index 0f4ebc350b..976b7bbe86 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instance.cpp
@@ -32,8 +32,8 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instances =
     // ##################################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
     // ##################################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
     // ##################################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<0, 2, 1>,              1,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<0, 2, 1>,              1,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    32,    32,   4,   4,   32,   32,    1,    1,      S<1, 16, 4>,     S<0, 1, 2>,              1,              1,         0,      S<8, 2, 4>,     S<1, 0, 2>,             2,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Col,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    32,    32,    32,   4,   4,   32,   32,    1,    1,      S<1, 16, 4>,     S<0, 1, 2>,              1,              1,         0,      S<8, 2, 4>,     S<1, 0, 2>,             2,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instance.cpp
index d2bc9351b6..bf65b9af76 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -31,8 +31,8 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instances =
     // ##################################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
     // ##################################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
     // ##################################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<0, 2, 1>,             1,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<0, 2, 1>,             1,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    32,    32,   4,   4,   32,   32,    1,    1,      S<8, 2, 4>,     S<1, 0, 2>,              2,              1,         0,      S<1, 16, 4>,     S<0, 1, 2>,             1,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Row,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    32,    32,    32,   4,   4,   32,   32,    1,    1,      S<8, 2, 4>,     S<1, 0, 2>,              2,              1,         0,      S<1, 16, 4>,     S<0, 1, 2>,             1,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instance.cpp
index 2c208c01f3..2a65566f8e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -32,8 +32,8 @@ using device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instances =
     // ##################################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
     // ##################################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
     // ##################################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>,
-    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         1,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         1,           1,           1,                S<1, 8, 1, 8>,               4>
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         0,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>,
+    DeviceGemm_Xdl_CShuffle_LdsDirectLoad<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,    64,    32,   8,   8,   32,   32,    1,    1,      S<4, 8, 8>,     S<1, 0, 2>,              2,              1,         0,      S<4, 8, 8>,     S<1, 0, 2>,             2,              1,         0,           1,           1,                S<1, 8, 1, 8>,               4>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
index 0442bed130..bb67a9edae 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
@@ -6,6 +6,8 @@ list(APPEND GEMM_MX_INSTANCES
         device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
         device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
         device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
+        device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instance.cpp
     )
 
 
@@ -13,6 +15,8 @@ set_source_files_properties(device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 
 add_instance_library(device_gemm_mx_instance ${GEMM_MX_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
index 8dc21cbf1f..c5a44281df 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn.hpp
@@ -13,12 +13,13 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F8   = f8_t;
-using BF8  = bf8_t;
-using F16  = half_t;
-using BF16 = bhalf_t;
-using F32  = float;
-using E8M0 = ck::e8m0_bexp_t;
+using F8     = f8_t;
+using BF8    = bf8_t;
+using F16    = half_t;
+using BF16   = bhalf_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
 
 using Row = tensor_layout::gemm::RowMajor;
 using Col = tensor_layout::gemm::ColumnMajor;
@@ -40,17 +41,19 @@ static constexpr auto ScaleBlockSize = 32;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_instances = std::tuple<
+#if 0 // TODO: Fix RRR
     // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    64,    16,   128,  16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   256,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,     false,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,    64,   256,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0,   F8,  E8M0,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    16,    32,   512,  16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              8,         0,           1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+        //#########################| ALayout| BLayout| CLayout|AData|   AScale|BData|  BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|     Data| Type|    Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |     Type|     |    Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |         |     |        |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0PK,   F8,  E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    64,    16,   128,  16,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 4, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0PK,   F8,  E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   256,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,     false,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0PK,   F8,  E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,    64,   256,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0PK,   F8,  E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Row,     Row,   BF8,  E8M0PK,   F8,  E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,    16,    32,   512,  16,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,     S<64, 2, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              8,         0,           1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
     // clang-format on
+#endif
     >;
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
index 2b6ccdbeda..e865b2f7df 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf8_f8_f16/device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instance.cpp
@@ -13,9 +13,9 @@ void add_device_gemm_mx_xdl_bf8_f8_f16_mk_kn_mn_default_instances(
                                              Row,
                                              Row,
                                              BF8,
-                                             E8M0,
+                                             E8M0PK,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              F16,
                                              32,
                                              PassThrough,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
new file mode 100644
index 0000000000..03ea71883a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F4     = f4x2_pk_t;
+using F16    = half_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
+
+using Row  = tensor_layout::gemm::RowMajor;
+using Col  = tensor_layout::gemm::ColumnMajor;
+using MFMA = tensor_layout::gemm::MFMA;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_instances = std::tuple<
+    // clang-format off
+    //#####################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //#####################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //#####################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //#####################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   384,   128,  16,  16,  16,   16,    2,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   384,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   512,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   128,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   256,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   384,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   512,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   384,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   512,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instance.cpp
new file mode 100644
index 0000000000..d955148d2c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             MFMA,
+                                             Row,
+                                             F4,
+                                             E8M0PK,
+                                             F4,
+                                             E8M0PK,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..1ebb400fdd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F4     = f4x2_pk_t;
+using F16    = half_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+    //#############################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //#############################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //#############################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //#############################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   128,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   256,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   128,  16,  16,  16,   16,    2,    2,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..597879c414
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F4,
+                                             E8M0PK,
+                                             F4,
+                                             E8M0PK,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
index d3f74b2907..c9bc4d25bb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn.hpp
@@ -13,11 +13,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F8   = f8_t;
-using F16  = half_t;
-using BF16 = bhalf_t;
-using F32  = float;
-using E8M0 = ck::e8m0_bexp_t;
+using F8     = f8_t;
+using F16    = half_t;
+using BF16   = bhalf_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
 
 using Row = tensor_layout::gemm::RowMajor;
 using Col = tensor_layout::gemm::ColumnMajor;
@@ -39,19 +40,21 @@ static constexpr auto ScaleBlockSize = 32;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_instances = std::tuple<
+#if 0 // TODO: Fix CCR
     // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,   4,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    16,   256,   128,   4,  16,  16,   16,    1,    4,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                  S<1, 16, 1, 16>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,    64,   4,  16,  32,   32,    2,    2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   64,     16,    16,   512,   8,  16,  16,   16,    1,    1,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,   8,  16,  16,   16,    8,    8,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,    64,   4,  16,  32,   32,    4,    4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   128,   128,   4,  16,  16,   16,    4,    8,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+        //#########################| ALayout| BLayout| CLayout|AData|  AScale|BData|  BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|    Data| Type|    Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |    Type|     |    Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |        |     |        |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,   4,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    16,   256,   128,   4,  16,  16,   16,    1,    4,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                  S<1, 16, 1, 16>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,    64,   4,  16,  32,   32,    2,    2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   64,     16,    16,   512,   8,  16,  16,   16,    1,    1,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,   8,  16,  16,   16,    8,    8,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              8,         0,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,    64,   4,  16,  32,   32,    4,    4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Col,     Col,     Row,   F8,  E8M0PK,   F8,  E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   128,   128,   4,  16,  16,   16,    4,    8,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,             16,              4,         0,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,           1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
     // clang-format on
+#endif
     >;
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
index c75e779fea..4f9c372c93 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
@@ -13,9 +13,9 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(
                                              Col,
                                              Row,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              BF16,
                                              32,
                                              PassThrough,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
index ac09df7ea2..3645026c60 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -13,11 +13,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F8   = f8_t;
-using F16  = half_t;
-using BF16 = bhalf_t;
-using F32  = float;
-using E8M0 = ck::e8m0_bexp_t;
+using F8     = f8_t;
+using F16    = half_t;
+using BF16   = bhalf_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
 
 using Row = tensor_layout::gemm::RowMajor;
 using Col = tensor_layout::gemm::ColumnMajor;
@@ -40,15 +41,15 @@ static constexpr auto ScaleBlockSize = 32;
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
     // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+    //###########################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //###########################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //###########################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //###########################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64,   256,  16,  16,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   256,  16,  16,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32,   256,  16,  16,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   256,  16,  16,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
index 05914e06b5..a4c3451c47 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
@@ -13,9 +13,9 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instances(
                                              Col,
                                              Row,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              BF16,
                                              32,
                                              PassThrough,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
index 68363de523..f7ef5562e4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -13,11 +13,12 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F8   = f8_t;
-using F16  = half_t;
-using BF16 = bhalf_t;
-using F32  = float;
-using E8M0 = ck::e8m0_bexp_t;
+using F8     = f8_t;
+using F16    = half_t;
+using BF16   = bhalf_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
 
 using Row = tensor_layout::gemm::RowMajor;
 using Col = tensor_layout::gemm::ColumnMajor;
@@ -40,15 +41,15 @@ static constexpr auto ScaleBlockSize = 32;
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances = std::tuple<
     // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData|AScale|BData|BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Data| Type|  Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |  Type|     |  Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |     |      |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,   16,    128,  16,  16,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      false,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     false,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,    64,  16,  16,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8,  E8M0,   F8,  E8M0,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    16,    16,   512,  16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>
+    //###########################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //###########################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //###########################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //###########################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   256,  16,  16,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64,   256,  16,  16,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   256,  16,  16,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32,   256,  16,  16,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   256,  16,  16,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
index f4e59cf92d..1cacee7aea 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
@@ -13,9 +13,9 @@ void add_device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instances(
                                              Col,
                                              Row,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              F8,
-                                             E8M0,
+                                             E8M0PK,
                                              F16,
                                              32,
                                              PassThrough,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
index f0a54ee400..0b1f08474b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -37,30 +37,30 @@ using device_gemm_xdl_splitk_lds_direct_load_f16_f16_f16_mk_nk_mn_instances = st
         //#######################################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization| Prefetch|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|          ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar| AddExtraM|          ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#######################################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |    Stage|      |      |      |      |    |     |     | Wave| Wave| Lengths_KBatch_K0_M_K1|               |               |      PerVector|          | Lengths_KBatch_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#######################################|     |      |      |        |        |        |        |            |            |            |              |         |      |      |      |      |    |     |     |     |     |                       |               |               |               |          |                       |               |              |               |          |            |            |                                 |                |
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       1,   256,    16,   128,     4,  16,   16,   16,    1,    2,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                  S<1, 16, 1, 16>,               4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       1,    64,    16,    16,     8,   8,   16,   16,    1,    1,         S<1, 1, 16, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 1, 16, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 4>,               4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       1,    64,    16,    16,     4,  16,   16,   16,    1,    1,          S<1, 1, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 1, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 4>,               4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       2,    64,    16,    16,     8,  16,   16,   16,    1,    1,          S<1, 1, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 1, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       1,   256,    16,   128,     8,   8,   16,   16,    1,    2,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                  S<1, 16, 1, 16>,               4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       1,    64,    16,    16,     8,   8,   16,   16,    1,    1,          S<1, 8, 2, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 2, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       1,    64,    16,    16,     8,   8,   16,   16,    1,    1,          S<1, 8, 2, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 2, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,       2,    64,    16,    16,     16,  8,   16,   16,    1,    1,          S<1, 16, 1, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 1, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 4>,               4>,
 
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   256,   128,   128,     4,  16,   32,   32,    2,    2,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 16>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   256,    32,    32,     4,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   256,    16,    64,     8,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   128,    16,    64,     4,  32,   16,   16,    1,    2,         S<1, 2, 4, 16>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 2, 4, 16>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   128,    16,    32,     8,   8,   16,   16,    1,    1,         S<1, 2, 16, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 2, 16, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   128,    16,    32,     4,   8,   16,   16,    1,    1,         S<1, 2, 16, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 2, 16, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                     S<1, 8, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,    64,    16,    16,     4,  32,   16,   16,    1,    1,         S<1, 1, 4, 16>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 1, 4, 16>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 4>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       2,   256,    64,    16,     4,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 32, 1, 4>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       2,   256,    16,    64,     4,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       2,    64,    16,    16,     8,  16,   16,   16,    1,    1,          S<1, 1, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 1, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 4>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   256,   128,   128,     8,   8,   32,   32,    2,    2,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 16>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   256,    32,    32,     8,   8,   16,   16,    1,    1,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   256,    16,    64,     16,  8,   16,   16,    1,    1,          S<1, 16, 4, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 4, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   128,    16,    64,     16,  8,   16,   16,    1,    2,          S<1, 16, 2, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 2, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   128,    16,    32,     8,   8,   16,   16,    1,    1,          S<1, 8, 4, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 4, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,   128,    16,    32,     4,   8,   16,   16,    1,    1,          S<1, 4, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 4, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                     S<1, 8, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       1,    64,    16,    16,     16,  8,   16,   16,    1,    1,          S<1, 16, 1, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 1, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 4>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       2,   256,    64,    16,     8,   8,   16,   16,    1,    1,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 32, 1, 4>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       2,   256,    16,    64,     8,   8,   16,   16,    1,    1,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,       2,    64,    16,    16,     16,  8,   16,   16,    1,    1,          S<1, 16, 1, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 1, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 4>,              4>,
 
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,   128,   128,     4,  16,   32,   32,    2,    2,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 16>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    16,   128,     4,  32,   16,   16,    1,    2,         S<1, 4, 4, 16>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 4, 4, 16>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 16>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    32,    32,     8,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    32,    32,     4,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    16,    64,     4,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   128,    16,    32,     8,   8,   16,   16,    1,    1,         S<1, 2, 16, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 2, 16, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 8>,               4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,    64,    16,    16,     4,  32,   16,   16,    1,    1,         S<1, 1, 4, 16>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 1, 4, 16>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 4>,              4>,
-        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       2,   256,    64,    16,     4,  16,   16,   16,    1,    1,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,              3,              2,         0,          S<1, 4, 8, 8>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 32, 1, 4>,              4>
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,   128,   128,     8,   8,   32,   32,    2,    2,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 16>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    16,   128,     16,  8,   16,   16,    1,    2,          S<1, 16, 4, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 4, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                   S<1, 16, 1, 16>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    32,    32,     16,  8,   16,   16,    1,    1,          S<1, 16, 4, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 4, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    32,    32,     8,   8,   16,   16,    1,    1,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   256,    16,    64,     8,   8,   16,   16,    1,    1,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,   128,    16,    32,     8,   8,   16,   16,    1,    1,          S<1, 8, 4, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 4, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 8>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       1,    64,    16,    16,     16,  8,   16,   16,    1,    1,          S<1, 16, 1, 4>, S<0, 2, 1, 3>,              3,              2,         0,         S<1, 16, 1, 4>, S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 16, 1, 4>,              4>,
+        DeviceGemmXdlSplitKCShuffle_LdsDirectLoad<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNKPadding,       2,   256,    64,    16,     8,   8,   16,   16,    1,    1,          S<1, 8, 8, 4>,  S<0, 2, 1, 3>,              3,              2,         0,         S<1, 8, 8, 4>,  S<0, 2, 1, 3>,             3,              2,         0,           1,           1,                    S<1, 32, 1, 4>,              4>
     // clang-format on
     >;
 
diff --git a/profiler/include/profiler/profile_gemm_mx_impl.hpp b/profiler/include/profiler/profile_gemm_mx_impl.hpp
new file mode 100644
index 0000000000..8135bf4475
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_mx_impl.hpp
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace profiler {
+
+#if 1
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+void preShuffleBuffer(const ck::f4x2_pk_t* src, ck::f4x2_pk_t* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+    int K_pk  = K / 2;
+    int K0    = K_pk / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K_pk; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K_pk + k];
+        }
+    }
+}
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          int ScaleBlockSize>
+bool profile_gemm_mx_impl(int do_verification,
+                          int init_method,
+                          bool do_log,
+                          bool time_kernel,
+                          int M,
+                          int N,
+                          int K,
+                          int StrideA,
+                          int StrideB,
+                          int StrideC,
+                          int KBatch,
+                          int n_warmup,
+                          int n_iter,
+                          uint64_t rotating = 0)
+{
+    using tensor_operation::device::instance::Col;
+    using tensor_operation::device::instance::E8M0;
+    using tensor_operation::device::instance::E8M0PK;
+    using tensor_operation::device::instance::MFMA;
+    using tensor_operation::device::instance::Row;
+
+    constexpr bool BPreShuffle = is_same_v<BLayout, MFMA>;
+    using BRefLayout           = conditional_t<BPreShuffle, Col, BLayout>;
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    using XDataType       = E8M0;
+    using XPackedDataType = E8M0PK;
+    using AScaleLayout    = Row;
+    using BScaleLayout    = Col;
+
+    auto f_host_tensor_descriptor =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+                return HostTensorDescriptor({row, col}, {stride, 1});
+            else
+                return HostTensorDescriptor({row, col}, {1, stride});
+        };
+    auto f_get_default_stride =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                    return static_cast<ck::index_t>(col);
+                else
+                    return static_cast<ck::index_t>(row);
+            }
+            else
+                return static_cast<ck::index_t>(stride);
+        };
+
+    auto Scale_Padded_M = (M + 32 - 1) / 32 * 32;
+    auto Scale_Stride_AM =
+        f_get_default_stride(Scale_Padded_M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    auto b_k_n =
+        std::make_shared<Tensor<BDataType>>(f_host_tensor_descriptor(K, N, StrideB, BRefLayout{}));
+    auto b_input = b_k_n;
+    if constexpr(BPreShuffle)
+        b_input = std::make_shared<Tensor<BDataType>>(
+            f_host_tensor_descriptor(K, N, StrideB, BRefLayout{})); // use layout only for size
+
+    // scales for A and B
+    Tensor<XDataType> a_m_k_scale(f_host_tensor_descriptor(
+        Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
+    Tensor<XDataType> b_k_n_scale(
+        f_host_tensor_descriptor(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
+
+    // shuffled scales for A and B
+    Tensor<XDataType> a_shuffled_scale(f_host_tensor_descriptor(
+        Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
+    Tensor<XDataType> b_shuffled_scale(
+        f_host_tensor_descriptor(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::size_t total_gemm_needed =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n->GetElementSpaceSizeInBytes() +
+        a_m_k_scale.GetElementSpaceSizeInBytes() + b_k_n_scale.GetElementSpaceSizeInBytes() +
+        a_shuffled_scale.GetElementSpaceSizeInBytes() +
+        b_shuffled_scale.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "a_m_k_scale: " << a_m_k_scale.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n->mDesc << std::endl;
+    std::cout << "b_k_n_scale: " << b_k_n_scale.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    auto a_data_element = [](float x) {
+        if constexpr(ck::is_same_v<ADataType, ck::f4x2_pk_t>)
+            return ck::type_convert<ADataType>(ck::float2_t(x));
+        else
+            return ck::type_convert<ADataType>(x);
+    };
+    auto b_data_element = [](float x) {
+        if constexpr(ck::is_same_v<BDataType, ck::f4x2_pk_t>)
+            return ck::type_convert<BDataType>(ck::float2_t(x));
+        else
+            return ck::type_convert<BDataType>(x);
+    };
+
+    switch(init_method)
+    {
+    case 0: // Initializations for development and debugging
+        ck::utils::FillConstant<ADataType>{a_data_element(1.0f)}(a_m_k);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(2.0f)}(a_m_k_scale);
+        ck::utils::FillConstant<BDataType>{b_data_element(0.5f)}(*b_k_n);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(1.0f)}(b_k_n_scale);
+        if(do_log)
+        {
+            std::cout << "Init A = {1}" << std::endl;
+            std::cout << "Init A scale = {2.0}" << std::endl;
+            std::cout << "Init B = {0.5}" << std::endl;
+            std::cout << "Init B scale = {1.0}" << std::endl;
+            std::cout << "Expect C = {K}" << std::endl;
+        }
+        break;
+
+    case 1:
+
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5});  // Z[-4,4]
+        b_k_n->GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5}); // Z[-4,4]
+
+        a_m_k_scale.GenerateTensorValue(
+            GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorValue(
+            GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        break;
+
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
+        a_m_k_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+
+        b_k_n->GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
+        b_k_n_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+        break;
+    }
+
+#if 1
+    preShuffleScaleBuffer<ck::is_same_v<ALayout, Row>>(a_m_k_scale.mData.data(),
+                                                       a_shuffled_scale.mData.data(),
+                                                       Scale_Padded_M,
+                                                       K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<BRefLayout, Col>>(
+        b_k_n_scale.mData.data(), b_shuffled_scale.mData.data(), N, K / ScaleBlockSize);
+    if constexpr(BPreShuffle)
+    {
+        int NPerXdl = 16; // Fixed 16
+        preShuffleBuffer(b_k_n->mData.data(), b_input->mData.data(), N, K, NPerXdl);
+    }
+#endif
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    if(do_log > 0)
+        std::cout << "Device memory allocation..." << std::endl;
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.GetElementSpaceSize());
+    DeviceMem a_scale_device_buf(sizeof(XDataType) * a_m_k_scale.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n->GetElementSpaceSize());
+    DeviceMem b_scale_device_buf(sizeof(XDataType) * b_k_n_scale.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.GetElementSpaceSize());
+
+    if(do_log > 0)
+        std::cout << "Upload data to device..." << std::endl;
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    a_scale_device_buf.ToDevice(a_shuffled_scale.mData.data());
+    b_device_buf.ToDevice(b_input->mData.data());
+    b_scale_device_buf.ToDevice(b_shuffled_scale.mData.data());
+
+    if(do_log > 0)
+        std::cout << "Done." << std::endl;
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                ADataType,
+                                                                XPackedDataType,
+                                                                BDataType,
+                                                                XPackedDataType,
+                                                                CDataType,
+                                                                ScaleBlockSize,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp>;
+    std::cout << "finding op instances..." << std::endl;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMXGemm< //
+            ADataType,
+            BDataType,
+            CDataType,
+            float, // AccDataType
+            XDataType,
+            AElementOp,
+            BElementOp,
+            CElementOp,
+            float, // ComputeTypeA
+            float  // ComputeTypeB
+            >;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
+                                                  a_m_k_scale,
+                                                  *b_k_n,
+                                                  b_k_n_scale,
+                                                  c_m_n_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    std::optional<std::string> best_op_object_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+    bool pass             = true;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; // use these when KBatch <= 0
+
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                M,
+                N,
+                K,
+                StrideA,
+                Scale_Stride_AM,
+                StrideB,
+                Scale_Stride_BN,
+                StrideC,
+                kbatch_curr,
+                a_element_op,
+                b_element_op,
+                c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                    if(do_log)
+                    {
+
+                        if(init_method == 0)
+                        {
+                            auto expected = static_cast<float>(K);
+                            auto computed = type_convert<float>(c_m_n_device_result(0, 12));
+
+                            pass = pass & (std::abs(expected - computed) <= 0.0f);
+                            std::cout << "\nExpected vs Computed: " << expected << " vs "
+                                      << computed << ((pass) ? " (PASSED!)" : " (FAILED!)")
+                                      << std::endl
+                                      << std::endl;
+                        }
+                        else
+                        {
+                            if constexpr(is_same_v<ADataType, ck::f8_t> ||
+                                         is_same_v<ADataType, ck::bf8_t>)
+                                LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",")
+                                    << "\n";
+                            else
+                                std::cout << "A: WIP PRINT PACKED TYPE\n";
+                            LogRangeAsType<float>(std::cout << "a_scale : ", a_m_k_scale.mData, ",")
+                                << "\n";
+                            if constexpr(is_same_v<BDataType, ck::f8_t> ||
+                                         is_same_v<BDataType, ck::bf8_t>)
+                                LogRangeAsType<float>(std::cout << "b : ", b_k_n->mData, ",")
+                                    << "\n";
+                            else
+                                std::cout << "B: WIP PRINT PACKED TYPE\n";
+                            LogRangeAsType<float>(std::cout << "b_scale: ", b_k_n_scale.mData, ",")
+                                << "\n";
+                            LogRangeAsType<float>(
+                                std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                                << "\n";
+                            LogRangeAsType<float>(
+                                std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                                << std::endl;
+                        }
+                    }
+
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                }
+
+                std::string op_name                    = op_ptr->GetTypeString();
+                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();
+
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});
+
+                // Output size(M*N) * [dot product(2K) + product of scales(K/ScaleBlockSize) +
+                // scaling of partial sums(K/ScaleBlockSize)]
+                // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
+                std::size_t flop =
+                    std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
+
+                // TODO: fp6?
+                std::size_t num_btype = sizeof(ADataType) * M * K / packed_size_v<ADataType> +
+                                        sizeof(BDataType) * K * N / packed_size_v<BDataType> +
+                                        sizeof(CDataType) * M * N +
+                                        sizeof(XDataType) * (M * K + K * N) / ScaleBlockSize;
+
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;
+
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name        = op_name;
+                    best_op_object_name = op_obj_name;
+                    best_tflops         = tflops;
+                    best_ave_time       = ave_time;
+                    best_gb_per_sec     = gb_per_sec;
+                    best_kbatch         = kbatch_curr;
+                }
+            }
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    std::cout << " ALayout = " << ALayout::name;
+    std::cout << " BLayout = " << BLayout::name;
+    std::cout << " CLayout = " << CLayout::name;
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    if(best_op_object_name)
+        std::cout << best_op_object_name.value() << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 4f4a1f5356..72a12e718c 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -63,6 +63,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
     list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
   endif()
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+    list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
+  endif()
   list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
   list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
   list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
@@ -168,6 +171,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
   endif()
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+    list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
+  endif()
   list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
   list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
diff --git a/profiler/src/profile_gemm_mx.cpp b/profiler/src/profile_gemm_mx.cpp
new file mode 100644
index 0000000000..9fd6f29464
--- /dev/null
+++ b/profiler/src/profile_gemm_mx.cpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_mx_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN,   // 0
+    MK_NK_MN,   // 1
+    MK_MFMA_MN, // 2
+};
+
+enum struct GemmDataType
+{
+    F4_F4_F16,  // 0
+    F8_F8_F16,  // 1
+    F8_F8_BF16, // 2
+};
+
+#define OP_NAME "gemm_mx"
+#define OP_DESC "GEMM_mx"
+
+int profile_gemm_mx(int argc, char* argv[])
+{
+    if(argc != 11 && argc != 14 && argc != 18)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: f4->f16   ;\n");
+        printf("                 1: fp8->f16  ;\n");
+        printf("                 2: fp8->bf16 )\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]  ;\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n]  ;\n");
+        printf("                     2: A[k, m] * BPreShuff = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("optional:\n");
+        printf("arg14: number of kbatch (default 1)\n");
+        printf("arg15: number of warm-up cycles (default 1)\n");
+        printf("arg16: number of iterations (default 10)\n");
+        printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+    int arg_index              = 2;
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[arg_index++]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[arg_index++]));
+    const bool do_verification = std::stoi(argv[arg_index++]);
+    const int init_method      = std::stoi(argv[arg_index++]);
+    const bool do_log          = std::stoi(argv[arg_index++]);
+    const bool time_kernel     = std::stoi(argv[arg_index++]);
+
+    const int M = std::stoi(argv[arg_index++]);
+    const int N = std::stoi(argv[arg_index++]);
+    const int K = std::stoi(argv[arg_index++]);
+
+    int StrideA = -1, StrideB = -1, StrideC = -1;
+    if(argc > arg_index)
+    {
+        StrideA = std::stoi(argv[arg_index++]);
+        StrideB = std::stoi(argv[arg_index++]);
+        StrideC = std::stoi(argv[arg_index++]);
+    }
+
+    int KBatch        = 1;
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc > arg_index)
+    {
+        KBatch   = std::stoi(argv[arg_index++]);
+        n_warmup = std::stoi(argv[arg_index++]);
+        n_iter   = std::stoi(argv[arg_index++]);
+        rotating = std::stoull(argv[arg_index++]) * 1024 * 1024;
+    }
+
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using F4   = ck::f4x2_pk_t;
+    using F8   = ck::f8_t;
+
+    using Row  = ck::tensor_layout::gemm::RowMajor;
+    using Col  = ck::tensor_layout::gemm::ColumnMajor;
+    using MFMA = ck::tensor_layout::gemm::MFMA;
+
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType = decltype(a_type);
+            using BDataType = decltype(b_type);
+            using CDataType = decltype(c_type);
+            using ALayout   = decltype(a_layout);
+            using BLayout   = decltype(b_layout);
+            using CLayout   = decltype(c_layout);
+
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+            bool pass = ck::profiler::profile_gemm_mx_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           ALayout,
+                                                           BLayout,
+                                                           CLayout,
+                                                           32>( //
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                M,
+                N,
+                K,
+                (StrideA < 0) ? DefaultStrideA : StrideA,
+                (StrideB < 0) ? DefaultStrideB : StrideB,
+                (StrideC < 0) ? DefaultStrideC : StrideC,
+                KBatch,
+                n_warmup,
+                n_iter,
+                rotating);
+
+            return pass ? 0 : 1;
+        };
+
+    if(data_type == GemmDataType::F4_F4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F4{}, F4{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F4_F4_F16 && layout == GemmMatrixLayout::MK_MFMA_MN)
+    {
+        return profile(F4{}, F4{}, F16{}, Row{}, MFMA{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, BF16{}, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_mx);
diff --git a/test/gemm_mx/test_gemm_mx.cpp b/test/gemm_mx/test_gemm_mx.cpp
index 2c976a217f..a3449cb1bb 100644
--- a/test/gemm_mx/test_gemm_mx.cpp
+++ b/test/gemm_mx/test_gemm_mx.cpp
@@ -12,7 +12,7 @@ using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
 using F6   = ck::f6_t;
 using BF6  = ck::bf6_t;
-using F4   = ck::f4_t;
+using F4   = ck::f4x2_pk_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
 using F32  = float;
@@ -52,22 +52,23 @@ class TestGemmMX_KM_NK
 };
 
 // clang-format off
-using KernelTypes_F8_MK_NK = ::testing::Types<
+using KernelTypes_MK_NK = ::testing::Types<
 #if defined(CK_ENABLE_FP8)
     //         ADataType, BDataType,       CDataType, ScaleBlockSize
     std::tuple<       F8,        F8,             F16, ck::Number<32> >,
-    std::tuple<       F8,        F8,            BF16, ck::Number<32> >
+    std::tuple<       F8,        F8,            BF16, ck::Number<32> >,
 #endif
+    std::tuple<       F4,        F4,             F16, ck::Number<32> >
     >;
 
-using KernelTypes_BF8_F8_MK_KN = ::testing::Types<
+using KernelTypes_MK_KN = ::testing::Types<
 #if defined(CK_ENABLE_FP8)
     //         ADataType, BDataType,       CDataType, ScaleBlockSize
     std::tuple<      BF8,        F8,             F16, ck::Number<32> >
 #endif
     >;
 
-using KernelTypes_F8_KM_NK = ::testing::Types<
+using KernelTypes_KM_NK = ::testing::Types<
 #if defined(CK_ENABLE_FP8)
     //         ADataType, BDataType,       CDataType, ScaleBlockSize
     std::tuple<       F8,        F8,            BF16, ck::Number<32> >
@@ -75,9 +76,9 @@ using KernelTypes_F8_KM_NK = ::testing::Types<
     >;
 // clang-format on
 
-TYPED_TEST_SUITE(TestGemmMX_MK_NK, KernelTypes_F8_MK_NK);
-TYPED_TEST_SUITE(TestGemmMX_MK_KN, KernelTypes_BF8_F8_MK_KN);
-TYPED_TEST_SUITE(TestGemmMX_KM_NK, KernelTypes_F8_KM_NK);
+TYPED_TEST_SUITE(TestGemmMX_MK_NK, KernelTypes_MK_NK);
+TYPED_TEST_SUITE(TestGemmMX_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmMX_KM_NK, KernelTypes_KM_NK);
 
 /// A: RowMajor
 /// B: ColMajor
@@ -214,7 +215,8 @@ TYPED_TEST(TestGemmMX_MK_KN, Large)
 TYPED_TEST(TestGemmMX_KM_NK, SmallN)
 {
     constexpr int M = 256;
-    std::vector<int> Ns{1, 2, 3, 4, 5, 6};
+    std::vector<int> Ns{32, 64};
+    // std::vector<int> Ns{1, 2, 3, 4, 5, 6};
     constexpr int K = 512;
 
     constexpr int StrideA = M;
@@ -222,16 +224,16 @@ TYPED_TEST(TestGemmMX_KM_NK, SmallN)
 
     for(int N : Ns)
     {
-        const auto new_N   = N * 8;
-        const auto StrideC = new_N;
-        this->Run(M, new_N, K, StrideA, StrideB, StrideC);
+        const auto StrideC = N;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
     }
 }
 
 TYPED_TEST(TestGemmMX_KM_NK, MidLargeN)
 {
     constexpr int M = 256;
-    std::vector<int> Ns{127, 255, 312, 799, 1573};
+    std::vector<int> Ns{128, 256, 2048};
+    // std::vector<int> Ns{127, 255, 312, 799, 1573};
     constexpr int K = 512;
 
     constexpr int StrideA = M;
@@ -239,9 +241,8 @@ TYPED_TEST(TestGemmMX_KM_NK, MidLargeN)
 
     for(int N : Ns)
     {
-        const auto new_N   = (N + 7) / 8 * 8;
-        const auto StrideC = new_N;
-        this->Run(M, new_N, K, StrideA, StrideB, StrideC);
+        const auto StrideC = N;
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
     }
 }
 
diff --git a/test/gemm_mx/test_gemm_mx_util.hpp b/test/gemm_mx/test_gemm_mx_util.hpp
index 02833daeb4..675a3de127 100644
--- a/test/gemm_mx/test_gemm_mx_util.hpp
+++ b/test/gemm_mx/test_gemm_mx_util.hpp
@@ -18,6 +18,7 @@
 #include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "profiler/profile_gemm_mx_impl.hpp"
 
 namespace ck {
 namespace test {
@@ -27,401 +28,6 @@ using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 } // namespace
 
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          int ScaleBlockSize>
-bool profile_gemm_mx_impl(int do_verification,
-                          int init_method,
-                          bool do_log,
-                          bool time_kernel,
-                          int M,
-                          int N,
-                          int K,
-                          int StrideA,
-                          int StrideB,
-                          int StrideC,
-                          int KBatch,
-                          int n_warmup,
-                          int n_iter,
-                          uint64_t rotating = 0)
-{
-    if(K % ScaleBlockSize != 0)
-    {
-        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
-    };
-
-    using ScaleDataType = e8m0_bexp_t;
-    using AScaleLayout  = Row;
-    using BScaleLayout  = Col;
-
-    bool pass = true;
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
-
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-    auto f_get_default_stride =
-        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
-            if(stride == -1)
-            {
-                // give a chance if stride is -1, return a default packed stride
-                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-                {
-                    return static_cast<ck::index_t>(col);
-                }
-                else
-                {
-                    return static_cast<ck::index_t>(row);
-                }
-            }
-            else
-                return static_cast<ck::index_t>(stride);
-        };
-
-    auto Scale_Stride_AM = f_get_default_stride(M, K / ScaleBlockSize, -1, AScaleLayout{});
-    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
-
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-
-    Tensor<ScaleDataType> a_m_k_scale(f_host_tensor_descriptor(
-        M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{})); // scales for A
-    Tensor<ScaleDataType> b_k_n_scale(f_host_tensor_descriptor(
-        K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{})); // scales for B
-
-    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
-    std::size_t total_gemm_needed =
-        a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes() +
-        a_m_k_scale.GetElementSpaceSizeInBytes() + b_k_n_scale.GetElementSpaceSizeInBytes();
-    int rotating_count = std::max(
-        1,
-        std::min(n_iter,
-                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
-
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "a_m_k_scale: " << a_m_k_scale.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "b_k_n_scale: " << b_k_n_scale.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
-    std::cout << "rotating count: " << rotating_count << std::endl;
-
-    switch(init_method)
-    {
-    case 0: // Initializations for development and debugging
-        ck::utils::FillConstant<ADataType>{ck::type_convert<ADataType>(1.0f)}(a_m_k);
-        ck::utils::FillConstant<ScaleDataType>{ck::type_convert<ScaleDataType>(2.0f)}(a_m_k_scale);
-        ck::utils::FillConstant<BDataType>{ck::type_convert<BDataType>(0.5f)}(b_k_n);
-        ck::utils::FillConstant<ScaleDataType>{ck::type_convert<ScaleDataType>(1.0f)}(b_k_n_scale);
-        if(do_log)
-        {
-            std::cout << "Init A = {1}" << std::endl;
-            std::cout << "Init A scale = {2.0}" << std::endl;
-            std::cout << "Init B = {0.5}" << std::endl;
-            std::cout << "Init B scale = {1.0}" << std::endl;
-            std::cout << "Expect C = {K}" << std::endl;
-        }
-        break;
-
-    case 1:
-
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5}); // Z[-4,4]
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5}); // Z[-4,4]
-
-        a_m_k_scale.GenerateTensorValue(
-            GeneratorTensor_2<ScaleDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
-        b_k_n_scale.GenerateTensorValue(
-            GeneratorTensor_2<ScaleDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
-
-        break;
-
-    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
-        a_m_k_scale.GenerateTensorValue(
-            GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f}); // R[2^-125, 1]
-
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
-        b_k_n_scale.GenerateTensorValue(
-            GeneratorTensor_3<ScaleDataType>{powf(2.0f, -125.0f), 1.0f});
-        break;
-    }
-
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto a_element_op = AElementOp{};
-    const auto b_element_op = BElementOp{};
-    const auto c_element_op = CElementOp{};
-
-    if(do_log > 0)
-        std::cout << "Device memory allocation..." << std::endl;
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem a_scale_device_buf(sizeof(ScaleDataType) * a_m_k_scale.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem b_scale_device_buf(sizeof(ScaleDataType) * b_k_n_scale.mDesc.GetElementSpaceSize());
-    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-
-    if(do_log > 0)
-        std::cout << "Upload data to device..." << std::endl;
-    a_device_buf.ToDevice(a_m_k.mData.data());
-    a_scale_device_buf.ToDevice(a_m_k_scale.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
-    b_scale_device_buf.ToDevice(b_k_n_scale.mData.data());
-
-    if(do_log > 0)
-        std::cout << "Done." << std::endl;
-
-    using DeviceOp = ck::tensor_operation::device::DeviceGemmMX<ALayout,
-                                                                BLayout,
-                                                                CLayout,
-                                                                ADataType,
-                                                                ScaleDataType,
-                                                                BDataType,
-                                                                ScaleDataType,
-                                                                CDataType,
-                                                                ScaleBlockSize,
-                                                                AElementOp,
-                                                                BElementOp,
-                                                                CElementOp>;
-
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
-
-    // Run reference GEMM
-    if(do_verification)
-    {
-        using ReferenceGemmInstance =
-            ck::tensor_operation::host::ReferenceMXGemm<ADataType,
-                                                        BDataType,
-                                                        CDataType,
-                                                        float, // AccDataType
-                                                        ScaleDataType,
-                                                        AElementOp,
-                                                        BElementOp,
-                                                        CElementOp,
-                                                        float, // ComputeTypeA
-                                                        float  // ComputeTypeB
-                                                        >;
-
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
-                                                  a_m_k_scale,
-                                                  b_k_n,
-                                                  b_k_n_scale,
-                                                  c_m_n_host_result,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  c_element_op);
-
-        ref_invoker.Run(ref_argument);
-    }
-
-    std::string best_op_name;
-    std::optional<std::string> best_op_object_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-    float best_kbatch     = 0;
-
-    // profile device GEMM instances
-    for(auto& op_ptr : op_ptrs)
-    {
-        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38}; // use these when KBatch <= 0
-
-        if(KBatch > 0)
-        {
-            kbatch_list = {KBatch};
-        }
-
-        for(std::size_t i = 0; i < kbatch_list.size(); i++)
-        {
-            auto kbatch_curr = kbatch_list[i];
-
-            auto argument_ptr = op_ptr->MakeArgumentPointer(
-                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                static_cast<ScaleDataType*>(a_scale_device_buf.GetDeviceBuffer()),
-                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                static_cast<ScaleDataType*>(b_scale_device_buf.GetDeviceBuffer()),
-                static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                M,
-                N,
-                K,
-                StrideA,
-                Scale_Stride_AM,
-                StrideB,
-                Scale_Stride_BN,
-                StrideC,
-                kbatch_curr,
-                a_element_op,
-                b_element_op,
-                c_element_op);
-
-            auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-
-                // re-init C to zero before profiling next kernel
-                c_device_buf.SetZero();
-
-                invoker_ptr->Run(argument_ptr.get(),
-                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
-
-                if(do_verification)
-                {
-                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-                    if(do_log)
-                    {
-
-                        if(init_method == 0)
-                        {
-                            auto expected = static_cast<float>(K);
-                            auto computed = type_convert<float>(c_m_n_device_result(0, 12));
-
-                            pass = pass & (std::abs(expected - computed) <= 0.0f);
-                            std::cout << "\nExpected vs Computed: " << expected << " vs "
-                                      << computed << ((pass) ? " (PASSED!)" : " (FAILED!)")
-                                      << std::endl
-                                      << std::endl;
-                        }
-                        else
-                        {
-                            LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(std::cout << "a_scale : ", a_m_k_scale.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(std::cout << "b_scale: ", b_k_n_scale.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_device: ", c_m_n_device_result.mData, ",")
-                                << std::endl;
-                        }
-                    }
-
-                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                }
-
-                std::string op_name                    = op_ptr->GetTypeString();
-                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();
-
-                float ave_time = invoker_ptr->Run(argument_ptr.get(),
-                                                  StreamConfig{nullptr,
-                                                               time_kernel,
-                                                               0,
-                                                               n_warmup,
-                                                               n_iter,
-                                                               rotating_count > 1,
-                                                               rotating_count});
-
-                // Output size(M*N) * [dot product(2K) + product of scales(K/ScaleBlockSize) +
-                // scaling of partial sums(K/ScaleBlockSize)]
-                // FLOPS = 2 * M * N * K + 2 * M * N * K / ScaleBlockSize
-                std::size_t flop =
-                    std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
-
-                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                        sizeof(CDataType) * M * N +
-                                        sizeof(ScaleDataType) * (M * K + K * N) / ScaleBlockSize;
-
-                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-                float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
-                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
-                          << kbatch_curr << std::endl;
-
-                if(tflops > best_tflops && ave_time > 1e-10)
-                {
-                    best_op_name        = op_name;
-                    best_op_object_name = op_obj_name;
-                    best_tflops         = tflops;
-                    best_ave_time       = ave_time;
-                    best_gb_per_sec     = gb_per_sec;
-                    best_kbatch         = kbatch_curr;
-                }
-            }
-            else
-            {
-                std::cout << op_ptr->GetTypeString() << " does not support this problem"
-                          << std::endl;
-            }
-        }
-    }
-
-    if constexpr(is_same<CDataType, float>::value)
-    {
-        std::cout << "Best Perf for datatype = f32";
-    }
-    else if constexpr(is_same<CDataType, half_t>::value)
-    {
-        std::cout << "Best Perf for datatype = f16";
-    }
-    else if constexpr(is_same<CDataType, bhalf_t>::value)
-    {
-        std::cout << "Best Perf for datatype = bf16";
-    }
-    else if constexpr(is_same<CDataType, int8_t>::value)
-    {
-        std::cout << "Best Perf for datatype = int8";
-    }
-
-    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
-    {
-        std::cout << " ALayout =  RowMajor";
-    }
-    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
-    {
-        std::cout << " ALayout =  ColumnMajor";
-    }
-
-    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
-    {
-        std::cout << " BLayout =  RowMajor";
-    }
-    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
-    {
-        std::cout << " BLayout =  ColumnMajor";
-    }
-
-    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
-              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
-              << " GB/s, " << best_op_name << std::endl;
-
-    if(best_op_object_name)
-        std::cout << best_op_object_name.value() << std::endl;
-
-    return pass;
-}
-
 template <typename Tuple>
 class TestGemmMX : public testing::Test
 {
@@ -471,25 +77,25 @@ class TestGemmMX : public testing::Test
                    int n_warmup = 1,
                    int n_iter   = 10)
     {
-        bool pass = ck::test::profile_gemm_mx_impl<ADataType,
-                                                   BDataType,
-                                                   CDataType,
-                                                   ALayout,
-                                                   BLayout,
-                                                   CLayout,
-                                                   ScaleBlockSize>(verify_,
-                                                                   init_method_,
-                                                                   log_,
-                                                                   bench_,
-                                                                   M,
-                                                                   N,
-                                                                   K,
-                                                                   StrideA,
-                                                                   StrideB,
-                                                                   StrideC,
-                                                                   kbatch,
-                                                                   n_warmup,
-                                                                   n_iter);
+        bool pass = ck::profiler::profile_gemm_mx_impl<ADataType,
+                                                       BDataType,
+                                                       CDataType,
+                                                       ALayout,
+                                                       BLayout,
+                                                       CLayout,
+                                                       ScaleBlockSize>(verify_,
+                                                                       init_method_,
+                                                                       log_,
+                                                                       bench_,
+                                                                       M,
+                                                                       N,
+                                                                       K,
+                                                                       StrideA,
+                                                                       StrideB,
+                                                                       StrideC,
+                                                                       kbatch,
+                                                                       n_warmup,
+                                                                       n_iter);
         EXPECT_TRUE(pass);
     }
 };
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 4cab411cb4..21a0484d19 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -74,7 +74,11 @@ struct mfma_scale_type_selector<16, 16>
                                AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>{};
-        op.template run<16, 16>(fragA, scale_a[Number<0>{}], fragB, scale_b[Number<0>{}], fragAcc);
+        op.template run<16, 16, 0, 0>(fragA,
+                                      ck::utils::get_exponent_value(scale_a[Number<0>{}]),
+                                      fragB,
+                                      ck::utils::get_exponent_value(scale_b[Number<0>{}]),
+                                      fragAcc);
     }
 };
 
@@ -93,7 +97,11 @@ struct mfma_scale_type_selector<32, 32>
                                AccumFragT& fragAcc)
     {
         auto op = mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>{};
-        op.template run<32, 32>(fragA, scale_a[Number<0>{}], fragB, scale_b[Number<0>{}], fragAcc);
+        op.template run<32, 32, 0, 0>(fragA,
+                                      ck::utils::get_exponent_value(scale_a[Number<0>{}]),
+                                      fragB,
+                                      ck::utils::get_exponent_value(scale_b[Number<0>{}]),
+                                      fragAcc);
     }
 };
 
@@ -921,14 +929,12 @@ template <typename AType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-__global__ void matmul(const typename packed_type<AType>::type* a,
-                       const typename packed_type<BType>::type* b,
-                       CType* c)
+__global__ void matmul(const packed_type_t<AType>* a, const packed_type_t<BType>* b, CType* c)
 {
-    using PackedAType            = typename packed_type<AType>::type;
-    constexpr auto packed_size_a = packed_type<AType>::packed_size;
-    using PackedBType            = typename packed_type<BType>::type;
-    constexpr auto packed_size_b = packed_type<BType>::packed_size;
+    using PackedAType            = packed_type_t<AType>;
+    constexpr auto packed_size_a = packed_size_v<PackedAType>;
+    using PackedBType            = packed_type_t<BType>;
+    constexpr auto packed_size_b = packed_size_v<PackedBType>;
 
     constexpr int WAVE_SIZE = 64;
     assert(threadIdx.x < WAVE_SIZE);
@@ -1005,9 +1011,9 @@ __global__ void matmul(const packed_type_t<AType>* a,
                        CType* c)
 {
     using PackedAType            = packed_type_t<AType>;
-    constexpr auto packed_size_a = packed_size_v<AType>;
+    constexpr auto packed_size_a = packed_size_v<PackedAType>;
     using PackedBType            = packed_type_t<BType>;
-    constexpr auto packed_size_b = packed_size_v<BType>;
+    constexpr auto packed_size_b = packed_size_v<PackedBType>;
 
     constexpr int WAVE_SIZE = 64;
     assert(threadIdx.x < WAVE_SIZE);
@@ -1181,10 +1187,10 @@ template <typename DeviceMFMA,
           index_t BLOCK_X>
 struct TestMXMFMA
 {
-    using PackedAType                   = typename packed_type<ADataType>::type;
-    static constexpr auto packed_size_a = packed_type<ADataType>::packed_size;
-    using PackedBType                   = typename packed_type<BDataType>::type;
-    static constexpr auto packed_size_b = packed_type<BDataType>::packed_size;
+    using PackedAType                   = packed_type_t<ADataType>;
+    static constexpr auto packed_size_a = packed_size_v<PackedAType>;
+    using PackedBType                   = packed_type_t<BDataType>;
+    static constexpr auto packed_size_b = packed_size_v<PackedBType>;
 
     auto PrepareGemmTensors(const GemmParams& params, index_t init)
     {
@@ -1384,11 +1390,10 @@ template <typename DeviceMFMA,
           index_t BLOCK_K>
 struct TestMFMA
 {
-
-    using PackedAType                   = typename packed_type<ADataType>::type;
-    static constexpr auto packed_size_a = packed_type<ADataType>::packed_size;
-    using PackedBType                   = typename packed_type<BDataType>::type;
-    static constexpr auto packed_size_b = packed_type<BDataType>::packed_size;
+    using PackedAType                   = packed_type_t<ADataType>;
+    static constexpr auto packed_size_a = packed_size_v<PackedAType>;
+    using PackedBType                   = packed_type_t<BDataType>;
+    static constexpr auto packed_size_b = packed_size_v<PackedBType>;
 
     auto PrepareGemmTensors(const GemmParams& params, index_t init)
     {

From 050cad09b5129ea04a0684061f6b8bef44c9805e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 6 Jun 2025 10:30:08 +0200
Subject: [PATCH 184/443] Grouped Convolution Backward Weight Explicit GEMM
 (#2282)

* Grouped conv bwd weight explicit gemm

* 3d

* cmake fixes

* fix test

* fix
---
 ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp |  44 +-
 ...e_grouped_conv_bwd_weight_explicit_xdl.hpp | 284 ++++++++++
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp |  94 ++--
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |  94 ++--
 .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp    |   2 +-
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp |   8 +-
 ..._bwd_wei_exp_device_operation_instance.hpp |  57 ++
 ...p_gemm_xdl_universal_km_kn_mn_instance.hpp |  94 ++++
 .../grouped_convolution_backward_weight.hpp   |  85 +++
 ...nvolution_backward_weight_explicit_xdl.inc | 506 ++++++++++++++++++
 .../grouped_convnd_bwd_weight/CMakeLists.txt  |  26 +
 ...16_bf16_bf16_exp_comp_default_instance.cpp |  67 +++
 ...6_bf16_bf16_exp_comp_kpadding_instance.cpp |  67 +++
 ..._bf16_bf16_exp_comp_mkpadding_instance.cpp |  67 +++
 ...6_bf16_bf16_exp_comp_mpadding_instance.cpp |  67 +++
 ..._bf16_bf16_exp_mem_v1_default_instance.cpp |  67 +++
 ...bf16_bf16_exp_mem_v1_kpadding_instance.cpp |  67 +++
 ...f16_bf16_exp_mem_v1_mkpadding_instance.cpp |  69 +++
 ..._bf16_bf16_exp_mem_v2_default_instance.cpp |  67 +++
 ...bf16_bf16_exp_mem_v2_kpadding_instance.cpp |  67 +++
 ...f16_bf16_exp_mem_v2_mkpadding_instance.cpp |  69 +++
 ..._f16_f16_f16_exp_comp_default_instance.cpp |  67 +++
 ...f16_f16_f16_exp_comp_kpadding_instance.cpp |  67 +++
 ...16_f16_f16_exp_comp_mkpadding_instance.cpp |  67 +++
 ...f16_f16_f16_exp_comp_mpadding_instance.cpp |  67 +++
 ...16_f16_f16_exp_mem_v1_default_instance.cpp |  67 +++
 ...6_f16_f16_exp_mem_v1_kpadding_instance.cpp |  67 +++
 ..._f16_f16_exp_mem_v1_mkpadding_instance.cpp |  67 +++
 ...16_f16_f16_exp_mem_v2_default_instance.cpp |  67 +++
 ...6_f16_f16_exp_mem_v2_kpadding_instance.cpp |  67 +++
 ..._f16_f16_exp_mem_v2_mkpadding_instance.cpp |  67 +++
 profiler/src/CMakeLists.txt                   |   1 +
 test/grouped_convnd_bwd_weight/CMakeLists.txt |  15 +-
 33 files changed, 2539 insertions(+), 115 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 5f5bea4f86..8fca6a1e2f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -242,6 +242,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
 
     struct ComputePtrOffsetOfStridedBatch
     {
+        ComputePtrOffsetOfStridedBatch() = default;
         ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
                                        index_t BatchStrideB,
                                        std::array<ck::index_t, NumDTensor> BatchStrideDs,
@@ -282,7 +283,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         private:
         index_t BatchStrideA_;
         index_t BatchStrideB_;
-        const std::array<ck::index_t, NumDTensor> BatchStrideDs_;
+        std::array<ck::index_t, NumDTensor> BatchStrideDs_;
         index_t BatchStrideC_;
     };
 
@@ -291,6 +292,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         index_t Batch;
         ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
 
+        Argument() = default;
         Argument(const ADataType* p_a_grid_,
                  const BDataType* p_b_grid_,
                  std::array<const void*, NumDTensor> p_ds_grid_,
@@ -413,19 +415,39 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                 }
                 else
                 {
-                    if(arg.KBatch > 1)
-                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
-                                                         0,
-                                                         arg.M * arg.N * sizeof(CDataType),
-                                                         stream_config.stream_id_));
+                    const auto clear_workspace = [&]() {
+                        if(arg.KBatch > 1)
+                            hipGetErrorString(
+                                hipMemsetAsync(arg.p_c_grid,
+                                               0,
+                                               arg.Batch * arg.M * arg.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
 
-                    ave_time = launch_and_time_kernel(
-                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                    ave_time = launch_and_time_kernel_with_preprocess(stream_config,
+                                                                      clear_workspace,
+                                                                      kernel,
+                                                                      dim3(gdx, gdy, gdz),
+                                                                      dim3(BlockSize),
+                                                                      0,
+                                                                      arg);
                 }
             };
 
-            constexpr index_t minimum_occupancy =
-                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
 
             if(has_main_k_block_loop)
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
new file mode 100644
index 0000000000..1ea4854bd3
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          typename DeviceGemmV3Op>
+struct DeviceGroupedConvBwdWeight_Explicit_Xdl
+    : public DeviceGroupedConvBwdWeight<NDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        OutDataType,
+                                        InElementwiseOperation,
+                                        WeiElementwiseOperation,
+                                        OutElementwiseOperation>
+{
+    static_assert(is_same_v<InElementwiseOperation, element_wise::PassThrough>);
+    static_assert(is_same_v<WeiElementwiseOperation, element_wise::PassThrough>);
+    static_assert(is_same_v<OutElementwiseOperation, element_wise::PassThrough>);
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    using DeviceOp = DeviceGroupedConvBwdWeight_Explicit_Xdl;
+
+    struct Argument : public BaseArgument
+    {
+        using GemmArgument = typename DeviceGemmV3Op::Argument;
+
+        Argument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 const std::array<index_t, NDimSpatial + 3>&, // input
+                 const std::array<index_t, NDimSpatial + 3>&,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>&,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>&,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial>&,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 ck::index_t split_k)
+            : filter_spatial_lengths_{},
+              conv_filter_strides_{conv_filter_strides},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            constexpr index_t spatial_offset = 3;
+            const index_t DoHoWo    = std::accumulate(begin(a_g_n_k_wos_lengths) + spatial_offset,
+                                                   end(a_g_n_k_wos_lengths),
+                                                   index_t{1},
+                                                   std::multiplies<>{});
+            const index_t M         = e_g_k_c_xs_lengths[I1];
+            const index_t N         = e_g_k_c_xs_lengths[I2];
+            const index_t K         = a_g_n_k_wos_lengths[I1] * DoHoWo;
+            const index_t BatchSize = a_g_n_k_wos_lengths[I0];
+
+            explicit_gemm_args = GemmArgument{p_out_grid,
+                                              p_in_grid,
+                                              {},
+                                              p_wei_grid,
+                                              M,
+                                              N,
+                                              K,
+                                              BatchSize * M,
+                                              BatchSize * N,
+                                              {},
+                                              N,
+                                              M,
+                                              N,
+                                              {},
+                                              M * N,
+                                              BatchSize,
+                                              out_element_op,
+                                              in_element_op,
+                                              wei_element_op,
+                                              split_k};
+
+            std::copy(begin(e_g_k_c_xs_lengths) + spatial_offset,
+                      end(e_g_k_c_xs_lengths),
+                      begin(filter_spatial_lengths_));
+        }
+
+        GemmArgument explicit_gemm_args;
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            return explicit_gemm_op.Run(arg.explicit_gemm_args, stream_config);
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+
+        typename DeviceGemmV3Op::Invoker explicit_gemm_op;
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(NDimSpatial == 2)
+        {
+            if constexpr(!is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>())
+            {
+                return false;
+            }
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            if constexpr(!is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>())
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check if it's 1x1, stride=1 pad = 0 conv
+        for(int i = 0; i < NDimSpatial; i++)
+        {
+            if(!(arg.filter_spatial_lengths_[i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                 arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+            {
+                return false;
+            }
+        }
+        // Gridwise GEMM size
+        return DeviceGemmV3Op::IsSupportedArgument(arg.explicit_gemm_args);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 const ck::index_t split_k)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        b_g_n_c_wis_lengths, // input
+                        b_g_n_c_wis_strides,
+                        e_g_k_c_xs_lengths, // weight
+                        e_g_k_c_xs_strides,
+                        a_g_n_k_wos_lengths, // output
+                        a_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op,
+                        split_k};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        void* p_wei_grid,
+                        const void* p_out_grid,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op,
+                        const ck::index_t split_k) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          b_g_n_c_wis_lengths, // input
+                                          b_g_n_c_wis_strides,
+                                          e_g_k_c_xs_lengths, // weight
+                                          e_g_k_c_xs_strides,
+                                          a_g_n_k_wos_lengths, // output
+                                          a_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op,
+                                          split_k);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvBwdWeight_Explicit_Xdl"
+            << "<" << DeviceGemmV3Op{}.GetTypeString() << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index c7d95254c5..6a708a9e7e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -391,53 +391,53 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
     using CElementwiseGridDesc_M_N =
         remove_cvref_t<decltype(GetElementwiseCGridDesc<NDimSpatial>())>;
 
-    using GridwiseGemm =
-        GridwiseGemm_xdl_cshuffle_v3<tensor_layout::gemm::RowMajor,
-                                     tensor_layout::gemm::ColumnMajor,
-                                     tensor_layout::gemm::RowMajor,
-                                     ADataType,
-                                     BDataType,
-                                     AccDataType,
-                                     AccDataType,
-                                     AccDataType,
-                                     AElementwiseOperation,
-                                     BElementwiseOperation,
-                                     CDEElementwiseOperation,
-                                     GemmSpec,
-                                     BlockSize,
-                                     MPerBlock,
-                                     NPerBlock,
-                                     KPerBlock,
-                                     K1,
-                                     K1,
-                                     MPerXdl,
-                                     NPerXdl,
-                                     MXdlPerWave,
-                                     NXdlPerWave,
-                                     ABlockTransferThreadClusterLengths_K0_M_K1,
-                                     ABlockTransferThreadClusterArrangeOrder,
-                                     ABlockTransferSrcAccessOrder,
-                                     ABlockTransferSrcVectorDim,
-                                     ABlockTransferSrcScalarPerVector,
-                                     ABlockTransferDstScalarPerVector_K1,
-                                     false,
-                                     ABlockLdsAddExtraM,
-                                     BBlockTransferThreadClusterLengths_K0_N_K1,
-                                     BBlockTransferThreadClusterArrangeOrder,
-                                     BBlockTransferSrcAccessOrder,
-                                     BBlockTransferSrcVectorDim,
-                                     BBlockTransferSrcScalarPerVector,
-                                     BBlockTransferDstScalarPerVector_K1,
-                                     false,
-                                     BBlockLdsAddExtraN,
-                                     CShuffleMXdlPerWavePerShuffle,
-                                     CShuffleNXdlPerWavePerShuffle,
-                                     CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-                                     CBlockTransferScalarPerVector_NWaveNPerXdl,
-                                     BlkGemmPipeSched,
-                                     BlkGemmPipelineVer,
-                                     ComputeTypeA,
-                                     ComputeTypeB>;
+    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3<
+        tensor_layout::gemm::RowMajor,
+        tensor_layout::gemm::ColumnMajor,
+        tensor_layout::gemm::RowMajor,
+        ADataType,
+        BDataType,
+        AccDataType,
+        AccDataType,
+        AccDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        K1,
+        K1,
+        MPerXdl,
+        NPerXdl,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false,
+        BBlockLdsAddExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB>;
 
     using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 869457a99e..b28b7347b6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -328,53 +328,53 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
     using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
-    using GridwiseGemm =
-        GridwiseGemm_xdl_cshuffle_v3<tensor_layout::gemm::RowMajor,
-                                     tensor_layout::gemm::ColumnMajor,
-                                     tensor_layout::gemm::RowMajor,
-                                     ADataType,
-                                     BDataType,
-                                     AccDataType,
-                                     CDataType,
-                                     CDataType,
-                                     AElementwiseOperation,
-                                     BElementwiseOperation,
-                                     CElementwiseOperation,
-                                     GemmSpec,
-                                     BlockSize,
-                                     MPerBlock,
-                                     NPerBlock,
-                                     K0PerBlock,
-                                     K1,
-                                     K1,
-                                     MPerXdl,
-                                     NPerXdl,
-                                     MXdlPerWave,
-                                     NXdlPerWave,
-                                     ABlockTransferThreadClusterLengths_K0_M_K1,
-                                     ABlockTransferThreadClusterArrangeOrder,
-                                     ABlockTransferSrcAccessOrder,
-                                     ABlockTransferSrcVectorDim,
-                                     ABlockTransferSrcScalarPerVector,
-                                     ABlockTransferDstScalarPerVector_K1,
-                                     false,
-                                     ABlockLdsAddExtraM,
-                                     BBlockTransferThreadClusterLengths_K0_N_K1,
-                                     BBlockTransferThreadClusterArrangeOrder,
-                                     BBlockTransferSrcAccessOrder,
-                                     BBlockTransferSrcVectorDim,
-                                     BBlockTransferSrcScalarPerVector,
-                                     BBlockTransferDstScalarPerVector_K1,
-                                     false,
-                                     BBlockLdsAddExtraN,
-                                     CShuffleMXdlPerWavePerShuffle,
-                                     CShuffleNXdlPerWavePerShuffle,
-                                     CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-                                     CBlockTransferScalarPerVector_NWaveNPerXdl,
-                                     BlkGemmPipeSched,
-                                     BlkGemmPipelineVer,
-                                     ComputeTypeA,
-                                     ComputeTypeB>;
+    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3<
+        tensor_layout::gemm::RowMajor,
+        tensor_layout::gemm::ColumnMajor,
+        tensor_layout::gemm::RowMajor,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        K1,
+        K1,
+        MPerXdl,
+        NPerXdl,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false,
+        BBlockLdsAddExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB>;
 
     // Argument
     using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 4d3ae93659..63d40f6ff8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -62,7 +62,7 @@ template <typename ALayout,
           BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA>
-struct GridwiseGemm_xdl_cshuffle_v3
+struct GridwiseGemm_xdl_cshuffle_conv_v3
 {
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index 812e41ba58..c8dbd81b73 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -542,6 +542,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
     struct Problem
     {
+        __host__ __device__ Problem() = default;
         __host__ __device__ Problem(index_t M_,
                                     index_t N_,
                                     index_t K_,
@@ -609,6 +610,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
     // Argument
     struct Argument : public tensor_operation::device::BaseArgument, public Problem
     {
+        __host__ Argument() = default;
         __host__ Argument(const ADataType* p_a_grid_,
                           const BDataType* p_b_grid_,
                           std::array<const void*, NumDTensor> p_ds_grid_,
@@ -648,9 +650,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
         DsGridPointer p_ds_grid;
         CDataType* p_c_grid;
 
-        const AElementwiseOperation a_element_op;
-        const BElementwiseOperation b_element_op;
-        const CElementwiseOperation c_element_op;
+        AElementwiseOperation a_element_op;
+        BElementwiseOperation b_element_op;
+        CElementwiseOperation c_element_op;
     };
 
     struct SplitKBatchOffset
diff --git a/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp
new file mode 100644
index 0000000000..8e2ee30430
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <type_traits>
+
+#include "ck/utility/functional2.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          typename DeviceGemmV3Ops,
+          typename BaseOp>
+void add_explicit_gemm_device_operation_instances(
+    std::vector<std::unique_ptr<BaseOp>>& op_instances)
+{
+    ck::static_for<0, std::tuple_size_v<DeviceGemmV3Ops>, 1>{}([&](auto i) {
+        using DeviceGemmOp = std::tuple_element_t<i, DeviceGemmV3Ops>;
+
+        using NewOpInstance = DeviceGroupedConvBwdWeight_Explicit_Xdl<NDimSpatial,
+                                                                      InLayout,
+                                                                      WeiLayout,
+                                                                      OutLayout,
+                                                                      InDataType,
+                                                                      WeiDataType,
+                                                                      OutDataType,
+                                                                      InElementwiseOperation,
+                                                                      WeiElementwiseOperation,
+                                                                      OutElementwiseOperation,
+                                                                      DeviceGemmOp>;
+
+        static_assert(std::is_base_of_v<BaseOp, NewOpInstance>,
+                      "wrong! NewOpInstance should be derived from BaseOp");
+
+        op_instances.push_back(std::make_unique<NewOpInstance>(NewOpInstance{}));
+    });
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
new file mode 100644
index 0000000000..1d291cca39
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::tensor_layout::convolution;
+
+using BF16 = bhalf_t;
+using F16  = half_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <typename InOutDataType, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_km_kn_mn_comp_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        // Can we support this kind of odd case? 224(256) = 28*8 + (4*8)
+        //DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+    // clang-format on
+    >;
+
+template <typename InOutDataType,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_km_kn_mn_mem_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        // Latency friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index a450307dc2..a53a92e795 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -17,6 +17,7 @@
 #endif
 #ifdef CK_USE_XDL
 #include "grouped_convolution_backward_weight_xdl.inc"
+#include "grouped_convolution_backward_weight_explicit_xdl.inc"
 #endif
 #ifdef CK_USE_WMMA
 #include "grouped_convolution_backward_weight_wmma.inc"
@@ -393,6 +394,27 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instances(
                         op_ptrs);
+                    // Explicit GEMM
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -434,6 +456,27 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances(
                         op_ptrs);
+                    // Explicit GEMM
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -604,6 +647,27 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instances(
                         op_ptrs);
+                    // Explicit GEMM
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -645,6 +709,27 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instances(
                         op_ptrs);
+                    // Explicit GEMM
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+                        op_ptrs);
                 }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc
new file mode 100644
index 0000000000..8501ffafa3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// 2D
+#ifdef CK_ENABLE_BF16
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+#endif
+#ifdef CK_ENABLE_FP16
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+#endif
+// 3D
+#ifdef CK_ENABLE_BF16
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+#endif
+#ifdef CK_ENABLE_FP16
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt
new file mode 100644
index 0000000000..6b5efd253f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -0,0 +1,26 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONVND_EXP_BWD_WEIGHT
+    # Explicit instances are common for 2d and 3d
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
+
+    )
+add_instance_library(device_grouped_convnd_bwd_weight_instance ${GROUPED_CONVND_EXP_BWD_WEIGHT})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instance.cpp
new file mode 100644
index 0000000000..088f4b0ef7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmDefault>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmDefault>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..645b60fcc6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
new file mode 100644
index 0000000000..1bed4ac5c4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
new file mode 100644
index 0000000000..8947235617
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..2684da4007
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmDefault>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmDefault>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..3cf9e00440
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
new file mode 100644
index 0000000000..e11c9c68ad
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmMKPadding>>(
+        instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmMKPadding>>(
+        instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..10a0d4c108
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmDefault>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmDefault>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
new file mode 100644
index 0000000000..109f42703a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
new file mode 100644
index 0000000000..e7350ee6d4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmMKPadding>>(
+        instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        BF16,
+        BF16,
+        BF16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmMKPadding>>(
+        instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp
new file mode 100644
index 0000000000..07f3b728e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmDefault>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmDefault>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..174970fa12
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
new file mode 100644
index 0000000000..05636b2438
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
new file mode 100644
index 0000000000..4a564da6c9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..b07e508be0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmDefault>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmDefault>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..0d8755a31a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
new file mode 100644
index 0000000000..5bf4b27771
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmMKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmMKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..0cab010524
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmDefault>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmDefault>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
new file mode 100644
index 0000000000..1b176d8d24
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
new file mode 100644
index 0000000000..7e478364d3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        2,
+        NHWGC,
+        GKYXC,
+        NHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmMKPadding>>(instances);
+}
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_explicit_gemm_device_operation_instances<
+        3,
+        NDHWGC,
+        GKZYXC,
+        NDHWGK,
+        F16,
+        F16,
+        F16,
+        PassThrough,
+        PassThrough,
+        PassThrough,
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmMKPadding>>(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 72a12e718c..d1480c2032 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -192,6 +192,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
 endif()
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
index 063e0248e7..2db0fb1cf3 100644
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -1,9 +1,12 @@
-if(GPU_TARGETS MATCHES "gfx9" OR DL_KERNELS)
-      add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
-      target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
-   elseif(GPU_TARGETS MATCHES "gfx11")
-      add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
-      target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv3d_bwd_weight_instance)
+if(GPU_TARGETS MATCHES "gfx9")
+   add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
+   target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance device_grouped_convnd_bwd_weight_instance)
+elseif(DL_KERNELS)
+   add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
+   target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
+elseif(GPU_TARGETS MATCHES "gfx11")
+   add_gtest_executable(test_grouped_convnd_bwd_weight test_grouped_convnd_bwd_weight.cpp)
+   target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv3d_bwd_weight_instance)
 endif()
 add_gtest_executable(test_grouped_convnd_bwd_weight_interface_xdl test_grouped_convnd_bwd_weight_interface_xdl.cpp)
 if(result EQUAL 0)

From 8482977a3752f0d8205d7b5530f28db3c6a3dc5f Mon Sep 17 00:00:00 2001
From: valarLip <103567126+valarLip@users.noreply.github.com>
Date: Fri, 6 Jun 2025 17:21:19 +0800
Subject: [PATCH 185/443] extend buffer load to support load 32 bf16/fp16 at
 same time (#2291)

---
 .../core/arch/amd_buffer_addressing.hpp       | 66 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 68648e1c02..7111eed596 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1437,8 +1437,10 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
     static_assert(
         (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
             (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
+            (std::is_same<T, bf16_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
             (std::is_same<T, int32_t>::value &&
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
@@ -1579,6 +1581,36 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
 
             return bit_cast<rtn_type>(tmp);
         }
+        else if constexpr(N == 32)
+        {
+            thread_buffer<float, 16> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 8 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 12 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
     }
     else if constexpr(std::is_same<T, bf16_t>::value) // bf16
     {
@@ -1633,6 +1665,36 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
 
             return bit_cast<rtn_type>(tmp);
         }
+        else if constexpr(N == 32)
+        {
+            thread_buffer<float, 16> tmp;
+
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 8 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            tmp.template get_as<fp32x4_t>()(number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 12 * sizeof(float),
+                                                   static_cast<index_t>(coherence));
+
+            return bit_cast<rtn_type>(tmp);
+        }
     }
     else // other datatype
     {

From 1c6f83df6c1d96668feb5ab7fd3f7d9fbc69d264 Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Sat, 7 Jun 2025 00:18:49 +0300
Subject: [PATCH 186/443] [CK_TILE] Tileloop persistent gemm - resubmit (#2299)

* Reapply "[CK_TILE] Tile loop persistent gemm kernel (#2191)" (#2293)

This reverts commit 233e274077cae99f2f1deacf5044593ace5be65e.

* Add missing header for kentry

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |   5 +-
 example/ck_tile/03_gemm/gemm_utils.hpp        |   6 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  37 +++++-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  16 ++-
 include/ck_tile/core/utility/type_traits.hpp  |  30 +++++
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 105 ++++++++++++++++++
 test/ck_tile/gemm/CMakeLists.txt              |   5 +
 .../gemm/test_gemm_pipeline_kernel_types.hpp  |   9 ++
 .../gemm/test_gemm_pipeline_persistent.cpp    |  16 +++
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  22 +++-
 10 files changed, 233 insertions(+), 18 deletions(-)
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 386fe93715..de9608bcb4 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -18,9 +18,12 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          bool Persistent>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
+    if constexpr(Persistent)
+        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
     constexpr bool kPadM = false;
     constexpr bool kPadN = false;
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 4c9fecaba6..aec5f6a116 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -213,7 +213,8 @@ auto create_args(int argc, char* argv[])
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -226,5 +227,6 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          bool Persistent = false>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 3010130e6c..bf455a6415 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -162,7 +162,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::index_t stride_C,
                   ck_tile::index_t kbatch,
                   int n_warmup,
-                  int n_repeat)
+                  int n_repeat,
+                  bool persistent)
 {
     ck_tile::GemmHostArgs args;
     args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
@@ -176,9 +177,31 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time =
-        gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
+    float ave_time;
+    if(persistent)
+    {
+        ave_time = gemm_calc<ADataType,
+                             BDataType,
+                             AccDataType,
+                             CDataType,
+                             ALayout,
+                             BLayout,
+                             CLayout,
+                             true>(
             args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    }
+    else
+    {
+        ave_time = gemm_calc<ADataType,
+                             BDataType,
+                             AccDataType,
+                             CDataType,
+                             ALayout,
+                             BLayout,
+                             CLayout,
+                             false>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    }
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -193,8 +216,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
               << " B_Type=" << DataTypeTraits<BDataType>::name
               << " C_Type=" << DataTypeTraits<CDataType>::name
               << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
 
     return ave_time;
 }
@@ -229,6 +252,7 @@ int run_gemm_example_with_layouts(int argc,
     int n_warmup                 = arg_parser.get_int("warmup");
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -316,7 +340,8 @@ int run_gemm_example_with_layouts(int argc,
         stride_C,
         kbatch,
         n_warmup,
-        n_repeat);
+        n_repeat,
+        persistent);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index bc9569d342..3a7cc93df8 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -19,7 +19,8 @@ template <typename ADataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          bool Persistent>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -48,7 +49,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                  BLayout,
                                                                  CLayout,
                                                                  GemmConfig::TransposeC,
-                                                                 GemmConfig::UseStructuredSparsity>;
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
@@ -98,7 +100,15 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index 2e82e21ba1..95fb1bd834 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core/config.hpp"
+#include <tuple>
 #include <type_traits>
 #include <stdint.h>
 
@@ -138,4 +139,33 @@ struct is_specialization_of<RefTemplate<Args...>, RefTemplate> : std::true_type
 {
 };
 
+// Helper to get a tuple element or default type
+namespace detail {
+
+template <bool IsWithinBounds, std::size_t Idx, typename Tuple, typename DefaultType>
+struct tuple_element_or_default_dispatch
+{
+    using type = DefaultType;
+};
+
+template <std::size_t Idx, typename Tuple, typename DefaultType>
+struct tuple_element_or_default_dispatch<true, Idx, Tuple, DefaultType>
+{
+    using type = std::tuple_element_t<Idx, Tuple>;
+};
+
+} // namespace detail
+
+template <typename Tuple_, std::size_t Idx, typename DefaultType>
+struct tuple_element_or_default
+{
+    using Tuple                            = remove_cvref_t<Tuple_>;
+    static constexpr bool is_within_bounds = Idx < std::tuple_size_v<Tuple>;
+    using type                             = typename detail::
+        tuple_element_or_default_dispatch<is_within_bounds, Idx, Tuple, DefaultType>::type;
+};
+template <typename Tuple_, std::size_t Idx, typename DefaultType>
+using tuple_element_or_default_t =
+    typename tuple_element_or_default<Tuple_, Idx, DefaultType>::type;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 9c25104cd7..edcde4a09f 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -9,7 +9,10 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
 
@@ -142,6 +145,21 @@ struct GemmKernel
     using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
+    // Get the persistent kernel if the pipeline has it available
+    struct has_persistent_kernel
+    {
+        template <typename T>
+        using has_persistent_type = decltype(T::UsePersistentKernel);
+
+        static constexpr bool value = []() {
+            if constexpr(is_detected<has_persistent_type, GemmPipeline>{})
+                return GemmPipeline::UsePersistentKernel;
+            else
+                return false;
+        }();
+    };
+    static constexpr bool PersistentKernel = has_persistent_kernel::value;
+
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
     // Below type is actually accumulation data type - the output of block GEMM.
@@ -163,6 +181,23 @@ struct GemmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
+    /**
+     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
+     * @return The maximum occupancy grid size.
+     * @note This function queries the maximum occupancy of the kernel using
+     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+     */
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        using Kernel      = GemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
+        const auto kernel = kentry<KernelBlockSize, 1, Kernel, GemmKernelArgs>;
+        int occupancy;
+        hip_check_error(
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
+        const int grid_size = get_available_compute_units(s) * occupancy;
+        return dim3(grid_size, 1, 1);
+    }
+
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
     CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
@@ -693,6 +728,8 @@ struct GemmKernel
             c_block_window, c_block_tile, smem_ptr_0);
     }
 
+    // Non-persistent kernel entry point
+    template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
     CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
     {
         const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
@@ -739,6 +776,74 @@ struct GemmKernel
             }
         }
     }
+
+    // Persistent kernel entry point
+    template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
+    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
+    {
+        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
+        const auto num_tiles =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
+        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
+        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
+
+        while(block_id < num_work)
+        {
+            // Get the tile index for this block
+            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
+            const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
+            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+            // Get the SplitK offset for this block
+            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
+            const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
+            const ADataType* a_ptr =
+                static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+            const BDataType* b_ptr =
+                static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+            CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+            // allocate LDS
+            __shared__ char smem_ptr_0[GetSmemSize()];
+            // Run the GEMM
+            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+            {
+                __shared__ char smem_ptr_1[GetSmemSize()];
+                if constexpr(!(EpiloguePipeline::MemoryOperation ==
+                                   memory_operation_enum::atomic_add &&
+                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<CDataType, fp16_t, bf16_t>::value))
+                {
+                    RunGemm2LDS(a_ptr,
+                                b_ptr,
+                                c_ptr,
+                                smem_ptr_0,
+                                smem_ptr_1,
+                                kargs,
+                                splitk_batch_offset,
+                                i_m,
+                                i_n);
+                }
+            }
+            else
+            {
+                if constexpr(!(EpiloguePipeline::MemoryOperation ==
+                                   memory_operation_enum::atomic_add &&
+                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<CDataType, fp16_t, bf16_t>::value))
+                {
+                    RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                }
+            }
+            // Advance to the next work item
+            block_id += grid_size;
+            if(block_id >= num_work)
+            {
+                break;
+            }
+        }
+    }
 };
 
 } // namespace ck_tile
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index fc04af5cdb..598bd68666 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -23,3 +23,8 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
 else()
     message("Skipping ck_tile_gemm tests for current target")
 endif()
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MATCHES "gfx90a")
+    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent test_gemm_pipeline_persistent.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index bd1502516b..b9d3f57dbb 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -2,6 +2,7 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <tuple>
+#include <type_traits>
 
 #include "gtest/gtest.h"
 
@@ -21,6 +22,9 @@ using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType:
 using CompV3    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
 using CompV4    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
+using Persistent    = std::true_type;
+using NonPersistent = std::false_type;
+
 // clang-format off
 using KernelTypesMem = ::testing::Types<
     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
@@ -59,4 +63,9 @@ using KernelTypesCompV4 = ::testing::Types<
     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>
 >;
 
+using KernelTypesPersistent = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3,    Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3, NonPersistent>
+>;
+
 // clang-format on
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
new file mode 100644
index 0000000000..1dea1ab48c
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -0,0 +1,16 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelinePersistent : public TestCkTileGemmPipeline<T>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelinePersistent
+
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesPersistent);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index c388df3a41..b3146b5f8e 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -76,6 +76,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
     using CDataType                    = std::tuple_element_t<6, Tuple>;
     static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
     static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
+    static constexpr bool Persistent =
+        ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
     // TODO: expose tile size through test t-param ?
 
     template <bool PadM, bool PadN, bool PadK>
@@ -117,14 +119,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
             GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
         using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+        static constexpr bool StructuredSparsity = false;
+        using GemmUniversalTraits                = ck_tile::TileGemmUniversalTraits<kPadM,
                                                                      kPadN,
                                                                      kPadK,
                                                                      DoubleSmemBuffer,
                                                                      ALayout,
                                                                      BLayout,
                                                                      CLayout,
-                                                                     TransposeC>;
+                                                                     TransposeC,
+                                                                     StructuredSparsity,
+                                                                     Persistent>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -177,7 +182,15 @@ class TestCkTileGemmPipeline : public ::testing::Test
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
             constexpr dim3 blocks = Kernel::BlockSize();
 
             if(!Kernel::IsSupportedArgument(kargs))
@@ -346,9 +359,6 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                   "Error: Incorrect results!",
                                   rtol_atol.at(ck_tile::number<0>{}),
                                   rtol_atol.at(ck_tile::number<1>{}));
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
         EXPECT_TRUE(pass);
     }
 };

From aece3c6700d856ca3f96e414fe8a37f5fccb80c3 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sun, 8 Jun 2025 12:41:57 -0700
Subject: [PATCH 187/443] Add a python script for running ckProfiler and
 processing the results (#2288)

* add profiler script

* add comments

* generalize and add some input validation

* format

* refactor

* Rename run_ck_profiler.py to run_ck_profiler_gemm_with_csv_shapes.py

rename script file
---
 .../run_ck_profiler_gemm_with_csv_shapes.py   | 307 ++++++++++++++++++
 1 file changed, 307 insertions(+)
 create mode 100644 script/run_ck_profiler_gemm_with_csv_shapes.py

diff --git a/script/run_ck_profiler_gemm_with_csv_shapes.py b/script/run_ck_profiler_gemm_with_csv_shapes.py
new file mode 100644
index 0000000000..1f7ec7585f
--- /dev/null
+++ b/script/run_ck_profiler_gemm_with_csv_shapes.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+from enum import Enum
+
+
+def parse_args():
+    """
+    Parse command-line arguments
+    -   --shapes_csv : input csv file with M, N, K integer columns
+    -   --best       : if set, store only the result reported by the best instance.
+                       if not set, store results from all instances
+    -   -o           : output csv file
+    -   --build_dir  : path to directory where CMake stores all the build artifacts.
+                       The profiler binary is bin/ckProfiler relative to this directory.
+    -   --op_name    : operator name
+    -   --layout     : inputs and output layout
+                       r ~ row-major
+                       c ~ col-major
+                       p ~ preshuffled for mfma
+    -   --dtype      : inputs and output dtype
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--shapes_csv",
+        required=True,
+        help="Input csv file with M, N, K integer columns",
+    )
+    parser.add_argument(
+        "--best",
+        action="store_true",
+        help="If set, store only the result reported by the best instance. If not set, store results from all instances",
+    )
+    parser.add_argument("-o", default="out.csv", help="Output csv file")
+    parser.add_argument(
+        "--build_dir",
+        default=".",
+        help="Path to directory where CMake stores all the build artifacts. The profiler binary is bin/ckProfiler relative to this directory.",
+    )
+    parser.add_argument(
+        "--op_name",
+        default="gemm_multiply_multiply_weight_preshuffle",
+        help="Operator name",
+    )
+    parser.add_argument(
+        "--layout",
+        default="rpr",
+        help="Inputs and output layout. r ~ row-major, c ~ col-major, p ~ preshuffled for mfma.",
+    )
+    parser.add_argument("--dtype", default="f8f8bf16", help="Inputs and output dtype.")
+
+    return vars(parser.parse_args())
+
+
+def tuples(filename):
+    """
+    Parse M, N, K integers from the input csv file
+    """
+    lines = []
+    with open(filename, "r", newline="") as f:
+        import csv
+
+        reader = csv.reader(f)
+        for line in reader:
+            try:
+                m, n, k = map(int, line)
+                lines.append((m, n, k))
+            except:
+                pass
+    return lines
+
+
+def parse_result(line):
+    """
+    Parse the ckProfiler stdout line.
+    Result: a dict with the instance metadata and performance results
+    """
+    words = line.split()
+    fields = dict()
+    if "Perf:" in words or "Perf" in words:
+        for key in ("ms", "TFlops", "GB/s"):
+            fields[key] = words[words.index(key + ",") - 1]
+    for key in (
+        "BlkSize:",
+        "BlkTile:",
+        "WaveTile:",
+        "WaveMap:",
+        "VmemReadVec:",
+        "BlkGemmPipelineScheduler:",
+        "BlkGemmPipelineVersion:",
+        "BlkGemmPipelinePrefetchStages:",
+    ):
+        fields[key.strip(":")] = words[words.index(key) + 1].strip(",")
+    if "KBatch" in words:
+        key = "KBatch"
+        fields[key] = words[words.index(key) + 1]
+
+    return fields
+
+
+class GemmMulMulWP:
+    """
+    Wrapper for ckProfiler CLI parameters specific to gemm_multiply_multiply_weight_preshuffle
+    """
+
+    dtype = Enum("dtype", [("f8f8f16", 0), ("f8f8bf16", 1)])
+    layout = Enum("layout", [("rpr", 0)])
+
+
+class GemmMulMul:
+    """
+    Wrapper for ckProfiler CLI parameters specific to gemm_multiply_multiply
+    """
+
+    dtype = Enum(
+        "dtype",
+        [
+            ("f32f32f32", 0),
+            ("f16f16f16", 1),
+            ("bf16bf16bf16", 2),
+            ("i8i8i8", 3),
+            ("f8f16f16", 4),
+            ("f16f8f16", 5),
+            ("f16f16f8", 6),
+            ("f8f8bf16", 7),
+            ("i8i8bf16", 8),
+            ("i8i8f16", 9),
+            ("f8f8f16", 10),
+        ],
+    )
+    layout = Enum(
+        "layout",
+        [
+            ("rrr", 0),
+            ("rcr", 1),
+            ("crr", 2),
+            ("ccr", 3),
+        ],
+    )
+
+
+OPs = Enum(
+    "ops",
+    [
+        ("gemm_multiply_multiply_weight_preshuffle", GemmMulMulWP),
+        ("gemm_multiply_multiply", GemmMulMul),
+    ],
+)
+
+
+def run_shape(shape, profiler_bin, op_name, dtype, layout):
+    """
+    Launch ckProfiler in subprocess and collect its stdout
+    """
+    import subprocess
+
+    m, n, k = shape
+    try:
+        op = OPs[op_name]
+    except:
+        raise AssertionError(f"Invalid operator {op_name}")
+    name_arg = op.name
+    op_wrapper = op.value()
+
+    try:
+        dtype_arg = str(op_wrapper.dtype[dtype].value)
+    except:
+        raise AssertionError(f"Invalid dtype for {op_name}: {dtype}")
+
+    try:
+        layout_wrapper = op_wrapper.layout[layout]
+    except:
+        raise AssertionError(f"Invalid layout for {op_name}: {layout}")
+    layout_arg = str(layout_wrapper.value)
+    # verification: no, initialization: decimal, print tensor: no, time kernel: yes
+    meta_args = map(str, [0, 2, 0, 1])
+
+    layout_a = layout_wrapper.name[0]
+    if layout_a == "r":
+        stride_a = k
+    elif layout_a == "c":
+        stride_a = n
+    else:
+        raise AssertionError(
+            f"Couldn't decide StrideA from layout {layout_wrapper.name}"
+        )
+
+    layout_b = layout_wrapper.name[1]
+    if layout_b == "r":
+        stride_b = n
+    elif layout_b in ("c", "p"):
+        stride_b = k
+    else:
+        raise AssertionError(
+            f"Couldn't decide StrideB from layout {layout_wrapper.name}"
+        )
+
+    # M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE
+    shape_args = map(str, [m, n, k, stride_a, stride_b, 0, 0, n])
+    # kBatch, number of warm-up cycles, number of iterations, rotating buffer size in MB
+    control_args = map(str, [1, 50, 10, 4096])
+
+    cmd = [
+        profiler_bin,
+        name_arg,
+        dtype_arg,
+        layout_arg,
+        *meta_args,
+        *shape_args,
+        *control_args,
+    ]
+    print(" ".join(cmd))
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+    ).stdout
+
+    return result.splitlines()
+
+
+def filter_output_line(result_line, best_only):
+    """
+    Filter out ckProfiler output lines which don't report performance results
+    """
+    if "DeviceGemmXdlUniversal" in result_line:
+        if best_only:
+            if "Best Perf" in result_line:
+                return True
+        else:
+            if "Best Perf" not in result_line:
+                return True
+    return False
+
+
+def write_results(filename, results):
+    """
+    Write out the performance results to a csv file
+    """
+    if not results:
+        return
+    with open(filename, "w", newline="") as f:
+        import csv
+
+        fields = list(results[0].keys())
+        writer = csv.DictWriter(f, dialect="unix", fieldnames=fields)
+        writer.writeheader()
+        for r in results:
+            writer.writerow(r)
+
+
+def add_shape_to_metadata(shape, metadata):
+    """
+    Adds M, N, K to the parsed profiler results
+    """
+    m, n, k = shape
+    return metadata | {"M": m, "N": n, "K": k}
+
+
+def main():
+    """
+    Main driver:
+    - parses command line arguments
+    - parses input shapes to run ckProfiler with
+    - for each shape,
+       - runs ckProfiler
+       - parses the ckProfiler output
+    - writes out the results for all shapes
+    """
+    args = parse_args()
+    filename = args["shapes_csv"]
+    shapes = tuples(filename)
+
+    all_results = []
+    from tqdm import tqdm
+    from functools import partial
+    from os import path
+
+    profiler_bin = path.join(args["build_dir"], "bin", "ckProfiler")
+
+    for s in tqdm(shapes):
+        run_shape_stdout_lines = run_shape(
+            s, profiler_bin, args["op_name"], args["dtype"], args["layout"]
+        )
+        results_single_shape = map(
+            lambda r: add_shape_to_metadata(s, r),
+            map(
+                parse_result,
+                filter(
+                    partial(filter_output_line, best_only=args["best"]),
+                    run_shape_stdout_lines,
+                ),
+            ),
+        )
+        all_results.extend(list(results_single_shape))
+
+    write_results(args["o"], all_results)
+
+
+if __name__ == "__main__":
+    main()

From 5a0bd157db656d4da723b201db868aa8dc04dd25 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Sun, 8 Jun 2025 16:41:27 -0400
Subject: [PATCH 188/443] Code Refactor for check_err.hpp (#2284)

* refactor & add documentation

* removed return datatype from doxygen comments

* Update include/ck_tile/host/check_err.hpp

Co-authored-by: John Afaganis <john.afaganis@amd.com>

* Update include/ck_tile/host/check_err.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck_tile/host/check_err.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck_tile/host/check_err.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update include/ck_tile/host/check_err.hpp

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

---------

Co-authored-by: John Afaganis <john.afaganis@amd.com>
Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 include/ck_tile/host/check_err.hpp | 278 +++++++++++++++++++++--------
 1 file changed, 204 insertions(+), 74 deletions(-)

diff --git a/include/ck_tile/host/check_err.hpp b/include/ck_tile/host/check_err.hpp
index 745c18d6dd..90dec42ed1 100644
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -18,16 +18,36 @@
 
 namespace ck_tile {
 
+/** @brief 8-bit floating point type */
+using F8 = ck_tile::fp8_t;
+/** @brief 8-bit brain floating point type */
+using BF8 = ck_tile::bf8_t;
+/** @brief 16-bit floating point (half precision) type */
+using F16 = ck_tile::half_t;
+/** @brief 16-bit brain floating point type */
+using BF16 = ck_tile::bf16_t;
+/** @brief 32-bit floating point (single precision) type */
+using F32 = float;
+/** @brief 8-bit signed integer type */
+using I8 = int8_t;
+/** @brief 32-bit signed integer type */
+using I32 = int32_t;
+
+/**
+ * @brief Calculate relative error threshold for numerical comparisons
+ *
+ * Calculates the relative error threshold based on the mantissa bits and characteristics
+ * of the data types involved in the computation.
+ *
+ * @tparam ComputeDataType Type used for computation
+ * @tparam OutDataType Type used for output
+ * @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
+ * @param number_of_accumulations Number of accumulation operations performed
+ * @return Relative error threshold based on data type characteristics
+ */
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
 double get_relative_threshold(const int number_of_accumulations = 1)
 {
-    using F8   = ck_tile::fp8_t;
-    using BF8  = ck_tile::bf8_t;
-    using F16  = ck_tile::half_t;
-    using BF16 = ck_tile::bf16_t;
-    using F32  = float;
-    using I8   = int8_t;
-    using I32  = int32_t;
 
     static_assert(
         is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
@@ -72,16 +92,22 @@ double get_relative_threshold(const int number_of_accumulations = 1)
     return std::max(acc_error, midway_error);
 }
 
+/**
+ * @brief Calculate absolute error threshold for numerical comparisons
+ *
+ * Calculates the absolute error threshold based on the maximum possible value and
+ * the characteristics of the data types involved in the computation.
+ *
+ * @tparam ComputeDataType Type used for computation
+ * @tparam OutDataType Type used for output
+ * @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
+ * @param max_possible_num Maximum possible value in the computation
+ * @param number_of_accumulations Number of accumulation operations performed
+ * @return Absolute error threshold based on data type characteristics and maximum value
+ */
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
 double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
 {
-    using F8   = ck_tile::fp8_t;
-    using BF8  = ck_tile::bf8_t;
-    using F16  = ck_tile::half_t;
-    using BF16 = ck_tile::bf16_t;
-    using F32  = float;
-    using I8   = int8_t;
-    using I32  = int32_t;
 
     static_assert(
         is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
@@ -128,6 +154,16 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
     return std::max(acc_error, midway_error);
 }
 
+/**
+ * @brief Stream operator overload for vector output
+ *
+ * Provides a formatted string representation of a vector, useful for debugging and logging.
+ *
+ * @tparam T Type of vector elements
+ * @param os Output stream
+ * @param v Vector to output
+ * @return Reference to the output stream
+ */
 template <typename T>
 std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 {
@@ -145,6 +181,66 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
     return os << "]";
 }
 
+/**
+ * @brief Check for size mismatch between output and reference ranges
+ *
+ * Verifies that the output and reference ranges are the same size.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if sizes mismatch
+ * @return True if sizes mismatch, false otherwise
+ */
+template <typename Range, typename RefRange>
+bool check_size_mismatch(const Range& out,
+                         const RefRange& ref,
+                         const std::string& msg = "Error: Incorrect results!")
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return true;
+    }
+    return false;
+}
+
+/**
+ * @brief Report error statistics for numerical comparisons
+ *
+ * Outputs statistics about numerical comparison errors including count and maximum error.
+ *
+ * @param err_count Number of errors found
+ * @param max_err Maximum error value encountered
+ * @param total_size Total number of elements compared
+ */
+void report_error_stats(int err_count, double max_err, std::size_t total_size)
+{
+    const float error_percent =
+        static_cast<float>(err_count) / static_cast<float>(total_size) * 100.f;
+    std::cerr << "max err: " << max_err;
+    std::cerr << ", number of errors: " << err_count;
+    std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+}
+
+/**
+ * @brief Check errors between floating point ranges using the specified tolerances.
+ *
+ * Compares two ranges of floating point values within specified relative and absolute tolerances.
+ * This overload handles standard floating point types except half precision floating point.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
 template <typename Range, typename RefRange>
 typename std::enable_if<
     std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
@@ -158,12 +254,9 @@ check_err(const Range& out,
           double atol             = 3e-6,
           bool allow_infinity_ref = false)
 {
-    if(out.size() != ref.size())
-    {
-        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl;
+
+    if(check_size_mismatch(out, ref, msg))
         return false;
-    }
 
     const auto is_infinity_error = [=](auto o, auto r) {
         const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
@@ -196,15 +289,27 @@ check_err(const Range& out,
     }
     if(!res)
     {
-        const float error_percent =
-            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
-        std::cerr << "max err: " << max_err;
-        std::cerr << ", number of errors: " << err_count;
-        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+        report_error_stats(err_count, max_err, ref.size());
     }
     return res;
 }
 
+/**
+ * @brief Check errors between floating point ranges using the specified tolerances
+ *
+ * Compares two ranges of brain floating point values within specified relative and absolute
+ * tolerances.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
 template <typename Range, typename RefRange>
 typename std::enable_if<
     std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
@@ -217,12 +322,8 @@ check_err(const Range& out,
           double atol             = 1e-3,
           bool allow_infinity_ref = false)
 {
-    if(out.size() != ref.size())
-    {
-        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl;
+    if(check_size_mismatch(out, ref, msg))
         return false;
-    }
 
     const auto is_infinity_error = [=](auto o, auto r) {
         const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
@@ -256,15 +357,28 @@ check_err(const Range& out,
     }
     if(!res)
     {
-        const float error_percent =
-            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
-        std::cerr << "max err: " << max_err;
-        std::cerr << ", number of errors: " << err_count;
-        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+        report_error_stats(err_count, max_err, ref.size());
     }
     return res;
 }
 
+/**
+ * @brief Check errors between half precision floating point ranges
+ *
+ * Compares two ranges of half precision floating point values within specified tolerances.
+ * This specialization handles the specific requirements and characteristics of half precision
+ * floating point comparisons.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
 template <typename Range, typename RefRange>
 typename std::enable_if<
     std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
@@ -277,12 +391,8 @@ check_err(const Range& out,
           double atol             = 1e-3,
           bool allow_infinity_ref = false)
 {
-    if(out.size() != ref.size())
-    {
-        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl;
+    if(check_size_mismatch(out, ref, msg))
         return false;
-    }
 
     const auto is_infinity_error = [=](auto o, auto r) {
         const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
@@ -315,15 +425,26 @@ check_err(const Range& out,
     }
     if(!res)
     {
-        const float error_percent =
-            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
-        std::cerr << "max err: " << max_err;
-        std::cerr << ", number of errors: " << err_count;
-        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+        report_error_stats(err_count, max_err, ref.size());
     }
     return res;
 }
 
+/**
+ * @brief Check errors between integer ranges
+ *
+ * Compares two ranges of integer values with an absolute tolerance.
+ * This specialization handles integer types and optionally int4_t when the
+ * experimental bit int extension is enabled.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param atol Absolute tolerance
+ * @return True if check passes, false otherwise
+ */
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                   std::is_integral_v<ranges::range_value_t<Range>> &&
@@ -339,12 +460,8 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
                            double                 = 0,
                            double atol            = 0)
 {
-    if(out.size() != ref.size())
-    {
-        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl;
+    if(check_size_mismatch(out, ref, msg))
         return false;
-    }
 
     bool res{true};
     int err_count   = 0;
@@ -370,15 +487,28 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
     }
     if(!res)
     {
-        const float error_percent =
-            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
-        std::cerr << "max err: " << max_err;
-        std::cerr << ", number of errors: " << err_count;
-        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+        report_error_stats(err_count, static_cast<double>(max_err), ref.size());
     }
     return res;
 }
 
+/**
+ * @brief Check errors between FP8 ranges
+ *
+ * Specialized comparison for 8-bit floating point values that takes into account
+ * the unique characteristics and limitations of FP8 arithmetic, including
+ * rounding point distances and special handling of infinity values.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param max_rounding_point_distance Maximum allowed distance between rounding points
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                   std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
@@ -390,12 +520,8 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
                            double atol                          = 1e-1,
                            bool allow_infinity_ref              = false)
 {
-    if(out.size() != ref.size())
-    {
-        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl;
+    if(check_size_mismatch(out, ref, msg))
         return false;
-    }
 
     const auto is_infinity_error = [=](auto o, auto r) {
         const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
@@ -447,15 +573,27 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
     }
     if(!res)
     {
-        const float error_percent =
-            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
-        std::cerr << "max err: " << max_err;
-        std::cerr << ", number of errors: " << err_count;
-        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+        report_error_stats(err_count, max_err, ref.size());
     }
     return res;
 }
 
+/**
+ * @brief Check errors between BF8 ranges
+ *
+ * Specialized comparison for 8-bit brain floating point values that considers
+ * the specific numerical properties and error characteristics of the BF8 format.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                   std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
@@ -467,12 +605,8 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
                            double atol             = 1e-3,
                            bool allow_infinity_ref = false)
 {
-    if(out.size() != ref.size())
-    {
-        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl;
+    if(check_size_mismatch(out, ref, msg))
         return false;
-    }
 
     const auto is_infinity_error = [=](auto o, auto r) {
         const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
@@ -505,11 +639,7 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
     }
     if(!res)
     {
-        const float error_percent =
-            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
-        std::cerr << "max err: " << max_err;
-        std::cerr << ", number of errors: " << err_count;
-        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+        report_error_stats(err_count, max_err, ref.size());
     }
     return res;
 }

From 65835c0bbb90117c8d9c6bc3fff23458abcbe043 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Tue, 10 Jun 2025 10:40:54 +0800
Subject: [PATCH 189/443] MUST USE INLINE FOR ANY NON TEMPLATE FUNCTION IN
 HEADER!!! (#2305)

---
 include/ck_tile/host/check_err.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ck_tile/host/check_err.hpp b/include/ck_tile/host/check_err.hpp
index 90dec42ed1..454f22e007 100644
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -46,7 +46,7 @@ using I32 = int32_t;
  * @return Relative error threshold based on data type characteristics
  */
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-double get_relative_threshold(const int number_of_accumulations = 1)
+CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1)
 {
 
     static_assert(
@@ -106,7 +106,7 @@ double get_relative_threshold(const int number_of_accumulations = 1)
  * @return Absolute error threshold based on data type characteristics and maximum value
  */
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
+CK_TILE_HOST double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
 {
 
     static_assert(
@@ -194,7 +194,7 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
  * @return True if sizes mismatch, false otherwise
  */
 template <typename Range, typename RefRange>
-bool check_size_mismatch(const Range& out,
+CK_TILE_HOST bool check_size_mismatch(const Range& out,
                          const RefRange& ref,
                          const std::string& msg = "Error: Incorrect results!")
 {
@@ -216,7 +216,7 @@ bool check_size_mismatch(const Range& out,
  * @param max_err Maximum error value encountered
  * @param total_size Total number of elements compared
  */
-void report_error_stats(int err_count, double max_err, std::size_t total_size)
+CK_TILE_HOST void report_error_stats(int err_count, double max_err, std::size_t total_size)
 {
     const float error_percent =
         static_cast<float>(err_count) / static_cast<float>(total_size) * 100.f;

From 9fcf21a4ec4698209c4ed7b859574cc1e1986aa3 Mon Sep 17 00:00:00 2001
From: MHYangAMD <meng-hsuan.yang@amd.com>
Date: Tue, 10 Jun 2025 15:03:23 +0800
Subject: [PATCH 190/443] Fix fmha fwd precision issue on MI3XX series (#2285)

* Fix fmha fwd precision issue on MI3XX series

For fmha fwd fp16 cases, we found that using
impl::cast_tile_pk_fp16_fp32 for casting P would lead to precision
issues, since it uses __builtin_amdgcn_cvt_pkrtz, which is round to zero.

For examaple, fixing K,V to be all 1, and Q is random, which outputs are
expected to be all 1. But we found that it would have some incorrect
outputs 0.9995, which are smaller than the atol 0.001. (1 - 0.9995 =
0.0005 < 0.001) Thus, ck do not report this error.

* Add option to switch rtn/rtz for fmha fwd
---
 include/ck_tile/core/config.hpp                            | 4 ++++
 .../block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp   | 7 +++++++
 .../fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp   | 7 +++++++
 3 files changed, 18 insertions(+)

diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 27133fa847..14b33aea77 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -223,6 +223,10 @@
 #define CK_TILE_FMHA_FWD_FAST_EXP2 0
 #endif
 
+#ifndef CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN
+#define CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN 0
+#endif
+
 #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
 #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
 #endif
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
index 8691622bb0..6398bf316e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
@@ -702,12 +702,19 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
             }
 
             const auto p = [&]() {
+#if CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN
+                // For fp32 to fp16,
+                // impl::cast_tile_pk_fp16_fp32 would cause precision issue,
+                // since it uses __builtin_amdgcn_cvt_pkrtz, which is round to zero.
+                return cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
+#else
                 if constexpr(std::is_same_v<PDataType, fp16_t>)
                     return impl::cast_tile_pk_fp16_fp32<PDataType>(
                         tile_elementwise_in(p_compute_element_func, p_compute));
                 else
                     return cast_tile<PDataType>(
                         tile_elementwise_in(p_compute_element_func, p_compute));
+#endif
             }();
 
             // STAGE 3, KV gemm
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 7af3902dc5..ba788c7f1e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -653,12 +653,19 @@ struct BlockFmhaPipelineQRKSVSAsync
             }
 
             const auto p = [&]() {
+#if CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN
+                // For fp32 to fp16,
+                // impl::cast_tile_pk_fp16_fp32 would cause precision issue,
+                // since it uses __builtin_amdgcn_cvt_pkrtz, which is round to zero.
+                return cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
+#else
                 if constexpr(std::is_same_v<PDataType, fp16_t>)
                     return impl::cast_tile_pk_fp16_fp32<PDataType>(
                         tile_elementwise_in(p_compute_element_func, p_compute));
                 else
                     return cast_tile<PDataType>(
                         tile_elementwise_in(p_compute_element_func, p_compute));
+#endif
             }();
 
             // STAGE 3, KV gemm

From 7a83f1d510d487b6f01582a59c1b7da5b92bb04a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 10 Jun 2025 11:17:12 +0200
Subject: [PATCH 191/443] Grouped conv bwd wei explicit GEMM for odd C/K
 (#2306)

---
 ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp |   4 +-
 ...e_grouped_conv_bwd_weight_explicit_xdl.hpp | 266 ++++++++++++++---
 ...p_gemm_xdl_universal_km_kn_mn_instance.hpp |  91 ++++++
 .../grouped_convolution_backward_weight.hpp   |  70 ++---
 ...nvolution_backward_weight_explicit_xdl.inc | 272 ++++++++----------
 .../grouped_convnd_bwd_weight/CMakeLists.txt  |  34 +--
 ...f16_bf16_exp_comp_mnkpadding_instance.cpp} |   8 +-
 ...6_bf16_exp_mem_v1_mnkpadding_instance.cpp} |   8 +-
 ...bf16_bf16_exp_mem_v2_kpadding_instance.cpp |  67 -----
 ...6_bf16_exp_mem_v2_mnkpadding_instance.cpp} |   8 +-
 ...ght_bf16_bf16_bf16_exp_odd_m_instance.cpp} |  12 +-
 ...ht_bf16_bf16_bf16_exp_odd_mn_instance.cpp} |  12 +-
 ...ght_bf16_bf16_bf16_exp_odd_n_instance.cpp} |  10 +-
 ..._f16_f16_exp_comp_mnkpadding_instance.cpp} |   8 +-
 ...16_f16_exp_mem_v1_mnkpadding_instance.cpp} |  10 +-
 ..._f16_f16_exp_mem_v2_mkpadding_instance.cpp |  67 -----
 ...16_f16_exp_mem_v2_mnkpadding_instance.cpp} |  10 +-
 ...weight_f16_f16_f16_exp_odd_m_instance.cpp} |  12 +-
 ...eight_f16_f16_f16_exp_odd_mn_instance.cpp} |  12 +-
 ...weight_f16_f16_f16_exp_odd_n_instance.cpp} |  10 +-
 20 files changed, 557 insertions(+), 434 deletions(-)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/{device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp => device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instance.cpp} (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/{device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp => device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instance.cpp} (95%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/{device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp => device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instance.cpp} (95%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/{device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp => device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instance.cpp} (77%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/{device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp => device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instance.cpp} (77%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/{device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp => device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instance.cpp} (85%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/{device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp => device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instance.cpp} (96%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/{device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp => device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instance.cpp} (94%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/{device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp => device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instance.cpp} (94%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/{device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp => device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instance.cpp} (77%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/{device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp => device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instance.cpp} (77%)
 rename library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/{device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp => device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instance.cpp} (85%)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 8fca6a1e2f..6624570b27 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -185,7 +185,9 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                                        BElementwiseOperation,
                                        CElementwiseOperation>
 {
-    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumDTensor            = DsDataType::Size();
+    using CDEShuffleBlockTransferScalarPerVectors_ = CDEShuffleBlockTransferScalarPerVectors;
+    using CDataType_                               = CDataType;
 
     // GridwiseGemm
     using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
index 1ea4854bd3..a819b91b05 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
@@ -11,6 +11,8 @@
 
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
+#include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 
 namespace ck {
 namespace tensor_operation {
@@ -48,7 +50,48 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
 
-    using DeviceOp = DeviceGroupedConvBwdWeight_Explicit_Xdl;
+    static constexpr bool IsTwoStageNeeded =
+        sizeof(WeiDataType) % 4 != 0 &&
+        DeviceGemmV3Op::CDEShuffleBlockTransferScalarPerVectors_::At(I0) % 2 != 0;
+
+    using DeviceOp                 = DeviceGroupedConvBwdWeight_Explicit_Xdl;
+    using TwoStageIntermediateType = typename DeviceGemmV3Op::CDataType_;
+
+    static constexpr index_t ElementwiseBlockSize = 256;
+    static constexpr index_t ElemsPerBlock        = 256;
+
+    static auto GetElementwiseCGridDesc(index_t merged_filter_dims)
+    {
+        const auto padd_size = merged_filter_dims % ElemsPerBlock == 0
+                                   ? 0
+                                   : ElemsPerBlock - merged_filter_dims % ElemsPerBlock;
+        const auto desc = make_naive_tensor_descriptor_packed(make_tuple(I1, merged_filter_dims));
+        return transform_tensor_descriptor(
+            desc,
+            make_tuple(make_pass_through_transform(I1),
+                       make_right_pad_transform(merged_filter_dims, padd_size)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    using CElementwiseGridDesc     = remove_cvref_t<decltype(GetElementwiseCGridDesc(I1))>;
+    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<1, ElemsPerBlock>;
+    using GridwiseElementwiseCast  = GridwiseElementwise<Tuple<CElementwiseGridDesc>,
+                                                        Tuple<CElementwiseGridDesc>,
+                                                        Tuple<const float*>,
+                                                        Tuple<WeiDataType*>,
+                                                        Block2TileMapElementwise,
+                                                        WeiElementwiseOperation,
+                                                        ElementwiseBlockSize,
+                                                        I1,
+                                                        ElemsPerBlock,
+                                                        I1,
+                                                        ElemsPerBlock / ElementwiseBlockSize,
+                                                        Sequence<0, 1>,
+                                                        Sequence<1>,
+                                                        Sequence<1>,
+                                                        I1,
+                                                        I1>;
 
     struct Argument : public BaseArgument
     {
@@ -58,11 +101,11 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
                  WeiDataType* p_wei_grid,
                  const OutDataType* p_out_grid,
                  const std::array<index_t, NDimSpatial + 3>&, // input
-                 const std::array<index_t, NDimSpatial + 3>&,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_strides,
                  const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
-                 const std::array<index_t, NDimSpatial + 3>&,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_strides,
                  const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
-                 const std::array<index_t, NDimSpatial + 3>&,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
                  const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
                  const std::array<ck::index_t, NDimSpatial>&,
                  const std::array<ck::index_t, NDimSpatial>& input_left_pads,
@@ -74,42 +117,114 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
             : filter_spatial_lengths_{},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads}
+              input_right_pads_{input_right_pads},
+              p_wei_grid_{p_wei_grid}
         {
             constexpr index_t spatial_offset = 3;
-            const index_t DoHoWo    = std::accumulate(begin(a_g_n_k_wos_lengths) + spatial_offset,
+            const index_t DoHoWo = std::accumulate(begin(a_g_n_k_wos_lengths) + spatial_offset,
                                                    end(a_g_n_k_wos_lengths),
                                                    index_t{1},
                                                    std::multiplies<>{});
-            const index_t M         = e_g_k_c_xs_lengths[I1];
-            const index_t N         = e_g_k_c_xs_lengths[I2];
-            const index_t K         = a_g_n_k_wos_lengths[I1] * DoHoWo;
-            const index_t BatchSize = a_g_n_k_wos_lengths[I0];
+            const index_t M      = e_g_k_c_xs_lengths[I1];
+            const index_t N      = e_g_k_c_xs_lengths[I2];
+            const index_t K      = a_g_n_k_wos_lengths[I1] * DoHoWo;
 
-            explicit_gemm_args = GemmArgument{p_out_grid,
-                                              p_in_grid,
-                                              {},
-                                              p_wei_grid,
-                                              M,
-                                              N,
-                                              K,
-                                              BatchSize * M,
-                                              BatchSize * N,
-                                              {},
-                                              N,
-                                              M,
-                                              N,
-                                              {},
-                                              M * N,
-                                              BatchSize,
-                                              out_element_op,
-                                              in_element_op,
-                                              wei_element_op,
-                                              split_k};
+            const index_t StrideOut      = a_g_n_k_wos_strides[spatial_offset + NDimSpatial - 1];
+            const index_t StrideIn       = b_g_n_c_wis_strides[spatial_offset + NDimSpatial - 1];
+            const index_t StrideWei      = e_g_k_c_xs_strides[I1];
+            const index_t StrideBatchOut = a_g_n_k_wos_strides[I0];
+            const index_t StrideBatchIn  = b_g_n_c_wis_strides[I0];
+            const index_t StrideBatchWei = e_g_k_c_xs_strides[I0];
+
+            const index_t BatchSize = a_g_n_k_wos_lengths[I0];
 
             std::copy(begin(e_g_k_c_xs_lengths) + spatial_offset,
                       end(e_g_k_c_xs_lengths),
                       begin(filter_spatial_lengths_));
+
+            if constexpr(IsTwoStageNeeded)
+            {
+                const index_t merged_filter_dims = std::accumulate(begin(e_g_k_c_xs_lengths),
+                                                                   end(e_g_k_c_xs_lengths),
+                                                                   index_t{1},
+                                                                   std::multiplies<>{});
+                elementwise_desc_                = GetElementwiseCGridDesc(merged_filter_dims);
+                elementwise_block_2_ctile_map_   = Block2TileMapElementwise{1, merged_filter_dims};
+                // Check if stride to last dimension is product of all other dimensions. Then it is
+                // packed.
+                is_filter_data_packed =
+                    e_g_k_c_xs_strides[0] == (merged_filter_dims / e_g_k_c_xs_lengths[0]);
+
+                // Data type is modified during launch. It is checked in IsSupported if user
+                // allocated workspace
+                explicit_gemm_args = GemmArgument{p_out_grid,
+                                                  p_in_grid,
+                                                  {},
+                                                  static_cast<TwoStageIntermediateType*>(nullptr),
+                                                  M,
+                                                  N,
+                                                  K,
+                                                  StrideOut,
+                                                  StrideIn,
+                                                  {},
+                                                  StrideWei,
+                                                  StrideBatchOut,
+                                                  StrideBatchIn,
+                                                  {},
+                                                  StrideBatchWei,
+                                                  BatchSize,
+                                                  out_element_op,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  split_k};
+            }
+            else
+            {
+                explicit_gemm_args = GemmArgument{p_out_grid,
+                                                  p_in_grid,
+                                                  {},
+                                                  p_wei_grid,
+                                                  M,
+                                                  N,
+                                                  K,
+                                                  StrideOut,
+                                                  StrideIn,
+                                                  {},
+                                                  StrideWei,
+                                                  StrideBatchOut,
+                                                  StrideBatchIn,
+                                                  {},
+                                                  StrideBatchWei,
+                                                  BatchSize,
+                                                  out_element_op,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  split_k};
+            }
+        }
+
+        std::size_t GetWorkspaceETensorSizeBytes() const
+        {
+            if constexpr(IsTwoStageNeeded)
+            {
+                return sizeof(TwoStageIntermediateType) * elementwise_desc_.GetElementSpaceSize();
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            if constexpr(IsTwoStageNeeded)
+            {
+                return GetWorkspaceETensorSizeBytes();
+            }
+            else
+            {
+                return 0;
+            }
         }
 
         GemmArgument explicit_gemm_args;
@@ -117,16 +232,56 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
         const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
+        WeiDataType* p_wei_grid_;
+        bool is_filter_data_packed;
+        CElementwiseGridDesc elementwise_desc_;
+        Block2TileMapElementwise elementwise_block_2_ctile_map_;
     };
 
     // Invoker
     struct Invoker : public BaseInvoker
     {
-        using Argument = DeviceOp::Argument;
+        using Argument     = DeviceOp::Argument;
+        using GemmArgument = typename DeviceGemmV3Op::Argument;
 
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            return explicit_gemm_op.Run(arg.explicit_gemm_args, stream_config);
+            if constexpr(IsTwoStageNeeded)
+            {
+                // Modify to use workspace as output
+                GemmArgument explicit_gemm_args_with_workspace = arg.explicit_gemm_args;
+                explicit_gemm_args_with_workspace.p_c_grid =
+                    static_cast<TwoStageIntermediateType*>(arg.p_workspace_);
+                float avg_time =
+                    explicit_gemm_op.Run(explicit_gemm_args_with_workspace, stream_config);
+                const index_t grid_size =
+                    arg.elementwise_block_2_ctile_map_.CalculateGridSize(arg.elementwise_desc_);
+                const auto kernel = kernel_elementwise<GridwiseElementwiseCast,
+                                                       ck::Tuple<CElementwiseGridDesc>,
+                                                       ck::Tuple<CElementwiseGridDesc>,
+                                                       ck::Tuple<const TwoStageIntermediateType*>,
+                                                       ck::Tuple<WeiDataType*>,
+                                                       Block2TileMapElementwise,
+                                                       WeiElementwiseOperation>;
+
+                avg_time += launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(ElementwiseBlockSize),
+                    0,
+                    make_tuple(arg.elementwise_desc_),
+                    make_tuple(arg.elementwise_desc_),
+                    make_tuple(static_cast<const TwoStageIntermediateType*>(arg.p_workspace_)),
+                    make_tuple(arg.p_wei_grid_),
+                    arg.elementwise_block_2_ctile_map_,
+                    element_wise::PassThrough{});
+                return avg_time;
+            }
+            else
+            {
+                return explicit_gemm_op.Run(arg.explicit_gemm_args, stream_config);
+            }
         }
 
         float Run(const BaseArgument* p_arg,
@@ -174,6 +329,26 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
                 return false;
             }
         }
+        if constexpr(IsTwoStageNeeded)
+        {
+            if(!arg.is_filter_data_packed)
+            {
+                return false;
+            }
+            // Check this here, it allows to use other instances from factory even
+            // if workspace is not allocated
+            if(!arg.p_workspace_)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Warning: Workspace for "
+                                 "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle::Argument is not "
+                                 "allocated, use SetWorkSpacePointer."
+                              << std::endl;
+                }
+                return false;
+            }
+        }
         // Gridwise GEMM size
         return DeviceGemmV3Op::IsSupportedArgument(arg.explicit_gemm_args);
     }
@@ -277,6 +452,33 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
 
         return str.str();
     }
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        auto arg = dynamic_cast<const Argument*>(p_arg);
+        if(arg)
+        {
+            return arg->GetWorkspaceSizeBytes();
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle::Argument structure!");
+    }
+
+    void SetWorkSpacePointer(BaseArgument* p_arg,
+                             void* p_workspace,
+                             const StreamConfig& = StreamConfig{}) const override
+    {
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle::Argument structure!");
+    }
 };
 
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
index 1d291cca39..0c44ca6613 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
@@ -88,6 +88,97 @@ using device_gemm_xdl_universal_km_kn_mn_mem_instances = std::tuple<
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
+
+template <typename InOutDataType,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_km_kn_mn_irregular_odd_m_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        // Latency friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+
+template <typename InOutDataType,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_km_kn_mn_odd_n_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        // Latency friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              4,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+
+template <typename InOutDataType,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_km_kn_mn_irregular_odd_mn_instances = std::tuple<
+    // clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+        // Latency friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        // Memory friendly
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   4,   4,  16,   16,    2,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              4,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   4,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  F32,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   2,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              1,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              S<1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index a53a92e795..3c0784eef3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -397,24 +397,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     // Explicit GEMM
                     add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instances(
                         op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instances(op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instances(op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instances(op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -459,23 +454,21 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     // Explicit GEMM
                     add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instances(
                         op_ptrs);
                 }
 #endif
@@ -650,24 +643,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     // Explicit GEMM
                     add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instances(
                         op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instances(op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instances(op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instances(op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -712,23 +700,21 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     // Explicit GEMM
                     add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
-                        op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instances(
                         op_ptrs);
                     add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instances(
                         op_ptrs);
-                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instances(
+                        op_ptrs);
+                    add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instances(
                         op_ptrs);
                 }
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc
index 8501ffafa3..8958e4c1ee 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_explicit_xdl.inc
@@ -22,31 +22,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instan
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -70,19 +46,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_inst
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -106,7 +70,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_inst
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -118,7 +82,31 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_ins
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -145,31 +133,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -193,19 +157,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instanc
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -229,7 +181,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instanc
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -241,7 +193,31 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instan
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -268,31 +244,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instan
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -316,19 +268,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_inst
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -352,7 +292,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_inst
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -364,7 +304,31 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_ins
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -391,31 +355,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -439,19 +379,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instanc
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances);
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -475,7 +403,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instanc
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -487,7 +415,31 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instan
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt
index 6b5efd253f..65d92e3c2c 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -2,25 +2,25 @@
 set(GROUPED_CONVND_EXP_BWD_WEIGHT
     # Explicit instances are common for 2d and 3d
     explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_default_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instance.cpp
     explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_default_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instance.cpp
     explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_default_instance.cpp
-    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp
-    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instance.cpp
 
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instance.cpp
+    explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instance.cpp
+
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_default_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_default_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_default_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instance.cpp
+
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instance.cpp
+    explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instance.cpp
     )
 add_instance_library(device_grouped_convnd_bwd_weight_instance ${GROUPED_CONVND_EXP_BWD_WEIGHT})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instance.cpp
index 645b60fcc6..2a8b399368 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,10 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_insta
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMNKPadding>>(instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +58,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_kpadding_insta
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMNKPadding>>(instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instance.cpp
index e11c9c68ad..0cf0b7f9e3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,11 +32,11 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_in
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmMKPadding>>(
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmMNKPadding>>(
         instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -59,7 +59,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_mkpadding_in
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmMKPadding>>(
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmMNKPadding>>(
         instances);
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
deleted file mode 100644
index 109f42703a..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instance.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances)
-{
-    add_explicit_gemm_device_operation_instances<
-        2,
-        NHWGC,
-        GKYXC,
-        NHWGK,
-        BF16,
-        BF16,
-        BF16,
-        PassThrough,
-        PassThrough,
-        PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmKPadding>>(instances);
-}
-
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_kpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           BF16,
-                                                           BF16,
-                                                           BF16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances)
-{
-    add_explicit_gemm_device_operation_instances<
-        3,
-        NDHWGC,
-        GKZYXC,
-        NDHWGK,
-        BF16,
-        BF16,
-        BF16,
-        PassThrough,
-        PassThrough,
-        PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmKPadding>>(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instance.cpp
similarity index 95%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instance.cpp
index e7350ee6d4..1e280ed2bf 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,11 +32,11 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_in
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmMKPadding>>(
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmMNKPadding>>(
         instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -59,7 +59,7 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v2_mkpadding_in
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmMKPadding>>(
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Interwave, GemmMNKPadding>>(
         instances);
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instance.cpp
similarity index 77%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instance.cpp
index 1bed4ac5c4..a86efe9aa0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,12 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_inst
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_m_instances<BF16,
+                                                                     Intrawave,
+                                                                     GemmMNKPadding>>(instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_m_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +60,9 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mkpadding_inst
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_m_instances<BF16,
+                                                                     Intrawave,
+                                                                     GemmMNKPadding>>(instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instance.cpp
similarity index 77%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instance.cpp
index 3cf9e00440..239664d1da 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,12 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_ins
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_mn_instances<BF16,
+                                                                      Intrawave,
+                                                                      GemmMNKPadding>>(instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_mn_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +60,9 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_mem_v1_kpadding_ins
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<BF16, Intrawave, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_mn_instances<BF16,
+                                                                      Intrawave,
+                                                                      GemmMNKPadding>>(instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instance.cpp
similarity index 85%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instance.cpp
index 8947235617..fe79c5c5dd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/bf16_bf16_bf16/device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,11 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_insta
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_odd_n_instances<BF16, Intrawave, GemmMNKPadding>>(
+        instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +59,8 @@ void add_device_grouped_convnd_bwd_weight_bf16_bf16_bf16_exp_comp_mpadding_insta
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<BF16, GemmMPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_odd_n_instances<BF16, Intrawave, GemmMNKPadding>>(
+        instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instance.cpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instance.cpp
index 4a564da6c9..f1d1c5d228 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,10 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMNKPadding>>(instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +58,7 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mpadding_instance
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMNKPadding>>(instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instance.cpp
index 5bf4b27771..3fd121dca6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,11 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_insta
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmMKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmMNKPadding>>(
+        instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +59,8 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_mkpadding_insta
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmMKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmMNKPadding>>(
+        instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
deleted file mode 100644
index 7e478364d3..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instance.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           NHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances)
-{
-    add_explicit_gemm_device_operation_instances<
-        2,
-        NHWGC,
-        GKYXC,
-        NHWGK,
-        F16,
-        F16,
-        F16,
-        PassThrough,
-        PassThrough,
-        PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmMKPadding>>(instances);
-}
-
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mkpadding_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           NDHWGK,
-                                                           F16,
-                                                           F16,
-                                                           F16,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough>>>& instances)
-{
-    add_explicit_gemm_device_operation_instances<
-        3,
-        NDHWGC,
-        GKZYXC,
-        NDHWGK,
-        F16,
-        F16,
-        F16,
-        PassThrough,
-        PassThrough,
-        PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmMKPadding>>(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instance.cpp
similarity index 94%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instance.cpp
index 1b176d8d24..acc6c5e2df 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,11 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instan
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmMNKPadding>>(
+        instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_mnkpadding_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +59,8 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v2_kpadding_instan
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Interwave, GemmMNKPadding>>(
+        instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instance.cpp
similarity index 77%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instance.cpp
index 05636b2438..e9732bb675 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,12 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instanc
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_m_instances<F16,
+                                                                     Intrawave,
+                                                                     GemmMNKPadding>>(instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_m_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +60,9 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_mkpadding_instanc
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmMKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_m_instances<F16,
+                                                                     Intrawave,
+                                                                     GemmMNKPadding>>(instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instance.cpp
similarity index 77%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instance.cpp
index 0d8755a31a..aaf1000249 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,12 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instan
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_mn_instances<F16,
+                                                                      Intrawave,
+                                                                      GemmMNKPadding>>(instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_mn_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +60,9 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_mem_v1_kpadding_instan
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_mem_instances<F16, Intrawave, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_irregular_odd_mn_instances<F16,
+                                                                      Intrawave,
+                                                                      GemmMNKPadding>>(instances);
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instance.cpp
similarity index 85%
rename from library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instance.cpp
index 174970fa12..1f9c8f3ca4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_convnd_bwd_weight/explicit_xdl/fp16_fp16_fp16/device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instance.cpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
                                                            GKYXC,
@@ -32,10 +32,11 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_odd_n_instances<F16, Intrawave, GemmMNKPadding>>(
+        instances);
 }
 
-void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instances(
+void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_odd_n_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
                                                            GKZYXC,
@@ -58,7 +59,8 @@ void add_device_grouped_convnd_bwd_weight_f16_f16_f16_exp_comp_kpadding_instance
         PassThrough,
         PassThrough,
         PassThrough,
-        device_gemm_xdl_universal_km_kn_mn_comp_instances<F16, GemmKPadding>>(instances);
+        device_gemm_xdl_universal_km_kn_mn_odd_n_instances<F16, Intrawave, GemmMNKPadding>>(
+        instances);
 }
 
 } // namespace instance

From 2e0536269e8c68709f2080d28917eb6d0a3ea082 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Tue, 10 Jun 2025 20:35:28 +0800
Subject: [PATCH 192/443] hot fix (#2315)

---
 include/ck_tile/host.hpp           | 5 +++--
 include/ck_tile/host/check_err.hpp | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 3459e728e0..44851fec4a 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -9,7 +9,9 @@
 #include "ck_tile/host/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck_tile/host/convolution_parameter.hpp"
 #include "ck_tile/host/device_memory.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/fill.hpp"
+#include "ck_tile/host/flush_icache.hpp"
 #include "ck_tile/host/hip_check_error.hpp"
 #include "ck_tile/host/host_tensor.hpp"
 #include "ck_tile/host/joinable_thread.hpp"
@@ -34,8 +36,7 @@
 #include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/reference/reference_topk.hpp"
+#include "ck_tile/host/rotating_buffers.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/host/timer.hpp"
-#include "ck_tile/host/flush_icache.hpp"
-#include "ck_tile/host/rotating_buffers.hpp"
diff --git a/include/ck_tile/host/check_err.hpp b/include/ck_tile/host/check_err.hpp
index 454f22e007..171384be61 100644
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -106,7 +106,8 @@ CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1
  * @return Absolute error threshold based on data type characteristics and maximum value
  */
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-CK_TILE_HOST double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
+CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
+                                           const int number_of_accumulations = 1)
 {
 
     static_assert(
@@ -195,8 +196,8 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
  */
 template <typename Range, typename RefRange>
 CK_TILE_HOST bool check_size_mismatch(const Range& out,
-                         const RefRange& ref,
-                         const std::string& msg = "Error: Incorrect results!")
+                                      const RefRange& ref,
+                                      const std::string& msg = "Error: Incorrect results!")
 {
     if(out.size() != ref.size())
     {

From 1ac5eeaea9ca3670f4a9105caf6ffa8b95c0f422 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 10 Jun 2025 07:26:32 -0700
Subject: [PATCH 193/443] fix headers (#2321)

---
 .../add_device_operation_instance.hpp                       | 1 +
 test/scatter_gather/scatter_gather.cpp                      | 6 +-----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
index f57fed9c07..a20e608868 100644
--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -5,6 +5,7 @@
 
 #include <vector>
 #include <type_traits>
+#include <memory>
 
 #include "ck/utility/functional2.hpp"
 
diff --git a/test/scatter_gather/scatter_gather.cpp b/test/scatter_gather/scatter_gather.cpp
index 439e792dd8..81765b43e5 100644
--- a/test/scatter_gather/scatter_gather.cpp
+++ b/test/scatter_gather/scatter_gather.cpp
@@ -1,13 +1,9 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <vector>
-#include <iostream>
-#include <numeric>
 #include <cassert>
 #include <cstdlib>
-#include <iostream>
-#include <time.h>
+#include <ctime>
 #include <unordered_set>
 
 #include "ck_tile/core.hpp"

From 3d9f5eafaf9901d7ad0f0a357e02759a5e4752d7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Jun 2025 07:27:26 -0700
Subject: [PATCH 194/443] Bump rocm-docs-core[api_reference] from 1.20.0 to
 1.20.1 in /docs/sphinx (#2317)

Bumps [rocm-docs-core[api_reference]](https://github.com/ROCm/rocm-docs-core) from 1.20.0 to 1.20.1.
- [Release notes](https://github.com/ROCm/rocm-docs-core/releases)
- [Changelog](https://github.com/ROCm/rocm-docs-core/blob/v1.20.1/CHANGELOG.md)
- [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.20.0...v1.20.1)

---
updated-dependencies:
- dependency-name: rocm-docs-core[api_reference]
  dependency-version: 1.20.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 725a745f3a..489a448860 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.20.0
+rocm-docs-core[api_reference]==1.20.1
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index f74ad725af..14e74b2a6f 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -237,7 +237,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.20.0
+rocm-docs-core[api-reference]==1.20.1
     # via -r requirements.in
 rpds-py==0.24.0
     # via

From 6635d1bb888e3f51ec1125ff7d6f54a2ec054a10 Mon Sep 17 00:00:00 2001
From: John Afaganis <john.afaganis@amd.com>
Date: Tue, 10 Jun 2025 08:34:54 -0600
Subject: [PATCH 195/443] Remove usage of 'warpSize' variable as it has been
 deprecated (#2295)

* SWDEV-535598 - remove usage of 'warpSize' variable as it has been deprecated. Ideally get_warp_size() should not be constexpr but this is just a workaround

* SWDEV-535598 - remove comment from get_warp_size as constexpr is required for this repo

---------

Co-authored-by: Gerardo Hernandez <gerardo.hernandez@amd.com>
---
 include/ck/utility/get_id.hpp      | 7 +++++--
 include/ck_tile/core/arch/arch.hpp | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
index 77564c6130..fd0d1024b2 100644
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -9,8 +9,11 @@ namespace ck {
 
 __host__ __device__ constexpr index_t get_warp_size()
 {
-    // warpSize is defined by HIP
-    return warpSize;
+#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
+    return 64;
+#else
+    return 32;
+#endif
 }
 
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 1d3cf5c010..3dd9604b01 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -50,8 +50,11 @@ enum struct memory_operation_enum : std::uint16_t
 
 CK_TILE_HOST_DEVICE constexpr index_t get_warp_size()
 {
-    // warpSize is defined by HIP
-    return warpSize;
+#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
+    return 64;
+#else
+    return 32;
+#endif
 }
 
 CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }

From 4e586ca95834be8d22b5173cfd9fddcc8c73dc0e Mon Sep 17 00:00:00 2001
From: Eisuke Kawashima <e.kawaschima+github@gmail.com>
Date: Wed, 11 Jun 2025 01:13:59 +0900
Subject: [PATCH 196/443] chore: unset executable permission (#2303)

Co-authored-by: Eisuke Kawashima <e-kwsm@users.noreply.github.com>
---
 .pre-commit-config.yaml                                           | 0
 example/01_gemm/CMakeLists.txt                                    | 0
 example/01_gemm/gemm_xdl_bf16.cpp                                 | 0
 example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp                      | 0
 example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp                       | 0
 example/66_complex_contraction_bilinear/CMakeLists.txt            | 0
 example/66_complex_contraction_bilinear/README.md                 | 0
 .../complex_contraction_bilinear_xdl_fp32.cpp                     | 0
 .../complex_contraction_bilinear_xdl_fp64.cpp                     | 0
 include/ck_tile/ops/common/utils.hpp                              | 0
 library/src/tensor_operation_instance/gpu/CMakeLists.txt          | 0
 .../gpu/gemm_universal_streamk/CMakeLists.txt                     | 0
 .../device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp | 0
 ...rsal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp | 0
 ...sal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp | 0
 ...streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp | 0
 ...streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp | 0
 .../device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp | 0
 ...rsal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp | 0
 ...sal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp | 0
 ...sal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp | 0
 ..._streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp | 0
 ..._streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp | 0
 .../device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp | 0
 ...al_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp | 0
 ...streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp | 0
 ...streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 0
 .../device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp | 0
 ...l_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp | 0
 ...streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp | 0
 ...al_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp | 0
 ...l_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp | 0
 ...streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp | 0
 ...sal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 0
 ...versal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 0
 ...sal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp | 0
 ...versal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp | 0
 .../device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp     | 0
 .../device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp     | 0
 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp | 0
 profiler/src/profile_gemm_universal_streamk.cpp                   | 0
 test/CMakeLists.txt                                               | 0
 test/gemm_universal/CMakeLists.txt                                | 0
 test/gemm_universal_streamk/CMakeLists.txt                        | 0
 .../test_gemm_universal_streamk_ut_cases_fp8.inc                  | 0
 .../test_gemm_universal_streamk_xdl_bf16.cpp                      | 0
 .../test_gemm_universal_streamk_xdl_fp8.cpp                       | 0
 tile_engine/CMakeLists.txt                                        | 0
 tile_engine/include/CMakeLists.txt                                | 0
 tile_engine/ops/CMakeLists.txt                                    | 0
 67 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 .pre-commit-config.yaml
 mode change 100755 => 100644 example/01_gemm/CMakeLists.txt
 mode change 100755 => 100644 example/01_gemm/gemm_xdl_bf16.cpp
 mode change 100755 => 100644 example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
 mode change 100755 => 100644 example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
 mode change 100755 => 100644 example/66_complex_contraction_bilinear/CMakeLists.txt
 mode change 100755 => 100644 example/66_complex_contraction_bilinear/README.md
 mode change 100755 => 100644 example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
 mode change 100755 => 100644 example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
 mode change 100755 => 100644 include/ck_tile/ops/common/utils.hpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/CMakeLists.txt
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
 mode change 100755 => 100644 library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
 mode change 100755 => 100644 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
 mode change 100755 => 100644 profiler/src/profile_gemm_universal_streamk.cpp
 mode change 100755 => 100644 test/CMakeLists.txt
 mode change 100755 => 100644 test/gemm_universal/CMakeLists.txt
 mode change 100755 => 100644 test/gemm_universal_streamk/CMakeLists.txt
 mode change 100755 => 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
 mode change 100755 => 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
 mode change 100755 => 100644 test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
 mode change 100755 => 100644 tile_engine/CMakeLists.txt
 mode change 100755 => 100644 tile_engine/include/CMakeLists.txt
 mode change 100755 => 100644 tile_engine/ops/CMakeLists.txt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100755
new mode 100644
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
old mode 100755
new mode 100644
diff --git a/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp b/example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp
old mode 100755
new mode 100644
diff --git a/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp b/example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
old mode 100755
new mode 100644
diff --git a/example/66_complex_contraction_bilinear/CMakeLists.txt b/example/66_complex_contraction_bilinear/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/example/66_complex_contraction_bilinear/README.md b/example/66_complex_contraction_bilinear/README.md
old mode 100755
new mode 100644
diff --git a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
old mode 100755
new mode 100644
diff --git a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
old mode 100755
new mode 100644
diff --git a/include/ck_tile/ops/common/utils.hpp b/include/ck_tile/ops/common/utils.hpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_mnpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_kn_mn.hpp
old mode 100755
new mode 100644
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f8_bf16/device_gemm_xdl_universal_streamk_f8_f8_bf16_mk_nk_mn.hpp
old mode 100755
new mode 100644
diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100755
new mode 100644
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100755
new mode 100644
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/test/gemm_universal/CMakeLists.txt b/test/gemm_universal/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/test/gemm_universal_streamk/CMakeLists.txt b/test/gemm_universal_streamk/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
old mode 100755
new mode 100644
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_bf16.cpp
old mode 100755
new mode 100644
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp b/test/gemm_universal_streamk/test_gemm_universal_streamk_xdl_fp8.cpp
old mode 100755
new mode 100644
diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/tile_engine/include/CMakeLists.txt b/tile_engine/include/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
old mode 100755
new mode 100644

From e6b5e31c20bf859a869f7295489a2bbe10ef6eca Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 10 Jun 2025 09:37:14 -0700
Subject: [PATCH 197/443] Convert CK (GeMM MulMul Weight Preshuffle) instances
 to use 16x16 xdl tile (#2229)

* compile profiler only for gemm-mulmul-weight-preshuffle

* m/n xdl; m/n xdl per wave; cshuffle block transfer cluster length m per block

* process all p1 instances

* process all p2 instances

* process all p3 instances

* convert p4 instance

* modify compute p1 instances

* modify compute p2 instances

* relax p4 instance c block transfer cluster len

* fix c block transfer cluster lengths comment

* add mfma (without 16x16) instances to the profiler

* roll back profiling cmakelists change

* clang-format

* re-add (now unused) 32x32 xdl-tile instances

* clang-format

* add more instances

* fit c block transfer lengths into block

* copy and write over the instance definitions from bf16 to fp16

* add instances to profiler

* unify instance tuple alias
---
 .../gpu/gemm_multiply_multiply_wp.hpp         | 311 ++++++++++--------
 ..._multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp | 169 +++++++---
 ...y_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp | 231 ++++++++-----
 3 files changed, 432 insertions(+), 279 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
index 90a9fa381d..987a8114cb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_wp.hpp
@@ -18,168 +18,141 @@ namespace device {
 namespace instance {
 
 #if(defined(CK_ENABLE_F16) || defined(CK_ENABLE_FP8))
+using TGemmMulMulF8F8F16Instances =
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>;
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+    TGemmMulMulF8F8F16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+    TGemmMulMulF8F8F16Instances& instances);
 
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8F16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8F16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8F16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8F16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8F16Instances& instances);
 
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     F16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8F16Instances& instances);
 #endif
 
 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+using TGemmMulMulF8F8BF16Instances =
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>;
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
+    TGemmMulMulF8F8BF16Instances& instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
+    TGemmMulMulF8F8BF16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p1(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8BF16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p2(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8BF16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p3(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8BF16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p4(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8BF16Instances& instances);
+
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p5(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8BF16Instances& instances);
 
 void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p6(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                                     Col,
-                                                                     Tuple<Row, Col>,
-                                                                     Row,
-                                                                     F8,
-                                                                     F8,
-                                                                     Tuple<F32, F32>,
-                                                                     BF16,
-                                                                     PassThrough,
-                                                                     PassThrough,
-                                                                     MultiplyMultiply>>>&
-        instances);
+    TGemmMulMulF8F8BF16Instances& instances);
 
 #endif
 
@@ -239,6 +212,31 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
                     op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+                    op_ptrs);
             }
         }
 #endif
@@ -262,6 +260,31 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances_p6(
                     op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p4_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p5_default_instances_v2(
+                    op_ptrs);
             }
         }
 #endif
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
index 4613a0f24d..b9ace13f72 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
@@ -37,22 +37,83 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 static constexpr auto v1 = BlockGemmPipelineVersion::v1;
 static constexpr auto v2 = BlockGemmPipelineVersion::v2;
 
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma32x32_mn_instances =
+    std::tuple<
+        // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    // p1
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // N 256
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // N 512
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // p2
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // p3
+    // N 256
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // N 512
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // p4
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    64,    512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma32x32_mn_compute_instances =
+    std::tuple<
+        // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    // p1
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    // p2
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_instances =
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -61,13 +122,14 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  16,   16,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  16,   16,    4,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -76,17 +138,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  16,   16,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  16,   16,    4,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  16,   16,    2,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  16,   16,    4,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  16,   16,    2,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -95,8 +157,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
@@ -107,7 +169,7 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     32,    64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    64,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    64,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
 
@@ -119,8 +181,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
@@ -134,14 +196,14 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -150,14 +212,15 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    10,   2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -166,8 +229,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // Compute friendly
         // 256x[64, 256, 32]x128
@@ -186,8 +249,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 224x[64, 256, 32]x128
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,   14,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -204,8 +267,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 192x[64, 256, 32]x128, 192x[64]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,   12,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -222,8 +285,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,   10,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -240,8 +303,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    8,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -256,8 +319,8 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x1
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
index dc9db8889a..eebfff897a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
@@ -37,22 +37,83 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 static constexpr auto v1 = BlockGemmPipelineVersion::v1;
 static constexpr auto v2 = BlockGemmPipelineVersion::v2;
 
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma32x32_mn_instances =
+    std::tuple<
+        // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    // p1
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // N 256
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // N 512
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // p2
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // p3
+    // N 256
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // N 512
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    // p4
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    64,    512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma32x32_mn_compute_instances =
+    std::tuple<
+        // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    // p1
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    // p2
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
 template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances =
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -61,13 +122,14 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  16,   16,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  16,   16,    4,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -76,17 +138,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  16,   16,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  16,   16,    4,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  16,   16,    2,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  16,   16,    4,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  16,   16,    2,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -95,12 +157,22 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     32,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     32,    64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    64,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
         // clang-format on
         >;
 
@@ -109,19 +181,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   256,  16,  16,  16,   16,    1,    1,     S<16, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   128,   8,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
-
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
         // clang-format on
         >;
 
@@ -130,14 +196,14 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_c
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -146,14 +212,15 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_c
     std::tuple<
         // clang-format off
         //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    10,   2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -162,18 +229,18 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // Compute friendly
         // 256x[64, 256, 32]x128
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,   16,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    224,   128,  16,  16,  16,   16,    8,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,   16,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,   16,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,   16,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -182,17 +249,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 224x[64, 256, 32]x128
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,   14,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,   14,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,   14,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,   14,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 template <GemmSpecialization GemmSpec>
@@ -200,17 +267,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 192x[64, 256, 32]x128, 192x[64]x256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,   12,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,   12,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,   12,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,   12,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 template <GemmSpecialization GemmSpec>
@@ -218,17 +285,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,   10,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,    5,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,   10,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,   10,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,   10,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 template <GemmSpecialization GemmSpec>
@@ -236,14 +303,14 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    8,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    8,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    8,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 
@@ -252,14 +319,14 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16
     std::tuple<
         // clang-format off
         //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
         // clang-format on
         >;
 

From aed0f5880cd9e3b7fb1c7828166b2b480bc65649 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 10 Jun 2025 13:46:47 -0400
Subject: [PATCH 198/443] Label CMakeLists message() as DEBUG or STATUS for
 clean build output (#2301)

* - elevate important build messages to log level STATUS
- comment out the rest (temporarily)

* - marked all low importance build messages as log_level=DEBUG
---
 CMakeLists.txt                                | 52 +++++++-------
 client_example/CMakeLists.txt                 |  2 +-
 codegen/CMakeLists.txt                        |  4 +-
 codegen/test/rtc/CMakeLists.txt               |  2 +-
 example/CMakeLists.txt                        | 36 +++++-----
 example/ck_tile/01_fmha/CMakeLists.txt        |  8 +--
 example/ck_tile/02_layernorm2d/CMakeLists.txt |  2 +-
 example/ck_tile/05_reduce/CMakeLists.txt      |  2 +-
 example/ck_tile/10_rmsnorm2d/CMakeLists.txt   |  2 +-
 .../11_add_rmsnorm2d_rdquant/CMakeLists.txt   |  2 +-
 example/ck_tile/12_smoothquant/CMakeLists.txt |  2 +-
 .../ck_tile/14_moe_smoothquant/CMakeLists.txt |  2 +-
 example/ck_tile/15_fused_moe/CMakeLists.txt   |  2 +-
 .../gpu/CMakeLists.txt                        | 70 +++++++++----------
 .../gpu/mha/CMakeLists.txt                    |  4 +-
 profiler/src/CMakeLists.txt                   |  6 +-
 test/CMakeLists.txt                           | 28 ++++----
 test/ck_tile/gemm/CMakeLists.txt              |  2 +-
 tile_engine/include/CMakeLists.txt            |  2 +-
 tile_engine/ops/gemm/CMakeLists.txt           |  2 +-
 20 files changed, 115 insertions(+), 117 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bbdd77c21..aab74f3069 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,11 +36,11 @@ option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
 if(NOT CK_USE_ALTERNATIVE_PYTHON)
    find_package(Python3 3.8 COMPONENTS Interpreter REQUIRED)
 else()
-   message("Using alternative python version")
+   message(STATUS "Using alternative python version")
    set(EXTRA_PYTHON_PATH)
    # this is overly restrictive, we may need to be more flexible on the following
    string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
-   message("alternative python path is: ${EXTRA_PYTHON_PATH}")
+   message(STATUS "alternative python path is: ${EXTRA_PYTHON_PATH}")
    find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
    add_definitions(-DPython3_EXECUTABLE="${CK_USE_ALTERNATIVE_PYTHON}")
    set(Python3_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}")
@@ -80,7 +80,7 @@ if (DTYPES)
         add_definitions(-DCK_ENABLE_BF16)
         set(CK_ENABLE_BF16 "ON")
     endif()
-    message("DTYPES macro set to ${DTYPES}")
+    message(STATUS "DTYPES macro set to ${DTYPES}")
 else()
     add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
     set(CK_ENABLE_INT8 "ON")
@@ -146,8 +146,8 @@ rocm_setup_version(VERSION ${version})
 
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip "$ENV{ROCM_PATH}" "$ENV{HIP_PATH}")
 
-message("GPU_TARGETS= ${GPU_TARGETS}")
-message("GPU_ARCHS= ${GPU_ARCHS}")
+message(STATUS "GPU_TARGETS= ${GPU_TARGETS}")
+message(STATUS "GPU_ARCHS= ${GPU_ARCHS}")
 if(GPU_ARCHS)
     #disable GPU_TARGETS to avoid conflicts, this needs to happen before we call hip package
     unset(GPU_TARGETS CACHE)
@@ -162,9 +162,9 @@ find_package(hip REQUIRED)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
 # SWDEV-413293 and https://reviews.llvm.org/D155213
 math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
-message("hip_version_flat=${hip_VERSION_FLAT}")
+message(STATUS "hip_version_flat=${hip_VERSION_FLAT}")
 
-message("checking which targets are supported")
+message(STATUS "checking which targets are supported")
 #In order to build just the CK library (without tests and examples) for all supported GPU targets
 #use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
@@ -203,25 +203,25 @@ endif()
 rocm_check_target_ids(SUPPORTED_GPU_TARGETS
         TARGETS ${CK_GPU_TARGETS})
 
-message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
+message(STATUS "Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
 
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
-    message("Enabling XDL instances")
+    message(STATUS "Enabling XDL instances")
     add_definitions(-DCK_USE_XDL)
     set(CK_USE_XDL "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
-    message("Enabling XDL FP8 gemms on native architectures")
+    message(STATUS "Enabling XDL FP8 gemms on native architectures")
     add_definitions(-DCK_USE_GFX94)
     set(CK_USE_GFX94 "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
-    message("Enabling WMMA instances")
+    message(STATUS "Enabling WMMA instances")
     add_definitions(-DCK_USE_WMMA)
     set(CK_USE_WMMA "ON")
 endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
-    message("Enabling WMMA FP8 gemms on native architectures")
+    message(STATUS "Enabling WMMA FP8 gemms on native architectures")
     add_definitions(-DCK_USE_WMMA_FP8)
     set(CK_USE_WMMA_FP8 "ON")
 endif()
@@ -250,32 +250,32 @@ configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/con
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
   check_cxx_compiler_flag("-fno-offload-uniform-block" HAS_NO_OFFLOAD_UNIFORM_BLOCK)
   if(HAS_NO_OFFLOAD_UNIFORM_BLOCK)
-    message("Adding the fno-offload-uniform-block compiler flag")
+    message(STATUS "Adding the fno-offload-uniform-block compiler flag")
     add_compile_options(-fno-offload-uniform-block)
   endif()
 endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
   check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
   if(HAS_LSR_DROP_SOLUTION)
-    message("Adding the lsr-drop-solution=1 compiler flag")
+    message(STATUS "Adding the lsr-drop-solution=1 compiler flag")
     add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
   endif()
 endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
   check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
   if(HAS_ENABLE_POST_MISCHED)
-    message("Adding the enable-post-misched=0 compiler flag")
+    message(STATUS "Adding the enable-post-misched=0 compiler flag")
     add_compile_options("SHELL: -mllvm -enable-post-misched=0")
   endif()
 endif()
 set(check-coerce)
 check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
 if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132)
-   message("Adding the amdgpu-coerce-illegal-types=1")
+   message(STATUS "Adding the amdgpu-coerce-illegal-types=1")
    add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
 endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
-   message("Adding -amdgpu-early-inline-all=true and -amdgpu-function-calls=false")
+   message(STATUS "Adding -amdgpu-early-inline-all=true and -amdgpu-function-calls=false")
    add_compile_options("SHELL: -mllvm -amdgpu-early-inline-all=true")
    add_compile_options("SHELL: -mllvm -amdgpu-function-calls=false")
 endif()
@@ -312,13 +312,13 @@ option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX1
 if(USE_BITINT_EXTENSION_INT4)
     add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
     add_compile_options(-Wno-bit-int-extension)
-    message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+    message(STATUS "CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
 endif()
 
 if(USE_OPT_GFX11)
     add_compile_options(-mcumode)
     add_compile_options(-mno-wavefrontsize64)
-    message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
+    message(STATUS "CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
 endif()
 
 ## Threads
@@ -330,7 +330,7 @@ link_libraries(Threads::Threads)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
 
 # https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html
 # _GLIBCXX_ASSERTIONS
@@ -346,7 +346,7 @@ endif()
 set(CMAKE_HIP_PLATFORM amd)
 set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
 set(CMAKE_HIP_EXTENSIONS ON)
-message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
+message(STATUS "CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
 
 ## OpenMP
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -361,10 +361,10 @@ else()
 	find_package(OpenMP REQUIRED)
 endif()
 
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+message(STATUS "OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message(STATUS "OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message(STATUS "OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message(STATUS "OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
 
 link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})
@@ -560,7 +560,7 @@ if(BUILD_DEV)
     add_compile_options(-Werror)
     add_compile_options(-Weverything)
 endif()
-message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
 
 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_compile_options(-fcolor-diagnostics)
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index 9e2012bf8a..8fdd60f5d5 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -32,7 +32,7 @@ if (DTYPES)
         add_definitions(-DCK_ENABLE_BF16)
         set(CK_ENABLE_BF16 "ON")
     endif()
-    message("DTYPES macro set to ${DTYPES}")
+    message(DEBUG "DTYPES macro set to ${DTYPES}")
 else()
     add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
     set(CK_ENABLE_INT8 "ON")
diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 8ddc663452..35b5cf0367 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -19,9 +19,7 @@ list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
 include(Embed)
 file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
     ${CK_ROOT}/include/ck/*.hpp)
-# printouts fot debug purposes
-# message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-# message(STATUS "RELATIVE: ${CK_ROOT}/include")
+
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
 
 add_compile_options(-std=c++17)
diff --git a/codegen/test/rtc/CMakeLists.txt b/codegen/test/rtc/CMakeLists.txt
index 2e7ceb5648..b8a60cd633 100644
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
@@ -8,5 +8,5 @@ target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
 option(USE_HIPRTC_FOR_CODEGEN_TESTS "Whether to enable hipRTC for codegen tests." ON)
 if(USE_HIPRTC_FOR_CODEGEN_TESTS)
     target_compile_definitions(ck_rtc PUBLIC HIPRTC_FOR_CODEGEN_TESTS)
-    message("CK compiled with USE_HIPRTC_FOR_CODEGEN_TESTS set to ${USE_HIPRTC_FOR_CODEGEN_TESTS}")
+    message(STATUS "CK compiled with USE_HIPRTC_FOR_CODEGEN_TESTS set to ${USE_HIPRTC_FOR_CODEGEN_TESTS}")
 endif()
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 54d9f13453..1cfe2789c2 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -20,7 +20,7 @@ function(add_example_dependencies EXAMPLE_NAME FILE_NAME)
 endfunction(add_example_dependencies EXAMPLE_NAME)
 
 function(add_example_executable EXAMPLE_NAME FILE_NAME)
-    message("adding example ${EXAMPLE_NAME}")
+    message(DEBUG "adding example ${EXAMPLE_NAME}")
     set(result 1)
     if(DEFINED DTYPES)
         foreach(source IN LISTS FILE_NAME)
@@ -47,7 +47,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
                 set(test 1)
             endif()
             if(test EQUAL 1)
-                message("removing example source file ${source} ")
+                message(DEBUG "removing example source file ${source} ")
                 list(REMOVE_ITEM FILE_NAME "${source}")
             endif()
         endforeach()
@@ -58,56 +58,56 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     #Do not build any DL examples if DL_KERNELS not set
     foreach(source IN LISTS FILE_NAME)
         if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
-            message("removing dl example ${source} ")
+            message(DEBUG "removing dl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any DPP examples if DPP_KERNELS not set
     foreach(source IN LISTS FILE_NAME)
         if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
-            message("removing dpp example ${source} ")
+            message(DEBUG "removing dpp example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any XDL examples if gfx9 targets are not on the list
     foreach(source IN LISTS FILE_NAME)
         if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
-            message("removing xdl example ${source} ")
+            message(DEBUG "removing xdl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any WMMA examples if gfx11 targets are not on the list
     foreach(source IN LISTS FILE_NAME)
 	if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
-            message("removing wmma example ${source} ")
+            message(DEBUG "removing wmma example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any microscaling examples if gfx950 target is not on the list
     foreach(source IN LISTS FILE_NAME)
 	if(NOT EX_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
-            message("removing microscaling example ${source} ")
+            message(DEBUG "removing microscaling example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any FP8 examples if CK_ENABLE_FP8 not set
     foreach(source IN LISTS FILE_NAME)
         if(NOT DEFINED CK_ENABLE_FP8 AND source MATCHES "_fp8")
-            message("removing fp8 example ${source} ")
+            message(DEBUG "removing fp8 example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any BF8 examples if CK_ENABLE_BF8 not set
     foreach(source IN LISTS FILE_NAME)
         if(NOT DEFINED CK_ENABLE_BF8 AND source MATCHES "_bf8")
-            message("removing bf8 example ${source} ")
+            message(DEBUG "removing bf8 example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
     foreach(source IN LISTS FILE_NAME)
     if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply_xdl_fp8_bpreshuffle")
-         message("Skipping ${source} example for current target")
+         message(DEBUG "Skipping ${source} example for current target")
          list(REMOVE_ITEM FILE_NAME "${source}")
     endif()
     endforeach()
@@ -120,7 +120,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
         elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         elseif(FILE_NAME MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
-            message("trimming targets for ${FILE_NAME}")
+            message(DEBUG "trimming targets for ${FILE_NAME}")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
@@ -133,7 +133,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
         rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
         set(result 0)
     endif()
-    #message("add_example returns ${result}")
+    message(DEBUG "add_example returns ${result}")
     if(result EQUAL 0 AND NOT "${EXAMPLE_NAME}" IN_LIST REGRESSION_EXAMPLES)
         set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "SMOKE_TEST")
         add_dependencies(smoke ${EXAMPLE_NAME})
@@ -151,7 +151,7 @@ function(add_example_dependencies EXAMPLE_NAME FILE_NAME)
 endfunction(add_example_dependencies EXAMPLE_NAME)
 
 function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
-    message("adding example ${EXAMPLE_NAME}")
+    message(DEBUG "adding example ${EXAMPLE_NAME}")
     set(result 1)
     if(DEFINED DTYPES)
     foreach(source IN LISTS FILE_NAME)
@@ -178,7 +178,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
             set(test 1)
         endif()
         if(test EQUAL 1)
-            message("removing example ${source} ")
+            message(DEBUG "removing example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
@@ -189,21 +189,21 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     #Do not build any DL examples if DL_KERNELS not set
     foreach(source IN LISTS FILE_NAME)
         if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
-            message("removing dl example ${source} ")
+            message(DEBUG "removing dl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any XDL examples if gfx9 targets are not on the list
     foreach(source IN LISTS FILE_NAME)
         if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
-            message("removing xdl example ${source} ")
+            message(DEBUG "removing xdl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
     #Do not build any WMMA examples if gfx11 targets are not on the list
     foreach(source IN LISTS FILE_NAME)
 	if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
-            message("removing wmma example ${source} ")
+            message(DEBUG "removing wmma example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
@@ -223,7 +223,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
         set(result 0)
     endif()
 
-    #message("add_example returns ${result}")
+    message(DEBUG "add_example returns ${result}")
     set(result ${result} PARENT_SCOPE)
 
 endfunction(add_example_executable_no_testing EXAMPLE_NAME)
diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 9ba3a453fc..4fc8b0b4c9 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -25,7 +25,7 @@ execute_process(
   RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
-  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
+  message(FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
 endif()
 
 execute_process(
@@ -34,7 +34,7 @@ execute_process(
   RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
-  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
+  message(FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
 endif()
 
 # NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
@@ -57,7 +57,7 @@ add_custom_command(
 set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding example ${EXAMPLE_FMHA_FWD}")
+message(DEBUG "adding example ${EXAMPLE_FMHA_FWD}")
 add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
@@ -65,7 +65,7 @@ target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
 set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding example ${EXAMPLE_FMHA_BWD}")
+message(DEBUG "adding example ${EXAMPLE_FMHA_BWD}")
 add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt
index fa69ac0f7a..07714f0fe2 100644
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -25,7 +25,7 @@ add_custom_command(
 
 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
 
-message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
+message(DEBUG "adding example ${EXAMPLE_LAYERNORM2D_FWD}")
 add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt
index 6caa38d50d..2f48bb85a5 100644
--- a/example/ck_tile/05_reduce/CMakeLists.txt
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(EXAMPLE_REDUCE "tile_example_reduce")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding example ${EXAMPLE_REDUCE}")
+message(DEBUG "adding example ${EXAMPLE_REDUCE}")
 
 add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
 target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
index 5684c9b2e0..878f668f91 100644
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -25,7 +25,7 @@ add_custom_command(
 
 set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
 
-message("adding ${TILE_RMSNORM2D_FWD}")
+message(DEBUG "adding ${TILE_RMSNORM2D_FWD}")
 add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
 target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
index 6b0c3cef7a..7d56dd1fe3 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
+message(DEBUG "adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
 add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
 target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt
index 3849833aca..52f10b8d51 100644
--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -1,5 +1,5 @@
 function (add_smoothquant_example TARGET_NAME MAIN_SRC)
-    message("adding ${TARGET_NAME}")
+    message(DEBUG "adding ${TARGET_NAME}")
     # not using add_example_executable() to add target, since we don't want this to have
     # to be included in "make all/install/check"
     add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
index 12224a39a2..6b848bda2a 100644
--- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
@@ -1,5 +1,5 @@
 function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
-    message("adding ${TARGET_NAME}")
+    message(DEBUG "adding ${TARGET_NAME}")
     # not using add_example_executable() to add target, since we don't want this to have
     # to be included in "make all/install/check"
     add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt
index a716eef19e..78ec754528 100644
--- a/example/ck_tile/15_fused_moe/CMakeLists.txt
+++ b/example/ck_tile/15_fused_moe/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding ${TILE_EXAPMLE_FUSED_MOE}")
+message(DEBUG "adding ${TILE_EXAPMLE_FUSED_MOE}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
 add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
 target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index ec3287bf95..dbd503c0bd 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 function(add_instance_library INSTANCE_NAME)
-    message("adding instance ${INSTANCE_NAME}")
+    message(DEBUG "adding instance ${INSTANCE_NAME}")
     set(result 1)
     if(DEFINED DTYPES)
         foreach(source IN LISTS ARGN)
@@ -31,7 +31,7 @@ function(add_instance_library INSTANCE_NAME)
                 endif()
             endforeach()
             if(test EQUAL 1)
-                message("removing instance ${source} ")
+                message(DEBUG "removing instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
         endforeach()
@@ -42,42 +42,42 @@ function(add_instance_library INSTANCE_NAME)
     # Do not build DPP instances if DPP_KERNELS macro is not set
     foreach(source IN LISTS ARGN)
         if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
-            message("removing dpp instance ${source} ")
+            message(DEBUG "removing dpp instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     # Do not build DL instances if DL_KERNELS macro is not set
     foreach(source IN LISTS ARGN)
         if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
-            message("removing dl instance ${source} ")
+            message(DEBUG "removing dl instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     # Do not build XDL instances if gfx9 targets are not on the target list
     foreach(source IN LISTS ARGN)
         if(NOT INST_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
-            message("removing xdl instance ${source} ")
+            message(DEBUG "removing xdl instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     # Do not build MX instances if gfx950 targets are not on the target list
     foreach(source IN LISTS ARGN)
         if(NOT INST_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
-            message("removing MX instance ${source} ")
+            message(DEBUG "removing MX instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     # Do not build WMMA instances if gfx11 targets are not on the target list
     foreach(source IN LISTS ARGN)
 	if(NOT INST_TARGETS MATCHES "gfx11" AND NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
-            message("removing wmma instance ${source} ")
+            message(DEBUG "removing wmma instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
     foreach(source IN LISTS ARGN)
 	    if((NOT BUILD_MHA_LIB OR (NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND NOT INST_TARGETS MATCHES "gfx95")) AND source MATCHES "mha")
-         message("removing mha instance ${source} ")
+         message(DEBUG "removing mha instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
@@ -85,13 +85,13 @@ function(add_instance_library INSTANCE_NAME)
     if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         foreach(source IN LISTS ARGN)
             if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply" AND source MATCHES "_f8_")
-                message("removing gemm_multiply_multiply_f8 instance ${source} ")
+                message(DEBUG "removing gemm_multiply_multiply_f8 instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
         endforeach()
         foreach(source IN LISTS ARGN)
             if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
-                message("removing gemm_universal_f8 instance ${source} ")
+                message(DEBUG "removing gemm_universal_f8 instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
         endforeach()
@@ -99,12 +99,12 @@ function(add_instance_library INSTANCE_NAME)
     # Do not build WMMA gemm_universal_f8 for any targets except gfx12+
     foreach(source IN LISTS ARGN)
         if(NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "gemm_wmma_universal" AND source MATCHES "_f8_")
-            message("removing gemm_universal_f8 instance ${source} ")
+            message(DEBUG "removing gemm_universal_f8 instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
 
-    #message("remaining instances: ${ARGN}")
+    message(DEBUG "remaining instances: ${ARGN}")
     #only continue if there are some source files left on the list
     if(ARGN)
         set(INST_OBJ)
@@ -170,16 +170,16 @@ function(add_instance_library INSTANCE_NAME)
 
         # flags to compress the library
         if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
-            #message("Adding --offload-compress flag for ${INSTANCE_NAME}")
+            message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}")
             target_compile_options(${INSTANCE_NAME} PRIVATE --offload-compress)
         endif()
 
         set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
         clang_tidy_check(${INSTANCE_NAME})
         set(result 0)
-        message("add_instance_library ${INSTANCE_NAME}")
+        message(DEBUG "add_instance_library ${INSTANCE_NAME}")
     else()
-        message("skip_instance_libary ${INSTANCE_NAME}")
+        message(DEBUG "skip_instance_libary ${INSTANCE_NAME}")
     endif()
     set(result ${result} PARENT_SCOPE)
 endfunction(add_instance_library INSTANCE_NAME)
@@ -199,31 +199,31 @@ FOREACH(subdir_path ${dir_list})
         file(READ "${subdir_path}/CMakeLists.txt" cmake_instance)
         set(add_inst 0)
         if(("${cmake_instance}" MATCHES "_fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
-            message("fp8 instance found!")
+            message(DEBUG "fp8 instance found!")
             set(add_inst 1)
         endif()
         if(("${cmake_instance}" MATCHES "_bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
-            message("bf8 instance found!")
+            message(DEBUG "bf8 instance found!")
             set(add_inst 1)
         endif()
         if(("${cmake_instance}" MATCHES "_bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
-            message("bf16 instance found!")
+            message(DEBUG "bf16 instance found!")
             set(add_inst 1)
         endif()
         if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
-            message("fp16 instance found!")
+            message(DEBUG "fp16 instance found!")
             set(add_inst 1)
         endif()
         if(("${cmake_instance}" MATCHES "_fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
-            message("fp32 instance found!")
+            message(DEBUG "fp32 instance found!")
             set(add_inst 1)
         endif()
         if(("${cmake_instance}" MATCHES "_fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
-            message("fp64 instance found!")
+            message(DEBUG "fp64 instance found!")
             set(add_inst 1)
         endif()
         if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
-            message("int8 instance found!")
+            message(DEBUG "int8 instance found!")
             set(add_inst 1)
         endif()
         if(NOT ("${cmake_instance}" MATCHES "_fp8" OR
@@ -238,7 +238,7 @@ FOREACH(subdir_path ${dir_list})
                 "${cmake_instance}" MATCHES "_int8" OR
                 "${cmake_instance}" MATCHES "_i8" OR
                 "${cmake_instance}" MATCHES "_int4"))
-            message("instance should be built for all types!")
+            message(DEBUG "instance should be built for all types!")
             set(add_inst 1)
         endif()
         if(NOT DEFINED DTYPES)
@@ -248,39 +248,39 @@ FOREACH(subdir_path ${dir_list})
         set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
 
         if(("${cmake_instance}" MATCHES "quantization") AND (DEFINED DTYPES) AND (NOT DTYPES MATCHES "int8"))
-            message("quantization instances will not be built!")
+            message(DEBUG "quantization instances will not be built!")
             set(add_inst 0)
         endif()
         if(("${cmake_instance}" MATCHES "ONLY DL_KERNELS") AND (NOT DEFINED DL_KERNELS))
-            message("Found only dl instances, but DL_KERNELS is not set. Skipping.")
+            message(DEBUG "Found only dl instances, but DL_KERNELS is not set. Skipping.")
             set(add_inst 0)
         endif()
         if(("${cmake_instance}" MATCHES "ONLY XDL_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx9"))
-            message("Found only xdl instances, but gfx9 is not on the targets list. Skipping.")
+            message(DEBUG "Found only xdl instances, but gfx9 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
         if(("${cmake_instance}" MATCHES "ONLY MX_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx950"))
-            message("Found only MX instances, but gfx950 is not on the targets list. Skipping.")
+            message(DEBUG "Found only MX instances, but gfx950 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
 	    if(("${cmake_instance}" MATCHES "ONLY WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12"))
-            message("Found only wmma instances, but gfx11 is not on the targets list. Skipping.")
+            message(DEBUG "Found only wmma instances, but gfx11 is not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
         if(("${cmake_instance}" MATCHES "ONLY XDL_AND_DL_KERNELS") AND (NOT DEFINED DL_KERNELS) AND (NOT INST_TARGETS MATCHES "gfx9"))
-            message("Found only xdl and dl instances, but gfx9 is not on the targets listand DL_KERNELS is not set. Skipping.")
+            message(DEBUG "Found only xdl and dl instances, but gfx9 is not on the targets listand DL_KERNELS is not set. Skipping.")
             set(add_inst 0)
         endif()
 	    if(("${cmake_instance}" MATCHES "ONLY XDL_AND_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9"))
-            message("Found only xdl and wmma instances, but gfx11 and gfx9 are not on the targets list. Skipping.")
+            message(DEBUG "Found only xdl and wmma instances, but gfx11 and gfx9 are not on the targets list. Skipping.")
             set(add_inst 0)
         endif()
 	    if(("${cmake_instance}" MATCHES "XDL_DL_WMMA_KERNELS") AND (NOT INST_TARGETS MATCHES "gfx11") AND (NOT INST_TARGETS MATCHES "gfx12") AND (NOT INST_TARGETS MATCHES "gfx9") AND (NOT DEFINED DL_KERNELS))
-            message("Found xdl, dl, and wmma instances, but none of those meet the target list. Skipping.")
+            message(DEBUG "Found xdl, dl, and wmma instances, but none of those meet the target list. Skipping.")
             set(add_inst 0)
         endif()
         if(("${cmake_instance}" MATCHES "gemm_multiply_multiply" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
-            message("Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
+            message(DEBUG "Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
         if ("${cmake_instance}" MATCHES "gemm_bilinear")
@@ -294,7 +294,7 @@ FOREACH(subdir_path ${dir_list})
         endif()
 
         if(MIOPEN_REQ_LIBS_ONLY)
-            message("Removing all sources that are not required for MIOpen")
+            message(STATUS "Removing all sources that are not required for MIOpen")
             if("${cmake_instance}" MATCHES "gemm" OR 
                "${cmake_instance}" MATCHES "mha" OR 
                "${cmake_instance}" MATCHES "contraction" OR 
@@ -319,9 +319,9 @@ FOREACH(subdir_path ${dir_list})
             else()
                  list(APPEND CK_DEVICE_OTHER_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
             endif()
-            message("add_instance_directory ${subdir_path}")
+            message(DEBUG "add_instance_directory ${subdir_path}")
         else()
-            message("skip_instance_directory ${subdir_path}")
+            message(DEBUG "skip_instance_directory ${subdir_path}")
         endif()
     ENDIF()
 ENDFOREACH()
diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
index 0457588ea6..99ed93801d 100644
--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -8,11 +8,11 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/)
 if(NOT CK_USE_ALTERNATIVE_PYTHON)
    find_package(Python3 COMPONENTS Interpreter Development)
 else()
-   message("Using alternative python version")
+   message(STATUS "Using alternative python version")
    set(EXTRA_PYTHON_PATH)
    # this is overly restrictive, we may need to be more flexible on the following
    string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
-   message("alternative python path is: ${EXTRA_PYTHON_PATH}")
+   message(STATUS "alternative python path is: ${EXTRA_PYTHON_PATH}")
    find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
    add_definitions(-DPython3_EXECUTABLE="${CK_USE_ALTERNATIVE_PYTHON}")
    set(Python3_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}")
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index d1480c2032..2cfb5581ea 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -111,7 +111,7 @@ foreach(SOURCE ${PROFILER_OPS})
     list(APPEND PROFILER_SOURCES ${SOURCE})
   endif()
 endforeach()
-message(STATUS "ckProfiler sources: ${PROFILER_SOURCES}")
+message(VERBOSE "ckProfiler sources: ${PROFILER_SOURCES}")
 
 set(PROFILER_EXECUTABLE ckProfiler)
 
@@ -119,7 +119,7 @@ add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
 target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
 # flags to compress the library
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
-  message(STATUS "Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
+  message(DEBUG "Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
   target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
 endif()
 
@@ -228,7 +228,7 @@ foreach(LIB ${DEVICE_INSTANCES})
     list(APPEND PROFILER_LIBS ${LIB})
   endif()
 endforeach()
-message(STATUS "ckProfiler libs: ${PROFILER_LIBS}")
+message(VERBOSE "ckProfiler libs: ${PROFILER_LIBS}")
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE ${PROFILER_LIBS})
 
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index aa7e6651f1..1f2e7022ba 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -41,7 +41,7 @@ set(REGRESSION_TESTS
 )
 
 function(add_test_executable TEST_NAME)
-    message("adding test ${TEST_NAME}")
+    message(DEBUG "adding test ${TEST_NAME}")
     set(result 1)
     if(DEFINED DTYPES)
         foreach(source IN LISTS ARGN)
@@ -68,7 +68,7 @@ function(add_test_executable TEST_NAME)
                 set(test 1)
             endif()
             if(test EQUAL 1)
-                message("removing test ${source} ")
+                message(DEBUG "removing test ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
         endforeach()
@@ -78,25 +78,25 @@ function(add_test_executable TEST_NAME)
 
     foreach(source IN LISTS ARGN)
         if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
-            message("removing dpp test ${source} ")
+            message(DEBUG "removing dpp test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     foreach(source IN LISTS ARGN)
         if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
-            message("removing dl test ${source} ")
+            message(DEBUG "removing dl test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     foreach(source IN LISTS ARGN)
         if(NOT TEST_TARGETS MATCHES "gfx9" AND source MATCHES "xdl")
-            message("removing xdl test ${source} ")
+            message(DEBUG "removing xdl test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
     foreach(source IN LISTS ARGN)
 	if(NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "wmma")
-            message("removing wmma test ${source} ")
+            message(DEBUG "removing wmma test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
@@ -119,7 +119,7 @@ function(add_test_executable TEST_NAME)
         rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
         set(result 0)
     endif()
-    #message("add_test returns ${result}")
+    message(DEBUG "add_test returns ${result}")
     set(result ${result} PARENT_SCOPE)
     if(result EQUAL 0 AND NOT "${TEST_NAME}" IN_LIST REGRESSION_TESTS)
         set_tests_properties(${TEST_NAME} PROPERTIES LABELS "SMOKE_TEST")
@@ -131,7 +131,7 @@ function(add_test_executable TEST_NAME)
 endfunction()
 
 function(add_gtest_executable TEST_NAME)
-    message("adding gtest ${TEST_NAME}")
+    message(DEBUG "adding gtest ${TEST_NAME}")
     set(result 1)
     if(DEFINED DTYPES)
         foreach(source IN LISTS ARGN)
@@ -158,7 +158,7 @@ function(add_gtest_executable TEST_NAME)
                 set(test 1)
             endif()
             if(test EQUAL 1)
-                message("removing gtest ${source} ")
+                message(DEBUG "removing gtest ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
         endforeach()
@@ -168,28 +168,28 @@ function(add_gtest_executable TEST_NAME)
 
     foreach(source IN LISTS ARGN)
         if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
-            message("removing dl test ${source} ")
+            message(DEBUG "removing dl test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
 
     foreach(source IN LISTS ARGN)
         if(NOT TEST_TARGETS MATCHES "gfx9" AND source MATCHES "xdl")
-            message("removing xdl test ${source} ")
+            message(DEBUG "removing xdl test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
 
     foreach(source IN LISTS ARGN)
     if(NOT TEST_TARGETS MATCHES "gfx95" AND source MATCHES "mx_")
-        message("removing microscaling test ${source} ")
+        message(DEBUG "removing microscaling test ${source} ")
         list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
 
     foreach(source IN LISTS ARGN)
 	if(NOT TEST_TARGETS MATCHES "gfx11" AND NOT TEST_TARGETS MATCHES "gfx12" AND source MATCHES "wmma")
-            message("removing wmma test ${source} ")
+             message(DEBUG "removing wmma test ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
     endforeach()
@@ -218,7 +218,7 @@ function(add_gtest_executable TEST_NAME)
         rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
         set(result 0)
     endif()
-    #message("add_gtest returns ${result}")
+    message(DEBUG "add_gtest returns ${result}")
     set(result ${result} PARENT_SCOPE)
     if(result EQUAL 0 AND NOT "${TEST_NAME}" IN_LIST REGRESSION_TESTS)
         set_tests_properties(${TEST_NAME} PROPERTIES LABELS "SMOKE_TEST")
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 598bd68666..cfc5b0cd1a 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -21,7 +21,7 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     target_compile_options(test_ck_tile_gemm_pipeline_compv3 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     target_compile_options(test_ck_tile_gemm_pipeline_compv4 PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
 else()
-    message("Skipping ck_tile_gemm tests for current target")
+    message(DEBUG "Skipping ck_tile_gemm tests for current target")
 endif()
 
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MATCHES "gfx90a")
diff --git a/tile_engine/include/CMakeLists.txt b/tile_engine/include/CMakeLists.txt
index d11a4b3bee..53d97aafae 100644
--- a/tile_engine/include/CMakeLists.txt
+++ b/tile_engine/include/CMakeLists.txt
@@ -1 +1 @@
-message("Add include directory")
+message(STATUS "Add include directory")
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index 01b064ea98..cbba248211 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -42,7 +42,7 @@ target_include_directories(gemm_template_instances PRIVATE ${CMAKE_CURRENT_LIST_
 target_sources(gemm_template_instances PRIVATE ${GEMM_CODEGEN_HPP_FILES})
 
 set(BENCHMARK_GEMM_EXECUTABLE "benchmark_gemm")
-message("adding example ${BENCHMARK_GEMM_EXECUTABLE}")
+message(DEBUG "adding example ${BENCHMARK_GEMM_EXECUTABLE}")
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 

From bd270fe4bcee5d2f8d9b011ca3e3fbcd1899900a Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 10 Jun 2025 11:13:40 -0700
Subject: [PATCH 199/443] fix flatmm kernel for bigger size for fp16 datatype
 (#2302)

---
 example/ck_tile/18_flatmm/CMakeLists.txt      |  4 +-
 example/ck_tile/18_flatmm/flatmm_basic.cpp    | 77 +++++++------------
 example/ck_tile/18_flatmm/flatmm_basic.hpp    | 37 +++++++++
 .../ck_tile/18_flatmm/run_flatmm_example.inc  | 65 ++++++----------
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |  4 +-
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp |  2 +-
 6 files changed, 91 insertions(+), 98 deletions(-)

diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
index f4d823e91a..58e06f3c0f 100644
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -3,6 +3,6 @@ add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
 set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
-list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_16x16x32=1 -DENABLE_FP8=1 -Wno-unused-local-typedef)
-#list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_32x32x16=1 -DENABLE_FP8=1 -Wno-unused-local-typedef)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_16x16x32=1 -Wno-unused-local-typedef)
+#list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_32x32x16=1 -Wno-unused-local-typedef)
 target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 2dbff1bc5c..c564d7d1b1 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -22,49 +22,22 @@ template <typename ADataType,
           typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
 {
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 2;
-
-    // This part comes from the Codegen
-#if defined(USING_MFMA_16x16x32) || defined(ENABLE_FP16)
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 128;
-    constexpr ck_tile::index_t K_Tile = 128;
-
-    constexpr ck_tile::index_t M_Warp = 1;
-    constexpr ck_tile::index_t N_Warp = 4;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
-    constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
-    constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 64 : 16;
-
-#elif defined(USING_MFMA_32x32x16) && defined(ENABLE_FP8)
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 128;
-
-    constexpr ck_tile::index_t M_Warp = 1;
-    constexpr ck_tile::index_t N_Warp = 8;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
-    constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
-    constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 16;
-#endif
-    using CodegenFlatmmShape =
-        ck_tile::TileFlatmmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                                 ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                                 ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using FlatmmConfig       = FlatmmConfig<ADataType>;
+    using CodegenFlatmmShape = ck_tile::TileFlatmmShape<
+        ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
+        ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
+        ck_tile::sequence<FlatmmConfig::M_Warp_Tile,
+                          FlatmmConfig::N_Warp_Tile,
+                          FlatmmConfig::K_Warp_Tile>>;
 
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenFlatmmShape>;
 
-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenGemmTraits      = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
+                                                      FlatmmConfig::kPadN,
+                                                      FlatmmConfig::kPadK,
+                                                      ALayout,
+                                                      BLayout,
+                                                      CLayout>;
     using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
                                                                 BDataType,
                                                                 AccDataType,
@@ -82,11 +55,11 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                                              CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
+                                             FlatmmConfig::M_Warp,
+                                             FlatmmConfig::N_Warp,
+                                             FlatmmConfig::M_Warp_Tile,
+                                             FlatmmConfig::N_Warp_Tile,
+                                             FlatmmConfig::K_Warp_Tile,
                                              CodegenPipelineProblem::TransposeC,
                                              memory_operation>>;
 
@@ -110,8 +83,9 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
 
         if(s.log_level_ > 0)
         {
-            std::cout << "Launching kernel with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+            std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName()
+                      << CodegenPipelineProblem::GetName() << " grid: {" << grids.x << ", "
+                      << grids.y << ", " << grids.z << "}"
                       << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                       << std::endl;
         }
@@ -150,12 +124,15 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
             ave_time = ck_tile::launch_kernel_preprocess(
                 s,
                 run_flush_cache,
-                ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
         }
         else
         {
-            ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
         }
         return ave_time;
     };
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index 55f2d4f367..6b52ce8b1b 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -109,6 +109,43 @@ struct is_8bit_type
 {
 };
 
+template <typename ADataType>
+struct FlatmmConfig
+{
+#if defined(USING_MFMA_16x16x32)
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 64 : 16;
+
+#elif defined(USING_MFMA_32x32x16)
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 8;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 16;
+#endif
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 3d4f154af7..1607fb6163 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -32,38 +32,20 @@ static constexpr inline auto is_row_major(Layout layout_)
 }
 
 // mfma_type, 0:32x32, 1:16x16
-template <typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type)
+template <typename FlatmmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
 {
     assert(t.get_lengths().size() == 2);
-    int n_ = t.get_lengths()[1];
-    int k_ = t.get_lengths()[0];
-
-    if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0)
-    {
-        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 16, 2, 8});
-        std::copy(t.begin(), t.end(), t_view.begin());
-        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-    }
-    else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1)
-    {
-        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 32, 4, 8});
-        std::copy(t.begin(), t.end(), t_view.begin());
-        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-    }
-    else if((mfma_dtype == "int8" || mfma_dtype == "fp8" || mfma_dtype == "bf8") && mfma_type == 0)
-    {
-        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 32, 2, 16});
-        std::copy(t.begin(), t.end(), t_view.begin());
-        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-    }
-    else if((mfma_dtype == "int8" || mfma_dtype == "fp8" || mfma_dtype == "bf8") && mfma_type == 1)
-    {
-        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 64, 4, 16});
-        std::copy(t.begin(), t.end(), t_view.begin());
-        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-    }
-    return t;
+    int n_                = t.get_lengths()[1];
+    int k_                = t.get_lengths()[0];
+    constexpr int divisor = FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4;
+    ck_tile::HostTensor<T> t_view({n_ / FlatmmConfig::N_Warp_Tile,
+                                   FlatmmConfig::N_Warp_Tile,
+                                   k_ / FlatmmConfig::K_Warp_Tile,
+                                   divisor,
+                                   FlatmmConfig::K_Warp_Tile / divisor});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
 }
 
 template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
@@ -149,10 +131,11 @@ int run_flatmm_example_with_layouts(int argc,
     if(!result)
         return -1;
 
-    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
-    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
-    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
-    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
+    using ADataType    = typename GemmBasicTypeConfig<PrecType>::ADataType;
+    using BDataType    = typename GemmBasicTypeConfig<PrecType>::BDataType;
+    using CDataType    = typename GemmBasicTypeConfig<PrecType>::CDataType;
+    using AccDataType  = typename GemmBasicTypeConfig<PrecType>::AccDataType;
+    using FlatmmConfig = FlatmmConfig<ADataType>;
 
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
@@ -163,8 +146,9 @@ int run_flatmm_example_with_layouts(int argc,
     ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
 
     ck_tile::index_t kbatch = arg_parser.get_int("split_k");
-    int n_warmup            = arg_parser.get_int("warmup");
-    int n_repeat            = arg_parser.get_int("repeat");
+
+    int n_warmup = arg_parser.get_int("warmup");
+    int n_repeat = arg_parser.get_int("repeat");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -188,13 +172,8 @@ int run_flatmm_example_with_layouts(int argc,
     c_rslt_host.SetZero();
 
     // do pre-shuffle
-    std::string mfma = arg_parser.get_str("prec");
-#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
-    ck_tile::index_t mfma_type = 1;
-#else
-    ck_tile::index_t mfma_type = 0;
-#endif
-    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_origin_host, mfma, mfma_type);
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<FlatmmConfig>(b_origin_host);
+
     ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index cbd20a6ea3..aa4d233ecb 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -75,7 +75,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
 
     CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
     {
-#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8) || defined(USING_MFMA_32x32x16)
+#if defined(USING_MFMA_16x16x32) || defined(USING_MFMA_32x32x16)
         constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
 
         using WG = remove_cvref_t<decltype(config.template at<0>())>;
@@ -92,7 +92,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
         constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
         constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
 #endif
-#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
+#if defined(USING_MFMA_16x16x32)
         static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
             ignore = i;
             __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 7d06d871a9..91323d2c39 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -19,7 +19,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
     {
         using namespace ck_tile;
-#if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8)
+#if defined(USING_MFMA_16x16x32)
         /*reduce transform layers,compare with old ck*/
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;

From 14d229d6c8c799d999522aa0975ae9ed53854e57 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 10 Jun 2025 16:34:33 -0700
Subject: [PATCH 200/443] fix on the typo (#2326)

---
 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 4bc4884beb..7f7a835a69 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1127,7 +1127,7 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
                 bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
-            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_316x16x32_bf8_bf8(
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
                 bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
 #else
         ck_tile::ignore = a_vec;

From 06e0b8436c218349f08527cf0e5d2c502c622b77 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 10 Jun 2025 22:44:50 -0700
Subject: [PATCH 201/443] Epilogue cshuffle Improvement (#2312)

* add cshuffle's mxdlperwavepershuffle support, not finished

* add epilogue functions

* add cshuffle's mxdlperwavepershuffle support, not finished

* add epilogue functions

* update cshuffle logic

* update cshuffle_logics

* add some change within review

* update some codes following the code review

* update epilogue logic

* remove from problem

* update codes following review.

* fix some issues

* solve the previous PR error, refine the code

* Update include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>

* Comment addressed

* handling tile_engine failing case

* handling tile_engine failing case

---------

Co-authored-by: joyeamd <John.Ye@amd.com>
Co-authored-by: joye <joye@amd.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: khushbu agarwal <khuagarw@amd.com>
---
 .../ops/epilogue/cshuffle_epilogue.hpp        | 194 ++++++++++++------
 1 file changed, 133 insertions(+), 61 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 1f53dfd93c..5a6521deb5 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -17,11 +17,11 @@ template <typename ADataType_,
           index_t kBlockSize_,
           index_t kM_,
           index_t kN_,
-          index_t kMWave_,
-          index_t kNWave_,
-          index_t kMPerXdl_,
-          index_t kNPerXdl_,
-          index_t kKPerXdl_,
+          index_t MWave_,
+          index_t NWave_,
+          index_t MPerXdl_,
+          index_t NPerXdl_,
+          index_t KPerXdl_,
           bool isCTransposed_,
           memory_operation_enum MemoryOperation_>
 struct CShuffleEpilogueProblem
@@ -34,11 +34,11 @@ struct CShuffleEpilogueProblem
     static constexpr index_t kBlockSize                    = kBlockSize_;
     static constexpr index_t kMPerBlock                    = kM_;
     static constexpr index_t kNPerBlock                    = kN_;
-    static constexpr index_t kMWave                        = kMWave_;
-    static constexpr index_t kNWave                        = kNWave_;
-    static constexpr index_t kMPerXdl                      = kMPerXdl_;
-    static constexpr index_t kNPerXdl                      = kNPerXdl_;
-    static constexpr index_t kKPerXdl                      = kKPerXdl_;
+    static constexpr index_t MWave                         = MWave_;
+    static constexpr index_t NWave                         = NWave_;
+    static constexpr index_t MPerXdl                       = MPerXdl_;
+    static constexpr index_t NPerXdl                       = NPerXdl_;
+    static constexpr index_t KPerXdl                       = KPerXdl_;
     static constexpr index_t isCTransposed                 = isCTransposed_;
     static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
 };
@@ -59,25 +59,14 @@ struct CShuffleEpilogue
     static constexpr index_t kBlockSize                    = Problem::kBlockSize;
     static constexpr index_t kMPerBlock                    = Problem::kMPerBlock;
     static constexpr index_t kNPerBlock                    = Problem::kNPerBlock;
-    static constexpr index_t kMWave                        = Problem::kMWave;
-    static constexpr index_t kNWave                        = Problem::kNWave;
-    static constexpr index_t kMPerXdl                      = Problem::kMPerXdl;
-    static constexpr index_t kNPerXdl                      = Problem::kNPerXdl;
-    static constexpr index_t kKPerXdl                      = Problem::kKPerXdl;
+    static constexpr index_t MWave                         = Problem::MWave;
+    static constexpr index_t NWave                         = Problem::NWave;
+    static constexpr index_t MPerXdl                       = Problem::MPerXdl;
+    static constexpr index_t NPerXdl                       = Problem::NPerXdl;
+    static constexpr index_t KPerXdl                       = Problem::KPerXdl;
     static constexpr index_t isCTransposed                 = Problem::isCTransposed;
-    static constexpr index_t kMPerIteration                = kMPerXdl * kMWave;
-    static constexpr index_t kNPerIteration                = kNPerXdl * kNWave;
-
-    using WG = WarpGemmMfmaDispatcher<ADataType,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      kMPerXdl,
-                                      kNPerXdl,
-                                      kKPerXdl,
-                                      isCTransposed>;
-
-    using CWarpDstr   = typename WG::CWarpDstr;
-    using CWarpTensor = typename WG::CWarpTensor;
+    static constexpr index_t MPerIteration                 = MPerXdl * MWave;
+    static constexpr index_t NPerIteration                 = NPerXdl * NWave;
 
     /**
      * @brief Get the vector store size for C tensor.
@@ -89,18 +78,18 @@ struct CShuffleEpilogue
      *
      * @return The vector store size for C tensor.
      */
-    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeC()
+    CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeC()
     {
-        constexpr index_t MaxVectorStoreSize = 16;
+        constexpr index_t max_vector_size = 16;
         if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
         {
-            return std::min(static_cast<int>(kNPerIteration),
-                            static_cast<int>(MaxVectorStoreSize / sizeof(ODataType)));
+            return std::min(static_cast<int>(NPerIteration),
+                            static_cast<int>(max_vector_size / sizeof(ODataType)));
         }
         else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
         {
-            return std::min(static_cast<int>(kMPerIteration),
-                            static_cast<int>(MaxVectorStoreSize / sizeof(ODataType)));
+            return std::min(static_cast<int>(MPerIteration),
+                            static_cast<int>(max_vector_size / sizeof(ODataType)));
         }
         else
         {
@@ -108,6 +97,65 @@ struct CShuffleEpilogue
         }
     }
 
+    /**
+     * @brief Shuffle tile configuration parameters
+     *
+     * @details These parameters control the number of XDL tiles processed per wave in each shuffle
+     * iteration:
+     * - NumMXdlPerWavePerShuffle: Number of XDL tiles in M dimension processed per wave
+     * - NumNXdlPerWavePerShuffle: Number of XDL tiles in N dimension processed per wave
+     */
+    static constexpr auto shuffle_tile_tuple = [] {
+        constexpr index_t elem_per_thread = MPerXdl * NPerXdl / get_warp_size();
+        if constexpr(elem_per_thread >= GetVectorSizeC())
+        {
+            return std::make_tuple(1, 1);
+        }
+        else
+        {
+            constexpr index_t num_xdl_shuffles = GetVectorSizeC() / elem_per_thread;
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                static_assert((kMPerBlock % (MPerXdl * MWave) == 0) &&
+                                  (kMPerBlock % num_xdl_shuffles == 0),
+                              "kMPerBlock must be divisible by MPerXdl*MWave and "
+                              "num_xdl_shuffles for CShuffleEpilogue");
+                return std::make_tuple(min(num_xdl_shuffles, kMPerBlock / (MPerXdl * MWave)), 1);
+            }
+            else
+            {
+                static_assert((kNPerBlock % (NPerXdl * NWave) == 0) &&
+                                  (kNPerBlock % num_xdl_shuffles == 0),
+                              "kNPerBlock must be divisible by NPerXdl*NWave and "
+                              "num_xdl_shuffles for CShuffleEpilogue");
+                return std::make_tuple(1, min(num_xdl_shuffles, kNPerBlock / (NPerXdl * NWave)));
+            }
+        }
+    }();
+    static constexpr index_t NumMXdlPerWavePerShuffle = std::get<0>(shuffle_tile_tuple);
+    static constexpr index_t NumNXdlPerWavePerShuffle = std::get<1>(shuffle_tile_tuple);
+
+    static constexpr auto MNPerIterationShuffle = [] {
+        constexpr index_t m_val = MPerXdl * MWave * NumMXdlPerWavePerShuffle;
+        constexpr index_t n_val = NPerXdl * NWave * NumNXdlPerWavePerShuffle;
+        if constexpr(kMPerBlock % m_val != 0 || kNPerBlock % n_val != 0)
+            return std::make_tuple(MPerXdl * MWave, NPerXdl * NWave);
+        else
+            return std::make_tuple(m_val, n_val);
+    }();
+    static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
+    static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);
+    using WG                                      = WarpGemmMfmaDispatcher<ADataType,
+                                      BTypeToUse,
+                                      AccDataType,
+                                      MPerXdl,
+                                      NPerXdl,
+                                      KPerXdl,
+                                      isCTransposed>;
+
+    using CWarpDstr   = typename WG::CWarpDstr;
+    using CWarpTensor = typename WG::CWarpTensor;
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor()
     {
@@ -115,15 +163,15 @@ struct CShuffleEpilogue
         if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
         {
             return make_naive_tensor_descriptor(
-                make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
-                make_tuple(number<kNWave * kNPerXdl>{}, number<1>{}));
+                make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+                make_tuple(number<NPerIterationShuffle>{}, number<1>{}));
         }
         // M is contiguous dimension
         else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
         {
             return make_naive_tensor_descriptor(
-                make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
-                make_tuple(number<1>{}, number<kMWave * kMPerXdl>{}));
+                make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+                make_tuple(number<1>{}, number<MPerIterationShuffle>{}));
         }
         else
         {
@@ -131,40 +179,62 @@ struct CShuffleEpilogue
         }
     }
 
+    CK_TILE_DEVICE static constexpr auto MakeLdsDistributionEncode()
+    {
+        constexpr auto block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
+                                             sequence<NumNXdlPerWavePerShuffle, NWave>>,
+                                       tuple<sequence<1, 2>>,
+                                       tuple<sequence<1, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto block_dstr_encoding = detail::make_embed_tile_distribution_encoding(
+            block_outer_dstr_encoding, typename CWarpDstr::DstrEncode{});
+
+        return block_dstr_encoding;
+    }
+
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
-        return kMWave * kNWave * kMPerXdl * kNPerXdl * sizeof(ODataType);
+        return MPerIterationShuffle * NPerIterationShuffle * sizeof(ODataType);
     }
 
     template <typename ODramWindow, typename OAccTile>
     CK_TILE_DEVICE auto
     operator()(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, void* p_smem)
     {
+        constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
 
-        const index_t iMWarp = get_warp_id() / kNWave;
-        const index_t iNWarp = get_warp_id() - iMWarp * kNWave;
+        auto lds_tile = make_static_distributed_tensor<AccDataType>(LdsTileDistr);
 
         constexpr auto lds_block_desc = MakeLdsBlockDescriptor<Problem>();
         auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
             static_cast<ODataType*>(p_smem), lds_block_desc);
-        auto in_lds_window =
-            make_tile_window(o_lds_block,
-                             make_tuple(number<kMPerXdl>{}, number<kNPerXdl>{}),
-                             {number<kMPerXdl>{} * iMWarp, number<kNPerXdl>{} * iNWarp});
-        auto out_lds_window =
-            make_tile_window(o_lds_block,
-                             make_tuple(number<kMWave * kMPerXdl>{}, number<kNWave * kNPerXdl>{}),
-                             {0, 0});
+
+        auto in_lds_window = make_tile_window(
+            o_lds_block,
+            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+            {0, 0},
+            LdsTileDistr);
+
+        auto out_lds_window = make_tile_window(
+            o_lds_block,
+            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+            {0, 0});
 
         using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
                                         sequence<0, 1>,
-                                        sequence<kMPerXdl * kMWave, kNPerXdl * kNWave>>;
+                                        sequence<MPerIterationShuffle, NPerIterationShuffle>>;
         constexpr index_t num_access = SFC::get_num_of_access();
 
+        static_assert(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>,
+                      "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
+
         using TileEncodingPattern =
             TileDistributionEncodingPattern2D<kBlockSize,
-                                              kMPerIteration,
-                                              kNPerIteration,
+                                              MPerIterationShuffle,
+                                              NPerIterationShuffle,
                                               GetVectorSizeC(),
                                               tile_distribution_pattern::thread_raked>;
         constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
@@ -173,21 +243,23 @@ struct CShuffleEpilogue
             to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
-        CWarpTensor c_warp_in_tensor;
         static_for<0, num_access, 1>{}([&](auto iAccess) {
+            block_sync_lds();
             constexpr auto idx_y_start = SFC::get_index(iAccess);
 
-            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (kMPerXdl * kMWave)>{};
-            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (kNPerXdl * kNWave)>{};
+            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
+            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};
 
-            c_warp_in_tensor.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
-                merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+            lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+                merge_sequences(
+                    sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
+                    c_warp_y_index_zeros),
+                merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
+                                c_warp_y_lengths));
 
-            const auto c_warp_in_tensor_casted = cast_tile<ODataType>(c_warp_in_tensor);
+            const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
 
-            block_sync_lds();
-            store_tile(in_lds_window, c_warp_in_tensor_casted);
+            store_tile(in_lds_window, c_warptile_in_tensor_casted);
             block_sync_lds();
 
             const auto c_out_tensor =

From 6fad1c48742cb1547433caf82733f6763d011364 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Wed, 11 Jun 2025 10:59:44 -0700
Subject: [PATCH 202/443] Stream-K Reduction option as Runtime parameter and
 Compilation Error Fix (SK- Reduction) (#2145)

* reduction is passed as runtime parameter

* clang

* Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp

Co-authored-by: John Afaganis <john.afaganis@amd.com>

* Update include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp


* remove comment

---------
---
 example/01_gemm/common.hpp                    | 25 ++++-
 .../01_gemm/run_gemm_example_streamk_v2.inc   | 18 +++-
 .../gpu/device/device_gemm_streamk_v2.hpp     | 32 ++++---
 .../device_gemm_xdl_cshuffle_streamk_v3.hpp   | 93 +++++++++++--------
 .../gpu/grid/block_to_ctile_map.hpp           | 24 +++--
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 75 +++++++++------
 include/ck/utility/dynamic_buffer.hpp         | 50 +++++++++-
 7 files changed, 216 insertions(+), 101 deletions(-)
 mode change 100644 => 100755 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
 mode change 100644 => 100755 include/ck/utility/dynamic_buffer.hpp

diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
index d3e61b8216..434f549443 100644
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -15,6 +15,8 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/utility/data_type.hpp"
 
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/fill.hpp"
@@ -57,8 +59,9 @@ struct ProblemSizeStreamK_universal final
     ck::index_t StrideB = -1;
     ck::index_t StrideC = -1;
 
-    ck::index_t Grid_size   = -1; // defaults to max occupancy
-    ck::index_t Streamk_sel = 1;  // defaults to 1-tile SK
+    ck::index_t Grid_size                           = -1; // defaults to max occupancy
+    ck::index_t Streamk_sel                         = 1;  // defaults to 1-tile SK
+    ck::StreamKReductionStrategy reduction_strategy = ck::StreamKReductionStrategy::Atomic;
 };
 
 struct ProblemSizeSplitK final
@@ -173,7 +176,19 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
         if(argc >= 11)
         {
             problem_size.Streamk_sel = std::stoi(argv[10]);
-            problem_size.Grid_size   = std::stoi(argv[11]);
+
+            if(argc >= 12)
+            {
+                problem_size.Grid_size = std::stoi(argv[11]);
+
+                if(argc >= 13)
+                {
+                    int reduction_strategy          = std::stoi(argv[12]);
+                    problem_size.reduction_strategy = reduction_strategy == 0
+                                                          ? ck::StreamKReductionStrategy::Atomic
+                                                          : ck::StreamKReductionStrategy::Reduction;
+                }
+            }
         }
     }
     else
@@ -185,7 +200,9 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
             << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
             << std::endl
             << "arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
-            << "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
+            << std::endl
+            << "arg11: Grid_size(-1 for max occupancy)" << std::endl
+            << "arg12: Reduction strategy (0: Atomic, 1: Reduction)" << std::endl;
         return false;
     }
 
diff --git a/example/01_gemm/run_gemm_example_streamk_v2.inc b/example/01_gemm/run_gemm_example_streamk_v2.inc
index af35de0d25..2700838bcc 100644
--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
@@ -21,6 +21,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
     auto Grid_size   = problem_size.Grid_size;
     auto Streamk_sel = problem_size.Streamk_sel;
 
+    auto reduction_strategy = problem_size.reduction_strategy;
+    if(reduction_strategy == ck::StreamKReductionStrategy::Atomic)
+    {
+        std::cout << "Using Atomic reduction strategy" << std::endl;
+    }
+    else
+    {
+        std::cout << "Using Parallel reduction strategy" << std::endl;
+    }
+
     auto f_host_tensor_descriptor =
         [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
             if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
@@ -152,7 +162,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         Grid_size,
         a_element_op,
         b_element_op,
-        c_element_op);
+        c_element_op,
+        reduction_strategy);
 
     if(!gemm.IsSupportedArgument(argument))
     {
@@ -242,7 +253,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
         float gb_per_sec = num_btype / 1.E6 / ave_time;
 
         std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+                  << " GB/s, " << gemm.GetTypeString()
+                  << (reduction_strategy == ck::StreamKReductionStrategy::Atomic ? " (Atomic)"
+                                                                                 : " (Reduction)")
+                  << std::endl;
     }
     return pass;
 }
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_streamk_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_streamk_v2.hpp
index 1a4d684f14..ad79c1f61c 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_streamk_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_streamk_v2.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -20,21 +21,22 @@ template <typename ALayout,
           typename CElementwiseOperation>
 struct DeviceGemm_Streamk_V2 : public BaseOperator
 {
-    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a,
-                        const void* p_b,
-                        void* p_c,
-                        ck::index_t M,
-                        ck::index_t N,
-                        ck::index_t K,
-                        ck::index_t StrideA,
-                        ck::index_t StrideB,
-                        ck::index_t StrideC,
-                        ck::index_t Streamk_sel,
-                        ck::index_t Grid_size,
-                        AElementwiseOperation a_element_op,
-                        BElementwiseOperation b_element_op,
-                        CElementwiseOperation c_element_op) = 0;
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        void* p_c,
+        ck::index_t M,
+        ck::index_t N,
+        ck::index_t K,
+        ck::index_t StrideA,
+        ck::index_t StrideB,
+        ck::index_t StrideC,
+        ck::index_t Streamk_sel,
+        ck::index_t Grid_size,
+        AElementwiseOperation a_element_op,
+        BElementwiseOperation b_element_op,
+        CElementwiseOperation c_element_op,
+        StreamKReductionStrategy reduction_strategy = StreamKReductionStrategy::Atomic) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
index 26be5cfc61..3171208830 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -149,8 +149,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
 
             const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
 
-            if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
-                         StreamKReductionStrategy::Atomic)
+            if(arg.reduction_strategy == StreamKReductionStrategy::Atomic)
             {
 
                 hip_check_error(hipMemsetAsync(
@@ -198,26 +197,27 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                 else
                 {
 
-                    if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
-                                 StreamKReductionStrategy::Atomic)
+                    if(arg.reduction_strategy == StreamKReductionStrategy::Atomic)
                     {
                         ave_time = launch_and_time_kernel(
                             stream_config, kernel, grid_dim, dim3(BlockSize), 0, arg);
                     }
-                    else if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
-                                      StreamKReductionStrategy::Reduction)
+                    else if(arg.reduction_strategy == StreamKReductionStrategy::Reduction)
                     {
                         char* workspace_semaphore =
                             reinterpret_cast<char*>(arg.p_workspace_) +
                             arg.block_2_ctile_map_streamk.get_workspace_size_for_acc(
                                 sizeof(GemmAccDataType));
                         auto preprocess = [&]() {
-                            hipMemsetAsync(
+                            hipError_t status = hipMemsetAsync(
                                 workspace_semaphore,
                                 0,
                                 // sizeof(uint32_t),
                                 arg.block_2_ctile_map_streamk.get_workspace_size_for_semaphore(),
                                 stream_config.stream_id_);
+
+                            // Check the status
+                            hip_check_error(status);
                         };
 
                         ave_time = launch_and_time_kernel_with_preprocess(
@@ -437,8 +437,7 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
     size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
     {
         const Argument* p_arg = dynamic_cast<const Argument*>(pArg);
-        if constexpr(GridwiseGemm::Block2CTileMap_streamk::ReductionStrategy ==
-                     StreamKReductionStrategy::Reduction)
+        if(p_arg->reduction_strategy == StreamKReductionStrategy::Reduction)
         {
             return p_arg->block_2_ctile_map_streamk.get_workspace_size(sizeof(GemmAccDataType));
         }
@@ -491,20 +490,22 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
         return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
     }
 
-    static auto MakeArgument(const ADataType* p_a,
-                             const BDataType* p_b,
-                             CDataType* p_c,
-                             index_t M,
-                             index_t N,
-                             index_t K,
-                             index_t StrideA,
-                             index_t StrideB,
-                             index_t StrideC,
-                             index_t streamk_sel,
-                             index_t Grid_size,
-                             AElementwiseOperation,
-                             BElementwiseOperation,
-                             CElementwiseOperation)
+    static auto
+    MakeArgument(const ADataType* p_a,
+                 const BDataType* p_b,
+                 CDataType* p_c,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t streamk_sel,
+                 index_t Grid_size,
+                 AElementwiseOperation,
+                 BElementwiseOperation,
+                 CElementwiseOperation,
+                 StreamKReductionStrategy reduction_strategy = StreamKReductionStrategy::Atomic)
     {
 
         constexpr index_t minimum_occupancy =
@@ -705,26 +706,39 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
             }
         }
 
-        return Argument{p_a, p_b, p_c, M, N, K, StrideA, StrideB, StrideC, streamk_sel, Grid_size};
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        streamk_sel,
+                        Grid_size,
+                        reduction_strategy};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
 
     // polymorphic
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
-                                                      const void* p_b,
-                                                      void* p_c,
-                                                      index_t M,
-                                                      index_t N,
-                                                      index_t K,
-                                                      index_t StrideA,
-                                                      index_t StrideB,
-                                                      index_t StrideC,
-                                                      index_t streamk_sel,
-                                                      index_t Grid_size,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation) override
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        void* p_c,
+        index_t M,
+        index_t N,
+        index_t K,
+        index_t StrideA,
+        index_t StrideB,
+        index_t StrideC,
+        index_t streamk_sel,
+        index_t Grid_size,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        StreamKReductionStrategy reduction_strategy = StreamKReductionStrategy::Atomic) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -736,7 +750,8 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
                                           StrideB,
                                           StrideC,
                                           streamk_sel,
-                                          Grid_size);
+                                          Grid_size,
+                                          reduction_strategy);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 311545aad6..dcc07d8a49 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -1415,12 +1415,11 @@ template <uint32_t MPerBlock_,
           index_t M01_                                = 4>
 struct BlockToCTileMap_GemmStreamK_v2
 {
-    static constexpr uint32_t min_k_iters_per_sk_block          = 2;
-    static constexpr uint32_t MPerBlock                         = MPerBlock_;
-    static constexpr uint32_t NPerBlock                         = NPerBlock_;
-    static constexpr uint32_t KPerBlock                         = KPerBlock_;
-    static constexpr StreamKReductionStrategy ReductionStrategy = ReductionStrategy_;
-    static constexpr uint32_t tile_swizzle_sub_m                = TileSwizzleSubM_;
+    static constexpr uint32_t min_k_iters_per_sk_block = 2;
+    static constexpr uint32_t MPerBlock                = MPerBlock_;
+    static constexpr uint32_t NPerBlock                = NPerBlock_;
+    static constexpr uint32_t KPerBlock                = KPerBlock_;
+    static constexpr uint32_t tile_swizzle_sub_m       = TileSwizzleSubM_;
 
     //--------------------------------------
     // pass to device
@@ -1433,10 +1432,17 @@ struct BlockToCTileMap_GemmStreamK_v2
     MDiv k_iters_per_tile;
     MDiv equiv_tiles_big;    // for reduction
     MDiv equiv_tiles_little; // for reduction
+    StreamKReductionStrategy reduction_strategy;
 
     // prefer construct on host
     __host__ __device__ BlockToCTileMap_GemmStreamK_v2(
-        uint32_t m, uint32_t n, uint32_t k, uint32_t grid_size = 1, uint32_t streamk_sel = 1)
+        uint32_t m,
+        uint32_t n,
+        uint32_t k,
+        uint32_t grid_size                           = 1,
+        uint32_t streamk_sel                         = 1,
+        StreamKReductionStrategy reduction_strategy_ = StreamKReductionStrategy::Atomic)
+        : reduction_strategy(reduction_strategy_)
     {
 
         // total output tiles
@@ -1546,7 +1552,7 @@ struct BlockToCTileMap_GemmStreamK_v2
         // Using multiple blocks for parallel reduction
         reduction_start_block_idx = dp_start_block_idx + dp_num_blocks;
 
-        if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction)
+        if(reduction_strategy == ck::StreamKReductionStrategy::Reduction)
         {
             // Add additional safety checks
             if(k_iters_per_big_block > 0 && k_iters_per_tile.get() > 0)
@@ -1589,7 +1595,7 @@ struct BlockToCTileMap_GemmStreamK_v2
 
     __host__ __device__ index_t get_grid_dims() const
     {
-        if constexpr(ReductionStrategy == StreamKReductionStrategy::Reduction)
+        if(reduction_strategy == StreamKReductionStrategy::Reduction)
         {
             // return dim3(reduction_start_block_idx + get_sk_tiles(), 1, 1);
             return reduction_start_block_idx + get_sk_tiles();
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
old mode 100644
new mode 100755
index 4e72255d31..f1c0ec1c68
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -513,7 +513,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                          index_t StrideB_,
                          index_t StrideC_,
                          index_t Streamk_sel_,
-                         index_t Grid_size_)
+                         index_t Grid_size_,
+                         StreamKReductionStrategy reduction_strategy_)
             : M{M_},
               N{N_},
               K{K_},
@@ -522,6 +523,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
               StrideC{StrideC_},
               Streamk_sel{Streamk_sel_},
               Grid_size{Grid_size_},
+              reduction_strategy{reduction_strategy_}, // Initialize the member variable
               MPadded{CalculateMPadded(M_)},
               NPadded{CalculateNPadded(N_)},
               KRead{CalculateKRead(K_, 1)},
@@ -550,8 +552,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                       << "AK0:" << AK0 << ", "
                       << "BK0:" << BK0 << ", "
                       << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << ", Stream-K Selection:" << Streamk_sel
-                      << ", Grid size:" << Grid_size << "}" << std::endl;
+                      << "NBlock: " << NBlock << ", "
+                      << "Stream-K Selection:" << Streamk_sel << ", "
+                      << "Grid size:" << Grid_size << ", "
+                      << "Reduction Strategy:"
+                      << (reduction_strategy == StreamKReductionStrategy::Atomic ? "Atomic"
+                                                                                 : "Reduction")
+                      << "}" << std::endl;
         }
 
         index_t M;
@@ -562,6 +569,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         index_t StrideC;
         index_t Streamk_sel;
         mutable index_t Grid_size;
+        StreamKReductionStrategy reduction_strategy;
         index_t MPadded;
         index_t NPadded;
         index_t KRead;
@@ -585,13 +593,26 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                           index_t StrideB_,
                           index_t StrideC_,
                           index_t Streamk_sel_,
-                          index_t Grid_size_)
-            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, Streamk_sel_, Grid_size_},
+                          index_t Grid_size_,
+                          StreamKReductionStrategy reduction_strategy_)
+            : Problem{M_,
+                      N_,
+                      K_,
+                      StrideA_,
+                      StrideB_,
+                      StrideC_,
+                      Streamk_sel_,
+                      Grid_size_,
+                      reduction_strategy_},
               p_a_grid{p_a_grid_},
               p_b_grid{p_b_grid_},
               p_c_grid{p_c_grid_},
-              block_2_ctile_map_streamk(
-                  M_, N_, AK0Number * CalculateKPadded(K_, 1), Grid_size_, Streamk_sel_)
+              block_2_ctile_map_streamk(M_,
+                                        N_,
+                                        AK0Number * CalculateKPadded(K_, 1),
+                                        Grid_size_,
+                                        Streamk_sel_,
+                                        reduction_strategy_)
 
         {
         }
@@ -1267,11 +1288,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
         Block2CTileMap_streamk block_2_ctile_map_streamk(problem.M,
                                                          problem.N,
                                                          AK0Number * problem.KPadded,
                                                          problem.Grid_size,
-                                                         problem.Streamk_sel);
+                                                         problem.Streamk_sel,
+                                                         problem.reduction_strategy);
         uint32_t iter_start, iter_end;
         bool is_sk_block, is_dp_block, is_reduction_block;
         index_t num_k_block_main_loop;
@@ -1286,6 +1309,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         uint32_t* p_semaphore = reinterpret_cast<uint32_t*>(
             reinterpret_cast<char*>(p_workspace) +
             block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType)));
+
         for(auto block_idx = get_block_1d_id();
             block_idx < block_2_ctile_map_streamk.get_grid_dims();
             block_idx += gridDim.x)
@@ -1301,8 +1325,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
             num_k_block_main_loop = iter_end - iter_start;
 
-            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                         StreamKReductionStrategy::Reduction)
+            if(problem.reduction_strategy == StreamKReductionStrategy::Reduction)
             {
                 is_reduction_block = static_cast<uint32_t>(block_idx) >=
                                      block_2_ctile_map_streamk.reduction_start_block_idx;
@@ -1890,8 +1913,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                         else if(is_sk_block)
                         {
-                            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                                         StreamKReductionStrategy::Atomic)
+                            if(problem.reduction_strategy == StreamKReductionStrategy::Atomic)
                             {
                                 // each block copy its data from LDS to global
                                 c_shuffle_block_copy_lds_to_global
@@ -1903,8 +1925,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                         c_grid_desc_mblock_mperblock_nblock_nperblock,
                                         c_grid_buf);
                             }
-                            else if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                                              StreamKReductionStrategy::Reduction)
+                            else if(problem.reduction_strategy ==
+                                    StreamKReductionStrategy::Reduction)
                             {
                                 // constexpr offset
                                 c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin(
@@ -1936,8 +1958,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                     });
 
-                    if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                                 StreamKReductionStrategy::Reduction)
+                    if(problem.reduction_strategy == StreamKReductionStrategy::Reduction)
                     {
                         if(is_sk_block)
                         {
@@ -1952,8 +1973,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                 iter_end -= current_iter_length;
                 if(iter_end <= iter_start)
                     break;
-                if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                             StreamKReductionStrategy::Reduction)
+                if(problem.reduction_strategy == StreamKReductionStrategy::Reduction)
                 {
                     block_acc_offset -= MPerBlock * NPerBlock;
                 }
@@ -2008,7 +2028,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                                          problem.N,
                                                          AK0Number * problem.KPadded,
                                                          problem.Grid_size,
-                                                         problem.Streamk_sel);
+                                                         problem.Streamk_sel,
+                                                         problem.reduction_strategy);
         for(auto block_idx = get_block_1d_id();
             block_idx < block_2_ctile_map_streamk.get_grid_dims();
             block_idx += gridDim.x)
@@ -2027,8 +2048,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                 reinterpret_cast<char*>(p_workspace) +
                 block_2_ctile_map_streamk.get_workspace_size_for_acc(sizeof(AccDataType)));
 
-            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                         StreamKReductionStrategy::Reduction)
+            if(problem.reduction_strategy == StreamKReductionStrategy::Reduction)
             {
                 is_reduction_block = static_cast<uint32_t>(block_idx) >=
                                      block_2_ctile_map_streamk.reduction_start_block_idx;
@@ -2644,8 +2664,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         }
                         else if(is_sk_block)
                         {
-                            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                                         StreamKReductionStrategy::Atomic)
+                            if(problem.reduction_strategy == StreamKReductionStrategy::Atomic)
                             {
                                 // each block copy its data from LDS to global
                                 c_shuffle_block_copy_lds_to_global
@@ -2657,8 +2676,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                         c_grid_desc_mblock_mperblock_nblock_nperblock,
                                         c_grid_buf);
                             }
-                            else if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                                              StreamKReductionStrategy::Reduction)
+                            else if(problem.reduction_strategy ==
+                                    StreamKReductionStrategy::Reduction)
                             {
                                 // constexpr offset
                                 c_block_copy_lds_to_partial_acc.SetSrcSliceOrigin(
@@ -2693,16 +2712,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                 iter_end -= current_iter_length;
                 if(iter_end <= iter_start)
                     break;
-                if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                             StreamKReductionStrategy::Reduction)
+                if(problem.reduction_strategy == StreamKReductionStrategy::Reduction)
                 {
                     block_acc_offset -= MPerBlock * NPerBlock;
                 }
                 // make sure next loop LDS is ready for use
                 block_sync_lds();
             }
-            if constexpr(Block2CTileMap_streamk::ReductionStrategy ==
-                         StreamKReductionStrategy::Reduction)
+            if(problem.reduction_strategy == StreamKReductionStrategy::Reduction)
             {
                 if(is_sk_block)
                 {
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
old mode 100644
new mode 100755
index 1d80f196b5..eb35c34498
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -139,7 +139,8 @@ struct DynamicBuffer
     template <InMemoryDataOperationEnum Op,
               typename X,
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value ||
+                                     !is_native_type<X>(),
                                  bool>::type = false>
     __host__ __device__ void Update(IndexType i, bool is_valid_element, const X& x)
     {
@@ -159,7 +160,37 @@ struct DynamicBuffer
         {
             auto tmp       = this->template Get<X>(i, is_valid_element);
             using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
-            // handle bfloat addition
+
+#if defined(__gfx942__) || defined(__gfx950__)
+
+            // Properly handle addition for all low-precision types
+            if constexpr(is_same_v<scalar_t, bhalf_t> || is_same_v<scalar_t, half_t>)
+            {
+                if constexpr(is_scalar_type<X>::value)
+                {
+                    // Scalar type: Convert to float, add, convert back
+                    auto result =
+                        type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
+                    this->template Set<X>(i, is_valid_element, result);
+                }
+                else
+                {
+                    // Vector type
+                    constexpr auto vector_size = scalar_type<remove_cvref_t<X>>::vector_size;
+                    const vector_type<scalar_t, vector_size> a_vector{tmp};
+                    const vector_type<scalar_t, vector_size> b_vector{x};
+
+                    // Process each element of the vector in higher precision
+                    static_for<0, vector_size, 1>{}([&](auto idx) {
+                        auto result = type_convert<scalar_t>(
+                            type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
+                            type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
+                        this->template Set<scalar_t>(i + idx, is_valid_element, result);
+                    });
+                }
+            }
+#else
+            //   handle bfloat addition
             if constexpr(is_same_v<scalar_t, bhalf_t>)
             {
                 if constexpr(is_scalar_type<X>::value)
@@ -187,6 +218,8 @@ struct DynamicBuffer
             {
                 this->template Set<X>(i, is_valid_element, x + tmp);
             }
+
+#endif
         }
     }
 
@@ -240,9 +273,20 @@ struct DynamicBuffer
         if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
         {
             constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+            using vector_t = typename vector_type_maker<remove_cvref_t<T>, t_per_x>::type::type;
+            vector_t tmp;
+
+            if constexpr(is_same_v<remove_cvref_t<X>, vector_t>)
+            {
+                tmp = x;
+            }
+            else
+            {
+                __builtin_memcpy(&tmp, &x, sizeof(vector_t));
+            }
 
             amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
-                x, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
+                tmp, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
         }
         else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
                           is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&

From 8c1ed6f4c152ac29aa535afabf7b5cb7da4ba316 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 11 Jun 2025 23:41:03 +0200
Subject: [PATCH 203/443] Move SetZero functions inside the kernels for Grouped
 Conv (#2255)

* Disable SetZero before launch kernel for grouped conv fwd

* Move set zero to kernel

* wmma fix

* fix

---------

Co-authored-by: BrianHarrisonAMD <169072757+BrianHarrisonAMD@users.noreply.github.com>
---
 ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 30 +++++++++++++-
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 29 +++++++++++++-
 ...onv_bwd_weight_multiple_d_xdl_cshuffle.hpp |  8 +++-
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 15 +++++--
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 16 ++++----
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 40 +++++++++++++------
 .../profile_grouped_conv_bwd_data_impl.hpp    |  6 ---
 .../profile_grouped_conv_bwd_weight_impl.hpp  |  3 --
 .../profile_grouped_conv_fwd_impl.hpp         |  3 --
 .../test_grouped_convnd_bwd_data_xdl.cpp      | 10 +++++
 10 files changed, 121 insertions(+), 39 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
index 5e41c96dfc..651e730b63 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>
 
+#include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -244,6 +245,22 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
               input_right_pads_{input_right_pads},
               k_batch_{split_k}
         {
+            bool image_covered_dilation = true;
+            bool image_covered_strides  = true;
+            for(index_t d = 0; d < NDimSpatial; d++)
+            {
+                // If dilation and stride is not equal to  the we will have some empty places
+                image_covered_dilation &=
+                    conv_filter_dilations[d] == 1 || conv_filter_strides[d] == 1;
+                // If stride is larger than windows size then we will have some empty places
+                image_covered_strides &= conv_filter_strides[d] <= b_g_k_c_xs_lengths[d + I3];
+            }
+            bwd_needs_zero_out = k_batch_ > 1 || !image_covered_dilation || !image_covered_strides;
+            e_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    e_g_n_c_wis_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
+                sizeof(EDataType);
+
             // populate Ds pointer
             static_for<0, NumDTensor, 1>{}([&](auto i) {
                 using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
@@ -449,6 +466,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
         std::array<index_t, NDimSpatial> input_right_pads_;
 
         const index_t k_batch_;
+        bool bwd_needs_zero_out;
+        long_index_t e_space_size_bytes;
     };
 
     // Invoker
@@ -474,6 +493,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                 const auto GemmK = arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I0) *
                                    arg.a_grid_desc_ak0_m_ak1_container_[i].GetLength(I2);
 
+                const auto clear_workspace = [&]() {
+                    if(arg.bwd_needs_zero_out && i == 0)
+                    {
+                        hip_check_error(hipMemsetAsync(
+                            arg.p_e_grid_, 0, arg.e_space_size_bytes, stream_config.stream_id_));
+                    }
+                };
+
                 auto launch_kernel = [&](auto has_main_k_block_loop) {
                     constexpr bool has_main_loop = has_main_k_block_loop.value;
 
@@ -494,8 +521,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
                         has_main_loop>;
 
-                    return launch_and_time_kernel(
+                    return launch_and_time_kernel_with_preprocess(
                         stream_config,
+                        clear_workspace,
                         kernel,
                         dim3(grid_size),
                         dim3(BlockSize),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index f18ce40fc5..f6f354f98e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -517,6 +517,22 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
               input_right_pads_{input_right_pads},
               k_batch_{split_k}
         {
+            bool image_covered_dilation = true;
+            bool image_covered_strides  = true;
+            for(index_t d = 0; d < NDimSpatial; d++)
+            {
+                // If dilation and stride is not equal to  the we will have some empty places
+                image_covered_dilation &=
+                    conv_filter_dilations[d] == 1 || conv_filter_strides[d] == 1;
+                // If stride is larger than windows size then we will have some empty places
+                image_covered_strides &= conv_filter_strides[d] <= b_g_k_c_xs_lengths[d + I3];
+            }
+            bwd_needs_zero_out = k_batch_ > 1 || !image_covered_dilation || !image_covered_strides;
+            e_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
+                sizeof(EDataType);
+
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
                                                                       a_g_n_k_wos_strides);
@@ -887,6 +903,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         const index_t k_batch_;
         index_t num_workgroups_per_Conv_N_;
+        bool bwd_needs_zero_out;
+        long_index_t e_space_size_bytes;
     };
 
     // Invoker
@@ -940,6 +958,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
                 const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);
 
+                const auto clear_workspace = [&]() {
+                    if(arg.bwd_needs_zero_out && i == 0)
+                    {
+                        hip_check_error(hipMemsetAsync(
+                            p_e_grid, 0, arg.e_space_size_bytes, stream_config.stream_id_));
+                    }
+                };
+
                 auto launch_kernel = [&](auto has_main_k_block_loop) {
                     constexpr bool has_main_loop = has_main_k_block_loop.value;
 
@@ -961,8 +987,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         has_main_loop,
                         ElementOp>;
 
-                    return launch_and_time_kernel(
+                    return launch_and_time_kernel_with_preprocess(
                         stream_config,
+                        clear_workspace,
                         kernel,
                         dim3(gdx, gdy, gdz),
                         dim3(BlockSize),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
index 33b6d7c585..672c7dd2f7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -595,6 +595,11 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
               input_right_pads_{input_right_pads},
               k_batch_{split_k}
         {
+            c_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
+                sizeof(AccDataType);
+
             constexpr index_t spatial_offset = 3;
             std::copy(begin(b_g_n_c_wis_lengths) + spatial_offset,
                       end(b_g_n_c_wis_lengths),
@@ -709,6 +714,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
         const index_t k_batch_;
+        long_index_t c_space_size_bytes;
     };
 
     // Invoker
@@ -757,7 +763,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
 
                 auto preprocess = [&]() {
                     hip_check_error(hipMemsetAsync(
-                        p_c_grid, 0, arg.GetWorkspaceSizeBytes(), stream_config.stream_id_));
+                        p_c_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_));
                 };
 
                 const auto kernel = kernel_batched_gemm_xdlops_bwd_weight<
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 6a708a9e7e..c7c463f43d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -550,6 +550,11 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
               input_right_pads_{input_right_pads},
               k_batch_{split_k}
         {
+            c_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
+                sizeof(AccDataType);
+
             constexpr index_t spatial_offset = 3;
             std::copy(begin(b_g_n_c_wis_lengths) + spatial_offset,
                       end(b_g_n_c_wis_lengths),
@@ -747,6 +752,7 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
         const index_t k_batch_;
+        long_index_t c_space_size_bytes;
     };
 
     // Invoker
@@ -810,10 +816,11 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                 arg.a_grid_desc_k0_m_k1_.GetLength(Number<0>{}) / gemm_arg.KBatch;
 
             const auto clear_workspace = [&]() {
-                hip_check_error(hipMemsetAsync(gemm_arg.p_c_grid,
-                                               0,
-                                               arg.GetWorkspaceETensorSizeBytes(),
-                                               stream_config.stream_id_));
+                if(arg.k_batch_ > 1)
+                {
+                    hip_check_error(hipMemsetAsync(
+                        gemm_arg.p_c_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_));
+                }
             };
 
             const auto Run = [&](const auto& kernel) {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index c904b4e7d5..6c53161ded 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -468,6 +468,11 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
               input_right_pads_{input_right_pads},
               k_batch_{split_k}
         {
+            c_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
+                sizeof(WeiDataType);
+
             constexpr index_t spatial_offset = 3;
             std::copy(begin(b_g_n_c_wis_lengths) + spatial_offset,
                       end(b_g_n_c_wis_lengths),
@@ -654,6 +659,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
         const index_t k_batch_;
+        long_index_t c_space_size_bytes;
     };
 
     // Invoker
@@ -773,14 +779,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     has_main_loop>;
 
                 const auto clear_workspace = [&]() {
-                    if constexpr(is_NGCHW_GKCYX_NGKHW<InLayout, WeiLayout, OutLayout>() ||
-                                 is_NGCDHW_GKCZYX_NGKDHW<InLayout, WeiLayout, OutLayout>())
-                    {
-                        hip_check_error(hipMemsetAsync(p_e_grid,
-                                                       0,
-                                                       arg.GetWorkspaceETensorSizeBytes(),
-                                                       stream_config.stream_id_));
-                    }
+                    hip_check_error(hipMemsetAsync(
+                        p_e_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_));
                 };
 
                 avg_time += launch_and_time_kernel_with_preprocess(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index b28b7347b6..f13a256d6b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -427,6 +427,11 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
               input_right_pads_{input_right_pads},
               k_batch_{split_k}
         {
+            c_space_size_bytes =
+                ck::accumulate_n<long_index_t>(
+                    e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
+                sizeof(WeiDataType);
+
             constexpr index_t spatial_offset = 3;
             std::copy(begin(b_g_n_c_wis_lengths) + spatial_offset,
                       end(b_g_n_c_wis_lengths),
@@ -509,6 +514,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
         const index_t k_batch_;
+        long_index_t c_space_size_bytes;
     };
 
     // Invoker
@@ -559,6 +565,14 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
             const auto num_k_per_block =
                 arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(Number<0>{}) / gemm_arg.KBatch;
 
+            const auto clear_workspace = [&]() {
+                if(arg.k_batch_ > 1)
+                {
+                    hip_check_error(hipMemsetAsync(
+                        gemm_arg.p_c_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_));
+                }
+            };
+
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
@@ -575,6 +589,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                         ck::utility::flush_icache();
                         // rotating mem
                         rotating_mem.Next();
+                        clear_workspace();
                     };
                     ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
                         stream_config,
@@ -592,18 +607,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                 }
                 else
                 {
-                    ave_time +=
-                        launch_and_time_kernel(stream_config,
-                                               kernel,
-                                               dim3(gdx, gdy, gdz),
-                                               dim3(BlockSize),
-                                               0,
-                                               gemm_arg,
-                                               arg.a_grid_desc_kbatch_k0_m_k1_,
-                                               arg.b_grid_desc_kbatch_k0_n_k1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.compute_ptr_offset_of_batch_,
-                                               num_k_per_block);
+                    ave_time += launch_and_time_kernel_with_preprocess(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        gemm_arg,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                        arg.compute_ptr_offset_of_batch_,
+                        num_k_per_block);
                 }
             };
 
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 4e0ced347d..6cd8440e58 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -86,9 +86,6 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
     out_device_buf.ToDevice(out.mData.data());
     wei_device_buf.ToDevice(wei.mData.data());
 
-    // reset input to zero
-    in_device_buf.SetZero();
-
     float max_accumulated_value = 0;
     if(do_verification)
     {
@@ -136,9 +133,6 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            // re-init output to zero before profiling next kernel
-            in_device_buf.SetZero();
-
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index a13f79182e..ca9b2f1d24 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -11,7 +11,6 @@
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
@@ -207,8 +206,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
             if(op_ptr->IsSupportedArgument(argument_ptr.get()))
             {
-                // using atomic add, so need to reset input
-                wei_device_buf.SetZero();
 
                 std::string op_name = op_ptr->GetTypeString();
 
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index dfa6bc1edd..08e707b665 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -155,9 +155,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
 
         if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            // re-init output to zero before profiling next kernel
-            out_device_buf.SetZero();
-
             std::string op_name = op_ptr->GetTypeString();
 
             auto invoker_ptr = op_ptr->MakeInvokerPointer();
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index c4404b95ba..7f8f64c2e2 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -104,6 +104,12 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
         {2, 2, 2, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back(
         {2, 2, 2, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 2, 32, 32, {2, 2}, {12, 12}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 2, 32, 32, {2, 2}, {12, 12}, {2, 2}, {2, 2}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 6, 448, 896, {1, 1}, {118, 182}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
     this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {16, 16}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
@@ -119,6 +125,10 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl3d, Test3D)
         {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(
         {3, 2, 2, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 2, 32, 32, {1, 2, 2}, {1, 12, 12}, {1, 3, 3}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 2, 32, 32, {1, 2, 2}, {1, 12, 12}, {1, 2, 2}, {1, 2, 2}, {0, 0, 0}, {0, 0, 0}});
     this->conv_params.push_back(
         {3, 1, 1, 1, 32, {3, 3, 3}, {4, 16, 16}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
     this->conv_params.push_back(

From 37554c31e8e1cd3732bb6e51d3ea1c39cbe66b0e Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Thu, 12 Jun 2025 09:25:59 +0800
Subject: [PATCH 204/443] Add MoE & FP8 Blockscale WP Kernels for GFX950
 (#2297)

* [fix] align v3 gufusion pipeline

* fix device kernel selection.

* Add .co direct asm support by CK_USE_ASM_MOE_STAGE2_BLOCKSCALE

* experimental optimization for scale load in blkscale gemm

* Add asm for no-loop v3_128x128x128

* fix bugs

* tune fp8 example

* Update v1_128x128x128 to 2x2 instead of 4x1

* wip

* add warmup to asm launch

* wip2

* 16x16 function merged to moe

* temp save, a performant version.

* wip3

* Update .co binary to 16x16

* 16x16x128 correct; 64x64x128 failed

* update

* use mem_op::set when topk=1

* add mx fp8 b_preshuffle support, function not yet tested.

* Spilt the fp4 target. Fix the known bugs. 128x128x128 sanity checked; remove prints

* some fixes

* fix update

* remove some unnecessary hacky; enable 256x256x256 tilesize

* update for function debug

* Add pipeline v3. Have some runtime issue and register spill

* Fix pipe v3 correctness issue

* remove unnecessary hacky

* clang format

* fix a bug

* fix the bug, functional test passed

* tempsave; buggy at passed 4 e8m0 to scaled mfma

* added fp4_bpreshuffle example, build failures

* fixed some bugs

* implement shuffled scale mxfp4gemm, blocker: opsel not effect

* hotfix

* fix bugs, build passed

* (M, N, K)=(128, 128, 128) function failed.

* temp save for gemm1. Function not ready

* fix compile error. Gemm2 pass. Gemm1 WIP

* fix bug for a lds read

* update moe

* Compile pass. Gemm1 function WIP

* update moe

* fix fp8; fix even/odd

* tempsave

* update moe

* Revert "update"

This reverts commit 960b2bce1ca879ee8b7d95a41b3dc35e573a315b.

* Revert "use mem_op::set when topk=1"

This reverts commit def952a178bbb73e0940cf6a3cf69802e38b4dd7.

* Add v3 128x128x128_4x4_16x16.co for gfx950

* temp cmake flag suppression  for aiter test

* add code for mxfp4 gemm, blockscale not supported yet

* gemm1 up-only pass. GU WIP

* function pass with inline asm hacky

* revert unexpected file change

* updated and build passed

* update CE elementOP

* added code for debug

* Gemm1 GUFusion function pass. Perf WIP

* Fix fp8/bf8; remove duplicated code

* disable the scheduler in v3; bring it back when compiler feature ready.

* update moe v1 pipeline

* Add gemm1 v1 32x128x128

* remove schedule barrier

* updated

* Fix fp8/bf8 B-row

* mfma using asm, device result correct, host result need to check

* gemm1 v3 64x128x128 debug

* fix cpu ref

* a/b thread_desc stride fix

* Use random scale for init1

* 16x16x128 input size blockscale function passed

* fix blockscale gemm bug

* tempsave. Almost all instances passed.

* v1 fix for mi350.

* temp save

* debug save

* update debug

* fix the bug, 128x128x256 tile function passed

* v3

* rename moe block selector and pipeline

* Add gemm1 v1

* Add gemm1 v1 to selector

* added mx moe block v3 support, function passed

* compile error fix

* Improve the pipeline

* Pack e8m0 as int32_t

* v1 compile pass. Function not ready

* debug synchronize issue over different GPU/ROCm

* minor fix

* Add profiler filter

* Add f4 ckProfiler

* Fix example compile error

* Add f4 profiler examples

* tempsave

* v1 function pass.

* v3 function pass

* align file and function name

* mx_moe_fp4 ready for aiter with clang-format.

* modify the way we represent fp4

* generalize the pipeline scheduling.

* init moe mx f4 scale shuffle

* Cmakelist diable compiler-bound flags

* mx_fp4 default parameter change

* Moe blockscale gemm1&gemm2 asm support for aiter. Suppression cmkae flag til new compler.

* update code

* tempsave; modify the way we represent fp4

* generalize the pipeline scheduling.

* Add gemm1 gfx942 .co support

* updated code, build passed.

* Update gemm2 asm with latest compiler flag

* Fix mx f4 ckProfiler

* Fix blockwise gemm mx v1

* lds conflict free + buffer load lds

* Add gemm2 v3 64x128x128

* fix a, b scale loading bugs, a, b scale loading now correctly

* Add gemm2 v3 64x128x128

* commit with debug info

* fix fp4 profiler

* Add mx fp4 pileline v1 instances

* Fix v2 topk_weight cal. Add silu asm.

* v2 tok_weight WIP

* init mx fp4 B no preshuffle version

* tempsave. compile pass, function wrong

* enable fp4 moe no weigth preshuffle, function pass

* update the TFlops calculation in the example

* Add gemm2 64x128x128 asm. Fix BF16 ref.

* fix 2 typos in fp4_preshuffle

* Better kernel selection in device classes

* correct preShuffleBuffer

we should used packed k to do shuffle.

* lds conflict free + buffer load lds

* optimize offset math in dma

* Fix fp4 ckProfiler

* Fix MX MFMA tests

* fix f4 pipeline issues

* gemm1 func pass

* update mx moe gemm1_bns tile size to 64x128x256

* update mx moe gemm1 gemm2 TF and BW calculation

* fix typo

* temp save

* Fix example_gemm_mx build

* rename the block pipeline

* correct a typo in tail

* Add rotating to mx examples

* fix the correctness issue

* Fix v1; use M padding

* Add NT flag to B/BScale buffer

* Merge gemm_mx_common.hpp

* temp save, 4.4~4.5

* Fix 'Merge gemm_mx_common.hpp'

* refactor the pipeline

* Pad the M for scale buffer unconditionaly

* update MX moe GEMM1 hotloopscheduling

* change the gemm1 tile from 64x128x128 to 128x64x128

* Unconditional Ascale padding

* Pad shuffled a scale only

* pad ascale

* add vmcnt guard for async copy

* Profiler add f4 wp

* Merge preshuffle device

* Add more fp4 wp instances

* Fix do_weight in gemm1. Fix cshuffle_datatype. Clang-format

* Clang-format after 2 merges

* Remove rocm6.3 workaround flags and macro

* Fix fp8 config

* Fix bf8 config

* flag and barrier fix for copmiler branch MainOpSelV3

* Add fp8 profiler instances

* Remove debug infos; Enable flags for blockscale f8

* No asm ver. for merging moe blocksale fp8 into mainline

* update the flag name for f8blockscale

* recover example

* fix performance bug of bpreshuffle f8 gemm

* clang format, remove  single rate mfma restriction for f8

* remove single rate mfma restriction for f8 blockscale gemm

* Fix moe blockscale gemm1 barrier 0x800 for new compiler

* add pipeline v1 for MOE Gemm2

* Use v1 pipeline for example_moe_gemm2_xdl_mx_fp4_bns

* Fix OOB; add MB96 instances

* remove unnecessary files

* fix the cmake issue

* Enable splitk for mxfp4; clang format;

* Generate random tensor values with multiple threads

* Use packed_size_v for A/BPackedSize

* Fix warning

* Fix target_compile_options for disabled target on gfx942

* fix moe pki4 on gfx950

* doc the kGroup definition

* Fix ThreadwiseTensorSliceTransfer_v4::Run (Fuse scale)

* Refactor thread_copy_lds_direct_load; fix gfx942 direct lds load example; fix f16_pki4 example

* Fix unknown compiler flag

* fix two failed examples.

* fix some failure tile size in gfx950 universal gemm. fix test_gemm_fp16

* workaround fix for test_gemm_f32; * We have very limited support for lds direct load if input matrix is not K major

* fix test_gemm_splitk;

* Fix compile for mx_mfma_op

* add mfma selection logic for multipled_v3

* Clean up

* Fix device gemm mx link error

* improve the global atomic pattern

* Revert unnecessary copyright updates

* restore minimum_occupancy logic

* Avoid data race in moe gemm2 ref

* Build fp8 gemm_multiply_multiply and moe only on gfx94/95

* update the instance in device_mx_gemm

* Resolve comments

* Copyright 2025

* Remove unused code

* fix library linking issue

---------

Co-authored-by: OscarXu <huaiguxu@amd.com>
Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: valarLip <340077269@qq.com>
Co-authored-by: feifei14119 <feiw@amd.com>
Co-authored-by: Lin, Qun <qlin@amd.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
Co-authored-by: joye <joye@amd.com>
Co-authored-by: asleepzzz <hanwen.chang@amd.com>
---
 cmake/EnableCompilerWarnings.cmake            |    5 +-
 .../01_gemm/gemm_xdl_lds_direct_load_fp16.cpp |    4 +-
 .../65_gemm_multiply_multiply/CMakeLists.txt  |   37 +-
 ...emm_multiply_multiply_xdl_fp8_ab_scale.cpp |   16 +-
 ...ultiply_xdl_fp8_blockscale_bpreshuffle.cpp |  372 +++
 ..._multiply_multiply_xdl_fp8_bpreshuffle.cpp |   10 +-
 .../moe_gemm1_xdl_fp8.cpp                     |   50 +-
 .../moe_gemm1_xdl_fp8_blockscale.cpp          |  548 ++++
 .../moe_gemm2_xdl_fp8.cpp                     |   29 +-
 .../moe_gemm2_xdl_fp8_blockscale.cpp          |  541 ++++
 example/67_gemm_microscaling/CMakeLists.txt   |   30 +-
 .../67_gemm_microscaling/gemm_mx_common.hpp   |   34 +-
 example/67_gemm_microscaling/gemm_mx_fp4.cpp  |    2 -
 .../gemm_mx_fp4_bpreshuffle.cpp               |    8 +-
 .../moe_gemm1_xdl_mx_fp4_bns.cpp              |  545 ++++
 .../moe_gemm2_xdl_mx_fp4_bns.cpp              |  526 +++
 example/CMakeLists.txt                        |   10 +-
 include/ck/library/utility/host_tensor.hpp    |   68 +
 .../library/utility/host_tensor_generator.hpp |   12 +
 include/ck/library/utility/thread.hpp         |   25 +
 ...dlops_b_preshuffle_gufusion_dequant_v1.hpp |   50 +-
 ...peline_xdlops_b_preshuffle_gufusion_v1.hpp |   51 +-
 ...peline_xdlops_b_preshuffle_gufusion_v3.hpp |  952 ++++++
 ...xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp |  919 ++++++
 ...xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp | 1020 ++++++
 ...ne_xdlops_b_preshuffle_mx_moe_selector.hpp |  155 +
 ...pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp |  813 +++++
 ...pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp | 1032 ++++++
 ..._pipeline_xdlops_b_preshuffle_selector.hpp |   69 +-
 ...dlops_blockscale_b_preshuffle_selector.hpp |  123 +
 ...line_xdlops_blockscale_b_preshuffle_v1.hpp |  864 +++++
 ...line_xdlops_blockscale_b_preshuffle_v3.hpp | 1090 +++++++
 ...oe_blockscale_b_preshuffle_gufusion_v1.hpp | 1036 ++++++
 ...oe_blockscale_b_preshuffle_gufusion_v3.hpp | 1203 +++++++
 ...s_moe_blockscale_b_preshuffle_selector.hpp |  186 ++
 ..._xdlops_moe_blockscale_b_preshuffle_v1.hpp |  854 +++++
 ..._xdlops_moe_blockscale_b_preshuffle_v3.hpp | 1070 +++++++
 ...pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp | 1361 ++++++++
 ...mm_pipeline_xdlops_mx_moe_nbs_selector.hpp |  130 +
 ...ise_gemm_pipeline_xdlops_mx_moe_nbs_v1.hpp |  664 ++++
 ...ise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp | 1126 +++++++
 ...kwise_gemm_pipeline_xdlops_v3_ab_scale.hpp |    8 +-
 .../gpu/device/device_gemm_multiple_d.hpp     |   48 +-
 .../device_gemm_multiple_d_ab_scale.hpp       |   45 +-
 ...xdl_cshuffle_v3_blockscale_bpreshuffle.hpp |  507 +++
 .../impl/device_moe_gemm_blockscale.hpp       |  584 ++++
 .../gpu/device/impl/device_moe_mx_gemm.hpp    |  571 ++++
 .../device/impl/device_moe_mx_gemm_bns.hpp    |  540 ++++
 ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp |    6 +-
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |   10 +-
 ...fle_v3_multi_d_blockscale_b_preshuffle.hpp | 2080 ++++++++++++
 ...se_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp |   62 +-
 .../gpu/grid/gridwise_moe_gemm.hpp            |  354 +-
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp | 2668 +++++++++++++++
 .../gpu/grid/gridwise_moe_mx_gemm.hpp         | 2652 +++++++++++++++
 .../gpu/grid/gridwise_moe_mx_gemm_bns.hpp     | 2849 +++++++++++++++++
 .../threadwise_tensor_slice_transfer.hpp      |    5 -
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |    9 -
 include/ck/utility/amd_xdlops.hpp             |  100 +-
 include/ck/utility/data_type.hpp              |   11 -
 include/ck/utility/debug.hpp                  |   13 +
 include/ck/utility/dtype_vector.hpp           |   14 +-
 include/ck/utility/functional2.hpp            |    3 +-
 .../cpu/reference_moe_gemm1_blockscale.hpp    |  280 ++
 .../cpu/reference_moe_gemm2.hpp               |   11 +-
 .../cpu/reference_moe_gemm2_blockscale.hpp    |  248 ++
 .../cpu/reference_moe_mx_gemm1.hpp            |  264 ++
 .../cpu/reference_moe_mx_gemm2.hpp            |  238 ++
 .../add_device_operation_instance.hpp         |   17 +-
 .../gpu/gemm_blockscale_wp.hpp                |  172 +
 .../gpu/gemm_blockscale_wp/CMakeLists.txt     |   16 +
 ...wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp |   89 +
 ...k_mn_128_128_128_comp_default_instance.cpp |   38 +
 ..._mn_128_128_128_comp_kpadding_instance.cpp |   38 +
 ...mn_128_128_128_mem_v1_default_instance.cpp |   39 +
 ...n_128_128_128_mem_v1_kpadding_instance.cpp |   39 +
 ...evice_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp |   33 +-
 .../device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp |    3 +-
 ...device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp |    3 +-
 .../device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp |    3 +-
 .../profile_gemm_blockscale_wp_impl.hpp       |  415 +++
 .../include/profiler/profile_gemm_mx_impl.hpp |   20 +-
 profiler/src/CMakeLists.txt                   |    2 +
 profiler/src/profile_gemm_blockscale_wp.cpp   |  184 ++
 test/mx_mfma_op/mx_mfma_op.hpp                |    8 +-
 85 files changed, 32508 insertions(+), 431 deletions(-)
 create mode 100644 example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp
 create mode 100644 example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
 create mode 100644 example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
 create mode 100644 example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
 create mode 100644 example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
 create mode 100644 include/ck/library/utility/thread.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_blockscale_wp.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
 create mode 100644 profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
 create mode 100644 profiler/src/profile_gemm_blockscale_wp.cpp

diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
index fb2b38d688..0c81f8df98 100644
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -66,7 +66,8 @@ else()
             -Wunreachable-code
             -Wunused
             -Wno-reserved-identifier
-            -Werror
+            # Werror set outside by BUILD_DEV
+            # -Werror
             -Wno-option-ignored
             -Wsign-compare
             -Wno-extra-semi-stmt
@@ -108,7 +109,7 @@ else()
             endif()
             list(APPEND CMAKE_COMPILER_WARNINGS
                 -Wno-missing-field-initializers
-                -Wno-deprecated-declarations
+                -Wno-error=deprecated-declarations
             )
         endif()
         add_definitions(${CMAKE_COMPILER_WARNINGS})
diff --git a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
index 62037f7740..26ea31f20b 100644
--- a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 
@@ -38,7 +38,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster| SrcAccessOrder|   SrcVectorDim|         Scalar| AddExtraM|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|         Scalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|               |               |      PerVector|          | Lengths_K0_N_K1|               |              |      PerVector|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |          |                |               |              |               |          |            |            |                             |                |
-         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 4>,     S<1, 0, 2>,              2,              2,         1,     S<4, 16, 4>,     S<1, 0, 2>,             2,              2,         1,           1,           1,               S<1, 8, 1, 8>,               4>;
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 4>,     S<1, 0, 2>,              2,              2,         0,     S<4, 16, 4>,     S<1, 0, 2>,             2,              2,         0,           1,           1,               S<1, 8, 1, 8>,               4>;
 // clang-format on
 #else
 // clang-format off
diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index a58612cb5b..36f1860e4f 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -1,11 +1,20 @@
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp)
+add_example_executable(example_gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_fp16_bpreshuffle gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp)
 add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp)
 add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp)
+set(EXAMPLE_COMPILE_OPTIONS)
+# Open it when SGBPack branch landed on mainline
+# list(APPEND EXAMPLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm -misched=gcn-iterative-max-occupancy-experimental")
+example_compile_options(example_gemm_multiply_multiply_xdl_fp8_ab_scale PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+example_compile_options(example_gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
 add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
+add_example_executable(example_moe_gemm2_xdl_fp8_blockscale moe_gemm2_xdl_fp8_blockscale.cpp)
+add_example_executable(example_moe_gemm1_xdl_fp8_blockscale moe_gemm1_xdl_fp8_blockscale.cpp)
 
 list(APPEND gpu_list gfx942 gfx950)
 set(target 0)
@@ -19,14 +28,32 @@ foreach(gpu IN LISTS GPU_TARGETS)
             if(HAS_MAX_ILP_SCHEDULING_STRATEGY)
                 list(APPEND EXAMPLE_COMPILE_OPTIONS -mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1)
             endif()
-            target_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
-            target_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+            example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
+            example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()
         set(GEMM_OPTIONS)
         list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
-        target_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
-        target_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
-        target_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
+        example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
+        example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
+        example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
         set(target 1)
     endif()
 endforeach()
+
+set(GEMM_OPTIONS)
+list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+set(BLOCKSCALE_GEMM_OPTIONS)
+list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental " HAS_MAX_OCCUPANCY_EXPERIMENTAL)
+if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)
+endif()
+# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
+example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
+example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
+example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
+example_compile_options(example_gemm_multiply_multiply_xdl_fp8_ab_scale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
+example_compile_options(example_gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
+
+example_compile_options(example_moe_gemm2_xdl_fp8_blockscale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
+example_compile_options(example_moe_gemm1_xdl_fp8_blockscale PRIVATE ${BLOCKSCALE_GEMM_OPTIONS})
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
index b54ba5ddfb..5aa978fbf0 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -65,14 +65,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_
           A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, 
           AElementOp,  BElementOp, CDEElementOp, GemmSpec,
           256, Scale_Block_M, Scale_Block_N, Scale_Block_K,
-          16, 128,
-          256, 16, 16,
+          128, 128,
+          128, 16, 16,
           16,   16,
-          1,    2,
-          S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-          S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-          1,    2,  S<1, 16, 1, 16>,  S<8>,
-          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
+          4,    4,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          1,    2,  S<1, 32, 1, 8>,  S<8>,
+          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp
new file mode 100644
index 0000000000..d64266bccf
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using FP8  = ck::f8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = FP8;
+using A1DataType       = F32;
+using B0DataType       = FP8;
+using B1DataType       = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using A1Layout = Col;
+using B0Layout = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+void preShuffleBuffer(const FP8* src, FP8* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t Scale_Block_M = 1;
+static constexpr ck::index_t Scale_Block_N = 128;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
+    // clang-format off
+         <Row, Col, DsLayout, ELayout,
+          A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, 
+          AElementOp,  BElementOp, CDEElementOp, GemmSpec,
+          256, Scale_Block_M, Scale_Block_N, Scale_Block_K,
+          128,  128,
+          128, 16, 16,
+          16,   16,
+          8,    2,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          2,    1,  S<1, 32, 1, 8>,  S<8>,
+          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    bool flush_cache     = true;
+
+    // GEMM shape
+    ck::index_t M = 128;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideE = N;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 8)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        flush_cache = std::stoi(argv[7]);
+
+        StrideA = K;
+        StrideB = K;
+        StrideE = N;
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: M, N, K\n");
+        printf("arg7: flush both I$ and L2$ (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    // Transpose the AScale tensor for better performance
+    ck::index_t Scale_Stride_AK = (M + Scale_Block_M - 1) / Scale_Block_M;
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + Scale_Block_M - 1) / Scale_Block_M,
+                                                       (K + Scale_Block_K - 1) / Scale_Block_K,
+                                                       Scale_Stride_AK,
+                                                       A1Layout{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    Tensor<B0DataType> b0_preshuffled(
+        f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
+                                                       (N + Scale_Block_N - 1) / Scale_Block_N,
+                                                       Scale_Stride_BN,
+                                                       B0Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 3:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 4:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 5:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+    }
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    int NPerXdl    = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(b0_k_n.mData.data(), b0_preshuffled.mData.data(), N, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(),
+                                           b0_device_buf.GetDeviceBuffer(),
+                                           std::array<const void*, NumDTensor>{},
+                                           e_device_buf.GetDeviceBuffer(),
+                                           M,
+                                           N,
+                                           K,
+                                           StrideA,
+                                           StrideB,
+                                           std::array<ck::index_t, NumDTensor>{},
+                                           StrideE,
+                                           a1_device_buf.GetDeviceBuffer(),
+                                           b1_device_buf.GetDeviceBuffer(),
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float ave_time = 0.0f;
+
+    if(flush_cache)
+    {
+        int rotating_buf = (512 * 1024 * 1024 + num_btype - 1) / num_btype;
+
+        ave_time = invoker.Run(argument,
+                               StreamConfig{nullptr, time_kernel, 0, 50, 100, true, rotating_buf});
+    }
+    else
+    {
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 100});
+    }
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+        Tensor<float> a_m_k({M, K});
+        Tensor<float> b_k_n({K, N});
+
+        for(int m = 0; m < M; m++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                a_m_k(m, k) = ck::type_convert<float>(a0_m_k(m, k)) *
+                              a1_m_k(m / Scale_Block_M, k / Scale_Block_K);
+            }
+        }
+
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                b_k_n(k, n) = ck::type_convert<float>(b0_k_n(k, n)) *
+                              b1_k_n(k / Scale_Block_K, n / Scale_Block_N);
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<float,
+                                                                                float,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+#if 1
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_m_n_host_result(m, n) = ck::type_convert<EDataType>(c_m_n(m, n));
+            }
+        }
+#endif
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-2, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index 280697851b..fe1eca51b0 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
@@ -139,13 +139,13 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
     // clang-format off
     <   Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
         AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
-        128,   128,    128,
+        256,   256,    128,
         16,   16,
-        32,   32,
-        4,    1,
+        16,   16,
+        16,    4,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
         S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-        1,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
+        2,    1,   S<1, 32, 1, 8>, S<8, 8, 1>,
         ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
index 3b31460953..9fe9fdde78 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
@@ -158,21 +158,22 @@ using BElementOp = PassThrough;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
 static constexpr ck::index_t MPerBlock = 128;
-static constexpr ck::index_t MXDLPerWave = 4;
-static constexpr ck::index_t NXDLPerWave = 2;
-static constexpr ck::index_t BLOCKSIZE   = 256;
-static constexpr ck::index_t NPerBlock   = 64;
-static constexpr ck::index_t MNPerXDL    = 16;
-static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
-static constexpr ck::index_t Nswizzle    = false;
-static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
-static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
-static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
-static constexpr ck::index_t D0Vec       = 1;
-static constexpr ck::index_t D1Vec       = 1;
-static constexpr ck::index_t ActOP       = 1; // 0: gelu_and_mul, 1: silu_and_mul
-static constexpr bool MulRoutedWeight    = false;
-using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemm
+static constexpr ck::index_t NPerBlock = 128;
+static constexpr ck::index_t MNPerXDL  = 16;
+static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
+static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * 4);
+
+static constexpr ck::index_t BLOCKSIZE = 256;
+static constexpr ck::index_t KPerBlock = 128 / sizeof(A0DataType);
+static constexpr ck::index_t Nswizzle  = false;
+static constexpr ck::index_t AK1       = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1       = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec      = 16 / sizeof(EDataType);
+static constexpr ck::index_t D0Vec     = 1;
+static constexpr ck::index_t D1Vec     = 1;
+static constexpr ck::index_t ActOP     = 1; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight  = false;
+using DeviceOpInstance                 = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
         <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
                AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
@@ -183,15 +184,15 @@ using DeviceOpInstance                   = ck::tensor_operation::device::DeviceM
                // mn_perxdl
                MNPerXDL,   MNPerXDL,
                // mn_xdlperwave 
-               MXDLPerWave,    NXDLPerWave,
+               MXDLPerWave,  NXDLPerWave,
                // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
+                2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, true, int32_t, A0DataType>;
 
 // clang-format on
 
@@ -205,9 +206,9 @@ int main(int argc, char* argv[])
     ck::index_t N               = 4096;
     ck::index_t K               = 6144;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 16;
-    ck::index_t valid_tile_num  = 13;
-    ck::index_t tokens          = 64;
+    ck::index_t sorted_tile_num = 256;
+    ck::index_t valid_tile_num  = 256;
+    ck::index_t tokens          = 16384;
     ck::index_t topk            = 2;
 
     if(argc == 1)
@@ -263,11 +264,12 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
     max_token_id.mData = {valid_size};
-    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
+    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = eids[i];
+        expert_ids.mData[i] = i / (valid_tile_num / experts);
     }
+
     int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
     int tokenid        = 0;
 
@@ -307,7 +309,7 @@ int main(int argc, char* argv[])
     case 0: break;
     case 1:
         a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.1, 0.1});
         d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
         d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
         d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
new file mode 100644
index 0000000000..c5328226ff
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F8   = ck::f8_t;
+using F32  = float;
+using I64  = int64_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType = F8;
+using A1DataType = F32;
+using B0DataType = F8;
+using B1DataType = F32;
+// using EDataType        = F16;
+using EDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = EDataType;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D2Layout>;
+
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D2>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void
+    operator()<EDataType, float, float>(EDataType& e, const float& c, const float& d2) const
+    {
+        // for real kernel use
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void
+    operator()<EDataType, EDataType, float>(EDataType& e, const EDataType& c, const float& d2) const
+    {
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& e, const float& c, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d2);
+    }
+};
+
+void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(B0DataType);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(I64 n = 0; n < N; ++n)
+    {
+        for(I64 k = 0; k < K; ++k)
+        {
+            I64 n0 = n / NLane;
+            I64 n1 = n % NLane;
+
+            I64 k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            I64 k1 = tempk / KPack;
+            I64 k2 = tempk % KPack;
+
+            I64 outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * static_cast<I64>(K) + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t Scale_Block_M = 1;
+static constexpr ck::index_t Scale_Block_N = 128;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+static constexpr ck::index_t Nswizzle = false;
+static constexpr ck::index_t ActOP    = 0; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr bool MulRoutedWeight = true;
+
+#if 0
+static constexpr ck::index_t MPerBlock = 32;
+static constexpr ck::index_t NPerBlock   = 128;
+static constexpr ck::index_t MNPerXDL    = 16;
+static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * 1);
+static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * 4);
+static constexpr ck::index_t CShuffleMXDLPerWave = MXDLPerWave;
+static constexpr ck::index_t CShuffleNXDLPerWave = NXDLPerWave;
+static constexpr ck::index_t BLOCKSIZE   = 256;
+
+static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
+static constexpr ck::index_t AK1         = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1         = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec        = 16 / sizeof(EDataType);
+static constexpr ck::index_t D0Vec       = 1;
+static constexpr ck::index_t D1Vec       = 1;
+
+using DeviceOpInstance                   = ck::tensor_operation::device::DeviceMoeGemmBlockScale
+    // clang-format off
+        <      Row, Col, DsLayout, ELayout,
+               A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               //threadnum, mblock, nblock, kblock
+               BLOCKSIZE, Scale_Block_M, Scale_Block_N, Scale_Block_K,
+               MPerBlock,   NPerBlock,    KPerBlock,
+               // ak1, bk1
+               AK1,   BK1,
+               // mn_perxdl
+               MNPerXDL,   MNPerXDL,
+               // mn_xdlperwave 
+               MXDLPerWave,  NXDLPerWave,
+               // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
+               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+                CShuffleMXDLPerWave,    CShuffleNXDLPerWave,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+#else
+static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
+               Row, Col, DsLayout, ELayout,
+               A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,   GemmSpec,   
+               256,  Scale_Block_M, Scale_Block_N, Scale_Block_K,
+               MPerBlock,   128,    128,
+               16,   16,
+               16,   16,
+               4,    2,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+               4,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>;
+#endif
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+#if 1
+    // GEMM shape
+    ck::index_t N       = 4096;
+    ck::index_t K       = 6144;
+    ck::index_t experts = 8;
+    ck::index_t topk    = 2;
+    // ck::index_t sorted_tile_num = 515;
+    // ck::index_t valid_tile_num  = 512;
+    // ck::index_t tokens          = 8192;
+    // ck::index_t sorted_tile_num = 15;
+    // ck::index_t valid_tile_num  = 13;
+    ck::index_t sorted_tile_num = 259;
+    ck::index_t valid_tile_num  = 256;
+    ck::index_t tokens          = 4096;
+#else
+    // deepseek
+    ck::index_t N               = 2048;
+    ck::index_t K               = 7168;
+    ck::index_t experts         = 256;
+    ck::index_t topk            = 8;
+    ck::index_t tokens          = 4096;
+    ck::index_t sorted_tile_num = 261;
+    ck::index_t valid_tile_num  = 256;
+#endif
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else if(argc == 9)
+    {
+
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+        sorted_tile_num = std::stoi(argv[7]);
+        valid_tile_num  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    ck::index_t sorted_size = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size  = valid_tile_num * MPerBlock;
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0};
+    ck::index_t Scale_Stride_AM      = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_BN      = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_B       = (N + Scale_Block_N - 1) / Scale_Block_N * 2;
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1 + sorted_tile_num}));
+    max_token_id.mData = {valid_size};
+    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 3, 3, 3};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
+    }
+
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile && tokenid < tokens * topk)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<A1DataType> a1_t_k(HostTensorDescriptor(
+        {tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<B1DataType> b1_e_n_k(
+        HostTensorDescriptor({experts,
+                              (K + Scale_Block_K - 1) / Scale_Block_K,
+                              (N + Scale_Block_N - 1) / Scale_Block_N * 2},
+                             {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
+    std::cout << "a1_t_k: " << a1_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 4:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 5:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    case 6:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(A1DataType) * a1_t_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    a1_device_buf.ToDevice(a1_t_k.mData.data());
+    b1_device_buf.ToDevice(b1_e_n_k.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(
+        b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * 2 * experts, K, NPerXdl);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               a1_device_buf.GetDeviceBuffer(),
+                               b1_device_buf.GetDeviceBuffer(),
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * 2 * K;
+        std::size_t num_btype = sizeof(A0DataType) * valid_tile_num * K +
+                                sizeof(B0DataType) * K * N * 2 * experts +
+                                sizeof(EDataType) * valid_tile_num * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s.\n"
+                  << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> a_t_k({tokens, K});
+        Tensor<float> b_e_n_k({experts, K, N * 2});
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        // handle scale before ref.
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                a_t_k(t, k) = ck::type_convert<float>(a0_t_k(t, k)) * a1_t_k(t, k / Scale_Block_K);
+            }
+        }
+
+        for(int e = 0; e < experts; ++e)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                for(int n = 0; n < N * 2; ++n)
+                {
+                    b_e_n_k(e, k, n) = ck::type_convert<float>(b0_e_n_k(e, k, n)) *
+                                       b1_e_n_k(e, k / Scale_Block_K, n / Scale_Block_N);
+                }
+            }
+        }
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeGemm1BlockScale<float,
+                                                                    float,
+                                                                    float,
+                                                                    D2DataType,
+                                                                    AccDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    ActOP,
+                                                                    MulRoutedWeight>;
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a_t_k,
+                                                      b_e_n_k,
+                                                      d2_e_n,
+                                                      c_t_k_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, topk_id, n) =
+                    ck::type_convert<EDataType>(c_t_k_n(t, topk_id, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        auto status =
+            ck::utils::check_err(
+                e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                ? 0
+                : 1;
+        if(status == 0)
+        {
+            printf("Validation Pass.\n");
+        }
+        return status;
+    }
+
+    return 0;
+}
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 42d892fe26..3188ba142c 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -123,11 +123,11 @@ using BElementOp   = PassThrough;
 using CDEElementOp = MulABScaleExpertWeight;
 
 static constexpr auto GemmSpec         = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t MPerBlock = 256;
 static constexpr ck::index_t BLOCKSIZE = 256;
-static constexpr ck::index_t MXDLPerWave = 4;
+static constexpr ck::index_t MXDLPerWave = 16;
 static constexpr ck::index_t NXDLPerWave = 4;
-static constexpr ck::index_t NPerBlock   = 128;
+static constexpr ck::index_t NPerBlock   = 256;
 static constexpr ck::index_t MNPerXDL    = 16;
 static constexpr ck::index_t KPerBlock   = 128 / sizeof(A0DataType);
 
@@ -164,12 +164,12 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
             //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
             //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
                S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
-               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0,
                //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-               4,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, false, int32_t, A0DataType>;
+               2,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, false, int32_t, A0DataType>;
         // kernel 2: 128->32x128x128
         //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
 
@@ -186,11 +186,11 @@ int main(int argc, char* argv[])
     ck::index_t N               = 4096;
     ck::index_t K               = 4096;
     ck::index_t experts         = 8;
-    ck::index_t sorted_tile_num = 16;
-    ck::index_t valid_tile_num  = 13;
+    ck::index_t sorted_tile_num = 133;
+    ck::index_t valid_tile_num  = 128;
     ck::index_t sorted_size     = sorted_tile_num * MPerBlock;
     ck::index_t valid_size      = valid_tile_num * MPerBlock;
-    ck::index_t tokens          = 128;
+    ck::index_t tokens          = 16384;
     ck::index_t topk            = 2;
 
     if(argc == 1)
@@ -245,13 +245,14 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
-
-    max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
-
+    // max_token_id.mData[0] = valid_size;
+    // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
+    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
+    max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+    // int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     for(int i = 0; i < sorted_tile_num; i++)
     {
-        expert_ids.mData[i] = eids[i];
+        expert_ids.mData[i] = i / ((valid_tile_num + experts - 1) / experts);
     }
     if(tokens * topk > valid_size)
     {
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
new file mode 100644
index 0000000000..354957c0d1
--- /dev/null
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -0,0 +1,541 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F8   = ck::f8_t;
+using F32  = float;
+using I64  = int64_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType = F8;
+using A1DataType = F32;
+using B0DataType = F8;
+using B1DataType = F32;
+using EDataType  = F16;
+// using EDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = EDataType;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+// using DsLayoutGate = ck::Tuple<D0Layout, D1Layout>;
+using DsLayout = ck::Tuple<D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D2>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D2& d2) const;
+    // for real kernel use
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<EDataType, EDataType, float>(EDataType& e, const EDataType& c, const float& d2) const
+    {
+        // for real kernel use
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void
+    operator()<EDataType, float, float>(EDataType& e, const float& c, const float& d2) const
+    {
+        // for real kernel use
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& e, const float& c, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d2);
+    }
+};
+
+void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
+{
+    int KPack = 16 / sizeof(B0DataType);
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(I64 n = 0; n < N; ++n)
+    {
+        for(I64 k = 0; k < K; ++k)
+        {
+            I64 n0 = n / NLane;
+            I64 n1 = n % NLane;
+
+            I64 k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            I64 k1 = tempk / KPack;
+            I64 k2 = tempk % KPack;
+
+            I64 outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * static_cast<I64>(K) + k];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t Scale_Block_M = 1;
+static constexpr ck::index_t Scale_Block_N = 128;
+static constexpr ck::index_t Scale_Block_K = 128;
+static constexpr bool MulRoutedWeight      = true;
+
+#if 0
+static constexpr ck::index_t MPerBlock = 32;
+static constexpr ck::index_t BLOCKSIZE = 256;
+static constexpr ck::index_t MXDLPerWave = 2;
+static constexpr ck::index_t NXDLPerWave = 2;
+static constexpr ck::index_t NPerBlock   = 128;
+static constexpr ck::index_t MNPerXDL    = 16;
+static constexpr ck::index_t KPerBlock   = 256 / sizeof(A0DataType);
+
+static constexpr ck::index_t CShuffleNLane = 16;
+static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
+static constexpr ck::index_t AK1           = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1           = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec          = 2;
+static constexpr ck::index_t D0Vec         = 1;
+static constexpr ck::index_t D1Vec         = 1;
+static constexpr ck::index_t D2Vec         = 1;
+
+// clang-format off
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
+               Row, Col, DsLayout, ELayout,
+               A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               BLOCKSIZE, Scale_Block_M, Scale_Block_N, Scale_Block_K,
+               MPerBlock,   NPerBlock,    KPerBlock,
+               AK1,   BK1,
+               MNPerXDL,   MNPerXDL,
+               MXDLPerWave,  NXDLPerWave,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               2,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, int32_t, A0DataType>;
+
+#else
+static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale<
+               Row, Col, DsLayout, ELayout,
+               A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,   GemmSpec,   
+               256,  Scale_Block_M, Scale_Block_N, Scale_Block_K,
+               MPerBlock,   128,    128,
+               16,   16,
+               16,   16,
+               4,    2,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+               2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, int32_t, A0DataType>;
+#endif
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // tokens = 1
+    // topk = 1
+    // experts = 8
+    // per expert:
+
+    constexpr ck::index_t valid_tile_num =
+        26; // 13 for 128; 52 for 32; 4096 for ds  // > token * topk / MPerBlock
+    constexpr ck::index_t sorted_tile_num = valid_tile_num + 3;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+#if 1
+    // GEMM shape
+    ck::index_t N       = 6144;
+    ck::index_t K       = 4096;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+#else
+    // deepseek
+    ck::index_t N       = 2048;
+    ck::index_t K       = 7160;
+    ck::index_t experts = 256;
+    ck::index_t tokens  = 1;
+    ck::index_t topk    = 8;
+#endif
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0};
+    ck::index_t Scale_Stride_AM      = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_BN      = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_B       = (N + Scale_Block_N - 1) / Scale_Block_N;
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
+
+    max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+    // int eids[]         = {0, 1, 3, 3, 3};
+    //  int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
+    // int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
+    // int eids[]         = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    //                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    //                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    //                     3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
+    //                     5, 5, 5, 5, 6, 6, 6, 6, 7, 7,
+    //                     7, 7,
+    //                     3, 3, 3};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
+    }
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    int token_per_tile = tokens * topk / valid_tile_num;
+    int tokenid        = 0;
+
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile && tokenid < tokens * topk)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<A1DataType> a1_t_k_k(
+        HostTensorDescriptor({tokens, topk, (K + Scale_Block_K - 1) / Scale_Block_K},
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
+
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<B1DataType> b1_e_n_k(HostTensorDescriptor(
+        {experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N + Scale_Block_N - 1) / Scale_Block_N},
+        {(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
+
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k_k: " << a0_t_k_k.mDesc << std::endl;
+    std::cout << "a1_t_k_k: " << a1_t_k_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-1.0, 1.0});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-1.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 4:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 5:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 6:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{1.0, 1.0});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{1.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{1.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{1.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{1.0, 1.0});
+        for(auto i = 0; i < N * K; i++)
+        {
+            b0_e_n_k.mData[i]         = ck::type_convert<B0DataType>(static_cast<float>(0.1));
+            b0_e_n_k.mData[i + N * K] = ck::type_convert<B0DataType>(static_cast<float>(0.2));
+        }
+        break;
+    default:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) *
+                                   sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(A1DataType) * a1_t_k_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k_k.mData.data());
+    a1_device_buf.ToDevice(a1_t_k_k.mData.data());
+    b1_device_buf.ToDevice(b1_e_n_k.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    int NPerXdl = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer(b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * experts, K, NPerXdl);
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                               expert_ids_dev.GetDeviceBuffer(),
+                               max_token_id_dev.GetDeviceBuffer(),
+                               a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               topk,
+                               sorted_size,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               a1_device_buf.GetDeviceBuffer(),
+                               b1_device_buf.GetDeviceBuffer(),
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * tokens * topk * N * K;
+        std::size_t num_btype = sizeof(A0DataType) * tokens * K * topk +
+                                sizeof(B0DataType) * K * N * experts +
+                                sizeof(EDataType) * tokens * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s.\n"
+                  << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> a_t_k_k({tokens, topk, K});
+        Tensor<float> b_e_n_k({experts, K, N});
+        Tensor<float> c_t_n({tokens, N});
+
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int tk = 0; tk < topk; ++tk)
+            {
+                for(int k = 0; k < K; ++k)
+                {
+                    a_t_k_k(t, tk, k) = ck::type_convert<float>(a0_t_k_k(t, tk, k)) *
+                                        a1_t_k_k(t, tk, k / Scale_Block_K);
+                }
+            }
+        }
+
+        for(int e = 0; e < experts; ++e)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                for(int n = 0; n < N; ++n)
+                {
+                    b_e_n_k(e, k, n) = ck::type_convert<float>(b0_e_n_k(e, k, n)) *
+                                       b1_e_n_k(e, k / Scale_Block_K, n / Scale_Block_N);
+                }
+            }
+        }
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeGemm2BlockScale<float,
+                                                                    float,
+                                                                    float,
+                                                                    D2DataType,
+                                                                    AccDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    CDEElementOp,
+                                                                    MulRoutedWeight>;
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a_t_k_k,
+                                                      b_e_n_k,
+                                                      d2_e_n,
+                                                      c_t_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        auto status =
+            ck::utils::check_err(
+                e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                ? 0
+                : 1;
+        if(status == 0)
+        {
+            printf("Validation Pass.\n");
+        }
+        return status;
+    }
+
+    return 0;
+}
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 86d90674e1..34c54a7e12 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -6,8 +6,9 @@ add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)
 add_example_executable(example_gemm_mx_bf8 gemm_mx_bf8.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
 
-#add_example_executable(example_gemm_mx_fp8_bf8 gemm_mx_fp8_bf8.cpp)
-# add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_bf8) TOFO: Fix RRR
+# TODO: Fix RRR
+# add_example_executable(example_gemm_mx_fp8_bf8 gemm_mx_fp8_bf8.cpp)
+# add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_bf8) 
 
 add_example_executable(example_gemm_mx_fp4 gemm_mx_fp4.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp4)
@@ -15,30 +16,23 @@ add_example_dependencies(example_gemm_mx example_gemm_mx_fp4)
 add_example_executable(example_gemm_mx_fp4_bpreshuffle gemm_mx_fp4_bpreshuffle.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp4_bpreshuffle)
 
-#add_example_executable(example_moe_gemm1_xdl_mx_fp4 moe_gemm1_xdl_mx_fp4.cpp)
-# add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4) TODO: Fix
+add_example_executable(example_moe_gemm1_xdl_mx_fp4_bns moe_gemm1_xdl_mx_fp4_bns.cpp)
+add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4_bns)
 
-#add_example_executable(example_moe_gemm1_xdl_mx_fp4_bns moe_gemm1_xdl_mx_fp4_bns.cpp)
-#add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4_bns)
-
-#add_example_executable(example_moe_gemm2_xdl_mx_fp4 moe_gemm2_xdl_mx_fp4.cpp)
-# add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4) TODO: Fix
-
-#add_example_executable(example_moe_gemm2_xdl_mx_fp4_bns moe_gemm2_xdl_mx_fp4_bns.cpp)
-#add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4_bns)
+add_example_executable(example_moe_gemm2_xdl_mx_fp4_bns moe_gemm2_xdl_mx_fp4_bns.cpp)
+add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4_bns)
 
 set(FP4_MXGEMM_OPTIONS)
 list(APPEND FP4_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --amdgpu-use-amdgpu-trackers=1")
-#list(APPEND FP4_MXGEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker -ftemplate-backtrace-limit=0)
 example_compile_options(example_gemm_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
-# example_compile_options(example_moe_gemm1_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
-# example_compile_options(example_moe_gemm2_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
-# example_compile_options(example_moe_gemm1_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
-# example_compile_options(example_moe_gemm2_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
+
+example_compile_options(example_moe_gemm1_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+example_compile_options(example_moe_gemm2_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+example_compile_options(example_moe_gemm1_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
+example_compile_options(example_moe_gemm2_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
 
 set(FP8_MXGEMM_OPTIONS)
 list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
-#list(APPEND FP8_MXGEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker -ftemplate-backtrace-limit=0)
 example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 30df8ccd37..1f01e1c7be 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -250,7 +250,7 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     using AScaleLayout = Row;
     using BScaleLayout = Col;
 
-    auto Scale_Padded_M = (M + ScaleBlockSize - 1) / ScaleBlockSize * ScaleBlockSize;
+    auto Scale_Padded_M = ck::math::integer_least_multiple(M, ScaleBlockSize);
     auto Scale_Stride_AM =
         f_get_default_stride(Scale_Padded_M, K / ScaleBlockSize, -1, AScaleLayout{});
     auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
@@ -302,6 +302,8 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
             return ck::type_convert<BDataType>(x);
     };
 
+    using int_distr   = std::uniform_int_distribution<int>;
+    using float_distr = std::uniform_real_distribution<float>;
     switch(config.init_method)
     {
     case 0: // Initializations for development and debugging
@@ -320,22 +322,19 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
         break;
 
     case 1:
-
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6});  // Z[-5,5]
-        b_k_n->GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6}); // Z[-5,5]
+        a_m_k.GenerateTensorDistr(int_distr{-5, 6});  // Z[-5,5]
+        b_k_n->GenerateTensorDistr(int_distr{-5, 6}); // Z[-5,5]
         static_assert(ck::is_same_v<XDataType, ck::e8m0_bexp_t>);
-        a_m_k_scale.GenerateTensorValue(
-            GeneratorTensor_2<XDataType>{120, 129}); // scales: {0.25, 0.5, 1, 2}
-        b_k_n_scale.GenerateTensorValue(
-            GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        a_m_k_scale.GenerateTensorDistr(int_distr{120, 129}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorDistr(int_distr{125, 129}); // scales: {0.25, 0.5, 1, 2}
         break;
 
     case 2:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
-        a_m_k_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+        a_m_k.GenerateTensorDistr(float_distr{-2.0, 2.0});
+        a_m_k_scale.GenerateTensorDistr(float_distr{powf(2.0f, -125.0f), 1.0f});
 
-        b_k_n->GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
-        b_k_n_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+        b_k_n->GenerateTensorDistr(float_distr{-2.0, 2.0});
+        b_k_n_scale.GenerateTensorDistr(float_distr{powf(2.0f, -125.0f), 1.0f});
         break;
 
     default:
@@ -469,17 +468,6 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
             std::cout << "Comparing results..." << std::endl;
         }
 
-        // if(config.init_method == 0)
-        // {
-        //     auto expected = static_cast<float>(K);
-        //     auto computed = type_convert<float>(c_m_n_device_result(1, 12));
-
-        //     res_verified = res_verified && std::abs(expected - computed) <= 0.0f;
-        //     std::cout << "\nExpected vs Computed: " << expected << " vs " << computed
-        //               << ((res_verified) ? " (PASSED!)" : " (FAILED!)") << std::endl
-        //               << std::endl;
-        // }
-
         res_verified =
             res_verified &&
             ck::utils::check_err(
diff --git a/example/67_gemm_microscaling/gemm_mx_fp4.cpp b/example/67_gemm_microscaling/gemm_mx_fp4.cpp
index cff5148fa7..65fbe3491a 100644
--- a/example/67_gemm_microscaling/gemm_mx_fp4.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_fp4.cpp
@@ -5,8 +5,6 @@
 
 using ADataType = ck::f4x2_pk_t;
 using BDataType = ck::f4x2_pk_t;
-// using ADataType = ck::f4_t;
-// using BDataType = ck::f4_t;
 
 using XDataType       = ck::e8m0_bexp_t;
 using XPackedDataType = int32_t;
diff --git a/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp
index 562b2fdb17..6e1efd266b 100644
--- a/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_fp4_bpreshuffle.cpp
@@ -5,8 +5,6 @@
 
 using ADataType = ck::f4x2_pk_t;
 using BDataType = ck::f4x2_pk_t;
-// using ADataType = ck::f4_t;
-// using BDataType = ck::f4_t;
 
 using XDataType       = ck::e8m0_bexp_t;
 using XPackedDataType = int32_t;
@@ -74,9 +72,9 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffle
     16,               // BBlockTransferDstScalarPerVector_BK1
     true,             // BBlockLdsExtraN
     2,                // CShuffleMXdlPerWavePerShuffle
-    2,                // CShuffleNXdlPerWavePerShuffle
-    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    4,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 8, 1, 32>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlockW
     BlkGemmPSched,    // BlkGemmPipeSched
     BlkGemmPVer,      // BlkGemmPipelineVer
     ADataType,        // ComputeTypeA
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
new file mode 100644
index 0000000000..24ab326391
--- /dev/null
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F4              = ck::f4x2_pk_t;
+using F16             = ck::half_t;
+using BF16            = ck::bhalf_t;
+using F32             = float;
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F4;
+using A1DataType       = XPackedDataType;
+using B0DataType       = F4;
+using B1DataType       = XPackedDataType;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+// A, B Scale preshuffle
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+constexpr ck::index_t DataPackedSize   = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize   = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock        = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+static constexpr ck::index_t Nswizzle  = false;
+static constexpr ck::index_t ActOP     = 0; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t NPerBlock = 64;
+static constexpr ck::index_t BlockSize = 256;
+static constexpr bool MulRoutedWeight  = true;
+
+// clang-format off
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemmMXBNS<      
+    A0Layout,    B0Layout,    DsLayout,    ELayout, 
+    A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
+    AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
+    ScaleBlockSize, BlockSize,   
+    MPerBlock,      NPerBlock,    KPerBlock,
+    16,   16, 
+    16,   16,
+    4,     2,
+    S<8, 32, 1>, S<1, 0, 2>,     S<1, 0, 2>,    2, 16, 16, 0,
+    S<8, 32, 1>, S<1, 0, 2>,     S<1, 0, 2>,    2, 16, 16, 0,
+    2,    2,     S<1, 32, 1, 8>, S<8, 1, 1, 1>,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 
+    ActOP, Nswizzle, true, MulRoutedWeight, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    constexpr ck::index_t sorted_tile_num = 13;
+    constexpr ck::index_t valid_tile_num  = sorted_tile_num;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+
+    ck::index_t N       = 4096;
+    ck::index_t K       = 6144;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    ck::index_t Scale_Stride_AM      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    ck::index_t Scale_Stride_BN      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({sorted_tile_num + 1}));
+    max_token_id.mData[0] = valid_size;
+
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
+    }
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<XDataType> a1_t_k(HostTensorDescriptor(
+        {tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<XDataType> b1_e_n_k(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
+                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
+
+    // A, B Scale preshuffle
+    Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> a_scale_preshuffled(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> b_scale_preshuffled(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
+                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_k_n_host_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_k_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+
+    e_t_k_n_device_result.SetZero();
+    std::cout << "a0_t_k:   " << a0_t_k.mDesc << std::endl;
+    std::cout << "a1_t_k:   " << a1_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n:   " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_k_n:  " << e_t_k_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    case 3:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 4:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 5.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 5:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{1});
+        break;
+    case 6:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 7:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{0.5f});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{1.5f});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{1.0f});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{1.0f});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(XDataType) * a_scale_sorted.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(XDataType) * b1_e_n_k.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_k_n_device_result.GetElementSpaceSize());
+
+    // A scale sorted
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int token_id = sorted_token_ids.mData[i] & 0x00FFFFFF;
+
+        for(int k = 0; k < (K + ScaleBlockSize - 1) / ScaleBlockSize; k++)
+        {
+            if(token_id == tokens)
+            {
+                a_scale_sorted(i, k) = ck::type_convert<XDataType>(0);
+            }
+            else
+            {
+                a_scale_sorted(i, k) = a1_t_k(token_id, k);
+            }
+        }
+    }
+
+    // A/B scale shuffle
+    preShuffleScaleBuffer<ck::is_same_v<A0Layout, Row>>(a_scale_sorted.mData.data(),
+                                                        a_scale_preshuffled.mData.data(),
+                                                        sorted_size,
+                                                        K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<B0Layout, Col>>(b1_e_n_k.mData.data(),
+                                                        b_scale_preshuffled.mData.data(),
+                                                        N * 2 * experts,
+                                                        K / ScaleBlockSize);
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    b0_device_buf.ToDevice(b0_e_n_k.mData.data());
+    a1_device_buf.ToDevice(a_scale_preshuffled.mData.data());
+    b1_device_buf.ToDevice(b_scale_preshuffled.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(
+        sorted_token_ids_dev.GetDeviceBuffer(),
+        expert_ids_dev.GetDeviceBuffer(),
+        max_token_id_dev.GetDeviceBuffer(),
+        a0_device_buf.GetDeviceBuffer(),
+        a1_device_buf.GetDeviceBuffer(),
+        b0_device_buf.GetDeviceBuffer(),
+        b1_device_buf.GetDeviceBuffer(),
+        std::array<const void*, NumDTensor>{nullptr, nullptr, d2_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        tokens,
+        topk,
+        sorted_size,
+        N,
+        K,
+        StrideA,
+        Scale_Stride_AM,
+        StrideB,
+        Scale_Stride_BN,
+        StrideDs,
+        StrideE,
+        KBatch,
+        a_element_op,
+        b_element_op,
+        cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop =
+            // FMA * tokens * N * (Gate+Up) * topk * K +
+            // FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale)
+            std::size_t(2) * tokens * N * 2 * topk * K +
+            std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize;
+
+        std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K +
+                                sizeof(B0DataType) / 2 * K * N * 2 * experts +
+                                sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
+                                sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts +
+                                sizeof(EDataType) * tokens * topk * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
+                                                            XDataType,
+                                                            B0DataType,
+                                                            XDataType,
+                                                            CShuffleDataType,
+                                                            D2DataType,
+                                                            AccDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            ActOP,
+                                                            MulRoutedWeight>;
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k,
+                                                      a1_t_k,
+                                                      b0_e_n_k,
+                                                      b1_e_n_k,
+                                                      d2_e_n,
+                                                      c_t_k_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_k_n_host_result(t, topk_id, n) =
+                    ck::type_convert<EDataType>(c_t_k_n(t, topk_id, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_k_n_device_result.mData.data());
+
+        auto status =
+            ck::utils::check_err(
+                e_t_k_n_device_result, e_t_k_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                ? 0
+                : 1;
+        if(status == 0)
+        {
+            printf("Validation Pass.\n");
+        }
+        return status;
+    }
+
+    return 0;
+}
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
new file mode 100644
index 0000000000..6718581a50
--- /dev/null
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F4              = ck::f4x2_pk_t;
+using F16             = ck::half_t;
+using BF16            = ck::bhalf_t;
+using F32             = float;
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F4;
+using A1DataType       = XPackedDataType;
+using B0DataType       = F4;
+using B1DataType       = XPackedDataType;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>(
+        EDataType& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+// A, B Scale preshuffle
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+constexpr ck::index_t DataPackedSize = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock      = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr bool MulRoutedWeight  = true;
+
+// clang-format off
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemmMXBNS<      
+    A0Layout,    B0Layout,    DsLayout,    ELayout, 
+    A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
+    AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
+    ScaleBlockSize,      256,   
+    MPerBlock,  128,    KPerBlock,
+    16,   16,
+    16,   16,
+    4,    4,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+    2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    constexpr ck::index_t sorted_tile_num = 13;
+    constexpr ck::index_t valid_tile_num  = sorted_tile_num;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+
+    ck::index_t N       = 6144;
+    ck::index_t K       = 4096;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    ck::index_t Scale_Stride_AM      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    ck::index_t Scale_Stride_BN      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
+    max_token_id.mData[0] = valid_size;
+    // int eids[]            = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
+    int eids[sorted_tile_num]{};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        if(i < valid_tile_num)
+        {
+            eids[i] = (i * experts) / valid_tile_num;
+        }
+        else
+        {
+            eids[i] = 3;
+        }
+    }
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    int token_per_tile = tokens * topk / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<XDataType> a1_t_k_k(
+        HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<XDataType> b1_e_n_k(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
+                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
+    // B preshuffle
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+
+    // A, B Scale preshuffle
+    Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> a_scale_preshuffled(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> b_scale_preshuffled(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
+                             {N * Scale_Stride_BN, 1, Scale_Stride_BN}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k_k: " << a0_t_k_k.mDesc << std::endl;
+    std::cout << "a1_t_k_k: " << a1_t_k_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 4:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 5.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 5:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{1});
+        break;
+    case 6:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(XDataType) * a_scale_sorted.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(XDataType) * b1_e_n_k.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.GetElementSpaceSize());
+
+    // A scale sorted
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int token_id = sorted_token_ids.mData[i] & 0x00FFFFFF;
+        int topk_id  = (sorted_token_ids.mData[i] >> 24) & 0x000000FF;
+
+        for(int k = 0; k < (K + ScaleBlockSize - 1) / ScaleBlockSize; k++)
+        {
+            if(token_id == tokens)
+            {
+                a_scale_sorted(i, k) = ck::type_convert<XDataType>(0);
+            }
+            else
+            {
+                a_scale_sorted(i, k) = a1_t_k_k(token_id, topk_id, k);
+            }
+        }
+    }
+
+    preShuffleScaleBuffer<ck::is_same_v<A0Layout, Row>>(a_scale_sorted.mData.data(),
+                                                        a_scale_preshuffled.mData.data(),
+                                                        sorted_size,
+                                                        K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<B0Layout, Col>>(
+        b1_e_n_k.mData.data(), b_scale_preshuffled.mData.data(), N * experts, K / ScaleBlockSize);
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k_k.mData.data());
+    b0_device_buf.ToDevice(b0_e_n_k.mData.data());
+    a1_device_buf.ToDevice(a_scale_preshuffled.mData.data());
+    b1_device_buf.ToDevice(b_scale_preshuffled.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(
+        sorted_token_ids_dev.GetDeviceBuffer(),
+        expert_ids_dev.GetDeviceBuffer(),
+        max_token_id_dev.GetDeviceBuffer(),
+        a0_device_buf.GetDeviceBuffer(),
+        a1_device_buf.GetDeviceBuffer(),
+        b0_device_buf.GetDeviceBuffer(),
+        b1_device_buf.GetDeviceBuffer(),
+        std::array<const void*, NumDTensor>{nullptr, nullptr, d2_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        tokens,
+        topk,
+        sorted_size,
+        N,
+        K,
+        StrideA,
+        Scale_Stride_AM,
+        StrideB,
+        Scale_Stride_BN,
+        StrideDs,
+        StrideE,
+        KBatch,
+        a_element_op,
+        b_element_op,
+        cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        // FMA * tokens * N * topk * K +
+        // FMA * tokens * N * topk * (K/BlockScale)
+        std::size_t flop = std::size_t(2) * tokens * topk * N * K +
+                           std::size_t(2) * tokens * topk * N * K / ScaleBlockSize;
+
+        std::size_t num_btype =
+            sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts +
+            sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
+            sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<CShuffleDataType> c_t_n({tokens, N});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeMXGemm2<A0DataType,
+                                                            XDataType,
+                                                            B0DataType,
+                                                            XDataType,
+                                                            D2DataType,
+                                                            CShuffleDataType,
+                                                            AccDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            CDEElementOp,
+                                                            MulRoutedWeight,
+                                                            float,
+                                                            float>;
+
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k_k,
+                                                      a1_t_k_k,
+                                                      b0_e_n_k,
+                                                      b1_e_n_k,
+                                                      d2_e_n, // topk weights
+                                                      c_t_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 1cfe2789c2..56d709f41b 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -104,11 +104,13 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
     endforeach()
-    # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+    # Build fp8 gemm_multiply_multiply and moe only on gfx94/95
     foreach(source IN LISTS FILE_NAME)
-    if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply_xdl_fp8_bpreshuffle")
-         message(DEBUG "Skipping ${source} example for current target")
-         list(REMOVE_ITEM FILE_NAME "${source}")
+    if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95")
+        if (source MATCHES "fp8" AND source MATCHES "(gemm_multiply_multiply|moe)")
+            message(DEBUG "Skipping ${source} example for current target")
+            list(REMOVE_ITEM FILE_NAME "${source}")
+        endif()
     endif()
     endforeach()
     #only continue if there are some source files left on the list
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 257636d956..06e33afd20 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <fstream>
 #include <numeric>
+#include <random>
 #include <thread>
 #include <utility>
 #include <vector>
@@ -18,6 +19,7 @@
 
 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/ranges.hpp"
+#include "ck/library/utility/thread.hpp"
 
 template <typename Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
@@ -512,6 +514,72 @@ struct Tensor
         }
     }
 
+    // Generate random values with multiple threads. Guaranteed to give the same sequence with any
+    // number of threads provided.
+    template <typename Distribution = std::uniform_real_distribution<float>,
+              typename Mapping      = ck::identity,
+              typename Generator    = std::minstd_rand>
+    void GenerateTensorDistr(Distribution dis       = {0.f, 1.f},
+                             Mapping fn             = {},
+                             const Generator g      = Generator(0), // default seed 0
+                             std::size_t num_thread = -1)
+    {
+        using ck::math::integer_divide_ceil;
+        using ck::math::min;
+        if(num_thread == -1ULL)
+            num_thread = min(ck::get_available_cpu_cores(), 80U); // max 80 threads
+        // At least 2MB per thread
+        num_thread = min(num_thread, integer_divide_ceil(this->GetElementSpaceSize(), 0x200000));
+        constexpr std::size_t BLOCK_BYTES = 64;
+        constexpr std::size_t BLOCK_SIZE  = BLOCK_BYTES / sizeof(T);
+
+        const std::size_t num_blocks = integer_divide_ceil(this->GetElementSpaceSize(), BLOCK_SIZE);
+        const std::size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
+
+        std::vector<std::thread> threads;
+        threads.reserve(num_thread - 1);
+        const auto dst                = const_cast<T*>(this->mData.data());
+        const auto element_space_size = this->GetElementSpaceSize();
+        for(int it = num_thread - 1; it >= 0; --it)
+        {
+            std::size_t ib_begin = it * blocks_per_thread;
+            std::size_t ib_end   = min(ib_begin + blocks_per_thread, num_blocks);
+
+            auto job = [=]() {
+                auto g_   = g;   // copy
+                auto dis_ = dis; // copy
+                g_.discard(ib_begin * BLOCK_SIZE * ck::packed_size_v<T>);
+                auto t_fn = [&]() {
+                    if constexpr(ck::packed_size_v<T> == 1)
+                        return ck::type_convert<T>(fn(dis_(g_)));
+                    else if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
+                        return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
+                            ck::float2_t{ck::type_convert<float>(fn(dis_(g_))),
+                                         ck::type_convert<float>(fn(dis_(g_)))})};
+                    else
+                        static_assert(false, "Unsupported packed size for T");
+                };
+
+                std::size_t ib = ib_begin;
+                for(; ib < ib_end - 1; ++ib)
+                    ck::static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
+                        constexpr size_t iw       = iw_.value;
+                        dst[ib * BLOCK_SIZE + iw] = t_fn();
+                    });
+                for(std::size_t iw = 0; iw < BLOCK_SIZE; ++iw)
+                    if(ib * BLOCK_SIZE + iw < element_space_size)
+                        dst[ib * BLOCK_SIZE + iw] = t_fn();
+            };
+
+            if(it > 0)
+                threads.emplace_back(std::move(job));
+            else
+                job(); // last job run in the main thread
+        }
+        for(auto& t : threads)
+            t.join();
+    }
+
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp
index f48ba49bbf..ab69412c15 100644
--- a/include/ck/library/utility/host_tensor_generator.hpp
+++ b/include/ck/library/utility/host_tensor_generator.hpp
@@ -163,6 +163,18 @@ struct GeneratorTensor_1<ck::pk_i4_t>
     }
 };
 
+template <>
+struct GeneratorTensor_1<ck::e8m0_bexp_t>
+{
+    float value = 1;
+
+    template <typename... Is>
+    ck::e8m0_bexp_t operator()(Is...)
+    {
+        return ck::type_convert<ck::e8m0_bexp_t>(value);
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_2
 {
diff --git a/include/ck/library/utility/thread.hpp b/include/ck/library/utility/thread.hpp
new file mode 100644
index 0000000000..483c58c46f
--- /dev/null
+++ b/include/ck/library/utility/thread.hpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#ifdef __linux__
+#include <sched.h>
+#endif
+#include <thread>
+namespace ck {
+inline unsigned int get_available_cpu_cores()
+{
+#if defined(__linux__)
+    cpu_set_t cpu_set;
+    if(sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
+    {
+        unsigned int cpu_count = CPU_COUNT(&cpu_set);
+        if(cpu_count > 0)
+            return cpu_count;
+    }
+#endif
+    // Fallback if sched_getaffinity unavailable or fails
+    return std::thread::hardware_concurrency();
+}
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
index 29750b8baa..4f7b8e768c 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
@@ -122,6 +122,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -153,9 +154,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -290,12 +291,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf,
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_buf);
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
             });
         });
         // B VGPR->VGPR dequant
@@ -388,12 +391,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf,
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_buf);
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_buf);
+                            });
                         });
                     });
                     // B VGPR->VGPR dequant
@@ -477,12 +483,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_buf);
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                           a_thread_buf);
+                    });
                 });
             });
             // B VGPR->VGPR dequant
@@ -588,7 +596,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
index 73749c6309..fe89e700c4 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
@@ -122,6 +122,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
+    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -154,9 +155,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
+        constexpr index_t K2 = KPack / KGroup;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
+        constexpr index_t K0 = KRepeat * KGroup;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -298,12 +299,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_block_buf,
-                                   a_thread_desc_,
-                                   make_tuple(m0, I0, I0, k0, I0, I0),
-                                   a_thread_buf);
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
             });
         });
 
@@ -382,12 +385,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_block_buf,
-                                               a_thread_desc_,
-                                               make_tuple(m0, I0, I0, k0, I0, I0),
-                                               a_thread_buf);
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                    a_thread_buf);
+                            });
                         });
                     });
 
@@ -458,12 +464,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, I0),
-                                       a_thread_buf);
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                            a_thread_buf);
+                    });
                 });
             });
 
@@ -556,7 +565,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1<BlockGemmPipelineSch
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
new file mode 100644
index 0000000000..c76be74e52
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                            BlockSize,
+                                                            ADataType,
+                                                            BDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            ATileDesc,
+                                                            BTileDesc,
+                                                            AMmaTileDesc,
+                                                            BMmaTileDesc,
+                                                            ABlockTransferSrcScalarPerVector,
+                                                            BBlockTransferSrcScalarPerVector,
+                                                            MPerBlock,
+                                                            NPerBlock,
+                                                            KPerBlock,
+                                                            MPerXDL,
+                                                            NPerXDL,
+                                                            MRepeat,
+                                                            NRepeat,
+                                                            KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * 2;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more = math::integer_divide_ceil(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less = math::integer_divide_floor(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+
+        constexpr auto buffer_load_stages_more =
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) -
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_b_stages =
+            buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more +
+                   (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
+                       buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - num_ds_read_a_prefetch_stages - buffer_load_b_stages;
+
+        constexpr auto buffer_load_issue_point_b = 0;
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less;
+        constexpr auto ds_write_issue_point      = 0;
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_b)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_b)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               ds_write_issue_point)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               ds_write_issue_point)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
+        });
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a  = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_b =
+            MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num * 2;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                staged_num_mfma / num_buffer_load_inst_b;
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    __device__ static constexpr auto EpilogueScheduler_2()
+    {
+        constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num * 2;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        // A local Read
+        static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+            ignore = i_inst;
+            __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        BBlockTransfer& b_blockwise_copy_up,
+                        const BGridBuffer& b_grid_buf,
+                        const BGridBuffer& b_grid_buf_up,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        CThreadBuffer& c_thread_buf_up,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, 2, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                        [Number<b_thread_desc_.CalculateOffset(
+                                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+
+                        if constexpr(m0.value == MRepeat - 2)
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else if constexpr(m0.value == (MRepeat - 1))
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(mfma_reg_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                    });
+                    HotLoopScheduler();
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+                if constexpr(m0.value == (MRepeat - 2))
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else if constexpr(m0.value == MRepeat - 1)
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+
+            HotLoopScheduler();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+
+            HotLoopScheduler();
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp
new file mode 100644
index 0000000000..ac3b82f800
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp
@@ -0,0 +1,919 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v1
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v1<
+    BlockGemmPipelineScheduler::Intrawave,
+    ThreadBlockSize,
+    ScaleBlockSize,
+    ADataType,
+    AScaleDataType,
+    BDataType,
+    BScaleDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                  ADataType,
+                                                  BDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::KThreadChunk;
+
+    using Base::APackedSize;
+    using Base::BPackedSize;
+    using Base::ComputePackedSize;
+
+    using AccType      = typename Base::AccType;
+    using Tuple4       = typename Base::Tuple4;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        index_t num_loop) const
+    {
+        ignore            = b_block_desc;
+        ignore            = b_block_buf;
+        ignore            = a_scale_grid_buf;
+        ignore            = b_scale_grid_buf;
+        ignore            = b_scale_grid_buf_up;
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs_up;
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales to buf 0
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(0, ScalesPerKBlockSize, 0));
+
+        // Prefetch b_scales to buf 0
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_bufs(I0)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+
+                    auto b_scale_thread_buf_copy_up =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc_copy,
+                                               make_tuple(I0, I0),
+                                               b_scale_thread_buf_copy_up);
+
+                    b_scale_thread_bufs_up(I0)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy_up[Number<0>{}];
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+            b_scale_thread_copy_up.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+        b_scale_thread_copy_up.MoveSrcSliceWindow(
+            b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Prefetch a_scales to buf 1
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0, I0),
+                                a_scale_thread_bufs(I1));
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(0, ScalesPerKBlockSize, 0));
+
+        // Prefetch b_scales to buf 1
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_bufs(I1)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+
+                    auto b_scale_thread_buf_copy_up =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc_copy,
+                                               make_tuple(I0, I0),
+                                               b_scale_thread_buf_copy_up);
+
+                    b_scale_thread_bufs_up(I1)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy_up[Number<0>{}];
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+            b_scale_thread_copy_up.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+        });
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+        b_scale_thread_copy_up.MoveSrcSliceWindow(
+            b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                    constexpr auto a_k_step_chunk =
+                        k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                    b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                        [Number<b_thread_desc_.CalculateOffset(
+                                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(
+                                    0 < ScalesPerXdlopsRunPerThread,
+                                    "Must have at least one scale per Xdlops per Thread.");
+
+                                vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    b_scale_thread_vec;
+                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    b_scale_thread_vec_up;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs[mfma_reg_buf]
+                                                           [Number<a_scale_offset + s>{}];
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs[mfma_reg_buf]
+                                                           [Number<b_scale_offset + s>{}];
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs_up[mfma_reg_buf]
+                                                              [Number<b_scale_offset + s>{}];
+                                });
+
+                                using mfma_input_type_a =
+                                    typename vector_type<ComputeTypeA,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             APackedSize>::type;
+                                using mfma_input_type_b =
+                                    typename vector_type<ComputeTypeB,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             BPackedSize>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                // MFMA accumulation
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec.template AsType<BScaleDataType>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    // a thread copy
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step =
+                            k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}(
+                                [&](auto chunk) {
+                                    constexpr auto a_k_step_chunk =
+                                        k_step + chunk * KThreadChunk *
+                                                     xdlops_gemm.mfma_instr.num_input_blks;
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k,
+                                        make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                        a_block_buf,
+                                        a_thread_desc_,
+                                        make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                        a_thread_buf);
+                                });
+                        });
+                    });
+
+                    // Prefetch a_scales
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0, I0),
+                                            a_scale_thread_bufs(mfma_reg_buf));
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, make_multi_index(0, ScalesPerKBlockSize, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                constexpr auto b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                                auto b_scale_thread_buf_copy =
+                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                        b_scale_thread_desc_copy.GetElementSpaceSize());
+                                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                        b_scale_grid_buf,
+                                                        b_scale_thread_desc_copy,
+                                                        make_tuple(I0, I0),
+                                                        b_scale_thread_buf_copy);
+
+                                b_scale_thread_bufs(mfma_reg_buf)(Number<b_scale_offset>{}) =
+                                    b_scale_thread_buf_copy[Number<0>{}];
+                                b_scale_thread_copy.MoveSrcSliceWindow(
+                                    b_scale_grid_desc,
+                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+
+                                auto b_scale_thread_buf_copy_up =
+                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                        b_scale_thread_desc_copy.GetElementSpaceSize());
+                                b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                                           b_scale_grid_buf_up,
+                                                           b_scale_thread_desc_copy,
+                                                           make_tuple(I0, I0),
+                                                           b_scale_thread_buf_copy_up);
+
+                                b_scale_thread_bufs_up(mfma_reg_buf)(Number<b_scale_offset>{}) =
+                                    b_scale_thread_buf_copy_up[Number<0>{}];
+                                b_scale_thread_copy_up.MoveSrcSliceWindow(
+                                    b_scale_grid_desc,
+                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                            });
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                        b_scale_thread_copy_up.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                    });
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                            b_scale_thread_vec_up;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            // a thread copy
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                            b_scale_thread_vec_up;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I1][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I1][Number<b_scale_offset + s>{}];
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up[I1][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                            b_scale_thread_vec_up;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from a_scale_grid to a_scale_thread
+    static constexpr auto a_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from b_scale_grid to b_scale_thread_buf
+    static constexpr auto b_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    protected:
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    // using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
new file mode 100644
index 0000000000..f899c223b9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
@@ -0,0 +1,1020 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
+    BlockGemmPipelineScheduler::Intrawave,
+    ThreadBlockSize,
+    ScaleBlockSize,
+    ADataType,
+    AScaleDataType,
+    BDataType,
+    BScaleDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                  ADataType,
+                                                  BDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::KThreadChunk;
+
+    using Base::APackedSize;
+    using Base::BPackedSize;
+    using Base::ComputePackedSize;
+
+    using AccType      = typename Base::AccType;
+    using Tuple4       = typename Base::Tuple4;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 2;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_mfma_per_issue =
+            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs_up;
+
+        // Global prefetch B1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Global prefetch A1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Prefetch a_scales to buf 0
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(0, ScalesPerKBlockSize, 0));
+
+        // Prefetch b_scales 1
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_bufs(I0)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+
+                    auto b_scale_thread_buf_copy_up =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc_copy,
+                                               make_tuple(I0, I0),
+                                               b_scale_thread_buf_copy_up);
+
+                    b_scale_thread_bufs_up(I0)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy_up[Number<0>{}];
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+            b_scale_thread_copy_up.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+        });
+        // restore col id and advance to the next set of scales
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+        b_scale_thread_copy_up.MoveSrcSliceWindow(
+            b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0)); // vmem->vgpr-> lds0
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                    constexpr auto a_k_step_chunk =
+                        k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    // Prefetch a_scales to buf 1
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0, I0),
+                                            a_scale_thread_bufs(local_read_buf));
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, make_multi_index(0, ScalesPerKBlockSize, 0));
+
+                    // Prefetch b_scales 2
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                constexpr auto b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                                auto b_scale_thread_buf_copy =
+                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                        b_scale_thread_desc_copy.GetElementSpaceSize());
+                                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                        b_scale_grid_buf,
+                                                        b_scale_thread_desc_copy,
+                                                        make_tuple(I0, I0),
+                                                        b_scale_thread_buf_copy);
+
+                                b_scale_thread_bufs(local_read_buf)(Number<b_scale_offset>{}) =
+                                    b_scale_thread_buf_copy[Number<0>{}];
+                                b_scale_thread_copy.MoveSrcSliceWindow(
+                                    b_scale_grid_desc,
+                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+
+                                auto b_scale_thread_buf_copy_up =
+                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                        b_scale_thread_desc_copy.GetElementSpaceSize());
+                                b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                                           b_scale_grid_buf_up,
+                                                           b_scale_thread_desc_copy,
+                                                           make_tuple(I0, I0),
+                                                           b_scale_thread_buf_copy_up);
+
+                                b_scale_thread_bufs_up(local_read_buf)(Number<b_scale_offset>{}) =
+                                    b_scale_thread_buf_copy_up[Number<0>{}];
+                                b_scale_thread_copy_up.MoveSrcSliceWindow(
+                                    b_scale_grid_desc,
+                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                            });
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                        b_scale_thread_copy_up.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                    });
+                    // restore col id and advance to the next set of scales
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+                    // Local prefill A2
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+
+                    // Global prefetch A1
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    // Global prefetch B2
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    // A1 * B1
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                    b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                        [Number<b_thread_desc_.CalculateOffset(
+                                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    b_scale_thread_vec;
+                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    b_scale_thread_vec_up;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs[mfma_reg_buf]
+                                                           [Number<a_scale_offset + s>{}];
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs[mfma_reg_buf]
+                                                           [Number<b_scale_offset + s>{}];
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs_up[mfma_reg_buf]
+                                                              [Number<b_scale_offset + s>{}];
+                                });
+
+                                using mfma_input_type_a =
+                                    typename vector_type<ComputeTypeA,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             APackedSize>::type;
+                                using mfma_input_type_b =
+                                    typename vector_type<ComputeTypeB,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             BPackedSize>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                // MFMA accumulation
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec.template AsType<BScaleDataType>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                            }); // KRepeat
+                        });     // NRepeat
+                    });         // MRepeat
+
+                    // Local prefetch A2
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step =
+                            k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}(
+                                [&](auto chunk) {
+                                    constexpr auto a_k_step_chunk =
+                                        k_step + chunk * KThreadChunk *
+                                                     xdlops_gemm.mfma_instr.num_input_blks;
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k,
+                                        make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                        a_thread_buf);
+                                });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                }; // LoopFunc
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales 2
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(I0, I0, I0),
+                                    a_scale_thread_bufs(I1));
+
+            // Prefetch b_scales 2
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                        constexpr auto b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                        auto b_scale_thread_buf_copy =
+                            make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                b_scale_thread_desc_copy.GetElementSpaceSize());
+                        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                b_scale_grid_buf,
+                                                b_scale_thread_desc_copy,
+                                                make_tuple(I0, I0),
+                                                b_scale_thread_buf_copy);
+
+                        b_scale_thread_bufs(I1)(Number<b_scale_offset>{}) =
+                            b_scale_thread_buf_copy[Number<0>{}];
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+
+                        auto b_scale_thread_buf_copy_up =
+                            make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                b_scale_thread_desc_copy.GetElementSpaceSize());
+                        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                                   b_scale_grid_buf_up,
+                                                   b_scale_thread_desc_copy,
+                                                   make_tuple(I0, I0),
+                                                   b_scale_thread_buf_copy_up);
+
+                        b_scale_thread_bufs_up(I1)(Number<b_scale_offset>{}) =
+                            b_scale_thread_buf_copy_up[Number<0>{}];
+                        b_scale_thread_copy_up.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                    });
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                b_scale_thread_copy_up.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+            });
+
+            // Local prefill A2
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
+            // Global prefetch B2
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+
+            // A1 * B1
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                            b_scale_thread_vec_up;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    }); // KRepeat
+                });     // NRepeat
+            });         // MRepeat
+
+            // Local prefetch A2
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                           a_block_buf.At(I1),
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+                });
+            });
+
+            // A2 * B2
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                            b_scale_thread_vec_up;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I1][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I1][Number<b_scale_offset + s>{}];
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up[I1][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    }); // KRepeat
+                });     // NRepeat
+            });         // MRepeat
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                            b_scale_thread_vec_up;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                    }); // KRepeat
+                });     // NRepeat
+            });         // MRepeat
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from a_scale_grid to a_scale_thread
+    static constexpr auto a_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from b_scale_grid to b_scale_thread_buf
+    static constexpr auto b_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    protected:
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    // using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
new file mode 100644
index 0000000000..59b2619416
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp"
+
+namespace ck {
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType, // TODO: remove this as in this pipeline ADataType and BDataType
+                                    // must be used for compute
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool GUFusion = false>
+constexpr auto BlockGemmMXBPreshufflePipeline_Selector()
+{
+
+    // Hardware MX GEMM pipeline
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v1<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+            ;
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+    }
+    else
+    {
+        std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
new file mode 100644
index 0000000000..c3b54df7c8
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
@@ -0,0 +1,813 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                          ThreadBlockSize,
+                                                          ScaleBlockSize,
+                                                          ADataType,
+                                                          AScaleDataType,
+                                                          BDataType,
+                                                          BScaleDataType,
+                                                          ATileDesc,
+                                                          BTileDesc,
+                                                          AMmaTileDesc,
+                                                          BMmaTileDesc,
+                                                          ABlockTransferSrcScalarPerVector,
+                                                          BBlockTransferSrcScalarPerVector,
+                                                          MPerBlock,
+                                                          NPerBlock,
+                                                          KPerBlock,
+                                                          MPerXDL,
+                                                          NPerXDL,
+                                                          MRepeat,
+                                                          NRepeat,
+                                                          KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::KThreadChunk;
+
+    using Base::APackedSize;
+    using Base::BPackedSize;
+    using Base::ComputePackedSize;
+
+    using AccType      = typename Base::AccType;
+    using Tuple4       = typename Base::Tuple4;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto a_scale_offset =
+                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
+                    auto a_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+                            a_scale_thread_desc_copy.GetElementSpaceSize());
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf_copy);
+
+                    a_scale_thread_buf(I0)(Number<a_scale_offset>{}) =
+                        a_scale_thread_buf_copy[Number<0>{}];
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+
+        // Prefetch b_scales to buf 0
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_bufs(I0)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Prefetch a_scales to buf 1
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto a_scale_offset =
+                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
+                    auto a_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+                            a_scale_thread_desc_copy.GetElementSpaceSize());
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf_copy);
+
+                    a_scale_thread_buf(I1)(Number<a_scale_offset>{}) =
+                        a_scale_thread_buf_copy[Number<0>{}];
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+
+        // Prefetch b_scales to buf 1
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                    constexpr auto b_scale_offset =
+                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                    auto b_scale_thread_buf_copy =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                            b_scale_thread_desc_copy.GetElementSpaceSize());
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc_copy,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf_copy);
+
+                    b_scale_thread_bufs(I1)(Number<b_scale_offset>{}) =
+                        b_scale_thread_buf_copy[Number<0>{}];
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                });
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+        });
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                    constexpr auto a_k_step_chunk =
+                        k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(
+                                    0 < ScalesPerXdlopsRunPerThread,
+                                    "Must have at least one scale per Xdlops per Thread.");
+
+                                vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs[mfma_reg_buf]
+                                                           [Number<a_scale_offset + s>{}];
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs[mfma_reg_buf]
+                                                           [Number<b_scale_offset + s>{}];
+                                });
+
+                                using mfma_input_type_a =
+                                    typename vector_type<ComputeTypeA,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             APackedSize>::type;
+                                using mfma_input_type_b =
+                                    typename vector_type<ComputeTypeB,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             BPackedSize>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                // MFMA accumulation
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec.template AsType<BScaleDataType>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    // a thread copy
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step =
+                            k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}(
+                                [&](auto chunk) {
+                                    constexpr auto a_k_step_chunk =
+                                        k_step + chunk * KThreadChunk *
+                                                     xdlops_gemm.mfma_instr.num_input_blks;
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k,
+                                        make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                        a_block_buf,
+                                        a_thread_desc_,
+                                        make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                        a_thread_buf);
+                                });
+                        });
+                    });
+
+                    // Prefetch a_scales
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0, I0),
+                                            a_scale_thread_bufs(mfma_reg_buf));
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, make_multi_index(0, ScalesPerKBlockSize, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                                constexpr auto b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
+                                auto b_scale_thread_buf_copy =
+                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+                                        b_scale_thread_desc_copy.GetElementSpaceSize());
+                                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                        b_scale_grid_buf,
+                                                        b_scale_thread_desc_copy,
+                                                        make_tuple(I0, I0),
+                                                        b_scale_thread_buf_copy);
+
+                                b_scale_thread_bufs(mfma_reg_buf)(Number<b_scale_offset>{}) =
+                                    b_scale_thread_buf_copy[Number<0>{}];
+                                b_scale_thread_copy.MoveSrcSliceWindow(
+                                    b_scale_grid_desc,
+                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                            });
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc,
+                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                    });
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            // a thread copy
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I1][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I1][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
+                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
+
+                        // Pack b_scale_thread_buf into b_scale_thread_vec
+                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from a_scale_grid to a_scale_thread
+    static constexpr auto a_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
+
+    // Is used to copy data from b_scale_grid to b_scale_thread_buf
+    static constexpr auto b_scale_thread_desc_copy =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+
+    protected:
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    // using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
new file mode 100644
index 0000000000..ec0628ca20
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
@@ -0,0 +1,1032 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                          ThreadBlockSize,
+                                                          ScaleBlockSize,
+                                                          ADataType,
+                                                          AScaleDataType,
+                                                          BDataType,
+                                                          BScaleDataType,
+                                                          ATileDesc,
+                                                          BTileDesc,
+                                                          AMmaTileDesc,
+                                                          BMmaTileDesc,
+                                                          ABlockTransferSrcScalarPerVector,
+                                                          BBlockTransferSrcScalarPerVector,
+                                                          MPerBlock,
+                                                          NPerBlock,
+                                                          KPerBlock,
+                                                          MPerXDL,
+                                                          NPerXDL,
+                                                          MRepeat,
+                                                          NRepeat,
+                                                          KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using Base::APackedSize;
+    using Base::BPackedSize;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    template <typename TileDesc_M0_M1_M2_M3_K>
+    __host__ __device__ static constexpr auto
+    MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_M3_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<2>{});
+        constexpr index_t M3 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<3>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_M3_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_pass_through_transform(Number<M3>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(
+                Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4, 5, 6>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_m3_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_m3_k);
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        // Separate this part?
+        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
+        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
+        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
+        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_mfma_per_issue =
+            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
+        });
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
+                ignore = idswrite;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(
+                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        });
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch B1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_n2_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Global prefetch A1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Prefetch a_scales to buf 0
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales 1
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0)); // vmem->vgpr-> lds0
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_buf.At(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    // Prefetch a_scales to buf 1
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales 1
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Local prefill A2
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(scale_mem_buf));
+
+                    // Global prefetch A1
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    // Global prefetch B2
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_n2_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(scale_mem_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    // A1 * B1
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                                a_thread_vec.template AsType<ComputeTypeA>()(
+                                                    ik) = a_thread_buf
+                                                    [Number<a_thread_desc_.CalculateOffset(
+                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                                b_thread_vec.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                            });
+
+                                            using mfma_input_type_a =
+                                                typename vector_type<ComputeTypeA,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         APackedSize>::type;
+
+                                            using mfma_input_type_b =
+                                                typename vector_type<ComputeTypeB,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         BPackedSize>::type;
+
+                                            using mfma_scale_input_type_a =
+                                                typename vector_type<AScaleDataType,
+                                                                     a_scale_thread_vec_size>::type;
+                                            using mfma_scale_input_type_b =
+                                                typename vector_type<BScaleDataType,
+                                                                     b_scale_thread_vec_size>::type;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                            // MFMA accumulation
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+
+                    // Local prefetch A2
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_buf.At(scale_mem_buf),
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                }; // LoopFunc
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Local prefill A2
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
+            // Global prefetch B2
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_n2_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            // A1 * B1
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            // Local prefetch A2
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_buf.At(I0),
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+            });
+
+            // A2 * B2
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        // b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                        //     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        //         make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            type_convert<ComputeTypeB>(ck::float2_t(1.0));
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+
+    static constexpr BTileDesc b_block_desc_n0_n1_n2_k0_k1;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
index 074b5873ee..c6966011b4 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp"
@@ -171,26 +172,54 @@ constexpr auto BlockGemmBPreshufflePipeline_Selector()
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
         if constexpr(std::is_same<ADataType, BDataType>::value)
         {
-            return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
-                                                               BlockSize,
-                                                               ADataType,
-                                                               BDataType,
-                                                               ComputeDataType,
-                                                               AccDataType,
-                                                               ATileDesc,
-                                                               BTileDesc,
-                                                               AMmaTileDesc,
-                                                               BMmaTileDesc,
-                                                               ABlockTransferSrcScalarPerVector,
-                                                               BBlockTransferSrcScalarPerVector,
-                                                               MPerBlock,
-                                                               NPerBlock,
-                                                               KPerBlock,
-                                                               MPerXDL,
-                                                               NPerXDL,
-                                                               MRepeat,
-                                                               NRepeat,
-                                                               KPack>{};
+            if constexpr(GUFusion)
+            {
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3<
+                    BlkGemmPipeSche,
+                    BlockSize,
+                    ADataType,
+                    BDataType,
+                    ComputeDataType,
+                    AccDataType,
+                    ATileDesc,
+                    BTileDesc,
+                    AMmaTileDesc,
+                    BMmaTileDesc,
+                    ABlockTransferSrcScalarPerVector,
+                    BBlockTransferSrcScalarPerVector,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    MPerXDL,
+                    NPerXDL,
+                    MRepeat,
+                    NRepeat,
+                    KPack>{};
+            }
+            else
+            {
+
+                return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
+                                                                   BlockSize,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   ComputeDataType,
+                                                                   AccDataType,
+                                                                   ATileDesc,
+                                                                   BTileDesc,
+                                                                   AMmaTileDesc,
+                                                                   BMmaTileDesc,
+                                                                   ABlockTransferSrcScalarPerVector,
+                                                                   BBlockTransferSrcScalarPerVector,
+                                                                   MPerBlock,
+                                                                   NPerBlock,
+                                                                   KPerBlock,
+                                                                   MPerXDL,
+                                                                   NPerXDL,
+                                                                   MRepeat,
+                                                                   NRepeat,
+                                                                   KPack>{};
+            }
         }
         else
         {
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
new file mode 100644
index 0000000000..818439fddf
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp"
+namespace ck {
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1<
+            BlkGemmPipeSche,
+            BlockSize,
+            ADataType,
+            BDataType,
+            ComputeDataType,
+            AccDataType,
+            ATileDesc,
+            BTileDesc,
+            AMmaTileDesc,
+            BMmaTileDesc,
+            ABlockTransferSrcScalarPerVector,
+            BBlockTransferSrcScalarPerVector,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MScaleBlock,
+            NScaleBlock,
+            KScaleBlock,
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            KPack>{};
+    }
+#if 0
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+    {
+        return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v2<
+            BlkGemmPipeSche,
+            BlockSize,
+            ADataType,
+            BDataType,
+            ComputeDataType,
+            AccDataType,
+            ATileDesc,
+            BTileDesc,
+            AMmaTileDesc,
+            BMmaTileDesc,
+            ABlockTransferSrcScalarPerVector,
+            BBlockTransferSrcScalarPerVector,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            KPack>{};
+    }
+#endif
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
+        return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<
+            BlkGemmPipeSche,
+            BlockSize,
+            ADataType,
+            BDataType,
+            ComputeDataType,
+            AccDataType,
+            ATileDesc,
+            BTileDesc,
+            AMmaTileDesc,
+            BMmaTileDesc,
+            ABlockTransferSrcScalarPerVector,
+            BBlockTransferSrcScalarPerVector,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MScaleBlock,
+            NScaleBlock,
+            KScaleBlock,
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            KPack>{};
+    }
+    else
+    {
+        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp
new file mode 100644
index 0000000000..8e2922e2ce
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp
@@ -0,0 +1,864 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                              BlockSize,
+                                                              ADataType,
+                                                              BDataType,
+                                                              ComputeDataType,
+                                                              AccDataType,
+                                                              ATileDesc,
+                                                              BTileDesc,
+                                                              AMmaTileDesc,
+                                                              BMmaTileDesc,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              BBlockTransferSrcScalarPerVector,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              MScaleBlock,
+                                                              NScaleBlock,
+                                                              KScaleBlock,
+                                                              MPerXDL,
+                                                              NPerXDL,
+                                                              MRepeat,
+                                                              NRepeat,
+                                                              KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack,
+                                        true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::MWaves;
+    using Base::NWaves;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        // __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        constexpr auto num_scale_k_block = CScaleThreadDesc{}.GetLength(Number<0>{});
+        constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{});
+        constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{});
+
+        static_for<0, num_scale_m_block, 1>{}([&](auto m0) {
+            static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                    constexpr index_t c_offset =
+                        CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                    constexpr index_t a_offset =
+                        AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                    constexpr index_t b_offset =
+                        BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                    c_scale_thread_buf(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf[Number<b_offset>{}];
+                });
+            });
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_buf);
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(
+                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                        a_thread_buf);
+                });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                });
+                                vector_type<AccDataType, 2> c_scale_thread_vec;
+                                constexpr index_t cscale_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(
+                                        make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+
+                                static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                                    vector_type<ComputeDataType, KPack> a_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0,
+                                                           I0,
+                                                           I0,
+                                                           kscale0 * KRepeat / num_scale_k_block +
+                                                               k0,
+                                                           I0,
+                                                           ik))>{}];
+                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_bufs[mfma_reg_buf][Number<
+                                                b_thread_desc_.CalculateOffset(make_tuple(
+                                                    n0,
+                                                    I0,
+                                                    kscale0 * KRepeat / num_scale_k_block + k0,
+                                                    ik))>{}];
+                                    });
+
+                                    using mfma_input_type =
+                                        typename vector_type<ComputeDataType,
+                                                             xdlops_gemm.K1PerXdlops>::type;
+
+                                    xdlops_gemm.template Run<>(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                });
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}(
+                                    [&](auto t) {
+                                        using pk_fma_type =
+                                            typename vector_type<AccDataType, 2>::type;
+
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                            .template AsType<pk_fma_type>()(t) =
+                                            __builtin_elementwise_fma(
+                                                c_thread_buf_per_scale
+                                                    .GetVectorTypeReference(Number<0>{})
+                                                    .template AsType<pk_fma_type>()[t],
+                                                c_scale_thread_vec
+                                                    .template AsType<pk_fma_type>()[Number<0>{}],
+                                                c_thread_buf
+                                                    .GetVectorTypeReference(Number<c_offset>{})
+                                                    .template AsType<pk_fma_type>()[t]);
+                                    });
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                                    a_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                                constexpr index_t c_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                                constexpr index_t a_offset =
+                                    AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                                constexpr index_t b_offset =
+                                    BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                                c_scale_thread_buf(Number<c_offset>{}) =
+                                    a_scale_thread_buf[Number<a_offset>{}] *
+                                    b_scale_thread_buf[Number<b_offset>{}];
+                            });
+                        });
+                    });
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                a_scale_grid_buf,
+                                                a_scale_thread_desc,
+                                                make_tuple(m0, I0),
+                                                a_scale_thread_buf);
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    });
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                        constexpr index_t c_offset =
+                            CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                        constexpr index_t a_offset =
+                            AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                        constexpr index_t b_offset =
+                            BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                        c_scale_thread_buf(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf[Number<b_offset>{}];
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                            a_thread_buf);
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
new file mode 100644
index 0000000000..cc4c5a2c36
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v3.hpp
@@ -0,0 +1,1090 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                              BlockSize,
+                                                              ADataType,
+                                                              BDataType,
+                                                              ComputeDataType,
+                                                              AccDataType,
+                                                              ATileDesc,
+                                                              BTileDesc,
+                                                              AMmaTileDesc,
+                                                              BMmaTileDesc,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              BBlockTransferSrcScalarPerVector,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              MScaleBlock,
+                                                              NScaleBlock,
+                                                              KScaleBlock,
+                                                              MPerXDL,
+                                                              NPerXDL,
+                                                              MRepeat,
+                                                              NRepeat,
+                                                              KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack,
+                                        true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t LocalPrefetchStages   = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves;
+
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto buffer_load_perstage_more =
+            math::integer_divide_ceil((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                      (num_total_stages - (LocalPrefetchStages - 1)));
+        constexpr auto buffer_load_perstage_less =
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - (LocalPrefetchStages - 1)));
+
+        constexpr auto buffer_load_stages_more =
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) -
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - (LocalPrefetchStages - 1))) *
+                ((num_total_stages - (LocalPrefetchStages - 1)));
+
+        constexpr auto buffer_load_b_stages =
+            buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more +
+                   (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
+                       buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - (LocalPrefetchStages - 1) - buffer_load_b_stages;
+
+        constexpr auto buffer_load_issue_point_b = 0;
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more
+                ? num_mfma_perstage / buffer_load_perstage_more
+                : 1;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less
+                ? num_mfma_perstage / buffer_load_perstage_less
+                : 1;
+        constexpr auto ds_write_issue_point      = 0;
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_b)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_b)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr((imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage - 1)) &&
+                             (imfma < (num_mfma_perstage - 1)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+                // __builtin_amdgcn_sched_group_barrier(0x1000, 4, 0); // v_fmac
+            });
+            // Scale load, 1B
+            if constexpr(i.value == 0)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               ds_write_issue_point)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               ds_write_issue_point)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr((imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage - 1)) &&
+                             (imfma < (num_mfma_perstage - 1)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+                // __builtin_amdgcn_sched_group_barrier(0x1000, 4, 0); // v_fmac
+            });
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, (LocalPrefetchStages - 1), 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr((imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage - 1)) &&
+                             (imfma < (num_mfma_perstage - 1)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+                // __builtin_amdgcn_sched_group_barrier(0x1000, 4, 0); // v_fmac
+            });
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        static_assert(CScaleThreadDesc{}.GetLength(Number<0>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceK=1");
+        static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceN=1");
+        // assume kperblock = scaleblockk
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        // StaticallyIndexedArray<decltype(c_scale_thread_buf), Number<2>{}> c_scale_thread_bufs;
+
+        // Global prefetch A1 B1, AScale1 BScale1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_bufs(I0));
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // Global prefetch A2, AScale2 BScale2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+#if 1
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                    a_scale_grid_buf,
+                                    a_scale_thread_desc,
+                                    make_tuple(m0, I0),
+                                    a_scale_thread_bufs(I0));
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        });
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<2>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+#endif
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Double register buffer for non-scaled gemm computation
+        // 1. Reduce register pressure
+        // 2. Decouple the dependency between mfma instruction and scale-fma instruction following.
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  2,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, LocalPrefetchStages, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(
+                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                        a_block_buf.At(I0),
+                        a_thread_desc_,
+                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                        a_thread_buf);
+                });
+            });
+        });
+
+#if 1
+        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                .template AsType<AccDataType>()(Number<t>{}) = 0;
+        });
+
+        // Fill first mfma buffer
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            vector_type<ComputeDataType, KPack> a_thread_vec;
+            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+            static_for<0, KPack, 1>{}([&](auto ik) {
+                a_thread_vec.template AsType<ComputeDataType>()(ik) = a_thread_buf
+                    [Number<a_thread_desc_.CalculateOffset(make_tuple(I0, I0, I0, k0, I0, ik))>{}];
+                b_thread_vec.template AsType<ComputeDataType>()(ik) = b_thread_bufs
+                    [I0][Number<b_thread_desc_.CalculateOffset(make_tuple(I0, I0, k0, ik))>{}];
+            });
+
+            using mfma_input_type =
+                typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+            xdlops_gemm.template Run<>(a_thread_vec.template AsType<mfma_input_type>(),
+                                       b_thread_vec.template AsType<mfma_input_type>(),
+                                       c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+        });
+#endif
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_bufs(local_read_buf));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                a_scale_grid_buf,
+                                                a_scale_thread_desc,
+                                                make_tuple(m0, I0),
+                                                a_scale_thread_bufs(local_read_buf));
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    });
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[m0];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[m0];
+
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            constexpr auto mfma_buf_offset =
+                                ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                            constexpr auto scale_buf_offset =
+                                ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+
+                            constexpr auto a_local_buf_offset =
+                                ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                            constexpr auto b_local_buf_offset =
+                                ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+                            constexpr auto b_local_buf_id =
+                                Number<mfma_reg_buf ^
+                                       ((m0 * NRepeat + n0 + 1) / (MRepeat * NRepeat))>{};
+
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                c_thread_buf_per_scale
+                                    .GetVectorTypeReference(Number<mfma_buf_offset>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            });
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((a_local_buf_offset +
+                                                        HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs
+                                            [b_local_buf_id][Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale.GetVectorTypeReference(
+                                        Number<mfma_buf_offset>{}));
+                            });
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                                using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                    c_thread_buf_per_scale
+                                        .GetVectorTypeReference(Number<scale_buf_offset>{})
+                                        .template AsType<pk_fma_type>()[t],
+                                    c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                        .template AsType<pk_fma_type>()[t]);
+                            });
+                        });
+
+                        // We have to 1 stage early sync the lds for workaround the compiler
+                        // limitation
+                        if constexpr(m0.value == (MRepeat - LocalPrefetchStages - 1))
+                        {
+                            block_sync_lds();
+                        }
+
+                        constexpr auto lds_buf = m0.value >= (MRepeat - LocalPrefetchStages)
+                                                     ? local_read_buf
+                                                     : mfma_reg_buf;
+
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                               I0,
+                                               I0,
+                                               Number<k0 * KGroup + kg0>{},
+                                               I0,
+                                               I0),
+                                    a_block_buf.At(Number<lds_buf>{}),
+                                    a_thread_desc_,
+                                    make_tuple(Number<(m0 + LocalPrefetchStages +
+                                                       HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                      2>{},
+                                               I0,
+                                               I0,
+                                               k0,
+                                               I0,
+                                               Number<kg0 * KPack / KGroup>{}),
+                                    a_thread_buf);
+                            });
+                        });
+                    });
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        c_scale_thread_buf(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] *
+                                                 b_scale_thread_bufs[mfma_reg_buf][I0];
+                    });
+
+                    // We need new compiler to enable this feature
+                    // HotLoopScheduler();
+                    // __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    constexpr auto mfma_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                    constexpr auto scale_buf_offset =
+                        ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+
+                    constexpr auto a_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                    constexpr auto b_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+
+                    constexpr auto b_local_buf_id =
+                        Number<0 ^ ((m0 * NRepeat + n0 + 1) / (MRepeat * NRepeat))>{};
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<mfma_buf_offset>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(a_local_buf_offset % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[b_local_buf_id][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(a_thread_vec.template AsType<mfma_input_type>(),
+                                                   b_thread_vec.template AsType<mfma_input_type>(),
+                                                   c_thread_buf_per_scale.GetVectorTypeReference(
+                                                       Number<mfma_buf_offset>{}));
+                    });
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale
+                                .GetVectorTypeReference(Number<scale_buf_offset>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value == (MRepeat - LocalPrefetchStages))
+                {
+                    block_sync_lds();
+                }
+
+                constexpr auto lds_buf = m0.value >= (MRepeat - LocalPrefetchStages) ? I1 : I0;
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + LocalPrefetchStages) % MRepeat>{},
+                                       I0,
+                                       I0,
+                                       Number<k0 * KGroup + kg0>{},
+                                       I0,
+                                       I0),
+                            a_block_buf.At(Number<lds_buf>{}),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + LocalPrefetchStages) % 2>{},
+                                       I0,
+                                       I0,
+                                       k0,
+                                       I0,
+                                       Number<kg0 * KPack / KGroup>{}),
+                            a_thread_buf);
+                    });
+                });
+            });
+
+            // HotLoopScheduler();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    constexpr auto mfma_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                    constexpr auto scale_buf_offset =
+                        ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+
+                    constexpr auto a_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                    constexpr auto b_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+
+                    if constexpr(!((m0 == (MRepeat - 1)) && (n0 == (NRepeat - 1))))
+                    {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<mfma_buf_offset>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple((a_local_buf_offset + HotloopLocalBufSwitch) % 2,
+                                                   I0,
+                                                   I0,
+                                                   k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(
+                                    Number<mfma_buf_offset>{}));
+                        });
+                    }
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale
+                                .GetVectorTypeReference(Number<scale_buf_offset>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<m0 + LocalPrefetchStages>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + LocalPrefetchStages + HotloopLocalBufSwitch) %
+                                           2>{},
+                                    I0,
+                                    I0,
+                                    k0,
+                                    I0,
+                                    Number<kg0 * KPack / KGroup>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    constexpr auto mfma_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+                    constexpr auto scale_buf_offset =
+                        ((m0 * NRepeat + n0) % 2) * xdlops_gemm.GetRegSizePerXdlops();
+
+                    constexpr auto a_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) / NRepeat;
+                    constexpr auto b_local_buf_offset =
+                        ((m0 * NRepeat + n0 + 1) % (MRepeat * NRepeat)) % NRepeat;
+
+                    if constexpr(!((m0 == (MRepeat - 1)) && (n0 == (NRepeat - 1))))
+                    {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<mfma_buf_offset>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(a_local_buf_offset % 2, I0, I0, k0, I0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(b_local_buf_offset, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(
+                                    Number<mfma_buf_offset>{}));
+                        });
+                    }
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale
+                                .GetVectorTypeReference(Number<scale_buf_offset>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + LocalPrefetchStages) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * KPack / KGroup>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
new file mode 100644
index 0000000000..1608506b40
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp
@@ -0,0 +1,1036 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MScaleBlock,
+    NScaleBlock,
+    KScaleBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack,
+                                               true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::MWaves;
+    using Base::NWaves;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b =
+            HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves * 2;
+        constexpr auto mfma_interleave = MPerXDL == 32 ? 1 : 2;
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 64)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
+            }
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, MPerXDL == 32 ? num_ds_read_inst_a / 2 : num_ds_read_inst_a, 1>{}(
+            [&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);                     // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, MPerXDL == 32 ? 2 : 1, 0); // DS read
+            });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        // __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_buf);
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                   b_scale_grid_buf_up,
+                                   b_scale_thread_desc,
+                                   make_tuple(I0, I0),
+                                   b_scale_thread_buf_up);
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        // __builtin_amdgcn_sched_barrier(0);
+
+        constexpr auto num_scale_k_block = CScaleThreadDesc{}.GetLength(Number<0>{});
+        constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{});
+        constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{});
+        static_for<0, num_scale_m_block, 1>{}([&](auto m0) {
+            static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                    constexpr index_t c_offset =
+                        CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                    constexpr index_t a_offset =
+                        AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                    constexpr index_t b_offset =
+                        BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                    c_scale_thread_buf(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf[Number<b_offset>{}];
+                    c_scale_thread_buf_up(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf_up[Number<b_offset>{}];
+                });
+            });
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_buf);
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                   b_scale_grid_buf_up,
+                                   b_scale_thread_desc,
+                                   make_tuple(I0, I0),
+                                   b_scale_thread_buf_up);
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale_up;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(
+                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                        a_thread_buf);
+                });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        // __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                    c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                });
+                                vector_type<AccDataType, 2> c_scale_thread_vec;
+                                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                                constexpr index_t cscale_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(
+                                        make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+                                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                                    c_scale_thread_buf_up[Number<cscale_offset>{}];
+                                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                                    c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                                static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                                    vector_type<ComputeDataType, KPack> a_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0,
+                                                           I0,
+                                                           I0,
+                                                           kscale0 * KRepeat / num_scale_k_block +
+                                                               k0,
+                                                           I0,
+                                                           ik))>{}];
+                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_bufs[mfma_reg_buf][Number<
+                                                b_thread_desc_.CalculateOffset(make_tuple(
+                                                    n0,
+                                                    I0,
+                                                    kscale0 * KRepeat / num_scale_k_block + k0,
+                                                    ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_bufs_up[mfma_reg_buf][Number<
+                                                b_thread_desc_.CalculateOffset(make_tuple(
+                                                    n0,
+                                                    I0,
+                                                    kscale0 * KRepeat / num_scale_k_block + k0,
+                                                    ik))>{}];
+                                    });
+
+                                    using mfma_input_type =
+                                        typename vector_type<ComputeDataType,
+                                                             xdlops_gemm.K1PerXdlops>::type;
+
+                                    xdlops_gemm.template Run<>(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                    xdlops_gemm.template Run<>(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type>(),
+                                        c_thread_buf_per_scale_up.GetVectorTypeReference(
+                                            Number<0>{}));
+                                });
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}(
+                                    [&](auto t) {
+                                        using pk_fma_type =
+                                            typename vector_type<AccDataType, 2>::type;
+
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                            .template AsType<pk_fma_type>()(t) =
+                                            __builtin_elementwise_fma(
+                                                c_thread_buf_per_scale
+                                                    .GetVectorTypeReference(Number<0>{})
+                                                    .template AsType<pk_fma_type>()[t],
+                                                c_scale_thread_vec
+                                                    .template AsType<pk_fma_type>()[Number<0>{}],
+                                                c_thread_buf
+                                                    .GetVectorTypeReference(Number<c_offset>{})
+                                                    .template AsType<pk_fma_type>()[t]);
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                            .template AsType<pk_fma_type>()(t) =
+                                            __builtin_elementwise_fma(
+                                                c_thread_buf_per_scale_up
+                                                    .GetVectorTypeReference(Number<0>{})
+                                                    .template AsType<pk_fma_type>()[t],
+                                                c_scale_thread_vec_up
+                                                    .template AsType<pk_fma_type>()[Number<0>{}],
+                                                c_thread_buf_up
+                                                    .GetVectorTypeReference(Number<c_offset>{})
+                                                    .template AsType<pk_fma_type>()[t]);
+                                    });
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                                    a_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                                constexpr index_t c_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                                constexpr index_t a_offset =
+                                    AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                                constexpr index_t b_offset =
+                                    BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                                c_scale_thread_buf(Number<c_offset>{}) =
+                                    a_scale_thread_buf[Number<a_offset>{}] *
+                                    b_scale_thread_buf[Number<b_offset>{}];
+                                c_scale_thread_buf_up(Number<c_offset>{}) =
+                                    a_scale_thread_buf[Number<a_offset>{}] *
+                                    b_scale_thread_buf_up[Number<b_offset>{}];
+                            });
+                        });
+                    });
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf);
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    }
+
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc,
+                                               make_tuple(I0, I0),
+                                               b_scale_thread_buf_up);
+
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                              b_scale_thread_copy_step);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec_up.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                        constexpr index_t c_offset =
+                            CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                        constexpr index_t a_offset =
+                            AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                        constexpr index_t b_offset =
+                            BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                        c_scale_thread_buf(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf[Number<b_offset>{}];
+                        c_scale_thread_buf_up(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf_up[Number<b_offset>{}];
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                            a_thread_buf);
+                    });
+                });
+            });
+
+            // __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec_up.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                                b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec_up.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
new file mode 100644
index 0000000000..30d6d4f812
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp
@@ -0,0 +1,1203 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MScaleBlock,
+    NScaleBlock,
+    KScaleBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack,
+                                               true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * 2;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more = math::integer_divide_ceil(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less = math::integer_divide_floor(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+
+        constexpr auto buffer_load_stages_more =
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) -
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_b_stages =
+            buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more +
+                   (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
+                       buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - num_ds_read_a_prefetch_stages - buffer_load_b_stages;
+
+        constexpr auto buffer_load_issue_point_b = 0;
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more
+                ? num_mfma_perstage / buffer_load_perstage_more
+                : 1;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less
+                ? num_mfma_perstage / buffer_load_perstage_less
+                : 1;
+        constexpr auto ds_write_issue_point      = 0;
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            // Scale load, 1B
+            if constexpr(i.value == 0)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_b)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_b)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                // __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               ds_write_issue_point)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               ds_write_issue_point)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                // __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                // Scale load, 1A
+                if constexpr(imfma == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                // __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        static_assert(CScaleThreadDesc{}.GetLength(Number<0>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceK=1");
+        static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceN=1");
+        // assume kperblock = scaleblockk
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs_up;
+        // StaticallyIndexedArray<decltype(c_scale_thread_buf), Number<2>{}> c_scale_thread_bufs;
+
+        // Global prefetch A1 B1, AScale1 BScale1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        b_blockwise_copy_up.Run(b_grid_desc,
+                                b_grid_buf_up,
+                                b_block_desc_n0_n1_k0_k1,
+                                b_block_origin_idx,
+                                b_thread_bufs_up(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                   b_scale_grid_buf_up,
+                                   b_scale_thread_desc,
+                                   make_tuple(I0, I0),
+                                   b_scale_thread_bufs_up(I0));
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+        });
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf_up(m0) =
+                a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs_up[I0][I0];
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // Global prefetch A2, AScale2 BScale2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                   b_scale_grid_buf_up,
+                                   b_scale_thread_desc,
+                                   make_tuple(I0, I0),
+                                   b_scale_thread_bufs_up(I0));
+
+        b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+
+        // Double register buffer for non-scaled gemm computation
+        // 1. Reduce register pressure
+        // 2. Decouple the dependency between mfma instruction and scale-fma instruction following.
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale_up;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, 2, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc_n0_n1_k0_k1,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(local_read_buf));
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_bufs(local_read_buf));
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    }
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_bufs(local_read_buf));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc,
+                                               make_tuple(I0, I0),
+                                               b_scale_thread_bufs_up(local_read_buf));
+
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                              b_scale_thread_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[m0];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[m0];
+                        vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf_up[m0];
+                        c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf_up[m0];
+
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            });
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+
+                                    b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs_up[mfma_reg_buf]
+                                                        [Number<b_thread_desc_.CalculateOffset(
+                                                            make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec_up.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                            });
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                                using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<pk_fma_type>()[t],
+                                    c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                        .template AsType<pk_fma_type>()[t]);
+                                c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                    c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<pk_fma_type>()[t],
+                                    c_scale_thread_vec_up
+                                        .template AsType<pk_fma_type>()[Number<0>{}],
+                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                        .template AsType<pk_fma_type>()[t]);
+                            });
+                        });
+
+                        if constexpr(m0.value == (MRepeat - 2))
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else if constexpr(m0.value == (MRepeat - 1))
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(mfma_reg_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                    });
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        c_scale_thread_buf(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] *
+                                                 b_scale_thread_bufs[mfma_reg_buf][I0];
+                        c_scale_thread_buf_up(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] *
+                                                    b_scale_thread_bufs_up[mfma_reg_buf][I0];
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            b_blockwise_copy_up.Run(b_grid_desc,
+                                    b_grid_buf_up,
+                                    b_block_desc_n0_n1_k0_k1,
+                                    b_block_origin_idx,
+                                    b_thread_bufs_up(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf_up[m0];
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf_up[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec_up.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                    });
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value == (MRepeat - 2))
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else if constexpr(m0.value == (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+
+            HotLoopScheduler();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+                c_scale_thread_buf_up(m0) =
+                    a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs_up[I0][I0];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf_up[m0];
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf_up[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec_up.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                    });
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+
+                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+                vector_type<AccDataType, 2> c_scale_thread_vec_up;
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf_up[m0];
+                c_scale_thread_vec_up.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf_up[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec_up;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                            b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec_up.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{}));
+                    });
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale_up.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec_up.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
new file mode 100644
index 0000000000..e7c061bd97
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_gufusion_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp"
+namespace ck {
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool GUFusion = false>
+constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MScaleBlock,
+                NScaleBlock,
+                KScaleBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MScaleBlock,
+                NScaleBlock,
+                KScaleBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+    }
+#if 0
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+    {
+        return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v2<
+            BlkGemmPipeSche,
+            BlockSize,
+            ADataType,
+            BDataType,
+            ComputeDataType,
+            AccDataType,
+            ATileDesc,
+            BTileDesc,
+            AMmaTileDesc,
+            BMmaTileDesc,
+            ABlockTransferSrcScalarPerVector,
+            BBlockTransferSrcScalarPerVector,
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MPerXDL,
+            NPerXDL,
+            MRepeat,
+            NRepeat,
+            KPack>{};
+    }
+#endif
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MScaleBlock,
+                NScaleBlock,
+                KScaleBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3<
+                BlkGemmPipeSche,
+                BlockSize,
+                ADataType,
+                BDataType,
+                ComputeDataType,
+                AccDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MScaleBlock,
+                NScaleBlock,
+                KScaleBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+    }
+    else
+    {
+        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp
new file mode 100644
index 0000000000..598b69cd61
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v1.hpp
@@ -0,0 +1,854 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MScaleBlock,
+    NScaleBlock,
+    KScaleBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack,
+                                               true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::MWaves;
+    using Base::NWaves;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        // __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_buf);
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        constexpr auto num_scale_k_block = CScaleThreadDesc{}.GetLength(Number<0>{});
+        constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{});
+        constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{});
+
+        static_for<0, num_scale_m_block, 1>{}([&](auto m0) {
+            static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                    constexpr index_t c_offset =
+                        CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                    constexpr index_t a_offset =
+                        AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                    constexpr index_t b_offset =
+                        BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                    c_scale_thread_buf(Number<c_offset>{}) =
+                        a_scale_thread_buf[Number<a_offset>{}] *
+                        b_scale_thread_buf[Number<b_offset>{}];
+                });
+            });
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_buf);
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_buf);
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(
+                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                        make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                        a_thread_buf);
+                });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<AccDataType>()(Number<t>{}) = 0;
+                                });
+                                vector_type<AccDataType, 2> c_scale_thread_vec;
+                                constexpr index_t cscale_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(
+                                        make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+                                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                                    c_scale_thread_buf[Number<cscale_offset>{}];
+
+                                static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                                    vector_type<ComputeDataType, KPack> a_thread_vec;
+                                    vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0,
+                                                           I0,
+                                                           I0,
+                                                           kscale0 * KRepeat / num_scale_k_block +
+                                                               k0,
+                                                           I0,
+                                                           ik))>{}];
+                                        b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                            b_thread_bufs[mfma_reg_buf][Number<
+                                                b_thread_desc_.CalculateOffset(make_tuple(
+                                                    n0,
+                                                    I0,
+                                                    kscale0 * KRepeat / num_scale_k_block + k0,
+                                                    ik))>{}];
+                                    });
+
+                                    using mfma_input_type =
+                                        typename vector_type<ComputeDataType,
+                                                             xdlops_gemm.K1PerXdlops>::type;
+
+                                    xdlops_gemm.template Run<>(
+                                        a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                                });
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}(
+                                    [&](auto t) {
+                                        using pk_fma_type =
+                                            typename vector_type<AccDataType, 2>::type;
+
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                            .template AsType<pk_fma_type>()(t) =
+                                            __builtin_elementwise_fma(
+                                                c_thread_buf_per_scale
+                                                    .GetVectorTypeReference(Number<0>{})
+                                                    .template AsType<pk_fma_type>()[t],
+                                                c_scale_thread_vec
+                                                    .template AsType<pk_fma_type>()[Number<0>{}],
+                                                c_thread_buf
+                                                    .GetVectorTypeReference(Number<c_offset>{})
+                                                    .template AsType<pk_fma_type>()[t]);
+                                    });
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                    a_block_buf,
+                                    a_thread_desc_,
+                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                                    a_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                            static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                                constexpr index_t c_offset =
+                                    CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                                constexpr index_t a_offset =
+                                    AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                                constexpr index_t b_offset =
+                                    BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                                c_scale_thread_buf(Number<c_offset>{}) =
+                                    a_scale_thread_buf[Number<a_offset>{}] *
+                                    b_scale_thread_buf[Number<b_offset>{}];
+                            });
+                        });
+                    });
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf);
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    }
+
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_buf);
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, num_scale_n_block, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto k0) {
+                        constexpr index_t c_offset =
+                            CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0));
+                        constexpr index_t a_offset =
+                            AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0));
+                        constexpr index_t b_offset =
+                            BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0));
+
+                        c_scale_thread_buf(Number<c_offset>{}) =
+                            a_scale_thread_buf[Number<a_offset>{}] *
+                            b_scale_thread_buf[Number<b_offset>{}];
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, KGroup, 1>{}([&](auto kg0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(m0, I0, I0, k0, I0, Number<kg0 * KPack / KGroup>{}),
+                            a_thread_buf);
+                    });
+                });
+            });
+
+            // __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) {
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<AccDataType>()(Number<t>{}) = 0;
+                        });
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset(
+                            make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat));
+
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[Number<cscale_offset>{}];
+
+                        static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) {
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0,
+                                                   I0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   I0,
+                                                   ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0,
+                                                   I0,
+                                                   kscale0 * KRepeat / num_scale_k_block + k0,
+                                                   ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                        });
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                            using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<pk_fma_type>()[t],
+                                c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()[t]);
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp
new file mode 100644
index 0000000000..6db02d1dd7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_v3.hpp
@@ -0,0 +1,1070 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MScaleBlock,
+          index_t NScaleBlock,
+          index_t KScaleBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3<
+    BlockGemmPipelineScheduler::Intrawave,
+    BlockSize,
+    ADataType,
+    BDataType,
+    ComputeDataType,
+    AccDataType,
+    ATileDesc,
+    BTileDesc,
+    AMmaTileDesc,
+    BMmaTileDesc,
+    ABlockTransferSrcScalarPerVector,
+    BBlockTransferSrcScalarPerVector,
+    MPerBlock,
+    NPerBlock,
+    KPerBlock,
+    MScaleBlock,
+    NScaleBlock,
+    KScaleBlock,
+    MPerXDL,
+    NPerXDL,
+    MRepeat,
+    NRepeat,
+    KPack> : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                               ADataType,
+                                               BDataType,
+                                               ComputeDataType,
+                                               AccDataType,
+                                               ATileDesc,
+                                               BTileDesc,
+                                               AMmaTileDesc,
+                                               BMmaTileDesc,
+                                               ABlockTransferSrcScalarPerVector,
+                                               BBlockTransferSrcScalarPerVector,
+                                               MPerBlock,
+                                               NPerBlock,
+                                               KPerBlock,
+                                               MPerXDL,
+                                               NPerXDL,
+                                               MRepeat,
+                                               NRepeat,
+                                               KPack,
+                                               true>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack,
+                                                   true>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KGroup;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat * KGroup;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        static_assert(num_buffer_load_inst_a == num_ds_write_inst_a);
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
+
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_a_mfma_rate =
+            math::integer_divide_ceil(mfma_cycle - 4, 2 * ds_read_a_issue_cycle);
+
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+
+        constexpr auto num_total_stages = MRepeat;
+
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more = math::integer_divide_ceil(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less = math::integer_divide_floor(
+            (num_buffer_load_inst_a + num_buffer_load_inst_b), (num_total_stages - 2));
+
+        constexpr auto buffer_load_stages_more =
+            (num_buffer_load_inst_a + num_buffer_load_inst_b) -
+            math::integer_divide_floor((num_buffer_load_inst_a + num_buffer_load_inst_b),
+                                       (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_b_stages =
+            buffer_load_perstage_more * buffer_load_stages_more > num_buffer_load_inst_b
+                ? num_buffer_load_inst_b / buffer_load_perstage_more
+                : (buffer_load_stages_more +
+                   (num_buffer_load_inst_b - buffer_load_perstage_more * buffer_load_stages_more) /
+                       buffer_load_perstage_less);
+
+        constexpr auto buffer_load_a_stages =
+            num_total_stages - num_ds_read_a_prefetch_stages - buffer_load_b_stages;
+
+        constexpr auto buffer_load_issue_point_b = 0;
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more
+                ? num_mfma_perstage / buffer_load_perstage_more
+                : 1;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less
+                ? num_mfma_perstage / buffer_load_perstage_less
+                : 1;
+        constexpr auto ds_write_issue_point      = 0;
+        constexpr auto buffer_load_issue_point_a = num_mfma_perstage >= 3 ? 1 : 0;
+
+        // B global read
+        static_for<0, buffer_load_b_stages, 1>{}([&](auto i) {
+            // Scale load, 1B
+            if constexpr(i.value == 0)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(((i < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_b)) ||
+                             ((i >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_b)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                // __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // A global read + A local write
+        static_for<0, buffer_load_a_stages, 1>{}([&](auto i) {
+            // Scale load, 1A
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               ds_write_issue_point)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               ds_write_issue_point)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                }
+                if constexpr((((i + buffer_load_b_stages) < buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_more ==
+                               buffer_load_issue_point_a)) ||
+                             (((i + buffer_load_b_stages) >= buffer_load_stages_more) &&
+                              (imfma % buffer_load_issue_point_interval_less ==
+                               buffer_load_issue_point_a)))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                // __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto i) {
+            ignore = i;
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                // Scale load, 1A
+                if constexpr(imfma == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+                // __builtin_amdgcn_sched_group_barrier(0x800, 2, 0); // v_pk_fma
+            });
+            // __builtin_amdgcn_sched_barrier(0);
+        });
+    }
+
+    template <bool HasMainLoop,
+              int NumKBlockPerScale,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CScaleThreadDesc,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadDesc,
+              typename AScaleThreadTransfer,
+              typename AScaleThreadTransferStep,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadDesc,
+              typename BScaleThreadTransfer,
+              typename BScaleThreadTransferStep>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        const CScaleThreadDesc& c_scale_thread_desc,
+        CThreadBuffer& c_thread_buf,
+        // AScaleThreadCopy
+        const AScaleGridDesc& a_scale_grid_desc,
+        const AScaleThreadDesc& a_scale_thread_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const AScaleThreadTransferStep& a_scale_thread_copy_step,
+        // BScaleThreadCopy
+        const BScaleGridDesc& b_scale_grid_desc,
+        const BScaleThreadDesc& b_scale_thread_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleThreadTransferStep& b_scale_thread_copy_step,
+        // num_loop
+        index_t num_loop) const
+    {
+        ignore = b_block_desc;
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        static_assert(CScaleThreadDesc{}.GetLength(Number<0>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceK=1");
+        static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1,
+                      "Pipeline v3 only support scaleblocksliceN=1");
+        // assume kperblock = scaleblockk
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto c_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AccDataType>(
+            c_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        // StaticallyIndexedArray<decltype(c_scale_thread_buf), Number<2>{}> c_scale_thread_bufs;
+
+        // Global prefetch A1 B1, AScale1 BScale1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+        });
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // Global prefetch A2, AScale2 BScale2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                a_scale_grid_buf,
+                                a_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                a_scale_thread_bufs(I0));
+
+        if constexpr(NumKBlockPerScale == 1)
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<1>{}));
+        }
+        else
+        {
+            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                   a_scale_thread_copy_step.At(Number<0>{}));
+        }
+
+        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                b_scale_grid_buf,
+                                b_scale_thread_desc,
+                                make_tuple(I0, I0),
+                                b_scale_thread_bufs(I0));
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // Double register buffer for non-scaled gemm computation
+        // 1. Reduce register pressure
+        // 2. Decouple the dependency between mfma instruction and scale-fma instruction following.
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  AccDataType,
+                                  1,
+                                  xdlops_gemm.GetRegSizePerXdlops(),
+                                  true>
+            c_thread_buf_per_scale;
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, 2, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                       a_block_buf.At(I0),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                       a_thread_buf);
+                });
+            });
+        });
+
+#if 0
+        static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                .template AsType<AccDataType>()(Number<t>{}) = 0;
+        });
+
+        // Fill first mfma buffer
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            vector_type<ComputeDataType, KPack> a_thread_vec;
+            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+            static_for<0, KPack, 1>{}([&](auto ik) {
+                a_thread_vec.template AsType<ComputeDataType>()(ik) = a_thread_buf
+                    [Number<a_thread_desc_.CalculateOffset(make_tuple(I0, I0, I0, k0, I0, ik))>{}];
+                b_thread_vec.template AsType<ComputeDataType>()(ik) = b_thread_bufs
+                    [I0][Number<b_thread_desc_.CalculateOffset(make_tuple(I0, I0, k0, ik))>{}];
+            });
+
+            using mfma_input_type =
+                typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+            xdlops_gemm.template Run<>(a_thread_vec.template AsType<mfma_input_type>(),
+                                       b_thread_vec.template AsType<mfma_input_type>(),
+                                       c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+        });
+#endif
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_bufs(local_read_buf));
+
+                    if constexpr(NumKBlockPerScale == 1)
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{}));
+                    }
+                    else
+                    {
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{}));
+                    }
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(I0, I0),
+                                            b_scale_thread_bufs(local_read_buf));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           b_scale_thread_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        vector_type<AccDataType, 2> c_scale_thread_vec;
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                            c_scale_thread_buf[m0];
+                        c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                            c_scale_thread_buf[m0];
+
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                                c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                    .template AsType<AccDataType>()(Number<t>{}) = 0;
+                            });
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                xdlops_gemm.template Run<>(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                            });
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                            static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                                using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                                c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                    .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                                    c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                        .template AsType<pk_fma_type>()[t],
+                                    c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                        .template AsType<pk_fma_type>()[t]);
+                            });
+                        });
+
+                        if constexpr(m0.value == (MRepeat - 2))
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else if constexpr(m0.value == (MRepeat - 1))
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(local_read_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                static_for<0, KGroup, 1>{}([&](auto kg0) {
+                                    a_thread_copy_.Run(
+                                        a_block_desc_m0_m1_m2_k0_k1_k2,
+                                        make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                                   I0,
+                                                   I0,
+                                                   Number<k0 * KGroup + kg0>{},
+                                                   I0,
+                                                   I0),
+                                        a_block_buf.At(mfma_reg_buf),
+                                        a_thread_desc_,
+                                        make_tuple(
+                                            Number<(m0 + 2 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                   2>{},
+                                            I0,
+                                            I0,
+                                            k0,
+                                            I0,
+                                            Number<kg0 * A_K1>{}),
+                                        a_thread_buf);
+                                });
+                            });
+                        }
+                    });
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        c_scale_thread_buf(m0) = a_scale_thread_bufs[mfma_reg_buf][m0] *
+                                                 b_scale_thread_bufs[mfma_reg_buf][I0];
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                    });
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value == (MRepeat - 2))
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else if constexpr(m0.value == (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(Number<(m0 + 2) % MRepeat>{},
+                                           I0,
+                                           I0,
+                                           Number<k0 * KGroup + kg0>{},
+                                           I0,
+                                           I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+
+            HotLoopScheduler();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                c_scale_thread_buf(m0) = a_scale_thread_bufs[I0][m0] * b_scale_thread_bufs[I0][I0];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                    });
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I1),
+                                a_thread_desc_,
+                                make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{},
+                                           I0,
+                                           I0,
+                                           k0,
+                                           I0,
+                                           Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                vector_type<AccDataType, 2> c_scale_thread_vec;
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<0>{}) =
+                    c_scale_thread_buf[m0];
+                c_scale_thread_vec.template AsType<AccDataType>()(Number<1>{}) =
+                    c_scale_thread_buf[m0];
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) {
+                        c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                            .template AsType<AccDataType>()(Number<t>{}) = 0;
+                    });
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}));
+                    });
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    static_for<0, xdlops_gemm.GetRegSizePerXdlops() / 2, 1>{}([&](auto t) {
+                        using pk_fma_type = typename vector_type<AccDataType, 2>::type;
+
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                            .template AsType<pk_fma_type>()(t) = __builtin_elementwise_fma(
+                            c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})
+                                .template AsType<pk_fma_type>()[t],
+                            c_scale_thread_vec.template AsType<pk_fma_type>()[Number<0>{}],
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{})
+                                .template AsType<pk_fma_type>()[t]);
+                    });
+                });
+
+                if constexpr(m0.value < (MRepeat - 2))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, KGroup, 1>{}([&](auto kg0) {
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_k0_k1_k2,
+                                make_tuple(
+                                    Number<m0 + 2>{}, I0, I0, Number<k0 * KGroup + kg0>{}, I0, I0),
+                                a_block_buf.At(I0),
+                                a_thread_desc_,
+                                make_tuple(
+                                    Number<(m0 + 2) % 2>{}, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
+                                a_thread_buf);
+                        });
+                    });
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp
new file mode 100644
index 0000000000..f2508d9cfa
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp
@@ -0,0 +1,1361 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                           ThreadBlockSize,
+                                                           ScaleBlockSize,
+                                                           ADataType,
+                                                           AScaleDataType,
+                                                           BDataType,
+                                                           BScaleDataType,
+                                                           ATileDesc,
+                                                           BTileDesc,
+                                                           AMmaTileDesc,
+                                                           BMmaTileDesc,
+                                                           ABlockTransferSrcScalarPerVector,
+                                                           BBlockTransferSrcScalarPerVector,
+                                                           MPerBlock,
+                                                           NPerBlock,
+                                                           KPerBlock,
+                                                           MPerXDL,
+                                                           NPerXDL,
+                                                           MRepeat,
+                                                           NRepeat,
+                                                           KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2 * 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num * 2;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+        constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack * 2;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize * 2;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                               num_buffer_load_a_scale + num_buffer_load_b_scale;
+
+        constexpr auto mfma_perstage_more =
+            math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+        constexpr auto mfma_perstage_less =
+            math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+        constexpr auto mfma_stages_more =
+            num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            if constexpr(i < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_b)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                          num_buffer_load_a_scale) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // A
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // B0/B1
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_buf,
+        BBlockBuffer& b_block_buf_up,
+        const BBlockTransferStep& b_block_copy_step,
+        // C
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // A scale
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        // B0/B1 scale
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf_up), Number<2>{}> b_scale_thread_bufs_up;
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+        b_blockwise_copy_up.RunRead(b_grid_desc, b_grid_buf_up);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                           b_scale_grid_buf_up,
+                                           b_scale_thread_desc,
+                                           make_tuple(n0, k0, I0),
+                                           b_scale_thread_bufs_up(I0));
+
+                b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                          make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy_up.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy_up.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+        b_blockwise_copy_up.RunWrite(b_block_desc, b_block_buf_up);
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+        b_blockwise_copy_up.RunRead(b_grid_desc, b_grid_buf_up);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_buf_up,
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf_up);
+                    });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    block_sync_lds();
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                    b_blockwise_copy_up.RunWrite(b_block_desc, b_block_buf_up);
+                    b_blockwise_copy_up.RunRead(b_grid_desc, b_grid_buf_up);
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales_up
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                                       b_scale_grid_buf_up,
+                                                       b_scale_thread_desc,
+                                                       make_tuple(n0, k0, I0),
+                                                       b_scale_thread_bufs_up(scale_mem_buf));
+
+                            b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                      make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy_up.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec_up;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs_up(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                                a_thread_vec.template AsType<ComputeTypeA>()(
+                                                    ik) = a_thread_buf
+                                                    [Number<a_thread_desc_.CalculateOffset(
+                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                                b_thread_vec.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                                b_thread_vec_up.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf_up
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                            });
+
+                                            using mfma_input_type_a =
+                                                typename vector_type<ComputeTypeA,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         APackedSize>::type;
+
+                                            using mfma_input_type_b =
+                                                typename vector_type<ComputeTypeB,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         BPackedSize>::type;
+
+                                            using mfma_scale_input_type_a =
+                                                typename vector_type<AScaleDataType,
+                                                                     a_scale_thread_vec_size>::type;
+                                            using mfma_scale_input_type_b =
+                                                typename vector_type<BScaleDataType,
+                                                                     b_scale_thread_vec_size>::type;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                            // MFMA accumulation
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec_up
+                                                    .template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec_up
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf_up.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+
+                    // k indexes mapping to threads for 32x32x64:
+                    // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                    // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                    //              k = 0                 k = 1
+
+                    //  k indexes mapping to threads for 16x16x128:
+                    // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                    // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                    // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                    // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                    //              k = 0                    k = 1
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_buf,
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_buf,
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_buf_up,
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf_up);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales_up
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc,
+                                               make_tuple(n0, k0, I0),
+                                               b_scale_thread_bufs_up(I1));
+
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                              make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy_up.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+            b_blockwise_copy_up.RunWrite(b_block_desc, b_block_buf_up);
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf_up[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec_up
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_buf_up,
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf_up);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf_up[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec_up
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf_up[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec_up
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_selector.hpp
new file mode 100644
index 0000000000..84b0eebb31
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_selector.hpp
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_gufusion_v3.hpp"
+
+namespace ck {
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType, // TODO: remove this as in this pipeline ADataType and BDataType
+                                    // must be used for compute
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool GUFusion = false>
+constexpr auto BlockGemmMXNBSPipeline_Selector()
+{
+
+    // Hardware MX GEMM pipeline
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        if constexpr(GUFusion)
+        {
+            return nullptr;
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1<BlkGemmPipeSche,
+                                                              ThreadBlockSize,
+                                                              ScaleBlockSize,
+                                                              ADataType,
+                                                              AScaleDataType,
+                                                              BDataType,
+                                                              BScaleDataType,
+                                                              ATileDesc,
+                                                              BTileDesc,
+                                                              AMmaTileDesc,
+                                                              BMmaTileDesc,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              BBlockTransferSrcScalarPerVector,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              MPerXDL,
+                                                              NPerXDL,
+                                                              MRepeat,
+                                                              NRepeat,
+                                                              KPack>{};
+        }
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3<BlkGemmPipeSche,
+                                                              ThreadBlockSize,
+                                                              ScaleBlockSize,
+                                                              ADataType,
+                                                              AScaleDataType,
+                                                              BDataType,
+                                                              BScaleDataType,
+                                                              ATileDesc,
+                                                              BTileDesc,
+                                                              AMmaTileDesc,
+                                                              BMmaTileDesc,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              BBlockTransferSrcScalarPerVector,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              MPerXDL,
+                                                              NPerXDL,
+                                                              MRepeat,
+                                                              NRepeat,
+                                                              KPack>{};
+        }
+    }
+    else
+    {
+        std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v1.hpp
new file mode 100644
index 0000000000..32f6248543
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v1.hpp
@@ -0,0 +1,664 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 1
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                  ThreadBlockSize,
+                                                  ScaleBlockSize,
+                                                  ADataType,
+                                                  AScaleDataType,
+                                                  BDataType,
+                                                  BScaleDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_buf);
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_buf);
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                // -------------------------------------------------------------------------------------------
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                            (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_buf,
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        // read block data in chunks to assemble correct thread vectors
+                        static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_buf,
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
+                    });
+                });
+
+                static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            constexpr index_t a_scale_offset =
+                                a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                            constexpr index_t b_scale_offset =
+                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                            static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                          "Must have at least one scale per Xdlops "
+                                          "per Thread.");
+
+                            vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                            vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                            // Pack scale_thread_buf into scale_thread_vec
+                            static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                    a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                            });
+
+                            static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                    b_scale_thread_buf[Number<b_scale_offset + s>{}];
+                            });
+
+                            static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                    static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                        constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                        static_for<0, KPack, 1>{}([&](auto ik) {
+                                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                    make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        });
+
+                                        using mfma_input_type_a =
+                                            typename vector_type<ComputeTypeA,
+                                                                 xdlops_gemm.K1PerXdlops /
+                                                                     APackedSize>::type;
+
+                                        using mfma_input_type_b =
+                                            typename vector_type<ComputeTypeB,
+                                                                 xdlops_gemm.K1PerXdlops /
+                                                                     BPackedSize>::type;
+
+                                        using mfma_scale_input_type_a =
+                                            typename vector_type<AScaleDataType,
+                                                                 a_scale_thread_vec_size>::type;
+                                        using mfma_scale_input_type_b =
+                                            typename vector_type<BScaleDataType,
+                                                                 b_scale_thread_vec_size>::type;
+
+                                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                        // MFMA accumulation
+                                        xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                 ikxdl * NXdlPack + inxdl>(
+                                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                                            a_scale_thread_vec
+                                                .template AsType<mfma_scale_input_type_a>(),
+                                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                                            b_scale_thread_vec
+                                                .template AsType<mfma_scale_input_type_b>(),
+                                            c_thread_buf.GetVectorTypeReference(
+                                                Number<c_offset>{}));
+                                    });
+                                });
+                            });
+                        });
+                    });
+                });
+
+                // Prefetch a_scales
+                static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                a_scale_grid_buf,
+                                                a_scale_thread_desc,
+                                                make_tuple(m0, k0, I0),
+                                                a_scale_thread_buf);
+
+                        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                               make_multi_index(0, I1, 0));
+                    });
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                });
+
+                // restore row id and advance to the next set of scales
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc,
+                    make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                // Prefetch b_scales
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                b_scale_grid_buf,
+                                                b_scale_thread_desc,
+                                                make_tuple(n0, k0, I0),
+                                                b_scale_thread_buf);
+
+                        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                               make_multi_index(0, I1, 0));
+                    });
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                });
+
+                // restore col id and advance to the next set of scales
+                // NWaves * NPerXDL * NRepeat == NPerBlock
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc,
+                    make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_buf[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_buf[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp
new file mode 100644
index 0000000000..b48e464fee
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp
@@ -0,0 +1,1126 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                  ThreadBlockSize,
+                                                  ScaleBlockSize,
+                                                  ADataType,
+                                                  AScaleDataType,
+                                                  BDataType,
+                                                  BScaleDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+        constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                               num_buffer_load_a_scale + num_buffer_load_b_scale;
+
+        constexpr auto mfma_perstage_more =
+            math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+        constexpr auto mfma_perstage_less =
+            math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+        constexpr auto mfma_stages_more =
+            num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
+        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            if constexpr(i < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma < num_dswrite_per_issue_b)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    }
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                          num_buffer_load_a_scale) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    block_sync_lds();
+
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                                a_thread_vec.template AsType<ComputeTypeA>()(
+                                                    ik) = a_thread_buf
+                                                    [Number<a_thread_desc_.CalculateOffset(
+                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                                b_thread_vec.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                            });
+
+                                            using mfma_input_type_a =
+                                                typename vector_type<ComputeTypeA,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         APackedSize>::type;
+
+                                            using mfma_input_type_b =
+                                                typename vector_type<ComputeTypeB,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         BPackedSize>::type;
+
+                                            using mfma_scale_input_type_a =
+                                                typename vector_type<AScaleDataType,
+                                                                     a_scale_thread_vec_size>::type;
+                                            using mfma_scale_input_type_b =
+                                                typename vector_type<BScaleDataType,
+                                                                     b_scale_thread_vec_size>::type;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                            // MFMA accumulation
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+
+                    // k indexes mapping to threads for 32x32x64:
+                    // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                    // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                    //              k = 0                 k = 1
+
+                    //  k indexes mapping to threads for 16x16x128:
+                    // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                    // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                    // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                    // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                    //              k = 0                    k = 1
+                    block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_buf,
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_buf,
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_buf,
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
index a4038e9543..a7d22066ac 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -532,6 +532,9 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                     });
                 });
 
+                HotLoopScheduler();
+                __builtin_amdgcn_sched_barrier(0);
+
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     a_scale_thread_copy.Run(a_scale_grid_desc,
                                             a_scale_grid_buf,
@@ -560,8 +563,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale<BlockGemmPipelineScheduler::Intr
                                         b_scale_thread_buf);
 
                 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step);
-                HotLoopScheduler();
-                __builtin_amdgcn_sched_barrier(0);
+
                 i += 1;
             } while(i < (num_loop - 1));
         }
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
index ef0b5286ac..6ebdbc5054 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #ifndef __HIPCC_RTC__
@@ -149,6 +149,52 @@ struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator
 #endif
 };
 
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename DsDataType,
+          typename EDataType,
+          index_t ScaleBlockSize,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceMoEGemmMXBPreShuffle : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+#ifndef CK_CODE_GEN_RTC
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_a_scale,
+                        const void* p_b,
+                        const void* p_b_scale,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideAScale,
+                        ck::index_t StrideB,
+                        ck::index_t StrideBScale,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        ck::index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual int GetPreShuffleParameters() = 0;
+#endif
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_ab_scale.hpp
index 7171715250..abf49bdab2 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_ab_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -60,6 +60,49 @@ struct DeviceGemmMultipleD_ABScale : public BaseOperator
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
 
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename AScaleType,
+          typename BDataType,
+          typename BScaleType,
+          typename DsDataType,
+          typename EDataType,
+          index_t ScaleBlockM,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleD_BlockScale_BPreshuffle : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const ck::index_t M,
+                        const ck::index_t N,
+                        const ck::index_t K,
+                        const ck::index_t StrideA,
+                        const ck::index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor> StrideDs,
+                        const ck::index_t StrideE,
+                        const void* p_a_scale,
+                        const void* p_b_scale,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual int GetPreShuffleParameters() = 0;
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
new file mode 100644
index 0000000000..c446ca59ea
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_ab_scale.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockM,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ComputeTypeA,
+          typename LDSTypeB                           = ComputeTypeB>
+struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle
+    : public DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
+                                                        BLayout,
+                                                        DsLayout,
+                                                        CLayout,
+                                                        ADataType,
+                                                        AScaleDataType,
+                                                        BDataType,
+                                                        BScaleDataType,
+                                                        DsDataType,
+                                                        CDataType,
+                                                        ScaleBlockM,
+                                                        ScaleBlockN,
+                                                        ScaleBlockK,
+                                                        AElementwiseOperation,
+                                                        BElementwiseOperation,
+                                                        CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockM,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        LDSTypeA,
+        LDSTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            // unconditional 2 to remove agpr usage
+            constexpr index_t minimum_occupancy = 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                        Run(kernel);
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                        Run(kernel);
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle<
+                                GridwiseGemm,
+                                false,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle<
+                                GridwiseGemm,
+                                false,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                        Run(kernel);
+                    }
+                }
+            }
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        // if(ScaleBlockM % MPerBlock != 0 || ScaleBlockN % NPerBlock != 0 || ScaleBlockK !=
+        // KPerBlock)
+        // {
+        //     return false;
+        // }
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        // Padding to release this restriction
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             const index_t M,
+                             const index_t N,
+                             const index_t K,
+                             const index_t StrideA,
+                             const index_t StrideB,
+                             const std::array<index_t, NumDTensor> StrideDs,
+                             const index_t StrideC,
+                             const void* p_a_scale,
+                             const void* p_b_scale,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideC,
+                        static_cast<const AScaleDataType*>(p_a_scale),
+                        static_cast<const BScaleDataType*>(p_b_scale),
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_c,
+                        const index_t M,
+                        const index_t N,
+                        const index_t K,
+                        const index_t StrideA,
+                        const index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor> StrideDs,
+                        const index_t StrideC,
+                        const void* p_a_scale,
+                        const void* p_b_scale,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideC,
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"}};
+
+        // clang-format off
+        str << "DeviceGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
new file mode 100644
index 0000000000..48a10f219c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -0,0 +1,584 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <hip/hip_runtime.h>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_ab_scale.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockM,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = false,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ComputeTypeA,
+          typename LDSTypeB                           = ComputeTypeB>
+struct DeviceMoeGemmBlockScale
+    : public DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
+                                                        BLayout,
+                                                        DsLayout,
+                                                        CLayout,
+                                                        ADataType,
+                                                        AScaleDataType,
+                                                        BDataType,
+                                                        BScaleDataType,
+                                                        DsDataType,
+                                                        CDataType,
+                                                        ScaleBlockM,
+                                                        ScaleBlockN,
+                                                        ScaleBlockK,
+                                                        AElementwiseOperation,
+                                                        BElementwiseOperation,
+                                                        CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    using GridwiseGemm                  = GridwiseMoeGemmBlockScale<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockM,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ActivationOP,
+        NSwizzle,
+        IsInputGemm,
+        MulRoutedWeight,
+        IndexType,
+        ComputeTypeA,
+        ComputeTypeB,
+        LDSTypeA,
+        LDSTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+            const auto RunKernel             = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize /
+                                             4 * (1 + GridwiseGemm::NWave);
+            constexpr auto estimated_reg_b = NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize /
+                                             4 * (2) * (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_c = MPerBlock * NPerBlock * sizeof(GemmAccDataType) /
+                                             BlockSize / 4 * (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_total =
+                estimated_reg_a + estimated_reg_b + estimated_reg_c;
+
+            constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
+
+            constexpr auto MemoryDataOp =
+                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                true,
+                                                                MemoryDataOp,
+                                                                minimum_occupancy,
+                                                                TailNumber::Odd>;
+                            RunKernel(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                                true,
+                                                                MemoryDataOp,
+                                                                minimum_occupancy,
+                                                                TailNumber::Even>;
+                            RunKernel(kernel);
+                        }
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 & v2 support now");
+                }
+            }
+#if 1
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                            false,
+                                                            MemoryDataOp,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                            false,
+                                                            MemoryDataOp,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 false,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 false,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+            }
+#endif
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // only impl kbatch 1 now
+        if(arg.KBatch > 1)
+        {
+            return false;
+        }
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_sorted_token_ids,
+                             const void* p_sorted_expert_ids,
+                             const void* p_max_token_id,
+                             const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             index_t NumTokens,
+                             index_t TopK,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideC,
+                             const void* p_a_scale,
+                             const void* p_b_scale,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const index_t*>(p_sorted_token_ids),
+                        static_cast<const index_t*>(p_sorted_expert_ids),
+                        static_cast<const index_t*>(p_max_token_id),
+                        static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        NumTokens,
+                        TopK,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideC,
+                        static_cast<const AScaleDataType*>(p_a_scale),
+                        static_cast<const BScaleDataType*>(p_b_scale),
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      std::array<ck::index_t, NumDTensor> StrideDs,
+                                                      index_t StrideC,
+                                                      const void* p_a_scale,
+                                                      const void* p_b_scale,
+                                                      //   index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M, // randoms set, no use
+                                          0,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideC,
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          1, // KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"}};
+
+        // clang-format off
+        str << "DeviceMoeGEmm"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
new file mode 100644
index 0000000000..2868ce2567
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType>
+struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
+                                                           BLayout,
+                                                           DsLayout,
+                                                           CLayout,
+                                                           ADataType,
+                                                           AScaleDataType,
+                                                           BDataType,
+                                                           BScaleDataType,
+                                                           DsDataType,
+                                                           CDataType,
+                                                           ScaleBlockSize,
+                                                           AElementwiseOperation,
+                                                           BElementwiseOperation,
+                                                           CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    using GridwiseGemm =
+        GridwiseMoeGemmMX<ALayout,
+                          BLayout,
+                          DsLayout,
+                          CLayout,
+                          ADataType,
+                          AScaleDataType,
+                          BDataType,
+                          BScaleDataType,
+                          GemmAccDataType,
+                          CShuffleDataType,
+                          DsDataType,
+                          CDataType,
+                          AElementwiseOperation,
+                          BElementwiseOperation,
+                          CElementwiseOperation,
+                          GemmSpec,
+                          ScaleBlockSize,
+                          BlockSize,
+                          MPerBlock,
+                          NPerBlock,
+                          KPerBlock,
+                          AK1,
+                          BK1,
+                          MPerXDL,
+                          NPerXDL,
+                          MXdlPerWave,
+                          NXdlPerWave,
+                          ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                          ABlockTransferThreadClusterArrangeOrder,
+                          ABlockTransferSrcAccessOrder,
+                          ABlockTransferSrcVectorDim,
+                          ABlockTransferSrcScalarPerVector,
+                          ABlockTransferDstScalarPerVector_AK1,
+                          false,
+                          ABlockLdsExtraM,
+                          BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                          BBlockTransferThreadClusterArrangeOrder,
+                          BBlockTransferSrcAccessOrder,
+                          BBlockTransferSrcVectorDim,
+                          BBlockTransferSrcScalarPerVector,
+                          BBlockTransferDstScalarPerVector_BK1,
+                          false,
+                          BBlockLdsExtraN,
+                          CShuffleMXdlPerWavePerShuffle,
+                          CShuffleNXdlPerWavePerShuffle,
+                          CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                          CDEShuffleBlockTransferScalarPerVectors,
+                          BlkGemmPipeSched,
+                          BlkGemmPipelineVer,
+                          ActivationOP,
+                          NSwizzle,
+                          IsInputGemm,
+                          MulRoutedWeight,
+                          IndexType,
+                          ComputeTypeA,
+                          ComputeTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto RunKernel = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / BPackedSize;
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) /
+                                             APackedSize / BlockSize / 4 *
+                                             (1 + GridwiseGemm::NWave);
+            constexpr auto estimated_reg_b = NPerBlock * KPerBlock * sizeof(BDataType) /
+                                             BPackedSize / BlockSize / 4 * (2) *
+                                             (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_c = MPerBlock * NPerBlock * sizeof(GemmAccDataType) /
+                                             BlockSize / 4 * (IsInputGemm ? 2 : 1);
+            constexpr auto estimated_reg_total =
+                estimated_reg_a + estimated_reg_b + estimated_reg_c;
+
+            constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
+
+            constexpr auto MemoryDataOp =
+                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                                  true,
+                                                                  MemoryDataOp,
+                                                                  minimum_occupancy,
+                                                                  TailNumber::Odd>;
+                            RunKernel(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                                  true,
+                                                                  MemoryDataOp,
+                                                                  minimum_occupancy,
+                                                                  TailNumber::Even>;
+                            RunKernel(kernel);
+                        }
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   true,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   true,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 & v3 support now");
+                }
+            }
+            else
+            {
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              false,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              false,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   false,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   false,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // only impl kbatch 1 now
+        if(arg.KBatch > 1)
+        {
+            return false;
+        }
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_sorted_token_ids,
+                             const void* p_sorted_expert_ids,
+                             const void* p_max_token_id,
+                             const void* p_a,
+                             const void* p_a_scale,
+                             const void* p_b,
+                             const void* p_b_scale,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             index_t NumTokens,
+                             index_t TopK,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideScaleA,
+                             index_t StrideB,
+                             index_t StrideScaleB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const index_t*>(p_sorted_token_ids),
+                        static_cast<const index_t*>(p_sorted_expert_ids),
+                        static_cast<const index_t*>(p_max_token_id),
+                        static_cast<const ADataType*>(p_a),
+                        static_cast<const AScaleDataType*>(p_a_scale),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<const BScaleDataType*>(p_b_scale),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        NumTokens,
+                        TopK,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideScaleA,
+                        StrideB,
+                        StrideScaleB,
+                        StrideDs,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_a_scale,
+                                                      const void* p_b,
+                                                      const void* p_b_scale,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideScaleA,
+                                                      index_t StrideB,
+                                                      index_t StrideScaleB,
+                                                      std::array<ck::index_t, NumDTensor> StrideDs,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          static_cast<const ADataType*>(p_a),
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M, // randoms set, no use
+                                          0,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideScaleA,
+                                          StrideB,
+                                          StrideScaleB,
+                                          StrideDs,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceMoeGEmmMx"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp
new file mode 100644
index 0000000000..bb7dcae9de
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bns.hpp
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType>
+struct DeviceMoeGemmMXBNS : public DeviceMoEGemmMXBPreShuffle<ALayout,
+                                                              BLayout,
+                                                              DsLayout,
+                                                              CLayout,
+                                                              ADataType,
+                                                              AScaleDataType,
+                                                              BDataType,
+                                                              BScaleDataType,
+                                                              DsDataType,
+                                                              CDataType,
+                                                              ScaleBlockSize,
+                                                              AElementwiseOperation,
+                                                              BElementwiseOperation,
+                                                              CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    using GridwiseGemm =
+        GridwiseMoeGemmMXBNS<ALayout,
+                             BLayout,
+                             DsLayout,
+                             CLayout,
+                             ADataType,
+                             AScaleDataType,
+                             BDataType,
+                             BScaleDataType,
+                             GemmAccDataType,
+                             CShuffleDataType,
+                             DsDataType,
+                             CDataType,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation,
+                             GemmSpec,
+                             ScaleBlockSize,
+                             BlockSize,
+                             MPerBlock,
+                             NPerBlock,
+                             KPerBlock,
+                             AK1,
+                             BK1,
+                             MPerXDL,
+                             NPerXDL,
+                             MXdlPerWave,
+                             NXdlPerWave,
+                             ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                             ABlockTransferThreadClusterArrangeOrder,
+                             ABlockTransferSrcAccessOrder,
+                             ABlockTransferSrcVectorDim,
+                             ABlockTransferSrcScalarPerVector,
+                             ABlockTransferDstScalarPerVector_AK1,
+                             false,
+                             ABlockLdsExtraM,
+                             BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                             BBlockTransferThreadClusterArrangeOrder,
+                             BBlockTransferSrcAccessOrder,
+                             BBlockTransferSrcVectorDim,
+                             BBlockTransferSrcScalarPerVector,
+                             BBlockTransferDstScalarPerVector_BK1,
+                             false,
+                             BBlockLdsExtraN,
+                             CShuffleMXdlPerWavePerShuffle,
+                             CShuffleNXdlPerWavePerShuffle,
+                             CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                             CDEShuffleBlockTransferScalarPerVectors,
+                             BlkGemmPipeSched,
+                             BlkGemmPipelineVer,
+                             ActivationOP,
+                             NSwizzle,
+                             IsInputGemm,
+                             MulRoutedWeight,
+                             IndexType,
+                             ComputeTypeA,
+                             ComputeTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto RunKernel = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            // TODO: Check if this is the right algorithm for minimum_occupancy
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            constexpr auto MemoryDataOp =
+                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                          true,
+                                                          MemoryDataOp,
+                                                          minimum_occupancy,
+                                                          TailNumber::Full>;
+                    RunKernel(kernel);
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              true,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              true,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 & v3 support now");
+                }
+            }
+            else
+            {
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                          false,
+                                                          MemoryDataOp,
+                                                          minimum_occupancy,
+                                                          TailNumber::Full>;
+                    RunKernel(kernel);
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              false,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              false,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // only impl kbatch 1 now
+        if(arg.KBatch > 1)
+        {
+            return false;
+        }
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_sorted_token_ids,
+                             const void* p_sorted_expert_ids,
+                             const void* p_max_token_id,
+                             const void* p_a,
+                             const void* p_a_scale,
+                             const void* p_b,
+                             const void* p_b_scale,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             index_t NumTokens,
+                             index_t TopK,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideScaleA,
+                             index_t StrideB,
+                             index_t StrideScaleB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const index_t*>(p_sorted_token_ids),
+                        static_cast<const index_t*>(p_sorted_expert_ids),
+                        static_cast<const index_t*>(p_max_token_id),
+                        static_cast<const ADataType*>(p_a),
+                        static_cast<const AScaleDataType*>(p_a_scale),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<const BScaleDataType*>(p_b_scale),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        NumTokens,
+                        TopK,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideScaleA,
+                        StrideB,
+                        StrideScaleB,
+                        StrideDs,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_a_scale,
+                                                      const void* p_b,
+                                                      const void* p_b_scale,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideScaleA,
+                                                      index_t StrideB,
+                                                      index_t StrideScaleB,
+                                                      std::array<ck::index_t, NumDTensor> StrideDs,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          static_cast<const ADataType*>(p_a),
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M, // randoms set, no use
+                                          0,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideScaleA,
+                                          StrideB,
+                                          StrideScaleB,
+                                          StrideDs,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceMoeGEmmMx"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index ef84dd182a..64fbda7a44 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -153,9 +153,7 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
-         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
-          lcm_AK1_BK1 < 32))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
             ? true
             : false;
     static constexpr auto is_scale_mfma = false;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index cb22f99fc2..3eb0f986b3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -168,9 +168,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
-         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
-          lcm_AK1_BK1 < 32))
+         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
             ? true
             : false;
     static constexpr auto is_scale_mfma = false;
@@ -1192,7 +1190,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
         const index_t m_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
 
-        // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
 
@@ -1200,7 +1197,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
         // B matrix in LDS memory, dst of blockwise copy
-        // dummy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // A matrix blockwise copy
@@ -1629,7 +1625,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
         const index_t m_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
 
-        // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
 
@@ -1637,7 +1632,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
         // B matrix in LDS memory, dst of blockwise copy
-        // dummy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
 
         // A matrix blockwise copy
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
new file mode 100644
index 0000000000..322cd3d162
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
@@ -0,0 +1,2080 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3.hpp"
+
+#define DEBUG_LOG 0
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle(
+        typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid,
+        karg.p_b_grid,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        karg.p_a_scale_grid,
+        karg.p_b_scale_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds(
+        typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid,
+        karg.p_b_grid,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        karg.p_a_scale_grid,
+        karg.p_b_scale_grid,
+        p_shared,
+        p_shared1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockM,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ADataType,
+          typename LDSTypeB                           = BDataType>
+struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
+{
+    using AScaleType = float;
+    using BScaleType = float;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+    // K1 should be Number<...>
+    static constexpr auto AK0Number       = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number       = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number       = Number<AK1Value>{};
+    static constexpr auto BK1Number       = Number<BK1Value>{};
+    static constexpr auto BlockSizeNumber = Number<BlockSize>{};
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
+    static constexpr index_t KPack =
+        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+    static constexpr index_t KGroup = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
+            // On gfx950, we have a mfma that required 32 f8 elements as input,
+            // splited into 2 groups of 16 f8 elements.
+            // the 2 groups is not contiguous in the B preshuffed layout.
+            // and we do not want it to be contiguous in the B preshuffled layout
+            // because a memory instruction can only read 16 f8 elements at a time.
+            return mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+        else
+            return 1;
+    }();
+    static constexpr index_t KLane =
+        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ __device__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ __device__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(
+                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        return make_naive_tensor_descriptor(
+            make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
+            make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWave, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ELayout>
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeCGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(0, 0, 0, 0, {}))>;
+
+    struct Problem
+    {
+        __host__ __device__ Problem(index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t StrideA_,
+                                    index_t StrideB_,
+                                    std::array<index_t, NumDTensor> StrideDs_,
+                                    index_t StrideC_,
+                                    index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideDs{StrideDs_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // FOR PRESHUFFLE ONLY
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideC_,
+                          const AScaleType* p_a_scale_grid_,
+                          const BScaleType* p_b_scale_grid_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideDs_, StrideC_, k_batch_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_ds_grid{},
+              p_c_grid{p_c_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_}
+        {
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType_ = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType_*>(p_ds_grid_[i]);
+            });
+        }
+
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        DsGridPointer p_ds_grid;
+        CDataType* p_c_grid;
+
+        const AScaleType* p_a_scale_grid;
+        const BScaleType* p_b_scale_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+    };
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(Argument& karg)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead * karg.M;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead * karg.N;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead;
+            }
+
+            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(LDSTypeA));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(LDSTypeA))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(LDSTypeA)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}));
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmBlockScaleBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                LDSTypeA,
+                                LDSTypeB,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                ScaleBlockM,
+                                ScaleBlockN,
+                                ScaleBlockK,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(LDSTypeA),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                          << karg.K << " " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ __device__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* p_c_grid,
+                               const AScaleType* p_a_scale_grid,
+                               const BScaleType* p_b_scale_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.M, ScaleBlockM),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(1, math::integer_divide_ceil(problem.M, ScaleBlockM)));
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                LDSTypeA,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        constexpr index_t ScaleSliceSizeM = MXdlPerWave;
+        constexpr index_t ScaleSliceSizeN = math::integer_divide_ceil(NPerBlock, ScaleBlockN);
+        constexpr index_t ScaleSliceSizeK = math::integer_divide_ceil(KPerBlock, ScaleBlockK);
+
+        // ScaleSliceSizeK is last dimension in A/B scale for vector memory access
+        // ScaleSliceSizeK is first dimension in C scale for packed math
+        constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        auto a_thread_offset =
+            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr auto c_scale_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<ScaleSliceSizeK>{}, Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeN>{}));
+
+        auto a_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<AScaleType,
+                                             AScaleType,
+                                             decltype(a_scale_grid_desc_am_ak),
+                                             decltype(a_scale_thread_desc),
+                                             Sequence<1, ScaleSliceSizeK>,
+                                             Sequence<1, 0>,
+                                             0,
+                                             1,
+                                             1,
+                                             true>(
+                a_scale_grid_desc_am_ak,
+                make_multi_index(block_m_id * MPerBlock / ScaleBlockM + a_thread_offset, 0));
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             true>(
+                b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
+
+        // constexpr auto a_scale_thread_slice_copy_step = make_multi_index(0, 1);
+        constexpr auto a_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(MWaves * MPerXdl, 0),
+                       make_multi_index(-MPerBlock, 0),
+                       make_multi_index(-MPerBlock, ScaleSliceSizeK));
+        constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
+
+        constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+            a_grid_desc_ak0_m_ak1,
+            a_block_desc_ak0_m_ak1,
+            a_blockwise_copy,
+            a_grid_buf,
+            a_block_buf,
+            a_block_slice_copy_step,
+            b_grid_desc_bpreshuffled,
+            b_block_desc_bk0_n_bk1,
+            b_blockwise_copy,
+            b_grid_buf,
+            b_block_buf,
+            b_block_slice_copy_step,
+
+            c_scale_thread_desc,
+            c_thread_buf,
+
+            a_scale_grid_desc_am_ak,
+            a_scale_thread_desc,
+            a_scale_thread_copy,
+            a_scale_grid_buf,
+            a_scale_thread_slice_copy_step,
+
+            b_scale_grid_desc_bn_ak,
+            b_scale_thread_desc,
+            b_scale_thread_copy,
+            b_scale_grid_buf,
+            b_scale_thread_slice_copy_step,
+
+            num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // transposed XDL
+            // // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            // // TODO: hacky, fix it!
+            // only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
+            constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2)),                                    // M2 = MPerXdl
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2,                                      // N2 * N3 * N4 = NPerXdl
+                        N3,
+                        N4))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            I1,
+                                                            N2,
+                                                            I1,
+                                                            N4>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3],
+                                     n_thread_data_on_block_idx[I4]),
+                    tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin = container_concat(
+                make_tuple(make_multi_index(0, 0, 0, 0)),
+                generate_tuple(
+                    [&](auto) {
+                        return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0);
+                    },
+                    Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+
+            auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>>                    // ThreadTransferDstResetCoordinateAfterRunFlags
+                {c_ds_desc_refs,
+                 idx_c_ds_block_begin,
+                 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                 make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
+                 c_element_op};
+
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, 1, N2, 1, N4>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           1,
+                                           N2,
+                                           1,
+                                           N4>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf));
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* p_c_grid,
+                                    const AScaleType* p_a_scale_grid,
+                                    const BScaleType* p_b_scale_grid,
+                                    void* p_shared,
+                                    void* p_shared1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.M, ScaleBlockM),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(1, math::integer_divide_ceil(problem.M, ScaleBlockM)));
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                LDSTypeA,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        constexpr index_t ScaleSliceSizeM = MXdlPerWave;
+        constexpr index_t ScaleSliceSizeN = math::integer_divide_ceil(NPerBlock, ScaleBlockN);
+        constexpr index_t ScaleSliceSizeK = math::integer_divide_ceil(KPerBlock, ScaleBlockK);
+
+        // ScaleSliceSizeK is last dimension in A/B scale for vector memory access
+        // ScaleSliceSizeK is first dimension in C scale for packed math
+        constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        auto a_thread_offset =
+            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr auto c_scale_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<ScaleSliceSizeK>{}, Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeN>{}));
+
+        auto a_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<AScaleType,
+                                             AScaleType,
+                                             decltype(a_scale_grid_desc_am_ak),
+                                             decltype(a_scale_thread_desc),
+                                             Sequence<1, ScaleSliceSizeK>,
+                                             Sequence<1, 0>,
+                                             0,
+                                             1,
+                                             1,
+                                             true>(
+                a_scale_grid_desc_am_ak,
+                make_multi_index(block_m_id * MPerBlock / ScaleBlockM + a_thread_offset, 0));
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             true>(
+                b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
+
+        // constexpr auto a_scale_thread_slice_copy_step = make_multi_index(0, 1);
+        constexpr auto a_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(MWaves * MPerXdl, 0),
+                       make_multi_index(-MPerBlock, 0),
+                       make_multi_index(-MPerBlock, ScaleSliceSizeK));
+        constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
+
+        constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+            a_grid_desc_ak0_m_ak1,
+            a_block_desc_ak0_m_ak1,
+            a_blockwise_copy,
+            a_grid_buf,
+            a_block_bufs,
+            a_block_slice_copy_step,
+            b_grid_desc_bpreshuffled,
+            b_block_desc_bk0_n_bk1,
+            b_blockwise_copy,
+            b_grid_buf,
+            b_block_bufs,
+            b_block_slice_copy_step,
+
+            c_scale_thread_desc,
+            c_thread_buf,
+
+            a_scale_grid_desc_am_ak,
+            a_scale_thread_desc,
+            a_scale_thread_copy,
+            a_scale_grid_buf,
+            a_scale_thread_slice_copy_step,
+
+            b_scale_grid_desc_bn_ak,
+            b_scale_thread_desc,
+            b_scale_thread_copy,
+            b_scale_grid_buf,
+            b_scale_thread_slice_copy_step,
+
+            num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // transposed XDL
+            // // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            // // TODO: hacky, fix it!
+            // only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
+            constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2)),                                    // M2 = MPerXdl
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2,                                      // N2 * N3 * N4 = NPerXdl
+                        N3,
+                        N4))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            I1,
+                                                            N2,
+                                                            I1,
+                                                            N4>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3],
+                                     n_thread_data_on_block_idx[I4]),
+                    tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin = container_concat(
+                make_tuple(make_multi_index(0, 0, 0, 0)),
+                generate_tuple(
+                    [&](auto) {
+                        return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0);
+                    },
+                    Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+
+            auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>>                    // ThreadTransferDstResetCoordinateAfterRunFlags
+                {c_ds_desc_refs,
+                 idx_c_ds_block_begin,
+                 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                 make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
+                 c_element_op};
+
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, 1, N2, 1, N4>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           1,
+                                           N2,
+                                           1,
+                                           N4>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf));
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index a0e716ba8e..223670e3bc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -1221,7 +1221,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                 }
             }
         }
-#if 0
         // check gridwise gemm pipeline
         const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
 
@@ -1232,7 +1231,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                 return false;
             }
         }
-#endif
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         return true;
     }
@@ -2123,6 +2121,58 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                                        n_thread_data_on_block_idx[I3]),
                       ck::tensor_operation::element_wise::PassThrough{}};
 
+            // calculate C grid descriptor
+            constexpr auto DWORD_BYTES        = 4;
+            constexpr auto atomic_vector_size = DWORD_BYTES / sizeof(CDataType);
+
+            constexpr auto CShuffleBlockTransferClusterLengths = [&]() {
+                if constexpr(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::Set)
+                {
+                    return CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock{};
+                }
+                // Atomic operation
+                else
+                {
+                    return generate_sequence_v2(
+                        [&](auto i) {
+                            if constexpr(i == 3)
+                            {
+                                return Number<
+                                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock{}
+                                        .At(i) *
+                                    CShuffleBlockTransferScalarPerVector_NPerBlock /
+                                    atomic_vector_size>{};
+                            }
+                            else if constexpr(i == 1)
+                            {
+                                return Number<
+                                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock{}
+                                        .At(i) /
+                                    CShuffleBlockTransferScalarPerVector_NPerBlock *
+                                    atomic_vector_size>{};
+                            }
+                            else
+                            {
+                                return Number<
+                                    CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock{}
+                                        .At(i)>{};
+                            }
+                        },
+                        Number<4>{});
+                }
+            }();
+
+            constexpr auto CShuffleBlockTransferScalarPerVector = [&]() {
+                if constexpr(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::Set)
+                {
+                    return CShuffleBlockTransferScalarPerVector_NPerBlock;
+                }
+                else
+                {
+                    return atomic_vector_size;
+                }
+            }();
+
             // shuffle: blockwise copy C from LDS to global
             auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
                 ThisThreadBlock,            // ThreadGroup
@@ -2132,15 +2182,15 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                          CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
                          1,
                          CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                decltype(CShuffleBlockTransferClusterLengths),
                 Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                 CShuffleDataType,     // typename SrcData,
                 CDataType,            // typename DstData,
                 decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
                 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
-                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
-                3,                                              // index_t VectorDim,
-                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                Sequence<0, 1, 2, 3>,                 // typename DimAccessOrder,
+                3,                                    // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector, // index_t ScalarPerVector,
                 true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
                 false> // bool ThreadTransferDstResetCoordinateAfterRun>
                 {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index a083293485..62d94c0bf8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -183,27 +183,28 @@ struct GridwiseMoeGemm
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
-    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
-    static constexpr bool is_single_rate_mfma =
-        (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
-          lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8) ||
-         ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
-          lcm_AK1_BK1 < 32))
-            ? true
-            : false;
-    static constexpr auto is_scale_mfma = false;
-    static constexpr auto mfma          = MfmaSelector<ComputeTypeA,
-                                              MPerXdl,
-                                              NPerXdl,
-                                              ComputeTypeA,
-                                              is_single_rate_mfma,
-                                              is_scale_mfma>{};
-    static constexpr index_t KPack      = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
-    static constexpr index_t KLane      = mfma.GetKPerXdlops() / mfma.GetK1PerXdlops();
-    static constexpr index_t KRepeat    = KPerBlock / KLane / KPack;
-    static constexpr index_t NLane      = NPerXdl;
-    static constexpr index_t NWave      = NPerBlock / NPerXdl / NXdlPerWave;
+    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
+    static constexpr index_t KPack =
+        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+    static constexpr index_t KLane =
+        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+
+    static constexpr index_t KGroup = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
+            // On gfx950, we have a mfma that required 32 f8 elements as input,
+            // splited into 2 groups of 16 f8 elements.
+            // the 2 groups is not contiguous in the B preshuffed layout.
+            // and we do not want it to be contiguous in the B preshuffled layout
+            // because a memory instruction can only read 16 f8 elements at a time.
+            return mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
+
+    static constexpr index_t NLane = NPerXdl;
+    static constexpr index_t NWave = NPerBlock / NPerXdl / NXdlPerWave;
     // static constexpr index_t NumTokens = 1;
     static constexpr index_t SortedTileSize = MPerBlock;
 
@@ -262,7 +263,7 @@ struct GridwiseMoeGemm
     }
     __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
     {
-        return math::integer_divide_ceil(K, KLane * KPack);
+        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
     }
 
     __host__ __device__ static auto CalculateKPadded(index_t K)
@@ -404,7 +405,7 @@ struct GridwiseMoeGemm
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1314,7 +1315,7 @@ struct GridwiseMoeGemm
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1360,7 +1361,7 @@ struct GridwiseMoeGemm
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
@@ -1899,7 +1900,8 @@ struct GridwiseMoeGemm
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
         const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
         if(expert_block_id * MPerBlock >= max_token_id)
             return;
@@ -1908,12 +1910,13 @@ struct GridwiseMoeGemm
         const auto block_mn = [&]() -> std::pair<int, int> {
             if constexpr(NSwizzle)
             {
-                const index_t ecnt_prefix    = p_max_token_id[1 + expert_id];
-                const index_t prefix_block   = ecnt_prefix * problem.NBlock;
-                const index_t ecnt           = p_max_token_id[2 + expert_id] - ecnt_prefix;
-                const index_t expert_swizzle = ecnt > 0 ? ecnt : 1;
-                const index_t bid_new        = blockIdx.x - prefix_block;
-                const index_t nid            = __builtin_amdgcn_readfirstlane(
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
                     bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
                 const index_t mid =
                     __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
@@ -1924,9 +1927,9 @@ struct GridwiseMoeGemm
                 return {blockIdx.x, blockIdx.y};
             }
         }();
+
         const index_t block_n_id = block_mn.first;
         const index_t block_m_id = block_mn.second;
-
         const index_t token0 =
             __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
 
@@ -1938,11 +1941,9 @@ struct GridwiseMoeGemm
         constexpr auto AMRepeats  = MPerBlock / AMThreads;
         const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
 
-        if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
-           token0 >= problem.NumTokens)
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
-        StaticallyIndexedArray<IndexType, AMRepeats>
-            gather_offsets; //= p_sorted_token_ids[token_pos];
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
             const index_t fused_token = p_sorted_token_ids[token_pos + m0];
             index_t token_offset      = fused_token & 0xffffff;
@@ -1952,7 +1953,8 @@ struct GridwiseMoeGemm
             }
             gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
 
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
@@ -2025,7 +2027,7 @@ struct GridwiseMoeGemm
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -2042,24 +2044,76 @@ struct GridwiseMoeGemm
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
         auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
         auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
 
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
-                                                                         a_block_desc_ak0_m_ak1,
-                                                                         a_blockwise_copy,
-                                                                         a_grid_buf,
-                                                                         a_block_bufs,
-                                                                         a_block_slice_copy_step,
-                                                                         b_grid_desc_bpreshuffled,
-                                                                         b_blockwise_copy,
-                                                                         b_grid_buf,
-                                                                         b_block_bufs,
-                                                                         b_block_slice_copy_step,
-                                                                         c_thread_buf,
-                                                                         num_k_block_main_loop);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                num_k_block_main_loop);
+        }
 
         // shuffle C and write out
         {
@@ -2087,6 +2141,185 @@ struct GridwiseMoeGemm
             constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
             constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
 
+            // mul scales
+            const float* p_sorted_weights_0 = p_ds_grid[I0];
+            const float* p_scale_b          = p_ds_grid[I1];
+
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            if(p_sorted_weights_0 != nullptr && p_scale_b != nullptr)
+            {
+                if constexpr(PerTokenQuant)
+                {
+                    constexpr index_t scale_stride = (IsInputGemm ? 2 : 1);
+                    p_scale_b += expert_id * problem.N * scale_stride + block_n_id * NPerBlock +
+                                 get_warp_local_1d_id() % NWave * NPerXdl + threadIdx.x % NPerXdl;
+                }
+                else
+                {
+                    p_scale_b += expert_id;
+                }
+
+                vector_type<int32_t, 4> scale_token_ids;
+                vector_type<float, 4> topk_weights;
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    const float scale_b = p_scale_b[n0 * NWave * NPerXdl * PerTokenQuant];
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(PerTokenQuant)
+                            {
+                                scale_token_ids =
+                                    *c_style_pointer_cast<const vector_type<int32_t, M4>*>(
+                                        p_sorted_token_ids + m_pos);
+                            }
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                float scale_a = [&]() {
+                                    if constexpr(PerTokenQuant)
+                                    {
+                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
+                                        const index_t token_offset = fused_token & 0xffffff;
+                                        return token_offset < problem.NumTokens
+                                                   ? p_sorted_weights_0[token_offset]
+                                                   : 0.0;
+                                    }
+                                    else
+                                    {
+                                        return p_sorted_weights_0[0];
+                                    }
+                                }();
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        const float scale_up =
+                                            p_scale_b[(n0 * NWave * NPerXdl + problem.N) *
+                                                      PerTokenQuant];
+                                        float gate = scale_a * scale_b * c_thread_buf[cidx];
+                                        float up   = scale_a * scale_up * c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                        {
+                                            gate *= 16;
+                                            up *= 16;
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        scale_a * scale_b * c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf_fp32(cidx) *
+                                                                  topk_weights.AsType<float>()[m4];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+            else
+            {
+                vector_type<float, 4> topk_weights; // for gemm2 only
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                            const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                                  m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                            if constexpr(MulRoutedWeight)
+                            {
+                                topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                    p_ds_grid[I2] + m_pos);
+                            }
+                            static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                                constexpr index_t c_offset =
+                                    blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                        make_tuple(m0, n0, m2 * M4 + m4));
+                                constexpr auto cidx = Number<c_offset>{};
+
+                                if constexpr(IsInputGemm) // gu fusion
+                                {
+                                    if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Silu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                    else if(ActivationOperation == Activation::gelu_and_mul)
+                                    {
+                                        float gate = c_thread_buf[cidx];
+                                        float up   = c_thread_buf_up[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            gate = gate * topk_weights.AsType<float>()[m4];
+                                            up   = up * topk_weights.AsType<float>()[m4];
+                                        }
+                                        tensor_operation::element_wise::Gelu{}(gate, gate);
+                                        c_thread_buf_fp32(cidx) = gate * up;
+                                    }
+                                }
+                                else
+                                {
+                                    c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        c_thread_buf_fp32(cidx) = topk_weights.AsType<float>()[m4] *
+                                                                  c_thread_buf_fp32[cidx];
+                                    }
+                                }
+                            });
+                        });
+                    });
+                });
+            }
+
             constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
@@ -2184,18 +2417,8 @@ struct GridwiseMoeGemm
 
             const auto ds_grid_buf = generate_tuple(
                 [&](auto i) {
-                    using DDataType       = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
-                    const DDataType* ptr_ = p_ds_grid[i];
-                    // hack logic here to support different kind of strides. todo fix it.
-                    // ascale t, 1; bscale E, N, 1, move ptr to E
-                    // if(i.value == 1)
-                    // {
-                    //     ptr_ +=
-                    //         expert_id * (problem.StrideDs[1] ? problem.StrideDs[1] * problem.N :
-                    //         1);
-                    // }
                     return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                        ptr_, ds_grid_desc_m_n[i].GetElementSpaceSize());
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
                 },
                 Number<NumDTensor>{});
 
@@ -2271,7 +2494,6 @@ struct GridwiseMoeGemm
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-            // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
                 SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
@@ -2310,7 +2532,7 @@ struct GridwiseMoeGemm
                     block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
                 static_for<0, EMRepeats, 1>{}([&](auto m0) {
                     const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
-                    index_t token_offset      = fused_token & 0xffffff;
+                    IndexType token_offset    = fused_token & 0xffffff;
                     if constexpr(IsInputGemm)
                     {
                         token_offset = token_offset * problem.TopK + (fused_token >> 24);
@@ -2323,7 +2545,7 @@ struct GridwiseMoeGemm
                 // each thread write its data from VGPR to LDS
                 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                               sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
-                                              c_thread_buf,
+                                              c_thread_buf_fp32,
                                               c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                               c_shuffle_block_buf);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
new file mode 100644
index 0000000000..fbfe2509ff
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -0,0 +1,2668 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
+
+#define DEBUG_LOG 0
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_gemm(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        karg.p_a_scale_grid,
+        karg.p_b_scale_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        karg.p_a_scale_grid,
+        karg.p_b_scale_grid,
+        p_shared,
+        p_shared1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockM,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ADataType,
+          typename LDSTypeB                           = BDataType>
+struct GridwiseMoeGemmBlockScale
+{
+    using AScaleType = float;
+    using BScaleType = float;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+    // K1 should be Number<...>
+    static constexpr auto AK0Number       = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number       = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number       = Number<AK1Value>{};
+    static constexpr auto BK1Number       = Number<BK1Value>{};
+    static constexpr auto BlockSizeNumber = Number<BlockSize>{};
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    using mfma_selector = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeB>;
+    static constexpr index_t KPack =
+        math::max(math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk);
+    static constexpr index_t KGroup = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
+            // On gfx950, we have a mfma that required 32 f8 elements as input,
+            // splited into 2 groups of 16 f8 elements.
+            // the 2 groups is not contiguous in the B preshuffed layout.
+            // and we do not want it to be contiguous in the B preshuffled layout
+            // because a memory instruction can only read 16 f8 elements at a time.
+            return mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+        else
+            return 1;
+    }();
+    static constexpr index_t KLane =
+        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    // static constexpr index_t NumTokens = 1;
+    static constexpr index_t SortedTileSize = MPerBlock;
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t nblock = math::integer_divide_ceil(N, NPerBlock);
+        const index_t mblock = math::integer_divide_ceil(M, MPerBlock);
+        const index_t gridx  = NSwizzle ? nblock * mblock : nblock;
+        const index_t gridy  = NSwizzle ? 1 : mblock;
+        return std::make_tuple(gridx, gridy, 1);
+    }
+
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ __device__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ __device__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(
+                           Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        return make_naive_tensor_descriptor(
+            make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
+            make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MPerXdl>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWave, NPerXdl>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ELayout>
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename DLayout>
+    __host__ __device__ static auto
+    MakeDGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I0));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I0, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeDGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(0, 0, 0, 0, {}))>;
+
+    struct Problem
+    {
+        __host__ __device__ Problem(index_t NumTokens_,
+                                    index_t TopK_,
+                                    index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t StrideA_,
+                                    index_t StrideB_,
+                                    std::array<index_t, NumDTensor> StrideDs_,
+                                    index_t StrideC_,
+                                    index_t KBatch_)
+            : NumTokens{NumTokens_},
+              TopK{TopK_},
+              M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideDs{StrideDs_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "NumTokens:" << NumTokens << ", "
+                      << "TopK:" << TopK << ", "
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t NumTokens;
+        index_t TopK;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // FOR PRESHUFFLE ONLY
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const index_t* p_sorted_token_ids_,
+                          const index_t* p_sorted_expert_ids_,
+                          const index_t* p_max_token_id_,
+                          const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          CDataType* p_c_grid_,
+                          index_t NumTokens_,
+                          index_t TopK_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideC_,
+                          const AScaleType* p_a_scale_grid_,
+                          const BScaleType* p_b_scale_grid_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_)
+            : Problem{NumTokens_,
+                      TopK_,
+                      M_,
+                      N_,
+                      K_,
+                      StrideA_,
+                      StrideB_,
+                      StrideDs_,
+                      StrideC_,
+                      k_batch_},
+              p_sorted_token_ids{p_sorted_token_ids_},
+              p_sorted_expert_ids{p_sorted_expert_ids_},
+              p_max_token_id{p_max_token_id_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_ds_grid{},
+              p_c_grid{p_c_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_}
+        {
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType_ = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType_*>(p_ds_grid_[i]);
+            });
+        }
+
+        const index_t* p_sorted_token_ids;
+        const index_t* p_sorted_expert_ids;
+        const index_t* p_max_token_id;
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        DsGridPointer p_ds_grid;
+        CDataType* p_c_grid;
+
+        const AScaleType* p_a_scale_grid;
+        const BScaleType* p_b_scale_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+    };
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                // KPack * NLane * KLane * K0 * N0
+                b_k_split_offset = k_id * karg.KRead * NLane / BPackedSize;
+            }
+
+            if(k_id < karg.KBatch - 1)
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(LDSTypeA));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(LDSTypeA))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(LDSTypeA)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}));
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmBlockMoeScaleBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ADataType,
+                                BDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                ScaleBlockM,
+                                ScaleBlockN,
+                                ScaleBlockK,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack,
+                                IsInputGemm>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(LDSTypeA) / APackedSize,
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                          << karg.K << " " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+#if 0
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+        {
+            return false;
+        }
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ __device__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    // using Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock,
+    // NPerBlock>;
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const index_t* p_sorted_token_ids,
+                               const index_t* p_sorted_expert_ids,
+                               const index_t* p_max_token_id,
+                               const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* p_c_grid,
+                               const AScaleType* p_a_scale_grid,
+                               const BScaleType* p_b_scale_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(IsInputGemm ? problem.NumTokens
+                                                             : problem.NumTokens * problem.TopK,
+                                                 ScaleBlockM),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            math::integer_divide_ceil(problem.N, ScaleBlockN) * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockK));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * static_cast<long_index_t>(expert_stride) / BPackedSize,
+            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + expert_id * expert_scale_stride,
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        constexpr index_t ScaleSliceSizeM = MXdlPerWave;
+        constexpr index_t ScaleSliceSizeN = math::integer_divide_ceil(NPerBlock, ScaleBlockN);
+        constexpr index_t ScaleSliceSizeK = math::integer_divide_ceil(KPerBlock, ScaleBlockK);
+
+        // ScaleSliceSizeK is last dimension in A/B scale for vector memory access
+        // ScaleSliceSizeK is first dimension in C scale for packed math
+        constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        auto a_thread_offset =
+            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr auto c_scale_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<ScaleSliceSizeK>{}, Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeN>{}));
+
+        // get each thread's offset in the scale tensor
+        // A scale
+        const index_t token_scale_pos = block_m_id * MPerBlock / ScaleBlockM;
+
+        if(token_scale_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<index_t, MXdlPerWave> scale_gather_offsets;
+        static_for<0, MXdlPerWave, 1>{}([&](auto m0) {
+            const index_t fused_token =
+                p_sorted_token_ids[token_scale_pos + m0 * MPerXdl * MWaves + a_thread_offset];
+            index_t token_offset = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            scale_gather_offsets(m0) =
+                token_offset * math::integer_divide_ceil(problem.K, ScaleBlockK);
+        });
+
+        auto a_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2_gather<AScaleType,
+                                                    AScaleType,
+                                                    decltype(a_scale_grid_desc_am_ak),
+                                                    decltype(a_scale_thread_desc),
+                                                    Sequence<1, ScaleSliceSizeK>,
+                                                    Sequence<0, 1>,
+                                                    1,
+                                                    ScaleSliceSizeK,
+                                                    1,
+                                                    false,
+                                                    MXdlPerWave>(
+                a_scale_grid_desc_am_ak, make_multi_index(0, 0), scale_gather_offsets);
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             false>(
+                b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
+
+        // constexpr auto a_scale_thread_slice_copy_step = make_multi_index(0, 1);
+        constexpr auto a_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(0, 0), make_multi_index(0, ScaleSliceSizeK));
+        constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
+
+        constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * static_cast<long_index_t>(expert_stride) / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BScaleType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up =
+                ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                                 BScaleType,
+                                                 decltype(b_scale_grid_desc_bn_ak),
+                                                 decltype(b_scale_thread_desc),
+                                                 Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 ScaleSliceSizeK,
+                                                 1,
+                                                 false>(
+                    b_scale_grid_desc_bn_ak,
+                    make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_slice_copy_step,
+
+                c_scale_thread_desc,
+                c_thread_buf,
+                c_thread_buf_up,
+
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                b_scale_thread_slice_copy_step,
+
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+
+                c_scale_thread_desc,
+                c_thread_buf,
+
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                b_scale_thread_slice_copy_step,
+
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // transposed XDL
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
+            constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
+
+            static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
+            static_assert(M0 * M1 * M2 == MPerBlock);
+            static_assert(N4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m2 = threadIdx.x % get_warp_size() % M2;
+
+            float topk_weight;
+            static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    if constexpr(MulRoutedWeight)
+                    {
+                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 + m1 * M2 + m2;
+                        topk_weight         = p_ds_grid[I0][m_pos];
+                    }
+                    static_for<0, N2, 1>{}([&](auto n2) {     // num_groups_per_blk
+                        static_for<0, N4, 1>{}([&](auto n4) { // inst_group_size
+                            constexpr index_t c_offset =
+                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                    make_tuple(m0, n0, n2 * N4 + n4));
+                            constexpr auto cidx = Number<c_offset>{};
+                            if constexpr(IsInputGemm) // gu fusion, elementwise
+                            {
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weight;
+                                        up   = up * topk_weight;
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weight;
+                                        up   = up * topk_weight;
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                            }
+                            else
+                            {
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    c_thread_buf(cidx) = c_thread_buf[cidx] * topk_weight;
+                                }
+                            }
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2)),                                    // M2 = MPerXdl
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2,                                      // N2 * N3 * N4 = NPerXdl
+                        N3,
+                        N4))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            I1,
+                                                            N2,
+                                                            I1,
+                                                            N4>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3],
+                                     n_thread_data_on_block_idx[I4]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    using DDataType       = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                    const DDataType* ptr_ = p_ds_grid[i];
+                    // hack logic here to support different kind of strides. todo fix it.
+                    // ascale t, 1; bscale E, N, 1, move ptr to E
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        ptr_, ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = IsInputGemm ? 1 : 1; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, 1, N2, 1, N4>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           1,
+                                           N2,
+                                           1,
+                                           N4>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    index_t token_offset      = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = token_offset * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
+                                    const index_t* p_sorted_expert_ids,
+                                    const index_t* p_max_token_id,
+                                    const ADataType* p_a_grid,
+                                    const BDataType* p_b_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* p_c_grid,
+                                    const AScaleType* p_a_scale_grid,
+                                    const BScaleType* p_b_scale_grid,
+                                    void* p_shared,
+                                    void* p_shared1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(IsInputGemm ? problem.NumTokens
+                                                             : problem.NumTokens * problem.TopK,
+                                                 ScaleBlockM),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix    = p_max_token_id[1 + expert_id];
+                const index_t prefix_block   = ecnt_prefix * problem.NBlock;
+                const index_t ecnt           = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle = ecnt > 0 ? ecnt : 1;
+                const index_t bid_new        = blockIdx.x - prefix_block;
+                const index_t nid            = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || expert_block_id * MPerBlock >= max_token_id ||
+           token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats>
+            gather_offsets; //= p_sorted_token_ids[token_pos];
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            math::integer_divide_ceil(problem.N, ScaleBlockN) * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockK));
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * static_cast<long_index_t>(expert_stride) / BPackedSize,
+            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + expert_id * expert_scale_stride,
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            decltype(b_grid_desc_bpreshuffled),
+            decltype(b_block_desc_bk0_n_bk1),
+            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+            Sequence<1, 2, 0, 3>,
+            3,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_grid_desc_bpreshuffled,
+                  make_multi_index(n_block_data_idx_on_grid,
+                                   get_warp_local_1d_id() % NWave,
+                                   0,
+                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // scale
+        constexpr index_t ScaleSliceSizeM = MXdlPerWave;
+        constexpr index_t ScaleSliceSizeN = math::integer_divide_ceil(NPerBlock, ScaleBlockN);
+        constexpr index_t ScaleSliceSizeK = math::integer_divide_ceil(KPerBlock, ScaleBlockK);
+
+        // ScaleSliceSizeK is last dimension in A/B scale for vector memory access
+        // ScaleSliceSizeK is first dimension in C scale for packed math
+        constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+        auto a_thread_offset =
+            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) / NWaves * MPerXdl;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr auto c_scale_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<ScaleSliceSizeK>{}, Number<ScaleSliceSizeM>{}, Number<ScaleSliceSizeN>{}));
+
+        // get each thread's offset in the scale tensor
+        // A scale
+        const index_t token_scale_pos = block_m_id * MPerBlock / ScaleBlockM;
+
+        if(token_scale_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<index_t, MXdlPerWave> scale_gather_offsets;
+        static_for<0, MXdlPerWave, 1>{}([&](auto m0) {
+            const index_t fused_token =
+                p_sorted_token_ids[token_scale_pos + m0 * MPerXdl * MWaves + a_thread_offset];
+            index_t token_offset = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            scale_gather_offsets(m0) = static_cast<IndexType>(token_offset) *
+                                       math::integer_divide_ceil(problem.K, ScaleBlockK);
+        });
+
+        auto a_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2_gather<AScaleType,
+                                                    AScaleType,
+                                                    decltype(a_scale_grid_desc_am_ak),
+                                                    decltype(a_scale_thread_desc),
+                                                    Sequence<1, ScaleSliceSizeK>,
+                                                    Sequence<0, 1>,
+                                                    1,
+                                                    ScaleSliceSizeK,
+                                                    1,
+                                                    false,
+                                                    MXdlPerWave>(
+                a_scale_grid_desc_am_ak, make_multi_index(0, 0), scale_gather_offsets);
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             false>(
+                b_scale_grid_desc_bn_ak, make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
+
+        // constexpr auto a_scale_thread_slice_copy_step = make_multi_index(0, 1);
+        constexpr auto a_scale_thread_slice_copy_step =
+            make_tuple(make_multi_index(0, 0), make_multi_index(0, ScaleSliceSizeK));
+        constexpr auto b_scale_thread_slice_copy_step = make_multi_index(0, ScaleSliceSizeK);
+
+        constexpr auto NumKBlockPerScale = math::integer_divide_ceil(ScaleBlockK, KPerBlock);
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * static_cast<long_index_t>(expert_stride) / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BScaleType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride / BPackedSize,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up =
+                ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                                 BScaleType,
+                                                 decltype(b_scale_grid_desc_bn_ak),
+                                                 decltype(b_scale_thread_desc),
+                                                 Sequence<ScaleSliceSizeN, ScaleSliceSizeK>,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 ScaleSliceSizeK,
+                                                 1,
+                                                 false>(
+                    b_scale_grid_desc_bn_ak,
+                    make_multi_index(block_n_id * NPerBlock / ScaleBlockN, 0));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_scale_thread_desc,
+                c_thread_buf,
+                c_thread_buf_up,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                b_scale_thread_slice_copy_step,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, NumKBlockPerScale, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_scale_thread_desc,
+                c_thread_buf,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_desc,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                a_scale_thread_slice_copy_step,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_desc,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                b_scale_thread_slice_copy_step,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // transposed XDL
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            // TODO: hacky, fix it!
+            // only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
+            constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
+
+            static_assert(N0 * N1 * N2 * N3 * N4 == NPerBlock);
+            static_assert(M0 * M1 * M2 == MPerBlock);
+            static_assert(N4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m2 = threadIdx.x % get_warp_size() % M2;
+
+            float topk_weight;
+            static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                    if constexpr(MulRoutedWeight)
+                    {
+                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 + m1 * M2 + m2;
+                        topk_weight         = p_ds_grid[I0][m_pos];
+                    }
+                    static_for<0, N2, 1>{}([&](auto n2) {     // num_groups_per_blk
+                        static_for<0, N4, 1>{}([&](auto n4) { // inst_group_size
+                            constexpr index_t c_offset =
+                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                    make_tuple(m0, n0, n2 * N4 + n4));
+                            constexpr auto cidx = Number<c_offset>{};
+                            if constexpr(IsInputGemm) // gu fusion, elementwise
+                            {
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weight;
+                                        up   = up * topk_weight;
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weight;
+                                        up   = up * topk_weight;
+                                    }
+                                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                                    {
+                                        gate *= 16;
+                                        up *= 16;
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf(cidx) = gate * up;
+                                }
+                            }
+                            else
+                            {
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    c_thread_buf(cidx) = c_thread_buf[cidx] * topk_weight;
+                                }
+                            }
+
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2)),                                    // M2 = MPerXdl
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2,                                      // N2 * N3 * N4 = NPerXdl
+                        N3,
+                        N4))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            I1,
+                                                            N2,
+                                                            I1,
+                                                            N4>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3],
+                                     n_thread_data_on_block_idx[I4]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = IsInputGemm ? 1 : 1; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, 1, N2, 1, N4>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           1,
+                                           N2,
+                                           1,
+                                           N4>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats>
+                    scatter_offsets; //= p_sorted_token_ids[c_token_pos];
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    index_t token_offset      = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = token_offset * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
new file mode 100644
index 0000000000..fc156a878f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -0,0 +1,2652 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
+
+#define DEBUG_LOG 0
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    // auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid,
+        karg.p_a_scale_grid,
+        karg.p_b_grid,
+        karg.p_b_scale_grid,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        p_shared1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType>
+struct GridwiseMoeGemmMX
+{
+    using LDSTypeA = ADataType;
+    using LDSTypeB = BDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+    // K1 should be Number<...>
+    static constexpr auto AK0Number       = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number       = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number       = Number<AK1Value>{};
+    static constexpr auto BK1Number       = Number<BK1Value>{};
+    static constexpr auto BlockSizeNumber = Number<BlockSize>{};
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto MXdlPack = 2;
+    static constexpr auto NXdlPack = 2;
+    static constexpr auto KXdlPack = 2;
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
+    using mfma_selector                       = MfmaSelector<ComputeTypeA,
+                                       MPerXdl,
+                                       NPerXdl,
+                                       ComputeTypeB,
+                                       is_single_rate_mfma,
+                                       is_scale_mfma>;
+    static constexpr index_t KPack            = math::max(
+        math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk / APackedSize);
+    static constexpr index_t KLane =
+        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
+
+    static constexpr index_t KGroup = 1; // mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
+    // static_assert(KGroup == 2, "");
+    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    static constexpr index_t MWave   = MPerBlock / MPerXdl / MXdlPerWave;
+
+    // static constexpr index_t NumTokens = 1;
+    static constexpr index_t SortedTileSize = MPerBlock;
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t nblock = math::integer_divide_ceil(N, NPerBlock);
+        const index_t mblock = math::integer_divide_ceil(M, MPerBlock);
+        const index_t gridx  = NSwizzle ? nblock * mblock : nblock;
+        const index_t gridy  = NSwizzle ? 1 : mblock;
+
+        return std::make_tuple(gridx, gridy, 1);
+    }
+
+    __host__ __device__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ __device__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ __device__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ __device__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ __device__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave,
+              index_t MNWaves,
+              index_t MNXdlPack,
+              index_t MNPerXdl,
+              typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                         Number<MNWaves>{},
+                                                         Number<MNXdlPack>{},
+                                                         Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        return make_naive_tensor_descriptor(
+            make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber),
+            make_tuple(NWave * NXdlPack * K0 * NkSwizzleNumber,
+                       NXdlPack * K0 * NkSwizzleNumber,
+                       K0 * NkSwizzleNumber,
+                       NkSwizzleNumber,
+                       I1));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "f4x2_pk_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_M3_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWave, MXdlPack, MPerXdl>(
+            ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_N3_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWave, NXdlPack, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ELayout>
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename DLayout>
+    __host__ __device__ static auto
+    MakeDGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I0));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I0, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeDGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
+    struct Problem
+    {
+        __host__ __device__ Problem(index_t NumTokens_,
+                                    index_t TopK_,
+                                    index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t StrideA_,
+                                    index_t StrideScaleA_,
+                                    index_t StrideB_,
+                                    index_t StrideScaleB_,
+                                    std::array<index_t, NumDTensor> StrideDs_,
+                                    index_t StrideC_,
+                                    index_t KBatch_)
+            : NumTokens{NumTokens_},
+              TopK{TopK_},
+              M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideScaleA{StrideScaleA_},
+              StrideB{StrideB_},
+              StrideScaleB{StrideScaleB_},
+              StrideDs{StrideDs_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "NumTokens:" << NumTokens << ", "
+                      << "TopK:" << TopK << ", "
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SSCaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t NumTokens;
+        index_t TopK;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideScaleA;
+        index_t StrideB;
+        index_t StrideScaleB;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // FOR PRESHUFFLE ONLY
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const index_t* p_sorted_token_ids_,
+                          const index_t* p_sorted_expert_ids_,
+                          const index_t* p_max_token_id_,
+                          const ADataType* p_a_grid_,
+                          const AScaleDataType* p_a_scale_grid_,
+                          const BDataType* p_b_grid_,
+                          const BScaleDataType* p_b_scale_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          CDataType* p_c_grid_,
+                          index_t NumTokens_,
+                          index_t TopK_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideScaleA_,
+                          index_t StrideB_,
+                          index_t StrideScaleB_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_)
+            : Problem{NumTokens_,
+                      TopK_,
+                      M_,
+                      N_,
+                      K_ / APackedSize,
+                      StrideA_ / APackedSize,
+                      StrideScaleA_,
+                      StrideB_ / APackedSize,
+                      StrideScaleB_,
+                      StrideDs_,
+                      StrideC_,
+                      k_batch_},
+              p_sorted_token_ids{p_sorted_token_ids_},
+              p_sorted_expert_ids{p_sorted_expert_ids_},
+              p_max_token_id{p_max_token_id_},
+              p_a_grid{p_a_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_grid{p_b_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              p_ds_grid{},
+              p_c_grid{p_c_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_}
+        {
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType_ = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType_*>(p_ds_grid_[i]);
+            });
+        }
+
+        const index_t* p_sorted_token_ids;
+        const index_t* p_sorted_expert_ids;
+        const index_t* p_max_token_id;
+        const ADataType* p_a_grid;
+        const AScaleDataType* p_a_scale_grid;
+        const BDataType* p_b_grid;
+        const BScaleDataType* p_b_scale_grid;
+        DsGridPointer p_ds_grid;
+        CDataType* p_c_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+    };
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                // KPack * NLane * KLane * K0 * N0
+                b_k_split_offset = k_id * karg.KRead;
+            }
+
+            // Calculate A scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / APackedSize);
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_scale_k_split_offset =
+                    k_id * karg.KRead / (ScaleBlockSize / APackedSize) * karg.StrideScaleA;
+            }
+
+            // Calculate B scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_scale_k_split_offset =
+                    k_id * (karg.KRead / (ScaleBlockSize / BPackedSize)) * karg.StrideScaleB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / BPackedSize);
+            }
+
+            if(k_id < karg.KBatch - 1)
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t a_scale_k_split_offset;
+        index_t b_scale_k_split_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(LDSTypeA));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(LDSTypeA) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(LDSTypeA))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(LDSTypeA)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(make_tuple(Number<NXdlPerWave / NXdlPack>{},
+                                                              I1,
+                                                              Number<NXdlPack>{},
+                                                              Number<KRepeat>{},
+                                                              Number<BK1Value>{}));
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmMXBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ScaleBlockSize,
+                                ADataType,
+                                AScaleDataType,
+                                BDataType,
+                                BScaleDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_M3_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_N3_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack,
+                                IsInputGemm>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(LDSTypeA),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        static_assert(KPerBlock % (ScaleBlockSize / BPackedSize) == 0,
+                      "KPerBlock should be multiple of ScaleBlockSize");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                          << karg.K << " " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg K (" << karg.K
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+#if DEBUG_LOG
+                std::cout << "Arg M (" << karg.M
+                          << ") value is not a multiple of "
+                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
+                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+
+#endif // DEBUG_LOG
+                return false;
+            }
+        }
+
+        // check gridwise gemm pipeline
+#if 0
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+        {
+            return false;
+        }
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ __device__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    // using Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock,
+    // NPerBlock>;
+
+    using mx_scale_t                           = e8m0_bexp_t;
+    static constexpr index_t scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr index_t scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const index_t* p_sorted_token_ids,
+                               const index_t* p_sorted_expert_ids,
+                               const index_t* p_max_token_id,
+                               const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
+            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerBlock),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K / APackedSize;
+        });
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * (IsInputGemm ? 2 : 1) *
+                                           math::integer_divide_ceil(problem.K, ScaleBlockSize));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride / BPackedSize,
+            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        // A, B scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + expert_id * expert_scale_stride,
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto b_blockwise_copy =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             BDataType,
+                                             decltype(b_grid_desc_bpreshuffled),
+                                             decltype(b_block_desc_bk0_n_bk1),
+                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                      I1,
+                                                      Number<NXdlPack>{},
+                                                      Number<KRepeat>{},
+                                                      Number<BK1Value>{}>,
+                                             Sequence<1, 2, 0, 3>,
+                                             4,
+                                             BBlockTransferSrcScalarPerVector,
+                                             BThreadTransferSrcResetCoordinateAfterRun,
+                                             true>(
+                b_grid_desc_bpreshuffled,
+                make_multi_index(n_block_data_idx_on_grid,
+                                 get_warp_local_1d_id() % NWave,
+                                 0,
+                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared),
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // a and b scale processing
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        // B scale load
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
+            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BScaleDataType,
+                BScaleDataType,
+                decltype(b_scale_grid_desc_bn_ak),
+                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+                Sequence<0, 1, 2>,                                       // DimAccessOrder
+                2,                                                       // SrcVectorDim
+                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+                1,                                                       // SrcScalarStrideInVector
+                true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                 0,
+                                 thread_offset_shuffled / scale_pack_size_b));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            // mul scales
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            vector_type<float, 4> topk_weights; // for gemm2 only
+            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                        if constexpr(MulRoutedWeight)
+                        {
+                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                p_ds_grid[I2] + m_pos);
+                        }
+                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                            constexpr index_t c_offset =
+                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                    make_tuple(m0, n0, m2 * M4 + m4));
+                            constexpr auto cidx = Number<c_offset>{};
+
+                            if constexpr(IsInputGemm) // gu fusion
+                            {
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                            }
+                            else
+                            {
+                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                }
+                            }
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 1; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
+                                    const index_t* p_sorted_expert_ids,
+                                    const index_t* p_max_token_id,
+                                    const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared,
+                                    void* p_shared1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
+            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            problem.N * math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             BDataType,
+                                             decltype(b_grid_desc_bpreshuffled),
+                                             decltype(b_block_desc_bk0_n_bk1),
+                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                      I1,
+                                                      Number<NXdlPack>{},
+                                                      Number<KRepeat>{},
+                                                      Number<BK1Value>{}>,
+                                             Sequence<1, 2, 0, 3, 4>,
+                                             4,
+                                             BBlockTransferSrcScalarPerVector,
+                                             BThreadTransferSrcResetCoordinateAfterRun,
+                                             true>(
+                b_grid_desc_bpreshuffled,
+                make_multi_index(n_block_data_idx_on_grid,
+                                 get_warp_local_1d_id() % NWave,
+                                 0,
+                                 0,
+                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // a and b scale processing
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        // get each thread's offset int the scale tensor
+        const index_t token_scale_pos = block_m_id * MPerBlock;
+        if(token_scale_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        // B scale load
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
+            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BScaleDataType,
+                BScaleDataType,
+                decltype(b_scale_grid_desc_bn_ak),
+                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+                Sequence<0, 1, 2>,                                       // DimAccessOrder
+                2,                                                       // SrcVectorDim
+                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+                1,                                                       // SrcScalarStrideInVector
+                true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                 0,
+                                 thread_offset_shuffled / scale_pack_size_b));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            // mul scales
+
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            vector_type<float, 4> topk_weights; // for gemm2 only
+            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                        if constexpr(MulRoutedWeight)
+                        {
+                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                p_ds_grid[I2] + m_pos);
+                        }
+                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                            constexpr index_t c_offset =
+                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                    make_tuple(m0 / MXdlPack,
+                                               n0 / NXdlPack,
+                                               m0 % MXdlPack,
+                                               n0 % NXdlPack,
+                                               m2 * M4 + m4));
+                            constexpr auto cidx = Number<c_offset>{};
+
+                            if constexpr(IsInputGemm) // gu fusion
+                            {
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                            }
+                            else
+                            {
+                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                }
+                            }
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
new file mode 100644
index 0000000000..7238917920
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -0,0 +1,2849 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_nbs_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
+
+#define DEBUG_LOG 0
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+#if 0
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    // auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid,
+        karg.p_a_scale_grid,
+        karg.p_b_grid,
+        karg.p_b_scale_grid,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        p_shared1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+#endif
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType>
+struct GridwiseMoeGemmMXBNS
+{
+    using LDSTypeA = ADataType;
+    using LDSTypeB = BDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
+
+    static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto MXdlPack = 2;
+    static constexpr auto NXdlPack = 2;
+    static constexpr auto KXdlPack = 2;
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
+    using mfma_selector                       = MfmaSelector<ComputeTypeA,
+                                       MPerXdl,
+                                       NPerXdl,
+                                       ComputeTypeB,
+                                       is_single_rate_mfma,
+                                       is_scale_mfma>;
+    static constexpr index_t KPack            = math::max(
+        math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk / APackedSize);
+
+    // static constexpr index_t NumTokens = 1;
+    static constexpr index_t SortedTileSize = MPerBlock;
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t nblock = math::integer_divide_ceil(N, NPerBlock);
+        const index_t mblock = math::integer_divide_ceil(M, MPerBlock);
+        const index_t gridx  = NSwizzle ? nblock * mblock : nblock;
+        const index_t gridy  = NSwizzle ? 1 : mblock;
+
+        return std::make_tuple(gridx, gridy, 1);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave,
+              index_t MNWaves,
+              index_t MNXdlPack,
+              index_t MNPerXdl,
+              typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        return transform_tensor_descriptor(
+            TileDesc_K0_MN_K1{},
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                       make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                         Number<MNWaves>{},
+                                                         Number<MNXdlPack>{},
+                                                         Number<MNPerXdl>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "f4x2_pk_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_M3_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MXdlPack, MPerXdl>(
+            ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_N3_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NXdlPack, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ELayout>
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename DLayout>
+    __host__ __device__ static auto
+    MakeDGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I0));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I0, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeDGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t NumTokens_,
+                         index_t TopK_,
+                         index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideScaleA_,
+                         index_t StrideB_,
+                         index_t StrideScaleB_,
+                         std::array<index_t, NumDTensor> StrideDs_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : NumTokens{NumTokens_},
+              TopK{TopK_},
+              M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideScaleA{StrideScaleA_},
+              StrideB{StrideB_},
+              StrideScaleB{StrideScaleB_},
+              StrideDs{StrideDs_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "NumTokens:" << NumTokens << ", "
+                      << "TopK:" << TopK << ", "
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t NumTokens;
+        index_t TopK;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideScaleA;
+        index_t StrideB;
+        index_t StrideScaleB;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const index_t* p_sorted_token_ids_,
+                          const index_t* p_sorted_expert_ids_,
+                          const index_t* p_max_token_id_,
+                          const ADataType* p_a_grid_,
+                          const AScaleDataType* p_a_scale_grid_,
+                          const BDataType* p_b_grid_,
+                          const BScaleDataType* p_b_scale_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          CDataType* p_c_grid_,
+                          index_t NumTokens_,
+                          index_t TopK_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideScaleA_,
+                          index_t StrideB_,
+                          index_t StrideScaleB_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_)
+            : Problem{NumTokens_,
+                      TopK_,
+                      M_,
+                      N_,
+                      K_ / APackedSize,
+                      StrideA_ / APackedSize,
+                      StrideScaleA_,
+                      StrideB_ / BPackedSize,
+                      StrideScaleB_,
+                      StrideDs_,
+                      StrideC_,
+                      k_batch_},
+              p_sorted_token_ids{p_sorted_token_ids_},
+              p_sorted_expert_ids{p_sorted_expert_ids_},
+              p_max_token_id{p_max_token_id_},
+              p_a_grid{p_a_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_grid{p_b_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              p_ds_grid{},
+              p_c_grid{p_c_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_}
+        {
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType_ = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType_*>(p_ds_grid_[i]);
+            });
+        }
+
+        const index_t* p_sorted_token_ids;
+        const index_t* p_sorted_expert_ids;
+        const index_t* p_max_token_id;
+        const ADataType* p_a_grid;
+        const AScaleDataType* p_a_scale_grid;
+        const BDataType* p_b_grid;
+        const BScaleDataType* p_b_scale_grid;
+        DsGridPointer p_ds_grid;
+        CDataType* p_c_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+    };
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                // KPack * NLane * KLane * K0 * N0
+                b_k_split_offset = k_id * karg.KRead;
+            }
+
+            // Calculate A scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / APackedSize);
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_scale_k_split_offset =
+                    k_id * karg.KRead / (ScaleBlockSize / APackedSize) * karg.StrideScaleA;
+            }
+
+            // Calculate B scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_scale_k_split_offset =
+                    k_id * (karg.KRead / (ScaleBlockSize / BPackedSize)) * karg.StrideScaleB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / BPackedSize);
+            }
+
+            if(k_id < karg.KBatch - 1)
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t a_scale_k_split_offset;
+        index_t b_scale_k_split_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto WaveSize = 64;
+            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1       = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock + BBlockLdsExtraN>{}, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr auto b_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                                             make_tuple(BK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<NPerBlock>{}, Number<BK0Number>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_permuted;
+        }
+        else // RowMajor B
+        {
+            constexpr auto WaveSize = 64;
+            constexpr auto N0       = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1       = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerXdl * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerXdl * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerXdl * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmMXNBSPipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ScaleBlockSize,
+                                ADataType,
+                                AScaleDataType,
+                                BDataType,
+                                BScaleDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_M3_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_N3_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack,
+                                IsInputGemm>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        if constexpr(IsInputGemm)
+        {
+            return math::max((a_block_space_size_aligned * sizeof(ADataType) +
+                              b_block_space_size_aligned * sizeof(BDataType)) *
+                                 2,
+                             c_block_size * sizeof(CShuffleDataType));
+        }
+        else
+        {
+            return math::max((a_block_space_size_aligned * sizeof(ADataType) +
+                              b_block_space_size_aligned * sizeof(BDataType)),
+                             c_block_size * sizeof(CShuffleDataType));
+        }
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        static_assert(KPerBlock % (ScaleBlockSize / BPackedSize) == 0,
+                      "KPerBlock should be multiple of ScaleBlockSize");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+#if 0
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+        {
+            return false;
+        }
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    // using Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock,
+    // NPerBlock>;
+
+    using mx_scale_t                           = e8m0_bexp_t;
+    static constexpr index_t scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr index_t scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const index_t* p_sorted_token_ids,
+                               const index_t* p_sorted_expert_ids,
+                               const index_t* p_max_token_id,
+                               const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.M / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            problem.N * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // Gride buffer creation
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        // A, B scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            ADataType,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
+                                         a_block_space_size_aligned * sizeof(ADataType)),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // a and b scale processing
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        // B scale load
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        if constexpr(IsInputGemm)
+        {
+            constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+            auto b_block_buf_up = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
+                                             a_block_space_size_aligned * sizeof(ADataType) +
+                                             b_block_space_size_aligned * sizeof(BDataType)),
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride,
+                b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+            auto b_blockwise_copy_up =
+                ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                    BElementwiseOperation,
+                                                    ck::tensor_operation::element_wise::PassThrough,
+                                                    InMemoryDataOperationEnum::Set,
+                                                    Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                    BBlockTransferThreadClusterArrangeOrder,
+                                                    BDataType,
+                                                    BDataType,
+                                                    decltype(b_grid_desc_bk0_n_bk1),
+                                                    decltype(b_block_desc_bk0_n_bk1),
+                                                    BBlockTransferSrcAccessOrder,
+                                                    Sequence<0, 1, 2>,
+                                                    BBlockTransferSrcVectorDim,
+                                                    2,
+                                                    BBlockTransferSrcScalarPerVector,
+                                                    BBlockTransferDstScalarPerVector_BK1,
+                                                    1,
+                                                    1,
+                                                    BThreadTransferSrcResetCoordinateAfterRun,
+                                                    true,
+                                                    BlockwiseGemmPipe::GlobalBufferNum>(
+                    b_grid_desc_bk0_n_bk1,
+                    make_multi_index(0, n_block_data_idx_on_grid, 0),
+                    b_element_op,
+                    b_block_desc_bk0_n_bk1,
+                    make_multi_index(0, 0, 0),
+                    ck::tensor_operation::element_wise::PassThrough{});
+
+            const BScaleDataType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride / sizeof(BScaleDataType),
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BScaleDataType,
+                BScaleDataType,
+                decltype(b_scale_grid_desc_bn_ak),
+                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+                Sequence<0, 1, 2>,                                       // DimAccessOrder
+                2,                                                       // SrcVectorDim
+                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+                1,                                                       // SrcScalarStrideInVector
+                true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                 0,
+                                 thread_offset_shuffled / scale_pack_size_b));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                // A
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                // Gate and Up
+                b_grid_desc_bk0_n_bk1,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_buf_up,
+                b_block_slice_copy_step,
+                // C
+                c_thread_buf,
+                c_thread_buf_up,
+                // A scale
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                // Gate and Up scale
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1, // A
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bk0_n_bk1, // B
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,            // C
+                a_scale_grid_desc_am_ak, // A scale
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak, // B scale
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
+
+            // mul scales
+            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
+            static_assert(M5 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave; // Mwave id
+            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            vector_type<float, 4> topk_weights; // for gemm2 only
+            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
+                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
+                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
+                                const index_t m_pos = block_m_id * MPerBlock +
+                                                      m0 * M2 * M1 * M3 * M4 * M5 +
+                                                      m1 * M2 * M3 * M4 * M5 +
+                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
+
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    topk_weights =
+                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
+                                            p_ds_grid[I2] + m_pos);
+                                }
+                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
+                                    constexpr index_t c_offset =
+                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
+                                    constexpr auto cidx = Number<c_offset>{};
+
+                                    if constexpr(IsInputGemm) // gu fusion
+                                    {
+                                        if constexpr(ActivationOperation ==
+                                                     Activation::silu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Silu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                        else if(ActivationOperation == Activation::gelu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+
+                                            /*float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                //up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = up;*/
+                                        }
+                                    }
+                                    else
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            c_thread_buf_fp32(cidx) =
+                                                topk_weights.AsType<float>()[m5] *
+                                                c_thread_buf_fp32[cidx];
+                                        }
+                                    }
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave)
+                                                                            // per shuffle
+                        M1,                                                 // M1 = MWave
+                        M2,                                                 // M2 = MXdlPack
+                        M3, // M3 * M4 * M5 = MPerXdl
+                        M4,
+                        M5)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
+                                                                            // per shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                            // Sequence support
+                                                                            // arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+
+#if 0
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
+                                    const index_t* p_sorted_expert_ids,
+                                    const index_t* p_max_token_id,
+                                    const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared,
+                                    void* p_shared1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
+            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            problem.N * math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             BDataType,
+                                             decltype(b_grid_desc_bpreshuffled),
+                                             decltype(b_block_desc_bk0_n_bk1),
+                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                      I1,
+                                                      Number<NXdlPack>{},
+                                                      Number<KRepeat>{},
+                                                      Number<BK1Value>{}>,
+                                             Sequence<1, 2, 0, 3, 4>,
+                                             4,
+                                             BBlockTransferSrcScalarPerVector,
+                                             BThreadTransferSrcResetCoordinateAfterRun,
+                                             true>(
+                b_grid_desc_bpreshuffled,
+                make_multi_index(n_block_data_idx_on_grid,
+                                 get_warp_local_1d_id() % NWave,
+                                 0,
+                                 0,
+                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // a and b scale processing
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        // get each thread's offset int the scale tensor
+        const index_t token_scale_pos = block_m_id * MPerBlock;
+        if(token_scale_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        // B scale load
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
+            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BScaleDataType,
+                BScaleDataType,
+                decltype(b_scale_grid_desc_bn_ak),
+                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+                Sequence<0, 1, 2>,                                       // DimAccessOrder
+                2,                                                       // SrcVectorDim
+                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+                1,                                                       // SrcScalarStrideInVector
+                true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                 0,
+                                 thread_offset_shuffled / scale_pack_size_b));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            // mul scales
+
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            vector_type<float, 4> topk_weights; // for gemm2 only
+            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                        if constexpr(MulRoutedWeight)
+                        {
+                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                p_ds_grid[I2] + m_pos);
+                        }
+                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                            constexpr index_t c_offset =
+                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                    make_tuple(m0 / MXdlPack,
+                                               n0 / NXdlPack,
+                                               m0 % MXdlPack,
+                                               n0 % NXdlPack,
+                                               m2 * M4 + m4));
+                            constexpr auto cidx = Number<c_offset>{};
+
+                            if constexpr(IsInputGemm) // gu fusion
+                            {
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                            }
+                            else
+                            {
+                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                }
+                            }
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(make_freeze_transform(I0),
+                           make_unmerge_transform(make_tuple(
+                               Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per
+                                                                        // shuffle
+                               M1,                                      // M1 = MWave
+                               M2,                                      // M2 * M3 * M4 = MPerXdl
+                               M3,
+                               M4)),
+                           make_freeze_transform(I0),
+                           make_unmerge_transform(make_tuple(
+                               Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per
+                                                                        // shuffle
+                               N1,                                      // N1 = NWave
+                               N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                               // Sequence support
+                                                                               // arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+#endif
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index c17b88ccea..4e4c92de40 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -580,11 +580,6 @@ struct ThreadwiseTensorSliceTransfer_v2_gather
             });
         });
 
-        // printf("blockIdx.y: %d, tid: %d, dst_buf<%f>\n",
-        //        blockIdx.y,
-        //        threadIdx.x,
-        //        dst_buf(Number<0>{}));
-
         // move src coordinate back to slice origin (or not)
         if constexpr(SrcResetCoordinateAfterRun)
         {
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 7da353d9ad..1dd766eca0 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1146,15 +1146,6 @@ struct MfmaSelector
 #endif
     }
 
-    // Use single rate mfma instruction for this special case A (f8_t) * B (pk_i4_t)
-    // See example gemm_xdl_fp8_pk_i4_bpreshuffle_v3
-    // TODO: explore optimization opportunity by using new mfma instructions on gfx950
-    template <>
-    constexpr auto GetMfma<f8_t, 32, 32, pk_i4_t, true, false>()
-    {
-        return MfmaInstr::mfma_f32_32x32x16f8f8;
-    }
-
     template <>
     constexpr auto GetMfma<f8_t, 32, 32, f8_t, true, false>()
     {
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 9a28c5f332..56da5c1dc8 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -881,11 +881,6 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32, OpselA, OpselB>
 #endif
     }
 };
-#define BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS 1
-
-#ifndef BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
-#define BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS 0
-#endif
 
 template <index_t MPerWave, index_t NPerWave, index_t OpselA, index_t OpselB>
 struct intrin_mfma_scale_f32_16x16x128f8f6f4;
@@ -893,48 +888,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4;
 template <index_t OpselA, index_t OpselB>
 struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
 {
-
-#define V_MFMA_SCALE_F32_16X16X128_F8F6F4(OPF_F8F6F4_CTRL_A,                   \
-                                          OPF_F8F6F4_CTRL_B,                   \
-                                          F8F6F4_VEC_TYPE_A,                   \
-                                          F8F6F4_VEC_TYPE_B,                   \
-                                          OPSEL_A_L,                           \
-                                          OPSEL_A_H,                           \
-                                          OPSEL_B_L,                           \
-                                          OPSEL_B_H)                           \
-    if constexpr((OpselA == 1 * OPSEL_A_L + 2 * OPSEL_A_H) &&                  \
-                 (OpselB == 1 * OPSEL_B_L + 2 * OPSEL_B_H))                    \
-    asm volatile("v_mfma_scale_f32_16x16x128_f8f6f4  %0, %1, %2, %3, %4, %5  " \
-                 "op_sel:[" #OPSEL_A_L "," #OPSEL_A_H "] "                     \
-                 "op_sel_hi:[" #OPSEL_B_L "," #OPSEL_B_H "] "                  \
-                 "cbsz:" #OPF_F8F6F4_CTRL_A " blgp:" #OPF_F8F6F4_CTRL_B        \
-                 : "+v"(reg_c.template AsType<float4_t>()(Number<0>{}))        \
-                 : "v"(bit_cast<F8F6F4_VEC_TYPE_A>(reg_a)),                    \
-                   "v"(bit_cast<F8F6F4_VEC_TYPE_B>(reg_b)),                    \
-                   "v"(reg_c.template AsType<float4_t>()[Number<0>{}]),        \
-                   "v"(scale_a),                                               \
-                   "v"(scale_b))
-#define BOOL4_CASES(F) \
-    do                 \
-    {                  \
-        F(0, 0, 0, 0); \
-        F(0, 0, 0, 1); \
-        F(0, 0, 1, 0); \
-        F(0, 0, 1, 1); \
-        F(0, 1, 0, 0); \
-        F(0, 1, 0, 1); \
-        F(0, 1, 1, 0); \
-        F(0, 1, 1, 1); \
-        F(1, 0, 0, 0); \
-        F(1, 0, 0, 1); \
-        F(1, 0, 1, 0); \
-        F(1, 0, 1, 1); \
-        F(1, 1, 0, 0); \
-        F(1, 1, 0, 1); \
-        F(1, 1, 1, 0); \
-        F(1, 1, 1, 1); \
-    } while(0)
-
     template <class FloatC>
     __device__ static void Run(const f8x32_t& reg_a,
                                const int32_t& scale_a,
@@ -943,7 +896,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
-#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
@@ -956,11 +908,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                 scale_a,
                 OpselB, // OPSEL
                 scale_b);
-#else
-#define f8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(0, 0, int32x8_t, int32x8_t, __VA_ARGS__)
-        BOOL4_CASES(f8_cases);
-#undef f8_cases
-#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -978,7 +925,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
-#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
@@ -991,10 +937,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                 scale_a,
                 OpselB, // OPSEL
                 scale_b);
-#else
-#define bf8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(1, 1, int32x8_t, int32x8_t, __VA_ARGS__)
-        BOOL4_CASES(bf8_cases);
-#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -1012,7 +954,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
-#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
@@ -1025,11 +966,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                 scale_a,
                 OpselB, // OPSEL
                 scale_b);
-#else
-#define f8bf8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(0, 1, int32x8_t, int32x8_t, __VA_ARGS__)
-        BOOL4_CASES(f8bf8_cases);
-#undef f8bf8_cases
-#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -1047,7 +983,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                                FloatC& reg_c)
     {
 #if defined(__gfx950__)
-#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
         reg_c.template AsType<float4_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
@@ -1060,11 +995,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                 scale_a,
                 OpselB, // OPSEL
                 scale_b);
-#else
-#define bf8f8_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(1, 0, int32x8_t, int32x8_t, __VA_ARGS__)
-        BOOL4_CASES(bf8f8_cases);
-#undef bf8f8_cases
-#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -1141,24 +1071,13 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
     }
 
     template <class FloatC>
-    __device__ static void
-    Run(const f4x32_t& reg_a, // misalignment between pk_f4_t, 32 and f4_t, 32
-        const int32_t scale_a,
-        const f4x32_t& reg_b,
-        const int32_t scale_b,
-        FloatC& reg_c)
+    __device__ static void Run(const f4x32_t& reg_a,
+                               const int32_t scale_a,
+                               const f4x32_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
     {
-#if 0
-        if(get_thread_local_1d_id()){
-            printf("Tid: %03d, Scale A: %08x, Scale B: %08x, OpSelA: %d, OpSelB: %d\n",
-                get_thread_local_1d_id(),
-                *reinterpret_cast<const uint32_t*>(&scale_a), *reinterpret_cast<const
-                uint32_t*>(&scale_b),
-                OpselA, OpselB);
-        }
-#endif
 #if defined(__gfx950__)
-#if BUILTIN_AMDGCN_MFMA_SCALE_F32_16X16X128_F8F6F4_WORKS
         int32x4_t arg_a = bit_cast<int32x4_t>(reg_a);
         int32x4_t arg_b = bit_cast<int32x4_t>(reg_b);
         using arg_type  = int32x8_t;
@@ -1173,11 +1092,6 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
                 scale_a,
                 OpselB, // OPSEL
                 scale_b);
-#else
-#define f4_cases(...) V_MFMA_SCALE_F32_16X16X128_F8F6F4(4, 4, int32x4_t, int32x4_t, __VA_ARGS__)
-        BOOL4_CASES(f4_cases);
-#undef f4_cases
-#endif
 #else
         ignore = reg_a;
         ignore = scale_a;
@@ -1186,9 +1100,7 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
         ignore = reg_c;
 #endif
     }
-#undef BOOL4_CASES
-#undef V_MFMA_SCALE_F32_16X16X128_F8F6F4
-}; // namespace ck
+};
 
 template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f32_16x16x128f8f6f4;
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index ad9bb45158..51da18cd2b 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -165,17 +165,6 @@ inline constexpr bool is_native_type()
            is_same<T, f8_fnuz_t>::value || is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value;
 }
 
-template <typename T>
-struct is_f8f6f4
-{
-    static constexpr bool value =
-        is_same_v<T, f8_t> || is_same_v<T, bf8_t> || is_same_v<T, f6_t> || is_same_v<T, bf6_t> ||
-        is_same_v<T, f6x16_pk_t> || is_same_v<T, f6x32_pk_t> || is_same_v<T, bf6x16_pk_t> ||
-        is_same_v<T, bf6x32_pk_t> || is_same_v<T, f4_t> || is_same_v<T, f4x2_pk_t>;
-};
-template <typename T>
-inline constexpr bool is_f8f6f4_v = is_f8f6f4<T>::value;
-
 // scalar_type
 template <typename TV>
 struct scalar_type;
diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp
index 2b247cc02a..45d443ae49 100644
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
@@ -87,6 +87,19 @@ __device__ static bool is_thread_local_1d_id_idx()
     return ((tid == Ids) || ...);
 }
 
+// Use `CK_PRINT<T1, T2, ...>()` to inspect values of type T1, T2, ...
+// Use `CK_PRINT<v1, v2, ...>()` to inspect constexpr values of val1, val2, ... of the same type
+// In a non-evaluated context, you can use `using _dummy = decltype(CK_PRINT<...>());`
+// Set BUILD_DEV to OFF to avoid enabling Werror
+template <auto... val>
+[[deprecated("Help function to print value")]] inline constexpr void CK_PRINT()
+{
+}
+template <typename... type>
+[[deprecated("Help function to print value")]] inline constexpr void CK_PRINT()
+{
+}
+
 } // namespace debug
 } // namespace ck
 
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index 049221cea1..0891a7ccf4 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -1036,11 +1036,11 @@ struct vector_type<T, 128, typename ck::enable_if_t<is_native_type<T>()>>
         StaticallyIndexedArray<d32_t, 4> d32x4_;
         StaticallyIndexedArray<d64_t, 2> d64x2_;
         StaticallyIndexedArray<d128_t, 1> d128x1_;
-    } data_;
+    } data_ = {d128_t{0}};
 
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+    __attribute__((host)) __attribute__((device)) constexpr vector_type() {}
 
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+    __attribute__((host)) __attribute__((device)) constexpr vector_type(type v) { (void)v; }
 
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
@@ -1164,11 +1164,11 @@ struct vector_type<T, 256, typename ck::enable_if_t<is_native_type<T>()>>
         StaticallyIndexedArray<d64_t, 4> d64x4_;
         StaticallyIndexedArray<d128_t, 2> d128x2_;
         StaticallyIndexedArray<d256_t, 1> d256x1_;
-    } data_;
+    } data_ = {d256_t{0}};
 
-    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+    __attribute__((host)) __attribute__((device)) constexpr vector_type() {}
 
-    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+    __attribute__((host)) __attribute__((device)) constexpr vector_type(type v) { (void)v; }
 
     template <typename X>
     __host__ __device__ constexpr const auto& AsType() const
@@ -2228,7 +2228,9 @@ using f6x32_t = typename vector_type<f6x32_pk_t, 1>::type;
 using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type;
 using bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type;
 
+// e8m0
 using e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type;
+
 // pack int4
 using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
 using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
index 16213173f3..ef8b5a435c 100644
--- a/include/ck/utility/functional2.hpp
+++ b/include/ck/utility/functional2.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/functional.hpp"
 #include "ck/utility/sequence.hpp"
 #include "ck/utility/tuple.hpp"
+#include "ck/utility/type.hpp"
 
 namespace ck {
 
@@ -107,7 +108,7 @@ struct identity
     template <typename T>
     __host__ __device__ constexpr T&& operator()(T&& arg) const noexcept
     {
-        return forward<T>(arg);
+        return ck::forward<T>(arg);
     }
 };
 
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
new file mode 100644
index 0000000000..eedd687bde
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename D2DataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t ActivationType_ = 0,
+          bool MulRoutedWeight    = true,
+          typename ComputeTypeA   = AccDataType,
+          typename ComputeTypeB   = AccDataType>
+struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
+{
+    // Argument
+    static constexpr auto ActivationType = ActivationType_;
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k,
+                 const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<D2DataType>& d2,
+                 Tensor<CDataType>& c_t_k_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_{a_t_k},
+              b_e_n_k_{b_e_n_k},
+              d2_{d2},
+              c_t_k_n_{c_t_k_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_;
+        const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<D2DataType>& d2_;
+        Tensor<CDataType>& c_t_k_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeGemm1BlockScale::Argument;
+
+        float Run(const Argument& arg)
+        {
+            static_assert(ActivationType < 2, "Not supported activation type");
+            const int full_n = arg.c_t_k_n_.mDesc.GetLengths()[2];
+            auto f_mk_kn_mn  = [&](auto m, auto n) {
+                const int K = arg.a_t_k_.mDesc.GetLengths()[1];
+                AccDataType v_acc_up{0};
+                ComputeTypeB v_b_up{0};
+                AccDataType v_acc{0};
+
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+
+                const int t         = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id   = (arg.sorted_token_ids_(m) & 0xff000000) >> 24;
+                const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt = arg.a_t_k_.mDesc.GetLengths()[0];
+                D2DataType v_topk_w = arg.d2_(m, 0); // expert
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        if constexpr(is_same_v<ADataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.a_t_k_(t, k).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_a = i4_to_f32_gfx9(i4);
+#else
+                            v_a    = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.a_element_op_(v_a, arg.a_t_k_(t, k));
+                        }
+                        // same for B matrix
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2    = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4x2_up = arg.b_e_n_k_(e, k, n + full_n).data;
+                            uint8_t i4      = 0;
+                            uint8_t i4_up   = 0;
+                            if(k % 2 == 1)
+                            {
+                                i4    = (i4x2 >> 0) & 0xf;
+                                i4_up = (i4x2_up >> 0) & 0xf;
+                            }
+                            else
+                            {
+                                i4    = (i4x2 >> 4) & 0xf;
+                                i4_up = (i4x2_up >> 4) & 0xf;
+                            }
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_b    = i4_to_f32_gfx9(i4);
+                            v_b_up = i4_to_f32_gfx9(i4_up);
+#else
+                            v_b    = i4 - 8;
+                            v_b_up = i4_up - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
+                            arg.b_element_op_(v_b_up, arg.b_e_n_k_(e, k, n + full_n));
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                        v_acc_up += ck::type_convert<AccDataType>(v_a) *
+                                    ck::type_convert<AccDataType>(v_b_up);
+                    }
+                    CDataType v_c{0};
+                    CDataType v_c_up{0};
+                    if constexpr(MulRoutedWeight)
+                    {
+                        v_acc *= v_topk_w;
+                        v_acc_up *= v_topk_w;
+                    }
+
+                    arg.c_element_op_(v_c, v_acc);
+                    arg.c_element_op_(v_c_up, v_acc_up);
+                    if constexpr(ActivationType == 1)
+                    {
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Silu{}(v_c, v_c);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                    else if constexpr(ActivationType == 0)
+                    {
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            v_c_up *= 16;
+                            v_c *= 16;
+                        }
+                        tensor_operation::element_wise::Gelu{}(v_c, v_c);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                }
+            };
+
+            const ck::index_t max_token_id = arg.max_token_id_(0);
+            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, full_n)(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k,
+                             const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<D2DataType>& d2,
+                             Tensor<CDataType>& c_t_k_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k,
+                        b_e_n_k,
+                        d2,
+                        c_t_k_n,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeGemm1BlaockScale"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+
+    static float i4_to_f32_gfx9(uint8_t i4)
+    {
+        static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                       {0b1001, -0.4375f},
+                                                       {0b1010, -0.3750f},
+                                                       {0b1011, -0.3125f},
+                                                       {0b1100, -0.2500f},
+                                                       {0b1101, -0.1875f},
+                                                       {0b1110, -0.1250f},
+                                                       {0b1111, -0.0625f},
+                                                       {0b0, +0.0000f},
+                                                       {0b1, +0.0625f},
+                                                       {0b10, +0.1250f},
+                                                       {0b11, +0.1875f},
+                                                       {0b100, +0.2500f},
+                                                       {0b101, +0.3125f},
+                                                       {0b110, +0.3750f},
+                                                       {0b111, +0.4375f}};
+
+        return u[i4];
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
index 5c932fcb18..583d704040 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -156,9 +156,14 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                 }
             };
 
-            const ck::index_t max_token_id = arg.max_token_id_(0);
-            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, arg.c_t_n_.mDesc.GetLengths()[1])(
-                std::thread::hardware_concurrency());
+            const std::size_t max_token_id = arg.max_token_id_(0);
+            // avoid parallelizing over the m dim to prevent data race
+            make_ParallelTensorFunctor(
+                [&](auto n) {
+                    for(std::size_t m = 0; m < max_token_id; ++m)
+                        f_mk_kn_mn(m, n);
+                },
+                arg.c_t_n_.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
 
             return 0;
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp
new file mode 100644
index 0000000000..a10ef88557
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename D2DataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          bool MulRoutedWeight  = true,
+          typename ComputeTypeA = AccDataType,
+          typename ComputeTypeB = AccDataType>
+struct ReferenceMoeGemm2BlockScale : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k_k,
+                 const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<D2DataType>& d2,
+                 Tensor<CDataType>& c_t_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_k_{a_t_k_k},
+              b_e_n_k_{b_e_n_k},
+              d2_{d2},
+              c_t_n_{c_t_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_k_;
+        const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<D2DataType>& d2_;
+        Tensor<CDataType>& c_t_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeGemm2BlockScale::Argument;
+
+        float Run(const Argument& arg)
+        {
+            arg.c_t_n_.SetZero();
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_t_k_k_.mDesc.GetLengths()[2];
+                AccDataType v_acc{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+                const int t          = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id    = arg.sorted_token_ids_(m) >> 24;
+                const int e          = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt  = arg.c_t_n_.mDesc.GetLengths()[0];
+                AccDataType v_topk_w = arg.d2_(m, 0); // expert
+
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        if constexpr(is_same_v<ADataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.a_t_k_(t, topk_id, k).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_a = i4_to_f32_gfx9(i4);
+#else
+                            v_a = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.a_element_op_(v_a, arg.a_t_k_k_(t, topk_id, k));
+                        }
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
+                        {
+                            uint8_t i4x2 = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+                            v_b = i4_to_f32_gfx9(i4);
+#else
+                            v_b = i4 - 8;
+#endif
+                        }
+                        else
+                        {
+                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                    }
+                    CDataType v_c{0};
+                    if constexpr(MulRoutedWeight)
+                    {
+                        arg.c_element_op_(v_c, v_acc, v_topk_w);
+                    }
+                    else
+                    {
+                        arg.c_element_op_(v_c, v_acc, 1.f);
+                    }
+                    arg.c_t_n_(t, n) += v_c;
+                }
+            };
+
+            const std::size_t max_token_id = arg.max_token_id_(0);
+            // avoid parallelizing over the m dim to prevent data race
+            make_ParallelTensorFunctor(
+                [&](auto n) {
+                    for(std::size_t m = 0; m < max_token_id; ++m)
+                        f_mk_kn_mn(m, n);
+                },
+                arg.c_t_n_.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k_k,
+                             const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<D2DataType>& d2,
+                             Tensor<CDataType>& c_t_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k_k,
+                        b_e_n_k,
+                        d2,
+                        c_t_n,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeGemm2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+
+#if CK_USE_PK4_LAYOUT_SHUFFLE
+    static float i4_to_f32_gfx9(uint8_t i4)
+    {
+        static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                       {0b1001, -0.4375f},
+                                                       {0b1010, -0.3750f},
+                                                       {0b1011, -0.3125f},
+                                                       {0b1100, -0.2500f},
+                                                       {0b1101, -0.1875f},
+                                                       {0b1110, -0.1250f},
+                                                       {0b1111, -0.0625f},
+                                                       {0b0, +0.0000f},
+                                                       {0b1, +0.0625f},
+                                                       {0b10, +0.1250f},
+                                                       {0b11, +0.1875f},
+                                                       {0b100, +0.2500f},
+                                                       {0b101, +0.3125f},
+                                                       {0b110, +0.3750f},
+                                                       { 0b111,
+                                                         +0.4375f }};
+
+        return u[i4];
+    }
+#endif
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp
new file mode 100644
index 0000000000..4dd331bc19
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename D0DataType, // expert weight
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t ActivationType_ = 0,
+          bool MulRoutedWeight    = true,
+          typename ComputeTypeA   = CDataType,
+          typename ComputeTypeB   = ComputeTypeA>
+struct ReferenceMoeMXGemm1 : public device::BaseOperator
+{
+    // Argument
+    static constexpr auto ActivationType = ActivationType_;
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k,
+                 const Tensor<AScaleDataType>& a_t_k_scale,
+                 const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<BScaleDataType>& b_e_n_k_scale,
+                 const Tensor<D0DataType>& d2,
+                 Tensor<CDataType>& c_t_k_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_{a_t_k},
+              a_t_k_scale_{a_t_k_scale},
+              b_e_n_k_{b_e_n_k},
+              b_e_n_k_scale_{b_e_n_k_scale},
+              d2_{d2},
+              c_t_k_n_{c_t_k_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_;
+        const Tensor<AScaleDataType>& a_t_k_scale_;
+        const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<BScaleDataType>& b_e_n_k_scale_;
+        const Tensor<D0DataType>& d2_;
+        Tensor<CDataType>& c_t_k_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeMXGemm1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            static_assert(ActivationType < 2, "Not supported activation type");
+            const int full_n = arg.c_t_k_n_.mDesc.GetLengths()[2];
+            arg.c_t_k_n_.SetZero();
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K                   = arg.a_t_k_.mDesc.GetLengths()[1];
+                const ck::index_t SCALE_BLOCK = K / arg.b_e_n_k_scale_.mDesc.GetLengths()[1];
+                AccDataType v_acc{0};
+                AccDataType v_acc_up{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+                ComputeTypeB v_b_up{0};
+                const int t         = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id   = arg.sorted_token_ids_(m) >> 24;
+                const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt = arg.c_t_k_n_.mDesc.GetLengths()[0];
+                D0DataType v_topk_w = arg.d2_(m, 0); // expert
+
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        auto a_f4x2  = arg.a_t_k_(t, k).data;
+                        auto a_scale = arg.a_t_k_scale_(t, k / SCALE_BLOCK);
+                        if constexpr(is_same_v<ADataType, f4x2_pk_t>)
+                        {
+
+                            f4_t f4 = 0;
+                            if(k % 2 == 1)
+                                f4 = (a_f4x2 >> 0) & 0xf;
+                            else
+                                f4 = (a_f4x2 >> 4) & 0xf;
+                            v_a = type_convert<ComputeTypeA>(f4) *
+                                  type_convert<ComputeTypeA>(a_scale);
+                        }
+                        else
+                        {
+                            v_a = type_convert<ComputeTypeA>(a_f4x2) *
+                                  type_convert<ComputeTypeA>(a_scale);
+                            arg.a_element_op_(v_a, v_a);
+                        }
+                        auto b_f4x2     = arg.b_e_n_k_(e, k, n).data;
+                        auto b_f4x2_up  = arg.b_e_n_k_(e, k, n + full_n).data;
+                        auto b_scale    = arg.b_e_n_k_scale_(e, k / SCALE_BLOCK, n);
+                        auto b_scale_up = arg.b_e_n_k_scale_(e, k / SCALE_BLOCK, n + full_n);
+                        if constexpr(is_same_v<BDataType, f4x2_pk_t>)
+                        {
+
+                            f4_t f4    = 0;
+                            f4_t f4_up = 0;
+                            if(k % 2 == 1)
+                            {
+                                f4    = (b_f4x2 >> 0) & 0xf;
+                                f4_up = (b_f4x2_up >> 0) & 0xf;
+                            }
+                            else
+                            {
+                                f4    = (b_f4x2 >> 4) & 0xf;
+                                f4_up = (b_f4x2_up >> 4) & 0xf;
+                            }
+                            v_b = type_convert<ComputeTypeB>(f4) *
+                                  type_convert<ComputeTypeB>(b_scale);
+                            v_b_up = type_convert<ComputeTypeB>(f4_up) *
+                                     type_convert<ComputeTypeB>(b_scale_up);
+                        }
+                        else
+                        {
+                            v_b = type_convert<ComputeTypeB>(b_f4x2) *
+                                  type_convert<ComputeTypeB>(b_scale);
+                            v_b_up = type_convert<ComputeTypeB>(b_f4x2_up) *
+                                     type_convert<ComputeTypeB>(b_scale_up);
+                            arg.b_element_op_(v_b, v_b);
+                            arg.b_element_op_(v_b_up, v_b_up);
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                        v_acc_up += ck::type_convert<AccDataType>(v_a) *
+                                    ck::type_convert<AccDataType>(v_b_up);
+                    }
+                    CDataType v_c{0};
+                    CDataType v_c_up{0};
+                    if constexpr(MulRoutedWeight)
+                    {
+                        v_acc *= v_topk_w;
+                        v_acc_up *= v_topk_w;
+                    }
+                    arg.c_element_op_(v_c, v_acc);
+                    arg.c_element_op_(v_c_up, v_acc_up);
+                    if constexpr(ActivationType == 1)
+                    {
+                        tensor_operation::element_wise::Silu{}(v_c, v_c);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                    else if constexpr(ActivationType == 0)
+                    {
+                        tensor_operation::element_wise::Gelu{}(v_c, v_c);
+                        arg.c_t_k_n_(t, topk_id, n) = v_c * v_c_up;
+                    }
+                }
+            };
+
+            const ck::index_t max_token_id = arg.max_token_id_(0);
+            make_ParallelTensorFunctor(f_mk_kn_mn, max_token_id, full_n)(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k,
+                             const Tensor<AScaleDataType>& a_t_k_scale,
+                             const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<BScaleDataType>& b_e_n_k_scale,
+                             const Tensor<D0DataType>& d2,
+                             Tensor<CDataType>& c_t_k_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k,
+                        a_t_k_scale,
+                        b_e_n_k,
+                        b_e_n_k_scale,
+                        d2,
+                        c_t_k_n,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeMxGemm1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp
new file mode 100644
index 0000000000..74f25f0f91
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename D0DataType, // expert weight
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          bool MulRoutedWeight  = true,
+          typename ComputeTypeA = CDataType,
+          typename ComputeTypeB = ComputeTypeA>
+struct ReferenceMoeMXGemm2 : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ck::index_t>& sorted_token_ids,
+                 const Tensor<ck::index_t>& expert_ids,
+                 const Tensor<ck::index_t>& max_token_id,
+                 const index_t sorted_tile_size,
+                 const Tensor<ADataType>& a_t_k_k,
+                 const Tensor<AScaleDataType>& a_t_k_k_scale,
+                 const Tensor<BDataType>& b_e_n_k,
+                 const Tensor<BScaleDataType>& b_e_n_k_scale,
+                 const Tensor<D0DataType>& d2,
+                 Tensor<CDataType>& c_t_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : sorted_token_ids_{sorted_token_ids},
+              expert_ids_{expert_ids},
+              max_token_id_{max_token_id},
+              sorted_tile_size_{sorted_tile_size},
+              a_t_k_k_{a_t_k_k},
+              a_t_k_k_scale_{a_t_k_k_scale},
+              b_e_n_k_{b_e_n_k},
+              b_e_n_k_scale_{b_e_n_k_scale},
+              d2_{d2},
+              c_t_n_{c_t_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ck::index_t>& sorted_token_ids_;
+        const Tensor<ck::index_t>& expert_ids_;
+        const Tensor<ck::index_t>& max_token_id_;
+        index_t sorted_tile_size_;
+        const Tensor<ADataType>& a_t_k_k_;
+        const Tensor<AScaleDataType>& a_t_k_k_scale_;
+        const Tensor<BDataType>& b_e_n_k_;
+        const Tensor<BScaleDataType>& b_e_n_k_scale_;
+        const Tensor<D0DataType>& d2_;
+        Tensor<CDataType>& c_t_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceMoeMXGemm2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            arg.c_t_n_.SetZero();
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K                   = arg.a_t_k_k_.mDesc.GetLengths()[2];
+                const ck::index_t SCALE_BLOCK = K / arg.b_e_n_k_scale_.mDesc.GetLengths()[1];
+                AccDataType v_acc{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};
+                const int t         = arg.sorted_token_ids_(m) & 0xffffff;
+                const int topk_id   = arg.sorted_token_ids_(m) >> 24;
+                const int e         = arg.expert_ids_(m / arg.sorted_tile_size_);
+                const int token_cnt = arg.c_t_n_.mDesc.GetLengths()[0];
+                D0DataType v_topk_w = arg.d2_(m, 0); // expert
+
+                if(t < token_cnt)
+                {
+                    for(int k = 0; k < K; ++k)
+                    {
+                        if constexpr(is_same_v<ADataType, f4x2_pk_t>)
+                        {
+                            auto f4x2    = arg.a_t_k_k_(t, topk_id, k).data;
+                            auto a_scale = arg.a_t_k_k_scale_(t, topk_id, k / SCALE_BLOCK);
+
+                            f4_t f4 = 0;
+                            if(k % 2 == 1)
+                                f4 = (f4x2 >> 0) & 0xf;
+                            else
+                                f4 = (f4x2 >> 4) & 0xf;
+
+                            v_a = type_convert<ComputeTypeA>(f4) *
+                                  type_convert<ComputeTypeA>(a_scale);
+                        }
+                        else
+                        {
+                            arg.a_element_op_(
+                                v_a, type_convert<ComputeTypeA>(arg.a_t_k_k_(t, topk_id, k)));
+                        }
+                        if constexpr(is_same_v<BDataType, f4x2_pk_t>)
+                        {
+                            auto f4x2    = arg.b_e_n_k_(e, k, n).data;
+                            auto b_scale = arg.b_e_n_k_scale_(e, k / SCALE_BLOCK, n);
+
+                            f4_t f4 = 0;
+                            if(k % 2 == 1)
+                                f4 = (f4x2 >> 0) & 0xf;
+                            else
+                                f4 = (f4x2 >> 4) & 0xf;
+
+                            v_b = type_convert<ComputeTypeB>(f4) *
+                                  type_convert<ComputeTypeB>(b_scale);
+                        }
+                        else
+                        {
+                            arg.b_element_op_(v_b,
+                                              type_convert<ComputeTypeB>(arg.b_e_n_k_(e, k, n)));
+                        }
+
+                        v_acc +=
+                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                    }
+                    CDataType v_c{0};
+                    if constexpr(MulRoutedWeight)
+                    {
+                        arg.c_element_op_(v_c, v_acc, 1.f, 1.f, v_topk_w); // hacky, need to fix
+                    }
+                    else
+                    {
+                        arg.c_element_op_(v_c, v_acc, 1.f, 1.f, 1.f);
+                    }
+                    arg.c_t_n_(t, n) += v_c;
+                }
+            };
+
+            const std::size_t max_token_id = arg.max_token_id_(0);
+            // avoid parallelizing over the m dim to prevent data race
+            make_ParallelTensorFunctor(
+                [&](auto n) {
+                    for(std::size_t m = 0; m < max_token_id; ++m)
+                        f_mk_kn_mn(m, n);
+                },
+                arg.c_t_n_.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ck::index_t>& sorted_token_ids,
+                             const Tensor<ck::index_t>& expert_ids,
+                             const Tensor<ck::index_t>& max_token_id,
+                             const index_t sorted_tile_size,
+                             const Tensor<ADataType>& a_t_k_k,
+                             const Tensor<AScaleDataType>& a_t_k_k_scale,
+                             const Tensor<BDataType>& b_e_n_k,
+                             const Tensor<BScaleDataType>& b_e_n_k_scale,
+                             const Tensor<D0DataType>& d2,
+                             Tensor<CDataType>& c_t_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{sorted_token_ids,
+                        expert_ids,
+                        max_token_id,
+                        sorted_tile_size,
+                        a_t_k_k,
+                        a_t_k_k_scale,
+                        b_e_n_k,
+                        b_e_n_k_scale,
+                        d2,
+                        c_t_n,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceMoeGemm2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
index a20e608868..a74401ff76 100644
--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -22,11 +22,16 @@ void add_device_operation_instances(std::vector<std::unique_ptr<BaseOp>>& op_ins
         const auto new_op_instance = std::get<i>(new_op_instances);
 
         using NewOpInstance = remove_cvref_t<decltype(new_op_instance)>;
-
-        static_assert(std::is_base_of_v<BaseOp, NewOpInstance>,
-                      "wrong! NewOpInstance should be derived from BaseOp");
-
-        op_instances.push_back(std::make_unique<NewOpInstance>(new_op_instance));
+        if constexpr(std::is_same_v<NewOpInstance, std::nullptr_t>)
+        {
+            return; // We can use nullptr_t to enable trailing comma
+        }
+        else
+        {
+            static_assert(std::is_base_of_v<BaseOp, NewOpInstance>,
+                          "wrong! NewOpInstance should be derived from BaseOp");
+            op_instances.push_back(std::make_unique<NewOpInstance>(new_op_instance));
+        }
     });
 }
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_blockscale_wp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_blockscale_wp.hpp
new file mode 100644
index 0000000000..ae496e01d3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_blockscale_wp.hpp
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances);
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances);
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances);
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances);
+#endif
+
+template <typename A0DataType,
+          typename A1DataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleD_BlockScale_BPreshuffle<
+        ALayout,
+        BLayout,
+        Tuple<>,
+        CLayout,
+        A0DataType,
+        A1DataType,
+        B0DataType,
+        B1DataType,
+        Tuple<>,
+        CDataType,
+        1,
+        128,
+        128,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp =
+        DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
+                                                   BLayout,
+                                                   Tuple<>,
+                                                   CLayout,
+                                                   A0DataType,
+                                                   A1DataType,
+                                                   B0DataType,
+                                                   B1DataType,
+                                                   Tuple<>,
+                                                   CDataType,
+                                                   1,
+                                                   128,
+                                                   128,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<A0DataType, f8_t> && is_same_v<B0DataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
+                    op_ptrs);
+                // add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances(
+                // op_ptrs);
+
+                add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances(
+                    op_ptrs);
+                // add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
+                //     op_ptrs);
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
new file mode 100644
index 0000000000..57cbd725aa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+set(GEMM_BLOCKSCALE_WP_INSTANCES)
+
+list(APPEND GEMM_BLOCKSCALE_WP_INSTANCES 
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
+        )
+
+set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+
+add_instance_library(device_gemm_blockscale_wp_instance ${GEMM_BLOCKSCALE_WP_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
new file mode 100644
index 0000000000..68bc25dbfb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances = std::tuple<
+    // clang-format off
+        //################################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################################|        |        |               |        |           |           |           |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,  16,  16,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances = std::tuple<
+    // clang-format off
+        //#######################################################| ALayout| BLayout|      DsLayout| ELayout|AData    |     BData|      DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#######################################################|        |        |              |        | Type    |      Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#######################################################|        |        |              |        |         |          |            |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#######################################################|        |        |              |        |         |          |            |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Memory friendly
+        // 16x
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   256,   128,   8,  16,  16,   16,    1,    4,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   128,   8,  16,  16,   16,    1,    2,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   128,   8,  16,  16,   16,    1,    1,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   256,  16,  16,  16,   16,    1,    2,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   256,  16,  16,  16,   16,    1,    1,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        //32x
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128,  16,  16,  16,   16,    2,    4,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128,  16,  16,  16,   16,    2,    2,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128,  16,  16,  16,   16,    2,    1,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   256,  16,  16,  16,   16,    2,    2,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   256,  16,  16,  16,   16,    2,    1,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        //48x
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    48,   256,   128,   8,  16,  16,   16,    3,    4,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    48,   128,   128,   8,  16,  16,   16,    3,    2,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    48,    64,   128,   8,  16,  16,   16,    3,    1,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    48,   128,   256,  16,  16,  16,   16,    3,    2,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    48,    64,   256,  16,  16,  16,   16,    3,    1,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        //64x
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   256,   128,  16,  16,  16,   16,    4,    4,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   128,  16,  16,  16,   16,    4,    2,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   128,  16,  16,  16,   16,    4,    1,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   256,  16,  16,  16,   16,    4,    2,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<>,      Row,     F8,F32,     F8,F32,    Tuple<>,    BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   256,  16,  16,  16,   16,    4,    1,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
new file mode 100644
index 0000000000..d745724c35
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..a2e6c4a43c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..91434863fe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances<Intrawave,
+                                                                                  GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..cc9a734659
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_BlockScale_BPreshuffle<Row,
+                                                                           Col,
+                                                                           Tuple<>,
+                                                                           Row,
+                                                                           F8,
+                                                                           F32,
+                                                                           F8,
+                                                                           F32,
+                                                                           Tuple<>,
+                                                                           BF16,
+                                                                           1,
+                                                                           128,
+                                                                           128,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances<Intrawave,
+                                                                                  GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
index 03ea71883a..40bacb3ee9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
@@ -46,25 +46,26 @@ using device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_instances = std::tuple<
     //#####################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
     //#####################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
     //#####################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   384,   128,  16,  16,  16,   16,    2,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   384,   128,  16,  16,  16,   16,    2,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   384,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   512,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   384,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   512,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   128,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   256,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   384,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   512,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   128,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   256,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   384,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   512,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   384,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   512,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   384,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   512,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      std::nullptr_t
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
index 1ebb400fdd..2b4c18787a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
@@ -56,7 +56,8 @@ using device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_instances = std::tuple<
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   128,  16,  16,  16,   16,    2,    2,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   128,  16,  16,  16,   16,    2,    2,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      std::nullptr_t
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
index 3645026c60..aa4704530d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -49,7 +49,8 @@ using device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_instances = std::tuple<
     DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64,   256,  16,  16,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
     DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   256,  16,  16,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
     DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32,   256,  16,  16,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   256,  16,  16,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   256,  16,  16,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    std::nullptr_t
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
index f7ef5562e4..1371e419ea 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn.hpp
@@ -49,7 +49,8 @@ using device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_instances = std::tuple<
     DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64,   256,  16,  16,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
     DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   256,  16,  16,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
     DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32,   256,  16,  16,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   256,  16,  16,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F8, E8M0PK,   F8, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32,   256,  16,  16,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    std::nullptr_t
     // clang-format on
     >;
 
diff --git a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
new file mode 100644
index 0000000000..53073a6c75
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_blockscale_wp.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename InOutDataType>
+void preShuffleBuffer(const InOutDataType* src, InOutDataType* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+
+template <typename A0DataType,
+          typename A1DataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename EDataType,
+          index_t ScaleBlockM,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout>
+bool profile_gemm_blockscale_weighpreshuffle_impl(int do_verification,
+                                                  int init_method,
+                                                  bool do_log,
+                                                  bool time_kernel,
+                                                  int M,
+                                                  int N,
+                                                  int K,
+                                                  int StrideA,
+                                                  int StrideB,
+                                                  int StrideE,
+                                                  int n_warmup,
+                                                  int n_iter,
+                                                  uint64_t rotating = 0)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    ck::index_t Scale_Stride_AM = ((M + ScaleBlockM - 1) / ScaleBlockM);
+    ck::index_t Scale_Stride_BN = ck::is_same_v<BLayout, ck::tensor_layout::gemm::ColumnMajor>
+                                      ? ((K + ScaleBlockK - 1) / ScaleBlockK)
+                                      : ((N + ScaleBlockN - 1) / ScaleBlockN);
+
+    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + ScaleBlockM - 1) / ScaleBlockM,
+                                                       (K + ScaleBlockK - 1) / ScaleBlockK,
+                                                       Scale_Stride_AM,
+                                                       ck::tensor_layout::gemm::ColumnMajor{}));
+    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<B0DataType> b_preshuffled_mfma16(
+        f_host_tensor_descriptor(K, N, StrideB, BLayout{})); // use layout only for size
+    Tensor<B0DataType> b_preshuffled_mfma32(
+        f_host_tensor_descriptor(K, N, StrideB, BLayout{})); // use layout only for size
+    Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor((K + ScaleBlockK - 1) / ScaleBlockK,
+                                                       (N + ScaleBlockN - 1) / ScaleBlockN,
+                                                       Scale_Stride_BN,
+                                                       BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    int total_gemm_needed =
+        a0_m_k.GetElementSpaceSizeInBytes() + b0_k_n.GetElementSpaceSizeInBytes() +
+        a1_m_k.GetElementSpaceSizeInBytes() + b1_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+    }
+
+    preShuffleBuffer(b0_k_n.mData.data(), b_preshuffled_mfma16.mData.data(), N, K, 16);
+    preShuffleBuffer(b0_k_n.mData.data(), b_preshuffled_mfma32.mData.data(), N, K, 32);
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using AElementOp = PassThrough;
+    using BElementOp = PassThrough;
+    using CElementOp = PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf_mfma16(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf_mfma32(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    b_device_buf_mfma16.ToDevice(b_preshuffled_mfma16.mData.data());
+    b_device_buf_mfma32.ToDevice(b_preshuffled_mfma32.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGemmMultipleD_BlockScale_BPreshuffle<ALayout,
+                                                                                 BLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 ELayout,
+                                                                                 A0DataType,
+                                                                                 A1DataType,
+                                                                                 B0DataType,
+                                                                                 B1DataType,
+                                                                                 ck::Tuple<>,
+                                                                                 EDataType,
+                                                                                 ScaleBlockM,
+                                                                                 ScaleBlockN,
+                                                                                 ScaleBlockK,
+                                                                                 AElementOp,
+                                                                                 BElementOp,
+                                                                                 CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+        Tensor<float> a_m_k({M, K});
+        Tensor<float> b_k_n({K, N});
+
+        for(int m = 0; m < M; m++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                a_m_k(m, k) = ck::type_convert<float>(a0_m_k(m, k)) *
+                              a1_m_k(m / ScaleBlockM, k / ScaleBlockK);
+            }
+        }
+
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                b_k_n(k, n) = ck::type_convert<float>(b0_k_n(k, n)) *
+                              b1_k_n(k / ScaleBlockK, n / ScaleBlockN);
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<float,
+                                                                                float,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough,
+                                                                                float>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_m_n_host_result(m, n) = ck::type_convert<EDataType>(c_m_n(m, n));
+            }
+        }
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        int NPerXdl = op_ptr->GetPreShuffleParameters();
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<A0DataType*>(a0_device_buf.GetDeviceBuffer()),
+            static_cast<B0DataType*>(NPerXdl == 16 ? b_device_buf_mfma16.GetDeviceBuffer()
+                                                   : b_device_buf_mfma32.GetDeviceBuffer()),
+            std::array<const void*, 0>{},
+            static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 0>{},
+            StrideE,
+            a1_device_buf.GetDeviceBuffer(),
+            b1_device_buf.GetDeviceBuffer(),
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+#if defined CK_ENABLE_FP8
+                // set softer tolerances for fp8
+                if constexpr(is_same_v<A0DataType, f8_t> || is_same_v<B0DataType, f8_t> ||
+                             is_same_v<EDataType, f8_t>)
+                {
+                    std::string msg   = "Error: Incorrect results!";
+                    double rtol       = 5e-2;
+                    double atol       = 5e-2;
+                    bool current_pass = ck::utils::check_err(
+                        e_m_n_device_result, e_m_n_host_result, msg, rtol, atol);
+                    pass = pass & current_pass;
+                    if(!current_pass)
+                    {
+                        std::cout << op_ptr->GetTypeString() << " failed" << std::endl;
+                    }
+                }
+                else
+                {
+#endif
+                    pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+                    if(!pass)
+                    {
+                        std::cout << op_ptr->GetTypeString() << " failed" << std::endl;
+                    }
+#if defined CK_ENABLE_FP8
+                }
+#endif
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a0_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b0_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", e_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(
+                argument_ptr.get(),
+                StreamConfig{
+                    nullptr, time_kernel, 0, n_warmup, n_iter, rotating_count > 1, rotating_count});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    if constexpr(is_same<EDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<EDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<EDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<EDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideE = " << StrideE << " : " << best_ave_time
+              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_mx_impl.hpp b/profiler/include/profiler/profile_gemm_mx_impl.hpp
index 8135bf4475..4df2348700 100644
--- a/profiler/include/profiler/profile_gemm_mx_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_mx_impl.hpp
@@ -226,6 +226,8 @@ bool profile_gemm_mx_impl(int do_verification,
             return ck::type_convert<BDataType>(x);
     };
 
+    using int_distr   = std::uniform_int_distribution<int>;
+    using float_distr = std::uniform_real_distribution<float>;
     switch(init_method)
     {
     case 0: // Initializations for development and debugging
@@ -245,21 +247,19 @@ bool profile_gemm_mx_impl(int do_verification,
 
     case 1:
 
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-4, 5});  // Z[-4,4]
-        b_k_n->GenerateTensorValue(GeneratorTensor_2<BDataType>{-4, 5}); // Z[-4,4]
+        a_m_k.GenerateTensorDistr(int_distr{-4, 5});  // Z[-4,4]
+        b_k_n->GenerateTensorDistr(int_distr{-4, 5}); // Z[-4,4]
 
-        a_m_k_scale.GenerateTensorValue(
-            GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
-        b_k_n_scale.GenerateTensorValue(
-            GeneratorTensor_2<XDataType>{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        a_m_k_scale.GenerateTensorDistr(int_distr{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorDistr(int_distr{125, 129}); // scales: {0.25, 0.5, 1, 2}
         break;
 
     default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
-        a_m_k_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+        a_m_k.GenerateTensorDistr(float_distr{-2.0, 2.0});
+        a_m_k_scale.GenerateTensorDistr(float_distr{powf(2.0f, -125.0f), 1.0f});
 
-        b_k_n->GenerateTensorValue(GeneratorTensor_3<BDataType>{-2.0, 2.0});
-        b_k_n_scale.GenerateTensorValue(GeneratorTensor_3<XDataType>{powf(2.0f, -125.0f), 1.0f});
+        b_k_n->GenerateTensorDistr(float_distr{-2.0, 2.0});
+        b_k_n_scale.GenerateTensorDistr(float_distr{powf(2.0f, -125.0f), 1.0f});
         break;
     }
 
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 2cfb5581ea..fef09315d5 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -62,6 +62,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
     list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
   endif()
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
     list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
@@ -170,6 +171,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
   endif()
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
     list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
diff --git a/profiler/src/profile_gemm_blockscale_wp.cpp b/profiler/src/profile_gemm_blockscale_wp.cpp
new file mode 100644
index 0000000000..e6a2fbb8f6
--- /dev/null
+++ b/profiler/src/profile_gemm_blockscale_wp.cpp
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_blockscale_wp_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    F16_F8_F16,     // 5
+    F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
+};
+
+enum struct ScaleBlockTile
+{
+    Tile_128_128_128, // 0
+    Tile_1_128_128,   // 1
+};
+
+#define OP_NAME "gemm_blockscale_wp"
+#define OP_DESC "GEMM_BlockScale_WeightPreshuffle"
+
+int profile_gemm_blockscale_weighpreshuffle(int argc, char* argv[])
+{
+    if(argc != 15 && argc != 18)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
+               "f16->f8; 7: f8->bf16, "
+               "comp f8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: scale block tile (0: ScaleBlockM/N/K = [128, 128, 128]; 1: ScaleBlockM/N/K = "
+               "[1, 128, 128];\n");
+        printf("arg5: verification (0: no; 1: yes)\n");
+        printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg7: print tensor value (0: no; 1: yes)\n");
+        printf("arg8: time kernel (0=no, 1=yes)\n");
+        printf("arg9 to 14: M, N, K, StrideA, StrideB, StrideE\n");
+        printf("optional:\n");
+        printf("arg15: number of warm-up cycles (default 1)\n");
+        printf("arg16: number of iterations (default 10)\n");
+        printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+
+    const auto data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const auto scale_block_tile = static_cast<ScaleBlockTile>(std::stoi(argv[4]));
+    const bool do_verification  = std::stoi(argv[5]);
+    const int init_method       = std::stoi(argv[6]);
+    const bool do_log           = std::stoi(argv[7]);
+    const bool time_kernel      = std::stoi(argv[8]);
+
+    const int M = std::stoi(argv[9]);
+    const int N = std::stoi(argv[10]);
+    const int K = std::stoi(argv[11]);
+
+    const int StrideA = std::stoi(argv[12]);
+    const int StrideB = std::stoi(argv[13]);
+    const int StrideE = std::stoi(argv[14]);
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 18)
+    {
+        n_warmup = std::stoi(argv[15]);
+        n_iter   = std::stoi(argv[16]);
+        rotating = std::stoull(argv[17]) * 1024 * 1024;
+    }
+
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F8   = ck::f8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a0_type,
+                       auto a1_type,
+                       auto b0_type,
+                       auto b1_type,
+                       auto comp_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto scale_block_m,
+                       auto scale_block_n,
+                       auto scale_block_k,
+                       auto a_layout,
+                       auto b_layout,
+                       auto e_layout) {
+        using A0DataType      = decltype(a0_type);
+        using A1DataType      = decltype(a1_type);
+        using B0DataType      = decltype(b0_type);
+        using B1DataType      = decltype(b1_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using EDataType       = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using ELayout = decltype(e_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_blockscale_weighpreshuffle_impl<A0DataType,
+                                                                               A1DataType,
+                                                                               B0DataType,
+                                                                               B1DataType,
+                                                                               ComputeDataType,
+                                                                               AccDataType,
+                                                                               EDataType,
+                                                                               scale_block_m,
+                                                                               scale_block_n,
+                                                                               scale_block_k,
+                                                                               ALayout,
+                                                                               BLayout,
+                                                                               ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideE < 0) ? DefaultStrideE : StrideE,
+            n_warmup,
+            n_iter,
+            rotating);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN &&
+       scale_block_tile == ScaleBlockTile::Tile_1_128_128)
+    {
+        return profile(F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       BF16{},
+                       ck::Number<1>{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       Row{},
+                       Col{},
+                       Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_blockscale_weighpreshuffle);
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 21a0484d19..4bb38a0c16 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -1225,18 +1225,18 @@ struct TestMXMFMA
         {
         case 0:
             a_m_k.GenerateTensorValue(GeneratorTensor_1<PackedAType>{1.0f});
-            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{0.5f}});
+            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{0.5f});
             // NOTE: not all numbers are representable in FP8, BF8, etc.
             // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 16 18 20 20 20 22 24 24 24 26 28 28 28 30 32
             b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<PackedBType, 1>{});
-            b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{1.0f}});
+            b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{1.0f});
             break;
         case 1:
             // results in C = {K}
             a_m_k.GenerateTensorValue(GeneratorTensor_1<PackedAType>{1.0f});
-            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{512.0f}});
+            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{512.0f});
             b_n_k.GenerateTensorValue(GeneratorTensor_1<PackedBType>{1.0f});
-            b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{1.0f / 512}});
+            b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{1.0f / 512});
             break;
         case 2:
             // expect small round off errors

From 8aff45a8af0c868d8c3513dab3335e3b1d3e111f Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Thu, 12 Jun 2025 11:44:22 +0800
Subject: [PATCH 205/443] [CK_TILE] moe sorting optimization : refactor
 subtoken logic to let more kernel pickup mp kernel (#2327)

* refactor subtoken logic to let more kernel pickup mp kernel

* typo
---
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 37 ++++++-------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 664294fe18..4166c1c602 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -127,37 +127,21 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int tokens_, int num_ex
         constexpr index_t cumsum_bufs = 2;  // 1 for cumsum, 1 for cnt
         // at lease 2 lines, one for sub_token unroll, one for cumsum
         // should be enough
-        if ((total_ / target_occupancy_) < ((cumsum_bufs+sub_unroll) * smem_cols)) {
-            if ((total_ / 1) < ((cumsum_bufs+sub_unroll) * smem_cols))
-                throw std::runtime_error("too many num_experts, can't allocate smem");
-            target_occupancy_ = 1;
-        }
+
         int r = total_ / target_occupancy_ / smem_cols;
 
+        // Note: at lease allocate cumsum_bufs + sub_unroll as num-row. Otherwise, fallback to mp kernel
+        if(r < (cumsum_bufs + sub_unroll))
+            return cumsum_bufs;
+
         // round to sub_unroll multipl
         int r_for_sub_token = r - cumsum_bufs;
-        r_for_sub_token = min(r_for_sub_token, tokens_);
-        r_for_sub_token = (r_for_sub_token + sub_unroll - 1) / sub_unroll * sub_unroll;
-        r_for_sub_token = max(r_for_sub_token, 1);
+        r_for_sub_token = r_for_sub_token / sub_unroll * sub_unroll;
+        int r_token_min = (tokens_ + sub_unroll - 1) / sub_unroll * sub_unroll;
+        r_for_sub_token = min(r_for_sub_token, r_token_min);
 
-        if(r_for_sub_token > 1)
-        {
-            int r_unroll_ = r_for_sub_token / sub_unroll;
-            
-
-            // round to 1x/2x/4x/8x number of sub_unroll
-            int clz_ = __builtin_clz(r_unroll_); // 0b1:31 0b2:30, 0b3:30, 0b4:29
-            int mask_ = (1 << (31 - clz_)) - 1;
-
-            
-            mask_ = mask_ > 0b111 ? 0b111 : mask_;  //clamp to 8x at most
-            mask_ = ~mask_;
-
-            r_for_sub_token = (r_unroll_ & mask_) * sub_unroll;
-        }
-
-        // final check
-        if( (r_for_sub_token + cumsum_bufs * smem_cols *  target_occupancy_ ) >= total_ ) {
+        // final check, but usually should not happen
+        if( ((r_for_sub_token + cumsum_bufs) * smem_cols *  target_occupancy_ ) > total_ ) {
             throw std::runtime_error("can't run this kernel, request LDS over size");
         }
 
@@ -167,6 +151,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int tokens_, int num_ex
     return ck_tile::make_tuple(smem_rows, smem_cols);
 }
 
+// if return 0 or negative, means LDS is not enough
 CK_TILE_HOST index_t moe_sorting_get_sub_token(int tokens_, int num_experts_)
 {
     auto [r_, c_]            = moe_sorting_get_smem_row_col(tokens_, num_experts_);

From bb4f471b09d48ff26c9dfb97a36aa61c7a6af2d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 12 Jun 2025 10:15:07 +0200
Subject: [PATCH 206/443] Grouped conv bwd weight with grouped gemm (#2304)

* Grouped conv bwd weight with grouped gemm

* fixes

* fix

* Fixes

* test comments

* restore atol

* fix
---
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 372 +++++++++++-------
 .../gpu/grid/block_to_ctile_map.hpp           |   2 +
 .../profile_grouped_conv_bwd_data_impl.hpp    |   4 +-
 .../profile_grouped_conv_bwd_weight_impl.hpp  |   5 +-
 .../test_grouped_convnd_bwd_data_xdl.cpp      |  12 +
 5 files changed, 242 insertions(+), 153 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index f6f354f98e..efb91bd13d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -25,6 +25,8 @@
 #include "ck/host_utility/flush_cache.hpp"
 #include "ck/host_utility/io.hpp"
 
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -51,6 +53,11 @@ namespace {
  * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
  * pointer offset into \p ComputePtrOffsetOfStridedBatch.
  *
+ * MaxGroupedGemmGroupsNum  is used to specify number of gemm args in compile time. With this
+ * implementation we can avoid copy data to workspace before kernel launch since number of groups is
+ * runtime parameter. If number of groups is larger than MaxGroupedGemmGroupsNum  then we run this
+ * kernel in the loop.
+ *
  * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
  * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
  * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
@@ -60,17 +67,13 @@ template <typename GridwiseGemm,
           typename ABDataType,
           typename DsPointer,
           typename EDataType,
+          index_t MaxGroupedGemmGroupsNum,
+          typename GemmArgs,
           typename AElementwiseOp,
           typename BElementwiseOp,
           typename CDEElementwiseOp,
-          typename AGridDesc_AK0_M_AK1,
-          typename BGridDesc_BK0_N_BK1,
-          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename Block2ETileMap,
           typename ComputePtrOffsetOfBatch,
           typename ComputePtrOffsetOfN,
-          bool HasMainKBlockLoop,
           InMemoryDataOperationEnum OutElementOp>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -81,25 +84,21 @@ __global__ void
             const ABDataType* __restrict__ p_b_grid,
             DsPointer p_ds_grid,
             EDataType* __restrict__ p_e_grid,
+            const std::array<GemmArgs, MaxGroupedGemmGroupsNum> gemm_kernel_args,
+            const index_t gemms_count,
             const AElementwiseOp a_element_op,
             const BElementwiseOp b_element_op,
             const CDEElementwiseOp cde_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            const Block2ETileMap block_2_ctile_map,
             const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
             const ComputePtrOffsetOfN compute_ptr_offset_of_n,
             const index_t KBatch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
-    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z / KBatch);
-    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.z - n_idx * KBatch);
+    const index_t block_args_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
+    const index_t g_idx         = __builtin_amdgcn_readfirstlane(blockIdx.y);
+    const index_t n_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z / KBatch);
+    const index_t k_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z - n_idx * KBatch);
 
     const long_index_t a_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
@@ -119,43 +118,79 @@ __global__ void
 
     DsPointer p_ds_grid_grp;
 
-    static constexpr index_t NumDTensor =
-        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+    static constexpr index_t NumDTensor = DsPointer::Size();
 
     static_for<0, NumDTensor, 1>{}(
         [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, OutElementOp>(
-        p_a_grid + a_batch_offset + a_n_offset,
-        p_b_grid + b_batch_offset,
-        p_ds_grid_grp,
-        p_e_grid + e_batch_offset + e_n_offset,
-        p_shared,
-        a_element_op,
-        b_element_op,
-        cde_element_op,
-        a_grid_desc_ak0_m_ak1,
-        b_grid_desc_bk0_n_bk1,
-        ds_grid_desc_mblock_mperblock_nblock_nperblock,
-        e_grid_desc_mblock_mperblock_nblock_nperblock_,
-        block_2_ctile_map,
-        KBatch,
-        k_idx);
+    index_t left     = 0;
+    index_t right    = gemms_count;
+    index_t group_id = index_t((left + right) / 2);
+    while((!(block_args_id >= gemm_kernel_args[group_id].BlockStart_ &&
+             block_args_id < gemm_kernel_args[group_id].BlockEnd_)) &&
+          left <= right)
+    {
+        if(block_args_id < gemm_kernel_args[group_id].BlockStart_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+
+    if(gemm_kernel_args[group_id].HasMainKBlockLoop_)
+    {
+        GridwiseGemm::template Run<true, OutElementOp>(
+            p_a_grid + a_batch_offset + a_n_offset,
+            p_b_grid + b_batch_offset,
+            p_ds_grid_grp,
+            p_e_grid + e_batch_offset + e_n_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+            gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+            gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_kernel_args[group_id].block_2_ctile_map_,
+            KBatch,
+            k_idx);
+    }
+    else
+    {
+        GridwiseGemm::template Run<false, OutElementOp>(
+            p_a_grid + a_batch_offset + a_n_offset,
+            p_b_grid + b_batch_offset,
+            p_ds_grid_grp,
+            p_e_grid + e_batch_offset + e_n_offset,
+            p_shared,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+            gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+            gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            gemm_kernel_args[group_id].block_2_ctile_map_,
+            KBatch,
+            k_idx);
+    }
 #else
     ignore = p_a_grid;
     ignore = p_b_grid;
     ignore = p_ds_grid;
     ignore = p_e_grid;
-    ignore = a_grid_desc_ak0_m_ak1;
-    ignore = b_grid_desc_bk0_n_bk1;
-    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = gemm_kernel_args;
+    ignore = gemms_count;
     ignore = a_element_op;
     ignore = b_element_op;
     ignore = cde_element_op;
     ignore = compute_ptr_offset_of_batch;
     ignore = compute_ptr_offset_of_n;
-    ignore = block_2_ctile_map;
 #endif
 }
 
@@ -239,6 +274,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     static_assert(NDimSpatial == 2 || NDimSpatial == 3,
                   "wrong! only implemented for 2D and 3D now");
 
+    // MaxGroupedGemmGroupsNum  is used to specify number of gemm args in compile time. With this
+    // implementation we can avoid copy data to workspace before kernel launch since number of
+    // groups is runtime parameter. If number of groups is larger than MaxGroupedGemmGroupsNum  then
+    // we run this kernel in the loop.
+    static constexpr index_t MaxGroupedGemmGroupsNum = 32;
+
     using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
 
     static constexpr index_t NumDTensor          = DsDataType::Size();
@@ -378,15 +419,58 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
 
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>::
-                     MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}));
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}));
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
         decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
 
     // block-to-e-tile map
-    using Block2ETileMap = remove_cvref_t<
-        decltype(GridwiseGemmMultipleD_xdl_cshuffle<
-                 GridwiseGemmMultiDTemplateParams>::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+    using Block2ETileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+
+    using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMap<Block2ETileMap>;
+
+    struct GemmArgs
+    {
+        GemmArgs() = default;
+        GemmArgs(AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                 BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                 DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                     ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                 EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                     e_grid_desc_mblock_mperblock_nblock_nperblock,
+                 GroupedGemmBlock2ETileMap block_2_ctile_map,
+                 index_t BlockStart,
+                 index_t BlockEnd,
+                 bool HasMainKBlockLoop)
+            : a_grid_desc_ak0_m_ak1_(a_grid_desc_ak0_m_ak1),
+              b_grid_desc_bk0_n_bk1_(b_grid_desc_bk0_n_bk1),
+
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_(
+                  ds_grid_desc_mblock_mperblock_nblock_nperblock),
+
+              e_grid_desc_mblock_mperblock_nblock_nperblock_(
+                  e_grid_desc_mblock_mperblock_nblock_nperblock),
+
+              // block-to-e-tile map
+              block_2_ctile_map_(block_2_ctile_map),
+              BlockStart_(BlockStart),
+              BlockEnd_(BlockEnd),
+              HasMainKBlockLoop_(HasMainKBlockLoop)
+
+        {
+        }
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        GroupedGemmBlock2ETileMap block_2_ctile_map_;
+        index_t BlockStart_, BlockEnd_;
+        bool HasMainKBlockLoop_;
+    };
     using Block2TileMapInOutElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, MPerBlock>;
     using Block2TileMapWeiElementwise   = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
 
@@ -589,9 +673,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             const auto YTilde = ConvStrideH / GcdStrideDilationH;
             const auto XTilde = ConvStrideW / GcdStrideDilationW;
 
+            index_t grid_size = 0;
+            // Allocate place for sets of gemms
+            gemm_kernel_args_.resize(
+                math::integer_divide_ceil(ZTilde * YTilde * XTilde, MaxGroupedGemmGroupsNum));
+
             for(index_t i_ztilde = 0; i_ztilde < ZTilde; ++i_ztilde)
             {
-
                 for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
                 {
                     for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
@@ -694,36 +782,51 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                         ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
                         e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
 
-                        // desc for blockwise copy
-                        a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
-                        b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
+                        const index_t grid_size_grp = Block2ETileMap::CalculateGridSize(
+                            e_grid_desc_m_n.GetLength(I0), e_grid_desc_m_n.GetLength(I1));
 
-                        // block-to-e-tile-map
-                        auto block_2_etile_map =
-                            GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+                        const index_t BlockStart = grid_size;
+                        const index_t BlockEnd   = grid_size + grid_size_grp;
 
-                        block_2_etile_map_container_.push_back(block_2_etile_map);
+                        grid_size += grid_size_grp;
 
-                        if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                       b_grid_desc_n_k,
-                                                       ds_grid_desc_m_n,
-                                                       e_grid_desc_m_n,
-                                                       block_2_etile_map,
-                                                       k_batch_))
+                        // block-to-e-tile map
+                        const auto block_2_etile_map =
+                            GroupedGemmBlock2ETileMap(Block2ETileMap(e_grid_desc_m_n.GetLength(I0),
+                                                                     e_grid_desc_m_n.GetLength(I1)),
+                                                      BlockStart);
+
+                        const auto GemmK = a_grid_desc_m_k.GetLength(I1);
+                        const bool HasMainKBlockLoop =
+                            GridwiseGemm::CalculateHasMainKBlockLoop(GemmK, k_batch_);
+
+                        gemm_kernel_args_[gemms_count_ /
+                                          MaxGroupedGemmGroupsNum][gemms_count_ %
+                                                                   MaxGroupedGemmGroupsNum] =
+                            GemmArgs{a_grid_desc_ak0_m_ak1,
+                                     b_grid_desc_bk0_n_bk1,
+                                     GridwiseGemm::
+                                         MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                             ds_grid_desc_m_n),
+                                     MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                         e_grid_desc_m_n),
+                                     block_2_etile_map,
+                                     BlockStart,
+                                     BlockEnd,
+                                     HasMainKBlockLoop};
+                        gemms_count_++;
+                        if(gemms_count_ % MaxGroupedGemmGroupsNum == 0)
                         {
-                            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-
-                                GridwiseGemm::
-                                    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                        ds_grid_desc_m_n));
-
-                            e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
-                                MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                    e_grid_desc_m_n));
+                            gemms_grid_size_.push_back(grid_size);
+                            grid_size = 0;
                         }
                     }
                 }
             }
+            gemm_kernel_args_.resize(
+                math::integer_divide_ceil(gemms_count_, MaxGroupedGemmGroupsNum));
+            gemms_grid_size_.push_back(grid_size);
+
             // A/B/Ds/E Batch Stride
             compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides_transposed[0];
@@ -830,31 +933,28 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         void Print() const
         {
-            for(std::size_t i = 0; i < a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            for(std::size_t i = 0; i < a_grid_desc_m_k_container_.size(); i++)
             {
-                std::cout << "a_grid_desc_ak0_m_ak1_container_"
-                          << a_grid_desc_ak0_m_ak1_container_[i] << std::endl;
+                std::cout << "a_grid_desc_m_ak_container_" << a_grid_desc_m_k_container_[i]
+                          << std::endl;
 
-                std::cout << "b_grid_desc_bk0_n_bk1_container_"
-                          << b_grid_desc_bk0_n_bk1_container_[i] << std::endl;
+                std::cout << "b_grid_desc_n_bk_container_" << b_grid_desc_n_k_container_[i]
+                          << std::endl;
 
                 static_for<0, NumDTensor, 1>{}([&](auto j) {
                     std::cout << "ds_grid_desc_mblock_mperblock_nblock_nperblock_container_"
-                              << ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i][j]
-                              << std::endl;
+                              << ds_grid_desc_m_n_container_[i][j] << std::endl;
                 });
 
                 std::cout << "e_grid_desc_mblock_mperblock_nblock_nperblock_container_"
-                          << e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i]
-                          << std::endl;
+                          << e_grid_desc_m_n_container_[i] << std::endl;
             }
         }
 
         // pointers
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
-        typename GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>::DsGridPointer
-            p_ds_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
         EDataType* p_e_grid_;
 
         // tensor descriptor for problem definition
@@ -865,16 +965,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
         std::vector<EGridDesc_M_N> e_grid_desc_m_n_container_;
 
-        // tensor descriptor for block-wise copy
-        std::vector<AGridDesc_AK0_M_AK1> a_grid_desc_ak0_m_ak1_container_;
-        std::vector<BGridDesc_BK0_N_BK1> b_grid_desc_bk0_n_bk1_container_;
-        std::vector<DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
-            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_;
-        std::vector<EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
-            e_grid_desc_mblock_mperblock_nblock_nperblock_container_;
-
         // block-to-e-tile map
-        std::vector<Block2ETileMap> block_2_etile_map_container_;
         Block2TileMapInOutElementwise elementwise_block_2_ctile_map_transpose_a_,
             elementwise_block_2_ctile_map_transpose_e_;
         Block2TileMapWeiElementwise elementwise_block_2_ctile_map_transpose_b_;
@@ -903,6 +994,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         const index_t k_batch_;
         index_t num_workgroups_per_Conv_N_;
+        std::vector<index_t> gemms_grid_size_;
+        index_t gemms_count_ = 0;
+        std::vector<std::array<GemmArgs, MaxGroupedGemmGroupsNum>> gemm_kernel_args_;
+
         bool bwd_needs_zero_out;
         long_index_t e_space_size_bytes;
     };
@@ -941,84 +1036,61 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                            arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
             }
 
-            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+            for(std::size_t gemm_set_id = 0; gemm_set_id < arg.gemm_kernel_args_.size();
+                gemm_set_id++)
             {
-                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
-                                                arg.b_grid_desc_n_k_container_[i],
-                                                arg.ds_grid_desc_m_n_container_[i],
-                                                arg.e_grid_desc_m_n_container_[i],
-                                                arg.block_2_etile_map_container_[i],
-                                                arg.k_batch_))
-                {
-                    throw std::runtime_error("wrong! device_op has invalid setting");
-                }
-
-                const index_t gdx = arg.block_2_etile_map_container_[i].CalculateGridSize(
-                    arg.e_grid_desc_m_n_container_[i]);
-
-                const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);
+                const index_t gdx = arg.gemms_grid_size_[gemm_set_id];
+                const index_t gemms_count_for_set =
+                    gemm_set_id == arg.gemm_kernel_args_.size() - 1
+                        ? arg.gemms_count_ - MaxGroupedGemmGroupsNum * gemm_set_id
+                        : MaxGroupedGemmGroupsNum;
+                const std::array<GemmArgs, MaxGroupedGemmGroupsNum>& gemm_kernel_args =
+                    arg.gemm_kernel_args_[gemm_set_id];
 
                 const auto clear_workspace = [&]() {
-                    if(arg.bwd_needs_zero_out && i == 0)
+                    if(arg.bwd_needs_zero_out && gemm_set_id == 0)
                     {
                         hip_check_error(hipMemsetAsync(
                             p_e_grid, 0, arg.e_space_size_bytes, stream_config.stream_id_));
                     }
                 };
 
-                auto launch_kernel = [&](auto has_main_k_block_loop) {
-                    constexpr bool has_main_loop = has_main_k_block_loop.value;
-
+                auto launch_kernel = [&]() {
                     const auto kernel = kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle<
                         GridwiseGemm,
                         ADataType, // TODO: distiguish A/B datatype
                         typename GridwiseGemm::DsGridPointer,
                         EDataType,
+                        MaxGroupedGemmGroupsNum,
+                        GemmArgs,
                         AElementwiseOp,
                         BElementwiseOp,
                         CDEElementwiseOp,
-                        DeviceOp::AGridDesc_AK0_M_AK1,
-                        DeviceOp::BGridDesc_BK0_N_BK1,
-                        DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        Block2ETileMap,
                         ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
                         ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                        has_main_loop,
                         ElementOp>;
 
-                    return launch_and_time_kernel_with_preprocess(
-                        stream_config,
-                        clear_workspace,
-                        kernel,
-                        dim3(gdx, gdy, gdz),
-                        dim3(BlockSize),
-                        0,
-                        p_a_grid,
-                        p_b_grid,
-                        arg.p_ds_grid_,
-                        p_e_grid,
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.cde_element_op_,
-                        arg.a_grid_desc_ak0_m_ak1_container_[i],
-                        arg.b_grid_desc_bk0_n_bk1_container_[i],
-                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
-                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
-                        arg.block_2_etile_map_container_[i],
-                        arg.compute_ptr_offset_of_batch_,
-                        arg.compute_ptr_offset_of_n_,
-                        arg.k_batch_);
+                    return launch_and_time_kernel_with_preprocess(stream_config,
+                                                                  clear_workspace,
+                                                                  kernel,
+                                                                  dim3(gdx, gdy, gdz),
+                                                                  dim3(BlockSize),
+                                                                  0,
+                                                                  p_a_grid,
+                                                                  p_b_grid,
+                                                                  arg.p_ds_grid_,
+                                                                  p_e_grid,
+                                                                  gemm_kernel_args,
+                                                                  gemms_count_for_set,
+                                                                  arg.a_element_op_,
+                                                                  arg.b_element_op_,
+                                                                  arg.cde_element_op_,
+                                                                  arg.compute_ptr_offset_of_batch_,
+                                                                  arg.compute_ptr_offset_of_n_,
+                                                                  arg.k_batch_);
                 };
 
-                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK, arg.k_batch_))
-                {
-                    ave_time += launch_kernel(integral_constant<bool, true>{});
-                }
-                else
-                {
-                    ave_time += launch_kernel(integral_constant<bool, false>{});
-                }
+                ave_time += launch_kernel();
             }
 
             return ave_time;
@@ -1304,14 +1376,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         }
 
         // Gridwise GEMM size
-        for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+        for(std::size_t i = 0; i < arg.a_grid_desc_m_k_container_.size(); i++)
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
-                                            arg.b_grid_desc_n_k_container_[i],
-                                            arg.ds_grid_desc_m_n_container_[i],
-                                            arg.e_grid_desc_m_n_container_[i],
-                                            arg.block_2_etile_map_container_[i],
-                                            arg.k_batch_))
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_m_k_container_[i],
+                   arg.b_grid_desc_n_k_container_[i],
+                   arg.ds_grid_desc_m_n_container_[i],
+                   arg.e_grid_desc_m_n_container_[i],
+                   arg.gemm_kernel_args_[i / MaxGroupedGemmGroupsNum][i % MaxGroupedGemmGroupsNum]
+                       .block_2_ctile_map_,
+                   arg.k_batch_))
             {
                 return false;
             }
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index dcc07d8a49..7eca68bbf8 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -271,6 +271,7 @@ struct BlockToCTileMap_Grouped_M00_N0_M01Adapt
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
+    __host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt() = default;
     __host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt(index_t M,
                                                                 index_t N,
                                                                 index_t M01 = 8)
@@ -870,6 +871,7 @@ struct OffsettedBlockToCTileMap
 {
     using underlying_type = UnderlyingBlockToCTileMap;
 
+    __host__ __device__ OffsettedBlockToCTileMap() = default;
     __host__ __device__ OffsettedBlockToCTileMap(UnderlyingBlockToCTileMap block_to_ctile_map,
                                                  index_t block_start)
     {
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 6cd8440e58..12f6ad606f 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -186,8 +186,8 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                 rtol = std::max(rtol, rtol_split_k);
                 atol = std::max(atol, atol_split_k);
 
-                pass = pass & ck::utils::check_err(
-                                  in_device, in_host, "Error: Incorrect results!", rtol, atol);
+                pass &= ck::utils::check_err(
+                    in_device, in_host, "Error: Incorrect results!", rtol, atol);
                 std::cout << "Relative error threshold: " << rtol
                           << " Absolute error threshold: " << atol << std::endl;
 
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index ca9b2f1d24..c1bb90dd9c 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -261,8 +261,9 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                         ck::utils::get_absolute_threshold<WeiDataType, WeiDataType, WeiDataType>(
                             max_accumulated_value, num_accums_split_k);
                     // Use higher threshold
-                    rtol      = std::max(rtol, rtol_split_k);
-                    atol      = std::max(atol, atol_split_k);
+                    rtol = std::max(rtol, rtol_split_k);
+                    atol = std::max(atol, atol_split_k);
+                    // Use default atol for splitK == 1
                     bool pass = ck::utils::check_err(weight_device_result,
                                                      weight_host_result,
                                                      "Error: Incorrect results!",
diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
index 7f8f64c2e2..209b9b4f55 100644
--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp
@@ -96,6 +96,18 @@ TYPED_TEST(TestGroupedConvndBwdDataXdl2d, Test2D)
 {
     this->conv_params.clear();
 
+    // GroupedGemmGroupsNum = 4, ZTilde * YTilde * XTilde = 4, MaxGroupedGemmGroupsNum = 32
+    this->conv_params.push_back(
+        {2, 2, 2, 16, 16, {3, 3}, {28, 28}, {2, 2}, {1, 1}, {1, 1}, {1, 1}});
+    // GroupedGemmGroupsNum = 9, ZTilde * YTilde * XTilde = 36, MaxGroupedGemmGroupsNum = 32
+    this->conv_params.push_back(
+        {2, 2, 2, 16, 16, {3, 3}, {28, 28}, {6, 6}, {1, 1}, {1, 1}, {1, 1}});
+    // GroupedGemmGroupsNum = 36, ZTilde * YTilde * XTilde = 36, MaxGroupedGemmGroupsNum = 32
+    this->conv_params.push_back(
+        {2, 2, 2, 16, 16, {6, 6}, {28, 28}, {6, 6}, {1, 1}, {1, 1}, {1, 1}});
+    // GroupedGemmGroupsNum = 32, ZTilde * YTilde * XTilde = 32, MaxGroupedGemmGroupsNum = 32
+    this->conv_params.push_back(
+        {2, 2, 2, 16, 16, {4, 8}, {28, 28}, {4, 8}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(
         {2, 2, 2, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
     this->conv_params.push_back(

From f59b8c7d3db6a78685d7330d377cb8095c359434 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Thu, 12 Jun 2025 09:46:33 -0700
Subject: [PATCH 207/443] OCP FP8 Macro restructure (#2331)

* solved the problem
---
 include/ck_tile/core/config.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 14b33aea77..1ecc28fbeb 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -240,17 +240,17 @@
 #define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
 #endif
 
-#ifndef __HIP_DEVICE_COMPILE__ // for host code
-#ifdef CK_TILE_USE_OCP_FP8
+#ifndef CK_TILE_USE_OCP_FP8
+#if defined(__HIP_DEVICE_COMPILE__)
+#if defined(__gfx950__) || defined(__gfx12__)
 #define CK_TILE_USE_OCP_FP8 1
 #else
 #define CK_TILE_USE_OCP_FP8 0
 #endif
-#elif defined(__gfx950__) || defined(__gfx12__) // for GPU code
-#define CK_TILE_USE_OCP_FP8 1
-#else // for GPU code
+#else
 #define CK_TILE_USE_OCP_FP8 0
 #endif
+#endif
 
 #ifndef CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
 #if __clang_major__ == 20

From e5ece1446782b99877792d51e4ed3119dfd7000a Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Thu, 12 Jun 2025 18:27:14 -0400
Subject: [PATCH 208/443] fix(gemm_universal): Update gemm_utils.hpp so it
 builds successfully for memory pipeline (#2336)

---
 example/ck_tile/03_gemm/gemm_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index aec5f6a116..cd4ace6d2f 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -49,7 +49,7 @@ struct GemmConfig
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 8;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
 
     static constexpr bool DoubleSmemBuffer = false;
 #endif

From 5f1ad09b610cb0e083f63988479ab022bda70588 Mon Sep 17 00:00:00 2001
From: kylasa <sudhir.kylasa@amd.com>
Date: Thu, 12 Jun 2025 18:24:02 -0700
Subject: [PATCH 209/443] Code drop for 2 warp ping pong scheduler along K
 dimension. (#2276)

* Code drop for 2 warp ping pong scheduler along K dimension.

* Addressing code review comments.

* Addressing Clang formatting issues.

* Addressing build issues.

* Addressing build issues of other GEMM pipelines with ping pong scheduler code drop.

* Fix for LDS memory size for GEMM pipelines.

* Addressing code review feedback comments.

* Change log update.

* Addressing code review comments and build issues.

* Added new policy for pipeline specific logic about LDS needs.

* Clang Fix during build.
---
 CHANGELOG.md                                  |   1 +
 example/ck_tile/03_gemm/gemm_utils.hpp        |  35 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    |   8 +-
 .../algorithm/static_encoding_pattern.hpp     |  92 +++--
 .../ops/epilogue/cshuffle_epilogue.hpp        |   7 +-
 include/ck_tile/ops/gemm.hpp                  |   2 +
 .../block/block_gemm_areg_breg_creg_v1.hpp    | 160 ++++++--
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  17 +-
 .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp  |  10 +-
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |   1 +
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |   1 +
 .../gemm_pipeline_ag_bg_cr_comp_v5.hpp        | 379 ++++++++++++++++++
 ...peline_ag_bg_cr_comp_v5_default_policy.hpp |  63 +++
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |   1 +
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   2 +
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |  54 ++-
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |   4 +-
 17 files changed, 727 insertions(+), 110 deletions(-)
 create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
 create mode 100644 include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aecf16d83d..af8d965b30 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
 * Added benchmarking support for tile engine GEMM.
+* Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
 
 ### Optimized
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index cd4ace6d2f..f3d11c751b 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -14,6 +14,7 @@
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
 
 #ifndef CK_TILE_PIPELINE_DEFAULT
 #define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
@@ -31,6 +32,10 @@
 #define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
 #define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
 #define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V5)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV5
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV5
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
 #else
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif
@@ -51,7 +56,8 @@ struct GemmConfig
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = 16;
 
-    static constexpr bool DoubleSmemBuffer = false;
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
 #endif
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
     // Compute friendly for Intrawave scheduler
@@ -67,7 +73,8 @@ struct GemmConfig
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = 32;
 
-    static constexpr bool DoubleSmemBuffer = false;
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
 #elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
     // Compute friendly for Intrawave scheduler
     // Using the ping pong reader in the lds level
@@ -83,7 +90,29 @@ struct GemmConfig
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = 16;
 
-    static constexpr bool DoubleSmemBuffer = true;
+    static constexpr bool DoubleSmemBuffer          = true;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V5)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer = false;
+
+    // Available wavegroups will be split into `NumWaveGroups` and each of these wavegroups
+    // will be responsible for specific jobs. For instance, perform Global Memory read operations,
+    // perform block-gemm operation etc...
+    static constexpr ck_tile::index_t NumWaveGroups = 2;
 #endif
 
     static constexpr bool kPadM = false;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 3a7cc93df8..fafe40c333 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -50,7 +50,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                  CLayout,
                                                                  GemmConfig::TransposeC,
                                                                  GemmConfig::UseStructuredSparsity,
-                                                                 Persistent>;
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
@@ -96,7 +97,9 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                  GemmConfig::N_Warp_Tile,
                                                  GemmConfig::K_Warp_Tile,
                                                  UniversalGemmProblem::TransposeC,
-                                                 memory_operation>>;
+                                                 memory_operation,
+                                                 GemmConfig::NumWaveGroups>>;
+
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
@@ -190,7 +193,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     };
 
     BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
-
     return ave_time;
 }
 
diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index b56bda3741..d8a8f6ab66 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -56,19 +56,24 @@ template <index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
           index_t VecSize,
-          tile_distribution_pattern DistributionPattern>
+          tile_distribution_pattern DistributionPattern,
+          index_t NumWaveGroups = 1>
 struct TileDistributionEncodingPattern2D : public TileDistributionEncodingPattern
 {
 };
 
 // Thread raked
-template <index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize>
+template <index_t BlockSize,
+          index_t YPerTile,
+          index_t XPerTile,
+          index_t VecSize,
+          index_t NumWaveGroups>
 struct TileDistributionEncodingPattern2D<BlockSize,
                                          YPerTile,
                                          XPerTile,
                                          VecSize,
-                                         tile_distribution_pattern::thread_raked>
-    : public TileDistributionEncodingPattern
+                                         tile_distribution_pattern::thread_raked,
+                                         NumWaveGroups> : public TileDistributionEncodingPattern
 {
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
@@ -83,45 +88,76 @@ struct TileDistributionEncodingPattern2D<BlockSize,
     static constexpr index_t Y1 = warp_size / X0;
     static_assert(X0 * Y1 == warp_size, "X0 * Y1 must cover whole wavefront!");
 
-    static constexpr index_t Y0 = num_warps;
+    static constexpr index_t Y0 = num_warps / NumWaveGroups;
     //  YPerWarp = YPerTile / Y0;
     //  Y2 = YPerWarp / Y1;
     static constexpr index_t Y2 = YPerTile / (Y1 * Y0); // # of iters within wavefront
 
-    static_assert(X0 * Y1 * Y0 == BlockSize, "X0 * warp_ys * Y0 must cover whole workgroup!");
+    static_assert(X0 * Y1 * Y0 * NumWaveGroups == BlockSize,
+                  "X0 * warp_ys * Y0 must cover whole workgroup!");
     static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
 
     CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
     {
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<2, 1>>{});
+        if constexpr(NumWaveGroups != 1)
+        {
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<Y0>,
+                                           tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
+                                           tuple<sequence<0>, sequence<1, 2>>,
+                                           tuple<sequence<0>, sequence<0, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 1>>{});
+        }
+        else
+        {
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<2, 1>>{});
+        }
     }
 
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
     {
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
-                                       tuple<sequence<2>, sequence<2, 1>>,
-                                       tuple<sequence<0>, sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<1, 2>>{});
+        if constexpr(NumWaveGroups != 1)
+        {
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<Y0>,
+                                           tuple<sequence<X0, X1>, sequence<Y1, Y2>>,
+                                           tuple<sequence<0>, sequence<2, 1>>,
+                                           tuple<sequence<0>, sequence<0, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 1>>{});
+        }
+        else
+        {
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
+                                           tuple<sequence<2>, sequence<2, 1>>,
+                                           tuple<sequence<0>, sequence<1, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 2>>{});
+        }
     }
 };
 
 // Warp raked
-template <index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize>
+template <index_t BlockSize,
+          index_t YPerTile,
+          index_t XPerTile,
+          index_t VecSize,
+          index_t NumWaveGroups>
 struct TileDistributionEncodingPattern2D<BlockSize,
                                          YPerTile,
                                          XPerTile,
                                          VecSize,
-                                         tile_distribution_pattern::warp_raked>
-    : public TileDistributionEncodingPattern
+                                         tile_distribution_pattern::warp_raked,
+                                         NumWaveGroups> : public TileDistributionEncodingPattern
 {
 
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
@@ -164,13 +200,17 @@ struct TileDistributionEncodingPattern2D<BlockSize,
 };
 
 // Block raked
-template <index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize>
+template <index_t BlockSize,
+          index_t YPerTile,
+          index_t XPerTile,
+          index_t VecSize,
+          index_t NumWaveGroups>
 struct TileDistributionEncodingPattern2D<BlockSize,
                                          YPerTile,
                                          XPerTile,
                                          VecSize,
-                                         tile_distribution_pattern::block_raked>
-    : public TileDistributionEncodingPattern
+                                         tile_distribution_pattern::block_raked,
+                                         NumWaveGroups> : public TileDistributionEncodingPattern
 {
 
     // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 5a6521deb5..6613ceebb2 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -23,7 +23,8 @@ template <typename ADataType_,
           index_t NPerXdl_,
           index_t KPerXdl_,
           bool isCTransposed_,
-          memory_operation_enum MemoryOperation_>
+          memory_operation_enum MemoryOperation_,
+          index_t kNumWaveGroups_ = 1>
 struct CShuffleEpilogueProblem
 {
     using ADataType                                        = remove_cvref_t<ADataType_>;
@@ -41,6 +42,7 @@ struct CShuffleEpilogueProblem
     static constexpr index_t KPerXdl                       = KPerXdl_;
     static constexpr index_t isCTransposed                 = isCTransposed_;
     static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
+    static constexpr index_t kNumWaveGroups                = kNumWaveGroups_;
 };
 
 template <typename Problem_, typename Policy_ = void>
@@ -236,7 +238,8 @@ struct CShuffleEpilogue
                                               MPerIterationShuffle,
                                               NPerIterationShuffle,
                                               GetVectorSizeC(),
-                                              tile_distribution_pattern::thread_raked>;
+                                              tile_distribution_pattern::thread_raked,
+                                              Problem::kNumWaveGroups>;
         constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
 
         constexpr auto c_warp_y_lengths =
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 35f5170179..8db822ebd1 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -31,6 +31,8 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
index b4362d9069..28d8b3eead 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -60,52 +60,105 @@ struct BlockGemmARegBRegCRegV1
     static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
     static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
 
-    static constexpr index_t MWarp = Traits::MWarp;
-    static constexpr index_t NWarp = Traits::NWarp;
+    static constexpr index_t MWarp            = Traits::MWarp;
+    static constexpr index_t NWarp            = Traits::NWarp;
+    static constexpr bool UseDefaultScheduler = (Problem::NumWaveGroups != 1);
 
     CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
     {
-        constexpr auto a_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<1, 0>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto a_block_outer_dstr_encoding =
+                tile_distribution_encoding<sequence<NWarp>,
+                                           tuple<sequence<MIterPerWarp>, sequence<KIterPerWarp>>,
+                                           tuple<>,
+                                           tuple<>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{};
 
-        return a_block_dstr_encode;
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            return a_block_dstr_encode;
+        }
+        else
+        {
+            constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<NWarp>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            return a_block_dstr_encode;
+        }
     }
 
     CK_TILE_DEVICE static constexpr auto MakeBBlockDistributionEncode()
     {
-        constexpr auto b_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<MWarp>,
-                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<0, 1>>,
-                                       tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-        constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto b_block_outer_dstr_encoding =
+                tile_distribution_encoding<sequence<MWarp>,
+                                           tuple<sequence<NIterPerWarp>, sequence<KIterPerWarp>>,
+                                           tuple<>,
+                                           tuple<>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{};
+            constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
-        return b_block_dstr_encode;
+            return b_block_dstr_encode;
+        }
+        else
+        {
+            constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<0, 1>>,
+                tuple<sequence<0, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+            return b_block_dstr_encode;
+        }
     }
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockDistributionEncode()
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<>,
+                tuple<>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
 
-        return c_block_dstr_encode;
+            return c_block_dstr_encode;
+        }
+        else
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+
+            return c_block_dstr_encode;
+        }
     }
 
     // C += A * B
@@ -201,19 +254,38 @@ struct BlockGemmARegBRegCRegV1
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<>,
+                tuple<>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
 
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
-        return c_block_tensor;
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+            constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+            auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+            return c_block_tensor;
+        }
+        else
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+            constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+            auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+            return c_block_tensor;
+        }
     }
 
     // C = A * B
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index edcde4a09f..bfb0d2626b 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -644,6 +644,7 @@ struct GemmKernel
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
      *
      */
+    template <bool UseDefaultScheduler = true>
     CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
                                        const BDataType* b_ptr,
                                        CDataType* c_ptr,
@@ -671,11 +672,15 @@ struct GemmKernel
         const auto& c_block_tile = GemmPipeline{}.template operator()(
             a_block_window, b_block_window, num_loop, smem_ptr_0);
 
-        // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(I2);
+        if(UseDefaultScheduler || (get_warp_id() == 0))
+        {
+            // Run Epilogue Pipeline
+            auto& c_block_window = gemm_tile_windows.at(I2);
 
-        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, smem_ptr_0);
+            EpiloguePipeline{}
+                .template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+                    c_block_window, c_block_tile, smem_ptr_0);
+        }
     }
 
     /**
@@ -772,7 +777,9 @@ struct GemmKernel
                            EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
                            is_any_of<CDataType, fp16_t, bf16_t>::value))
             {
-                RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                constexpr auto scheduler_type = (GemmPipeline::NumWaveGroups == 1);
+                RunGemm<scheduler_type>(
+                    a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
             }
         }
     }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index 24bd66a59e..07bfb33252 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -71,7 +71,8 @@ struct GemmPipelineAgBgCrImplBase
     template <typename ADramBlockWindowTmp, typename ALdsTensorView, typename ALdsLoadTileDistr>
     CK_TILE_DEVICE constexpr auto GetAWindows(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                               const ALdsTensorView& a_lds_block_view,
-                                              const ALdsLoadTileDistr&) const
+                                              const ALdsLoadTileDistr&,
+                                              const array<index_t, 2>& offset = {0, 0}) const
     {
         constexpr bool is_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
 
@@ -82,7 +83,7 @@ struct GemmPipelineAgBgCrImplBase
         auto a_copy_dram_window =
             make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
                              make_tuple(YPerTile{}, XPerTile{}),
-                             a_dram_block_window_tmp.get_window_origin(),
+                             a_dram_block_window_tmp.get_window_origin() + offset,
                              Policy::template MakeADramTileDistribution<Problem>());
 
         // A LDS tile window for store
@@ -103,7 +104,8 @@ struct GemmPipelineAgBgCrImplBase
     template <typename BDramBlockWindowTmp, typename BLdsTensorView, typename BLdsLoadTileDistr>
     CK_TILE_DEVICE constexpr auto GetBWindows(const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                               const BLdsTensorView& b_lds_block_view,
-                                              const BLdsLoadTileDistr&) const
+                                              const BLdsLoadTileDistr&,
+                                              const array<index_t, 2>& offset = {0, 0}) const
     {
         constexpr bool is_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
 
@@ -113,7 +115,7 @@ struct GemmPipelineAgBgCrImplBase
         auto b_copy_dram_window =
             make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
                              make_tuple(YPerTile{}, XPerTile{}),
-                             b_dram_block_window_tmp.get_window_origin(),
+                             b_dram_block_window_tmp.get_window_origin() + offset,
                              Policy::template MakeBDramTileDistribution<Problem>());
 
         // TODO: Do we really need those two tile windows???
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index a6267e4c89..eb47d9bad6 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -143,6 +143,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
     static constexpr auto TailNum    = Problem::TailNum;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 6fc6ba2ba2..8424c43e86 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -134,6 +134,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
     static constexpr auto TailNum    = Problem::TailNum;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
new file mode 100644
index 0000000000..9ef7f3f0ef
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -0,0 +1,379 @@
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+// A Tile Window: global memory
+// B Tile Window: global memory
+// C Distributed Tensor: register
+
+template <typename Problem>
+struct BaseGemmPipelineAgBgCrCompV5
+{
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t) { return true; }
+
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t)
+    {
+        return TailNumber::Empty;
+    }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool, TailNumber)
+    {
+        return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
+    }
+};
+
+template <typename Problem, typename Policy = GemmPipelineAgBgCrCompV5DefaultPolicy>
+struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
+{
+    using Base             = BaseGemmPipelineAgBgCrCompV5<Problem>;
+    using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
+
+    using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType       = remove_cvref_t<typename Problem::CDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+    using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
+
+    using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
+    using I0        = number<0>;
+    using I1        = number<1>;
+    using I2        = number<2>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
+    static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
+    static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+
+    static constexpr bool HasHotLoop = Problem::HasHotLoop;
+    static constexpr auto TailNum    = Problem::TailNum;
+    static constexpr auto Scheduler  = Problem::Scheduler;
+
+    static constexpr index_t NumWarps  = BlockGemmShape::NumWarps;
+    static constexpr index_t KTileSize = BlockGemmShape::WarpTile::at(I2{});
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "pipeline_AgBgCrCompV5", BlockSize,
+                      concat('x', GetVectorSizeA(), GetVectorSizeB(),  GetVectorSizeC()),
+                      concat('x', kPadM, kPadN, kPadK));
+        // clang-format on
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC()
+    {
+        return Policy::template IsTransposeC<Problem>();
+    }
+
+    template <GemmPipelineScheduler Scheduler>
+    struct PipelineImpl : public PipelineImplBase
+    {
+    };
+
+    template <>
+    struct PipelineImpl<GemmPipelineScheduler::Intrawave> : public PipelineImplBase
+    {
+        using Base = PipelineImplBase;
+
+        template <bool HasHotLoop,
+                  TailNumber TailNum,
+                  typename ADramBlockWindowTmp,
+                  typename AElementFunction,
+                  typename BDramBlockWindowTmp,
+                  typename BElementFunction>
+        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                       const AElementFunction& a_element_func,
+                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BElementFunction& b_element_func,
+                                       index_t num_loop,
+                                       void* __restrict__ p_smem_0) const
+        {
+            static_assert(
+                std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                    std::is_same_v<BDataType,
+                                   remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+                "Data Type conflict on A and B matrix input data type.");
+
+            static_assert(
+                KPerBlock % ((NumWarps / 2) * KTileSize) == 0,
+                "Ping Pong Warps, TileSize and Block Size for K dimensions does not match.");
+
+            constexpr bool is_a_col_major =
+                std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+            constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+
+            static_assert(is_a_col_major
+                              ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}])
+                              : (MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]),
+                          "A block window has incorrect lengths for defined ALayout!");
+            static_assert(is_b_row_major
+                              ? (KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}])
+                              : (NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]),
+                          "B block window has incorrect lengths for defined BLayout!");
+
+            index_t warp_id = get_warp_id();
+            index_t operation_id =
+                __builtin_amdgcn_readfirstlane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
+
+            auto a_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
+            auto b_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
+
+            auto tensor_views =
+                Base::GetABLdsTensorViews(static_cast<void*>(static_cast<char*>(p_smem_0)));
+            auto& a_lds_block = tensor_views.get(number<0>{});
+            auto& b_lds_block = tensor_views.get(number<1>{});
+
+            constexpr auto a_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+            constexpr auto b_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
+
+            auto a_windows = Base::GetAWindows(
+                a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr, a_offset);
+            auto& a_copy_dram_window = a_windows.get(number<0>{});
+            auto& a_copy_lds_window  = a_windows.get(number<1>{});
+            auto& a_lds_window       = a_windows.get(number<2>{});
+
+            auto b_windows = Base::GetBWindows(
+                b_dram_block_window_tmp, b_lds_block, b_lds_load_tile_distr, b_offset);
+            auto& b_copy_dram_window = b_windows.get(number<0>{});
+            auto& b_copy_lds_window  = b_windows.get(number<1>{});
+            auto& b_lds_window       = b_windows.get(number<2>{});
+
+            // DRAM window steps.
+            using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
+            using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
+            constexpr ADramTileWindowStep a_dram_tile_window_step =
+                is_a_col_major ? make_array(KPerBlock * NumWarps, 0)
+                               : make_array(0, KPerBlock * NumWarps);
+            constexpr BDramTileWindowStep b_dram_tile_window_step =
+                is_b_row_major ? make_array(KPerBlock * NumWarps, 0)
+                               : make_array(0, KPerBlock * NumWarps);
+
+            constexpr auto AGemmTileDistr = decltype(make_static_tile_distribution(
+                BlockGemm::MakeABlockDistributionEncode())){};
+            constexpr auto BGemmTileDistr = decltype(make_static_tile_distribution(
+                BlockGemm::MakeBBlockDistributionEncode())){};
+
+            using AGemmTile = decltype(make_static_distributed_tensor<ADataType>(AGemmTileDistr));
+            using BGemmTile = decltype(make_static_distributed_tensor<BDataType>(BGemmTileDistr));
+            AGemmTile a_tile_0, a_tile_1;
+            BGemmTile b_tile_0, b_tile_1;
+
+            // Register tile for A and B.
+            using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+            using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+            using ABlockTile =
+                decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+            using BBlockTile =
+                decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+            ABlockTile a_global_load_tile;
+            BBlockTile b_global_load_tile;
+
+            // Block GEMM
+            auto block_gemm     = BlockGemm();
+            auto c_block_tile_0 = block_gemm.MakeCBlockTile();
+            auto c_block_tile_1 = block_gemm.MakeCBlockTile();
+
+            CDataType* __restrict__ p_c_lds = static_cast<CDataType*>(p_smem_0);
+            auto c_lds_block_0 =
+                make_naive_tensor_view<address_space_enum::lds>(p_c_lds,
+                                                                make_tuple(MPerBlock, NPerBlock),
+                                                                make_tuple(NPerBlock, 1),
+                                                                number<BlockGemm::Traits::KPack>{},
+                                                                number<1>{});
+            auto c_window_0 = make_tile_window(c_lds_block_0,
+                                               make_tuple(number<MPerBlock>{}, number<NPerBlock>{}),
+                                               {0, 0},
+                                               c_block_tile_1.get_tile_distribution());
+
+            // initialize C
+            if(warp_id == 0)
+            {
+                tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile_0);
+            }
+            else
+            {
+                tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile_1);
+            }
+
+            // define ping, pong steps here as lambda functions.
+            auto MemoryOpsStep = [&](auto idx) {
+                // Memory read half here.
+                Base::GlobalPrefetch(
+                    a_global_load_tile, a_copy_dram_window, a_dram_tile_window_step);
+                Base::GlobalPrefetch(
+                    b_global_load_tile, b_copy_dram_window, b_dram_tile_window_step);
+
+                if constexpr(is_a_col_major)
+                {
+                    auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                        Policy::template MakeShuffledARegTileDistribution<Problem>());
+                    transpose_tile2d(a_shuffle_tmp, a_global_load_tile);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                }
+                else
+                {
+                    Base::LocalPrefill(a_copy_lds_window, a_global_load_tile, a_element_func);
+                }
+
+                if constexpr(is_b_row_major)
+                {
+                    auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                        Policy::template MakeShuffledBRegTileDistribution<Problem>());
+                    transpose_tile2d(b_shuffle_tmp, b_global_load_tile);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                }
+                else
+                {
+                    Base::LocalPrefill(b_copy_lds_window, b_global_load_tile, b_element_func);
+                }
+
+                if(idx == 0)
+                {
+                    Base::LocalPrefetch(a_tile_0, a_lds_window);
+                    Base::LocalPrefetch(b_tile_0, b_lds_window);
+                }
+                else
+                {
+                    Base::LocalPrefetch(a_tile_1, a_lds_window);
+                    Base::LocalPrefetch(b_tile_1, b_lds_window);
+                }
+            };
+
+            auto ComputeStep = [&](auto idx) {
+                if(idx == 0)
+                {
+                    block_gemm(c_block_tile_0, a_tile_0, b_tile_0);
+                }
+                else
+                {
+                    block_gemm(c_block_tile_1, a_tile_1, b_tile_1);
+                }
+            };
+
+            if(operation_id == 0)
+            {
+                MemoryOpsStep(warp_id);
+            }
+
+            index_t num_compute_steps = __builtin_amdgcn_readfirstlane(num_loop);
+            while(num_compute_steps > 1)
+            {
+                block_sync_lds();
+                operation_id = (operation_id + 1) % NumWaveGroups;
+
+                if(operation_id == 0)
+                {
+                    MemoryOpsStep(warp_id);
+                }
+                else
+                {
+                    ComputeStep(warp_id);
+                }
+                num_compute_steps -= 1;
+            }
+            block_sync_lds();
+
+            if(operation_id == 0)
+            {
+                ComputeStep(warp_id);
+            }
+            block_sync_lds();
+
+            if(warp_id == 1)
+            {
+                store_tile(c_window_0, c_block_tile_1);
+            }
+            block_sync_lds();
+
+            if(warp_id == 0)
+            {
+                load_tile(c_block_tile_1, c_window_0);
+
+                constexpr auto s_spans = decltype(c_block_tile_0)::get_distributed_spans();
+                sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                        auto idx2 = make_tuple(idx0, idx1);
+                        c_block_tile_0(idx2) += c_block_tile_1(idx2);
+                    });
+                });
+            }
+            return c_block_tile_0;
+        }
+    };
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const BElementFunction& b_element_func,
+                                   index_t num_loop,
+                                   void* p_smem_0) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            a_element_func,
+            b_dram_block_window_tmp,
+            b_element_func,
+            num_loop,
+            p_smem_0);
+    }
+
+    public:
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const index_t num_loop,
+                                   void* __restrict__ p_smem_0) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_dram_block_window_tmp,
+            [](const BDataType& b) { return b; },
+            num_loop,
+            p_smem_0);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
new file mode 100644
index 0000000000..c03db08c3f
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+
+namespace ck_tile {
+// Default policy for GemmPipelineAGmemBGmemCregComputeV5, except the block gemm method, it shares
+// the same vector size implementation, SmemSize, Global memory tile distiribution as the
+// UniversalGemm Pipeline Policy.
+// Default policy class should not be templated, put template on
+// member functions instead.
+struct GemmPipelineAgBgCrCompV5DefaultPolicy
+    : public UniversalGemmBasePolicy<GemmPipelineAgBgCrCompV5DefaultPolicy>
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    {
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                Problem::TransposeC>;
+        using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                    typename Problem::BDataType,
+                                                                    typename Problem::CDataType,
+                                                                    BlockWarps,
+                                                                    WarpGemm>;
+
+        return BlockGemmARegBRegCRegV1<Problem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr index_t GetSmemSizeC()
+    {
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+
+        return integer_least_multiple(sizeof(typename Problem::CDataType) * MPerBlock * NPerBlock,
+                                      16);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+        constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
+        constexpr index_t smem_size_c = GetSmemSizeC<Problem>();
+
+        return smem_size_a + smem_size_b >= smem_size_c ? (smem_size_a + smem_size_b)
+                                                        : (smem_size_c);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index f7b5f9b3cb..1f2ab80797 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -188,6 +188,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
     // Where is the right place for HasHotLoop and TailNum ???
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 0b38e7789e..678fb6eb46 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -198,6 +198,8 @@ struct UniversalGemmPipelineProblem
 
     static constexpr bool TransposeC            = Traits::TransposeC;
     static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
+
+    static constexpr index_t NumWaveGroups = Traits::NumWaveGroups;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 6890cf2f64..91e845d200 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -426,10 +426,11 @@ struct UniversalGemmBasePolicy
     {
         using ALayout = remove_cvref_t<typename Problem::ALayout>;
 
-        constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize = GetVectorSizeA<Problem>();
+        constexpr index_t BlockSize     = Problem::kBlockSize;
+        constexpr index_t MPerBlock     = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize   = GetVectorSizeA<Problem>();
+        constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
         // Tile: MPerBlock X KPerBlock
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -438,7 +439,8 @@ struct UniversalGemmBasePolicy
                                                                           MPerBlock,
                                                                           KPerBlock,
                                                                           VecLoadSize,
-                                                                          ATileAccessPattern>;
+                                                                          ATileAccessPattern,
+                                                                          NumWaveGroups>;
             return TileEncodingPattern::Make2DStaticTileDistribution();
         }
         // Tile: KPerBlock X MPerBlock
@@ -448,7 +450,8 @@ struct UniversalGemmBasePolicy
                                                                           KPerBlock,
                                                                           MPerBlock,
                                                                           VecLoadSize,
-                                                                          ATileAccessPattern>;
+                                                                          ATileAccessPattern,
+                                                                          NumWaveGroups>;
             return TileEncodingPattern::Make2DStaticTileDistribution();
         }
     }
@@ -458,10 +461,11 @@ struct UniversalGemmBasePolicy
     {
         using BLayout = remove_cvref_t<typename Problem::BLayout>;
 
-        constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t NPerBlock   = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
+        constexpr index_t BlockSize     = Problem::kBlockSize;
+        constexpr index_t NPerBlock     = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize   = GetVectorSizeB<Problem>();
+        constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
         // Tile: KPerBlock X NPerBlock
         if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -470,7 +474,8 @@ struct UniversalGemmBasePolicy
                                                                           KPerBlock,
                                                                           NPerBlock,
                                                                           VecLoadSize,
-                                                                          BTileAccessPattern>;
+                                                                          BTileAccessPattern,
+                                                                          NumWaveGroups>;
             return TileEncodingPattern::Make2DStaticTileDistribution();
         }
         // Tile: NPerBlock X KPerBlock
@@ -480,7 +485,8 @@ struct UniversalGemmBasePolicy
                                                                           NPerBlock,
                                                                           KPerBlock,
                                                                           VecLoadSize,
-                                                                          BTileAccessPattern>;
+                                                                          BTileAccessPattern,
+                                                                          NumWaveGroups>;
             return TileEncodingPattern::Make2DStaticTileDistribution();
         }
     }
@@ -490,16 +496,18 @@ struct UniversalGemmBasePolicy
     {
         using ALayout = remove_cvref_t<typename Problem::ALayout>;
         static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
-        constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize = GetVectorSizeA<Problem>();
+        constexpr index_t BlockSize     = Problem::kBlockSize;
+        constexpr index_t MPerBlock     = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize   = GetVectorSizeA<Problem>();
+        constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
         using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
                                                                       KPerBlock,
                                                                       MPerBlock,
                                                                       VecLoadSize,
-                                                                      ATileAccessPattern>;
+                                                                      ATileAccessPattern,
+                                                                      NumWaveGroups>;
         return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
     }
 
@@ -508,16 +516,18 @@ struct UniversalGemmBasePolicy
     {
         using BLayout = remove_cvref_t<typename Problem::BLayout>;
         static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
-        constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t NPerBlock   = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
+        constexpr index_t BlockSize     = Problem::kBlockSize;
+        constexpr index_t NPerBlock     = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize   = GetVectorSizeB<Problem>();
+        constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
         using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
                                                                       KPerBlock,
                                                                       NPerBlock,
                                                                       VecLoadSize,
-                                                                      BTileAccessPattern>;
+                                                                      BTileAccessPattern,
+                                                                      NumWaveGroups>;
         return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
     }
 
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index a61b0eee3c..353192d86f 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -39,7 +39,8 @@ template <bool kPadM_,
           typename CLayout_,
           bool TransposeC_            = false,
           bool UseStructuredSparsity_ = false,
-          bool UsePersistentKernel_   = false>
+          bool UsePersistentKernel_   = false,
+          index_t NumWaveGroups_      = 1>
 struct TileGemmUniversalTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -55,6 +56,7 @@ struct TileGemmUniversalTraits
     static constexpr bool TransposeC            = TransposeC_;
     static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
     static constexpr bool UsePersistentKernel   = UsePersistentKernel_;
+    static constexpr index_t NumWaveGroups      = NumWaveGroups_;
 };
 
 template <bool kPadM_,

From 3a0cb2796605082cdbac4d1649397b9435e49556 Mon Sep 17 00:00:00 2001
From: John Shumway <john.shumwayjr@gmail.com>
Date: Fri, 13 Jun 2025 03:58:50 -0700
Subject: [PATCH 210/443] Shard several of the most costly targets. (#2266)

* Shard several of the most costly targets.

Introduces a filter_tuple_by_modulo to break up tuples.

Drops build time of target from 21 minutes to under 14 minutes with 64
build processes, or 11 minutes with 128 build processes.

time ninja -j 64 device_grouped_conv3d_fwd_instance

* fix clang format

* Fix build errors in instantiation code.

I wasn't sure how to test the header-only instantiation code on my
initial commit. From Jenkins CI test results, I see that there is a
test target that depends on these headers:

ninja -j 128 test_grouped_convnd_fwd

This allowed me to test the build locally. I found three mistakes I
made, mostly related to early experiments on I tried on the code.
This was hard to find earlier because this PR is really too large.

I also discovered that there are five 2D convolution targets that now
dominate the compilation time. I will likely address those in a later
PR, rather than adding even more changes to this PR.

* Fix link errors from mismatched declarations.

Our pattern for instantiating MIOpen templates uses duplicate
declarations (instead of headers). This is fragile, and I didn't
notice that my last commit had a bunch of link errors. I fixed these
mistakes, and the bin/test_grouped_conv_fwd test target binary now links
correctly.

* Migrate the design to a code-generation approach.

Use a CMake function with template files to generate the source files for the
intantiating the kerenels and to generate the calling function.

* Shard the longest 2D convolution builds

Now that we have automated the shard instantiation, we can shard the 2D
convolution targets that take the longest to build. The target
test_grouped_conv2d_fwd now compiles in 15 minutes.

* Use PROJECT_SOURCE_DIR for submodule compatibility

I used CMAKE_SOURCE_DIR to refer to the top-level source directory in
the ShardInstantiation.cmake file, but this can cause issues with
git submodules.  Instead, we should use PROJECT_SOURCE_DIR to ensure
compatibility when this project is used as a submodule in another
project.

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
---
 .gitignore                                    |   3 +
 cmake/ShardInstantiation.cmake                | 116 ++++++++++++++++++
 cmake/call_shard.in                           |  15 +++
 cmake/instantiate_shard.in                    |   9 ++
 include/ck/utility/filter_tuple.hpp           |  66 ++++++++++
 .../gpu/grouped_convolution_forward_xdl.inc   |   3 +-
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |  51 +++++++-
 ...l_ngchw_gkcyx_ngkhw_bf16_comp_instance.in} |  38 +++---
 ...wd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in} |  40 +++---
 ...fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in} |  64 +++++-----
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp |  66 ----------
 ...wgc_gkyxc_nhwgk_int8_mem_inter_instance.in |  80 ++++++++++++
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp |  66 ----------
 ...wgc_gkyxc_nhwgk_int8_mem_intra_instance.in |  80 ++++++++++++
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     | 109 +++++++++++++---
 ...dhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp | 111 -----------------
 ...ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in |  66 ++++++++++
 ...ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp | 111 -----------------
 ..._ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in |  65 ++++++++++
 ...gcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp |  54 --------
 ...ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in |  65 ++++++++++
 ...ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp |  54 --------
 ..._ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in |  63 ++++++++++
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |  53 --------
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in} |  53 ++++----
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |  53 --------
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in} |  53 ++++----
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp |   9 ++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp |   9 ++
 ...w_gkczyx_ngkdhw_bf16_mem_inter_instance.in |  64 ++++++++++
 ...w_gkczyx_ngkdhw_bf16_mem_intra_instance.in |  65 ++++++++++
 ...w_gkczyx_ngkdhw_f16_mem_inter_instance.in} |  69 ++++++-----
 ...w_gkczyx_ngkdhw_f16_mem_intra_instance.in} |  75 ++++++-----
 ...w_gkczyx_ngkdhw_f32_mem_inter_instance.in} |  69 ++++++-----
 ...w_gkczyx_ngkdhw_f32_mem_intra_instance.in} |  69 ++++++-----
 ...w_gkczyx_ngkdhw_f32_mem_intra_instance.inc |  62 ++++++++++
 42 files changed, 1325 insertions(+), 827 deletions(-)
 create mode 100644 cmake/ShardInstantiation.cmake
 create mode 100644 cmake/call_shard.in
 create mode 100644 cmake/instantiate_shard.in
 create mode 100644 include/ck/utility/filter_tuple.hpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in} (53%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in} (71%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in} (64%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/{mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in} (64%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/{mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in} (64%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in} (59%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in} (57%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in} (59%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in} (59%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc

diff --git a/.gitignore b/.gitignore
index 599ef99e35..e4dd8f7513 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,3 +68,6 @@ build*/
 
 # Python cache
 __pycache__/
+
+.cache/
+
diff --git a/cmake/ShardInstantiation.cmake b/cmake/ShardInstantiation.cmake
new file mode 100644
index 0000000000..47a5d0c48c
--- /dev/null
+++ b/cmake/ShardInstantiation.cmake
@@ -0,0 +1,116 @@
+# Function to generate templated instantiation functions and caller function.
+
+# In order to reduce build times, we split the instantiation of template functions into multiple files.
+# Developers can use ck::util::generate_sharded_instantiations to generate the instantiation functions,
+# which can be placed the TEMPLATE_FILE (typically a .in file).
+
+# This CMake function generates the instantiation functions and a caller function that calls all the instantiation 
+# functions. The ck::util::generate_sharded_instantiations function allows us to generate an arbitrary number of
+# shards (NUM_SHARDS). This function loops over the shards, generates an instantiation function for each shard,
+# and generates a caller function that calls all the instantiation functions.
+
+# The explicit instatiation pattern requires the use of `extern template` to avoid implicit instantiation
+# of the template functions in the caller function, and that code is automatically generated by this function.
+
+# In addition to the user-supplied template, this CMake function uses two generic templates:
+#
+# 1. `instantiate_shard.in`: This is the template for the instantiation functions.
+# 2. `call_shard.in`: This is the template for the caller function that calls all the instantiation functions.
+
+# This function takes the following arguments:
+#
+# - INSTANCES_NAME: The name of the instances (the calling function will be named `add_${INSTANCE_NAMES}`).
+# - TEMPLATE_FILE: The path to the template file that contains the templated instantiation function definitions.
+# - NUM_SHARDS: The number of shards to generate.
+# - OUTPUT_DIR: The build directory where the generated source files will be placed.
+# - SRC_LIST: The list of source files to which the generated source files will be added.
+
+
+function(generate_sharded_instantiations)
+    cmake_parse_arguments(
+        GEN_SHARDED
+        # No boolean arguments
+        ""
+        # Single-value arguments
+        "INSTANCES_NAME;TEMPLATE_FILE;NUM_SHARDS;OUTPUT_DIR;SRC_LIST"
+        # No multi-value arguments.
+        ""
+        ${ARGN}
+    )
+    if (NOT GEN_SHARDED_INSTANCES_NAME)
+        message(FATAL_ERROR "INSTANCES_NAME is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_TEMPLATE_FILE)
+        message(FATAL_ERROR "TEMPLATE_FILE is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_NUM_SHARDS)
+        message(FATAL_ERROR "NUM_SHARDS is required for generate_sharded_instantiations")
+    endif()
+    if(NOT GEN_SHARDED_OUTPUT_DIR)
+        message(FATAL_ERROR "OUTPUT_DIR is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_SRC_LIST)
+        message(FATAL_ERROR "SRC_LIST is required for generate_sharded_instantiations")
+    endif()
+
+    file(MAKE_DIRECTORY ${GEN_SHARDED_OUTPUT_DIR})
+
+
+    set(GENERATED_SOURCE_FILES "")
+    set(EXTERN_TEMPLATE_STATEMENTS "")
+    set(CALL_STATEMENTS "")
+    message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
+
+    set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
+    
+    # Generate the inc file with the template function defintions.
+    # This include file will hold the template function definitions and a using alias for all the shard
+    # instantiation functions.
+    configure_file(
+        "${GEN_SHARDED_TEMPLATE_FILE}"
+        "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.inc"
+        @ONLY
+    )
+
+    # Generate the sharded instantiation functions.
+    # This is where the build parallelization happens.
+    # Each of these source files will contain a single instantiation function for a shard,
+    # which will be called sequentially by the caller function.
+    set(INC_DIR "${GEN_SHARDED_INC_DIR}")
+    math(EXPR LAST_SHARD_ID "${GEN_SHARDED_NUM_SHARDS} - 1")
+    foreach(SHARD_ID RANGE 0 ${LAST_SHARD_ID})
+        set(NUM_SHARDS "${GEN_SHARDED_NUM_SHARDS}")
+        set(SHARD_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}_shard_${SHARD_ID}.cpp")
+        set(SHARD_FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/instantiate_shard.in")
+        configure_file(
+            "${SHARD_FUNCTION_TEMPLATE}"
+            "${SHARD_FUNCTION_PATH}"
+            @ONLY
+        )
+        list(APPEND GENERATED_SOURCE_FILES "${SHARD_FUNCTION_PATH}")
+        set(SHARDED_FUNCTION_NAME "add_${INSTANCES}_shard<${NUM_SHARDS}, ${SHARD_ID}>")
+        list(APPEND EXTERN_TEMPLATE_STATEMENTS "extern template void\n${SHARDED_FUNCTION_NAME}(\n  ${INSTANCES}& instances)")
+        list(APPEND CALL_STATEMENTS "  ${SHARDED_FUNCTION_NAME}(instances)")
+    endforeach()
+
+    # Join the include statements, the extern template declarations, and the call statements each
+    # into a single string for variable substitution in the caller function.
+    string(REPLACE ";" ";\n" INCLUDE_STATEMENTS "${INCLUDE_STATEMENTS}")
+    string(REPLACE ";" ";\n" CALL_STATEMENTS "${CALL_STATEMENTS}")
+    string(REPLACE ";" ";\n" EXTERN_TEMPLATE_STATEMENTS "${EXTERN_TEMPLATE_STATEMENTS}")
+
+    # Generate the caller function.
+    set(CALLER_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.cpp")
+    set(FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/call_shard.in")
+    configure_file(
+        "${FUNCTION_TEMPLATE}"
+        "${CALLER_FUNCTION_PATH}"
+        @ONLY
+    )
+    list(APPEND GENERATED_SOURCE_FILES "${CALLER_FUNCTION_PATH}")
+
+    # Add the generated source files to the list of source files.
+    # This allows the generated source files to be included in the build.
+    list(APPEND ${GEN_SHARDED_SRC_LIST} ${GENERATED_SOURCE_FILES})
+    set(${GEN_SHARDED_SRC_LIST} "${${GEN_SHARDED_SRC_LIST}}" PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/cmake/call_shard.in b/cmake/call_shard.in
new file mode 100644
index 0000000000..daba79b055
--- /dev/null
+++ b/cmake/call_shard.in
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "@INSTANCES@.inc"
+
+namespace ck::tensor_operation::device::instance {
+
+@EXTERN_TEMPLATE_STATEMENTS@;
+
+void add_@INSTANCES@(
+    @INSTANCES@& instances) {
+@CALL_STATEMENTS@; 
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/cmake/instantiate_shard.in b/cmake/instantiate_shard.in
new file mode 100644
index 0000000000..dbc0af17a9
--- /dev/null
+++ b/cmake/instantiate_shard.in
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "@INSTANCES@.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_@INSTANCES@_shard<@NUM_SHARDS@, @SHARD_ID@>(
+    @INSTANCES@& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/include/ck/utility/filter_tuple.hpp b/include/ck/utility/filter_tuple.hpp
new file mode 100644
index 0000000000..c2e378b879
--- /dev/null
+++ b/include/ck/utility/filter_tuple.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "ck/utility/functional.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck::util {
+
+template <typename Tuple, std::size_t Stride, std::size_t Offset>
+struct filter_tuple_by_modulo
+{
+    // Validate Stride and Offset.
+    static_assert(Stride > 0, "Offset must be positive.");
+    static_assert(Offset >= 0 && Offset < Stride,
+                  "Offset must be positive and less than the stride.");
+
+    // Generate filtered indices for this stride and offset.
+    static constexpr int new_size = (std::tuple_size_v<Tuple> + Stride - Offset - 1) / Stride;
+
+    template <std::size_t... Is>
+    static constexpr auto to_index(std::index_sequence<Is...>)
+    {
+        return std::index_sequence<(Offset + Is * Stride)...>{};
+    }
+
+    using filtered_indices = decltype(to_index(std::make_index_sequence<new_size>{}));
+
+    // Helper struct to construct the new tuple type from the filtered indices.
+    template <typename T, typename Indices>
+    struct make_filtered_tuple_type_impl;
+
+    template <typename T, std::size_t... Is>
+    struct make_filtered_tuple_type_impl<T, std::index_sequence<Is...>>
+    {
+        using type = std::tuple<std::tuple_element_t<Is, T>...>;
+    };
+
+    using type = typename make_filtered_tuple_type_impl<Tuple, filtered_indices>::type;
+};
+
+// Filter a tuple with a stride and offset.
+//
+// Tuple is a std::tuple or equivalent
+// Stride is a positive integer
+// Offset is a positive integer smaller than ofset
+//
+// Evaluates to a smaller tuple type from elements of T with stride M and offset I.
+//
+// Can be used to filter a tuple of types for sharded instantiations.
+template <typename Tuple, std::size_t Stride, std::size_t Offset>
+using filter_tuple_by_modulo_t = typename filter_tuple_by_modulo<Tuple, Stride, Offset>::type;
+
+// Example compile-time test:
+// using OriginalTuple =
+//    std::tuple<int, double, char, float, long, short, bool, char, long long, unsigned int>;
+// using NewTuple_Every3rdFrom2nd = filter_tuple_by_modulo_t<OriginalTuple, 3, 1>;
+// static_assert(std::is_same_v<NewTuple_Every3rdFrom2nd, std::tuple<double, long, char>>,
+//               "Test Case 1 Failed: Every 3rd from 2nd");
+
+} // namespace ck::util
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index b018737932..a3f2515099 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -688,7 +688,6 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 22e9d726b0..7f3621a2ba 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -1,5 +1,5 @@
 # XDL_DL_WMMA_KERNELS
-add_instance_library(device_grouped_conv2d_fwd_instance
+set(GROUPED_CONV2D_FWD
    #xdl
    # GNHWC, GKYXC, GNHWK
    xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -19,8 +19,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
    # NGCHW, GKCYX, NGKHW
-   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
-   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
@@ -46,12 +44,10 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
    # NGCHW, GKCYX, NGKHW
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
@@ -71,7 +67,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
    # NGCHW, GKCYX, NGKHW
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -105,3 +100,47 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
+# Add generated files for sharded instantiations.
+include(ShardInstantiation)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
+  NUM_SHARDS 21
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
+  NUM_SHARDS 21
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+add_instance_library(device_grouped_conv2d_fwd_instance ${GROUPED_CONV2D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
similarity index 53%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
index 7368587c93..55165729c6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
@@ -1,16 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -22,19 +20,23 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances_shard([[maybe_unused]]
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances& instances)
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                        NGCHW,
-                                                        GKCYX,
-                                                        Empty_Tuple,
-                                                        NGKHW,
-                                                        ConvFwdDefault>{});
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                           NGCHW,
+                                                                                           GKCYX,
+                                                                                           Empty_Tuple,
+                                                                                           NGKHW,
+                                                                                           ConvFwdDefault>,
+                                           Shards,
+                                           ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
\ No newline at end of file
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
similarity index 71%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
index 4ca1b2b85e..88c84adfe2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -21,32 +19,40 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances_shard(
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwdDefault>{});
+                                                                              ConvFwdDefault>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwd1x1P0>{});
+                                                                              ConvFwd1x1P0>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwd1x1S1P0>{});
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
index e3a12fd5f4..13fb583725 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -21,32 +19,40 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances_shard(
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwdDefault>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwd1x1P0>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1P0>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwd1x1S1P0>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
deleted file mode 100644
index f667481fa4..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdDefault,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
new file mode 100644
index 0000000000..d8b35bda68
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances_shard(
+    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
deleted file mode 100644
index 2ff2c7f51f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
new file mode 100644
index 0000000000..125e16139d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances_shard(
+    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index f8efa5a7c1..1d9d75a104 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -11,8 +11,6 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -32,23 +30,13 @@ set(GROUPED_CONV3D_FWD
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
    
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+      xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -71,6 +59,99 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
+# Add generated files for sharded instantiations.
+include(ShardInstantiation)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
+  NUM_SHARDS 8
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
+  NUM_SHARDS 8
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
deleted file mode 100644
index a94f687ef8..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NDHWGC,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NDHWGK,
-                                                        ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                                                   NDHWGC,
-                                                                                   GKZYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NDHWGK,
-                                                                                   ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NDHWGC,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NDHWGK,
-                                                        ConvFwd1x1S1P0>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwd1x1S1P0>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwd1x1S1P0>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
new file mode 100644
index 0000000000..e1a6e6c0c4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
deleted file mode 100644
index 0c63345e7f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NDHWGC,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NDHWGK,
-                                                       ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                  NDHWGC,
-                                                                                  GKZYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NDHWGK,
-                                                                                  ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NDHWGC,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NDHWGK,
-                                                       ConvFwd1x1S1P0>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd1x1S1P0>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwd1x1S1P0>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
new file mode 100644
index 0000000000..6d196ad71f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
deleted file mode 100644
index 43241454a5..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NGCDHW,
-                                                        GKCZYX,
-                                                        Empty_Tuple,
-                                                        NGKDHW,
-                                                        ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                                                   NGCDHW,
-                                                                                   GKCZYX,
-                                                                                   Empty_Tuple,
-                                                                                   NGKDHW,
-                                                                                   ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NGCDHW,
-                                                        GKCZYX,
-                                                        Empty_Tuple,
-                                                        NGKDHW,
-                                                        ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
new file mode 100644
index 0000000000..4c67e4912c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
deleted file mode 100644
index d02d9f6778..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NGCDHW,
-                                                       GKCZYX,
-                                                       Empty_Tuple,
-                                                       NGKDHW,
-                                                       ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NGCDHW,
-                                                       GKCZYX,
-                                                       Empty_Tuple,
-                                                       NGKDHW,
-                                                       ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
new file mode 100644
index 0000000000..0fbefa3bbc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
deleted file mode 100644
index 060eebebc1..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwdDefault>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
index f3eccc7dc8..c87783eed9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +20,43 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_insta
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  ConvFwdDefault>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Intrawave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Intrawave>{});
+                                                                                  ConvFwd1x1S1P0>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
deleted file mode 100644
index 85b088f416..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwdDefault>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
index abea0bea81..ca6d571be1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +20,43 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instan
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Interwave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 ConvFwdDefault>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Interwave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Interwave>{});
+                                                                                 ConvFwd1x1S1P0>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
new file mode 100644
index 0000000000..da2f3dc1fa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 0>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
new file mode 100644
index 0000000000..5d551833c0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 1>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
new file mode 100644
index 0000000000..715cbf6beb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 2>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
new file mode 100644
index 0000000000..cf2a9f4023
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 3>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
new file mode 100644
index 0000000000..085b2904d6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 4>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
new file mode 100644
index 0000000000..18b1e0c6d9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 5>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
new file mode 100644
index 0000000000..b95f1d1229
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 6>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
new file mode 100644
index 0000000000..afe3e5d19f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 7>(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
new file mode 100644
index 0000000000..2586bc0f16
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
new file mode 100644
index 0000000000..7405f86a5f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
index ba5d9fb1de..24d6b66976 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +19,44 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instan
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
similarity index 57%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
index fac3098341..91a2444241 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
@@ -3,53 +3,60 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
                                                                 Empty_Tuple,
                                                                 NGKDHW,
-                                                                BF16,
-                                                                BF16,
+                                                                F16,
+                                                                F16,
                                                                 Empty_Tuple,
-                                                                BF16,
+                                                                F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
index 5a2c4a0d5b..7571dff883 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +19,44 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instan
                                                                 F32,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
index 701b8eb4a4..38ed240fab 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +19,44 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instan
                                                                 F32,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc
new file mode 100644
index 0000000000..38ed240fab
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F32,
+                                                                F32,
+                                                                Empty_Tuple,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance

From bd96ac9742b9e7da08b9e8a26e0b40d10c54e574 Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Fri, 13 Jun 2025 19:39:11 +0200
Subject: [PATCH 211/443] [CK_TILE] Multiple-D GEMM example  (#2219)

* Multiple d, initial commit

* Check Ds Layout

* Readme and clang format

* Update branch & conflicts

* Multiple D - fix clang-formatter

* Rename elemetwise_op

* Fix CI

* Code review part1

* Remove printf

* Remove unnecessary comment

* Add new tests with Col layout

* Review part 2

* Added support for Multiple D GEMM

* Update comment

* Remove maybe_unused

* Clang-format

* Review part 3

* Add comment to function

* Add comment to function: another

* Take number of params for a refrence function

* Remove additional d param for 0 tensor

* Change name of function

* Fix CI fails
---
 CHANGELOG.md                                  |   1 +
 example/ck_tile/03_gemm/gemm_basic.cpp        |  10 +-
 example/ck_tile/03_gemm/gemm_utils.hpp        |   7 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  | 101 +++--
 example/ck_tile/03_gemm/universal_gemm.cpp    |  23 +-
 .../ck_tile/16_batched_gemm/batched_gemm.cpp  |  16 +-
 .../ck_tile/16_batched_gemm/batched_gemm.hpp  |   1 +
 .../run_batched_gemm_example.inc              |  68 ++-
 example/ck_tile/17_grouped_gemm/README.md     |   2 +-
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  |  14 +-
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  |  17 +-
 .../run_grouped_gemm_example.inc              |  58 ++-
 .../ck_tile/19_gemm_multi_d/CMakeLists.txt    |   1 +
 example/ck_tile/19_gemm_multi_d/README.md     |  35 ++
 .../19_gemm_multi_d/gemm_multi_d_fp16.cpp     | 296 +++++++++++++
 .../19_gemm_multi_d/gemm_multi_d_fp16.hpp     |  79 ++++
 .../run_gemm_multi_d_fp16_example.inc         | 247 +++++++++++
 example/ck_tile/19_gemm_multi_d/utils.hpp     |  50 +++
 example/ck_tile/CMakeLists.txt                |   1 +
 .../ck_tile/core/tensor/tile_elementwise.hpp  |  32 ++
 .../ck_tile/host/reference/reference_gemm.hpp |  52 +++
 .../unary_element_wise_operation.hpp          |   1 +
 .../ops/epilogue/cshuffle_epilogue.hpp        | 101 ++++-
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   |  44 +-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 385 ++++++++++++-----
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |  62 +--
 test/ck_tile/CMakeLists.txt                   |   1 +
 .../batched_gemm/test_batched_gemm_util.hpp   |  12 +-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  16 +-
 test/ck_tile/gemm_multi_d/CMakeLists.txt      |   4 +
 .../gemm_multi_d/test_gemm_multi_d.cpp        |  39 ++
 .../test_gemm_multi_d_ut_cases.inc            | 334 ++++++++++++++
 .../gemm_multi_d/test_gemm_multi_d_util.hpp   | 407 ++++++++++++++++++
 .../grouped_gemm/test_grouped_gemm_util.hpp   |  35 +-
 34 files changed, 2267 insertions(+), 285 deletions(-)
 create mode 100644 example/ck_tile/19_gemm_multi_d/CMakeLists.txt
 create mode 100644 example/ck_tile/19_gemm_multi_d/README.md
 create mode 100644 example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
 create mode 100644 example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
 create mode 100644 example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
 create mode 100644 example/ck_tile/19_gemm_multi_d/utils.hpp
 create mode 100644 test/ck_tile/gemm_multi_d/CMakeLists.txt
 create mode 100644 test/ck_tile/gemm_multi_d/test_gemm_multi_d.cpp
 create mode 100644 test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases.inc
 create mode 100644 test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af8d965b30..368d1e502d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
+* Added support for Multiple D GEMM
 * Added GEMM pipeline for microscaling (MX) FP8/FP4 data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index de9608bcb4..defeffc2ee 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -14,13 +14,17 @@
 
 template <typename ADataType,
           typename BDataType,
+          typename DsDataType,
           typename AccDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
+          typename DsLayout,
           typename CLayout,
-          bool Persistent>
-float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+
 {
     if constexpr(Persistent)
         std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
@@ -53,8 +57,10 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
     using CodegenGemmTraits =
         ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
+
     using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
 
     const auto Run = [&](const auto memory_operation_) {
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index f3d11c751b..6987a2492e 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -252,10 +252,13 @@ auto create_args(int argc, char* argv[])
 // host API
 template <typename ADataType,
           typename BDataType,
+          typename DsDataType,
           typename AccDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
+          typename DsLayout,
           typename CLayout,
-          bool Persistent = false>
-float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
+          bool Persistent = false,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index bf455a6415..cc9a825c73 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -146,11 +146,14 @@ void permute_vectors_i4x4_b(Tensor& tensor)
 
 template <typename ADataType,
           typename BDataType,
+          typename DsDataType,
           typename AccDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::DeviceMem& b_k_n_dev_buf,
                   ck_tile::DeviceMem& c_m_n_dev_buf,
@@ -165,41 +168,48 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   int n_repeat,
                   bool persistent)
 {
-    ck_tile::GemmHostArgs args;
-    args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-    args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch  = kbatch;
-    args.M        = M;
-    args.N        = N;
-    args.K        = K;
-    args.stride_A = stride_A;
-    args.stride_B = stride_B;
-    args.stride_C = stride_C;
+    ck_tile::GemmHostArgs</*NumDTensor = 0*/> args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                                      {},
+                                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                                      kbatch,
+                                                      M,
+                                                      N,
+                                                      K,
+                                                      stride_A,
+                                                      stride_B,
+                                                      {},
+                                                      stride_C};
 
     float ave_time;
     if(persistent)
     {
-        ave_time = gemm_calc<ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout,
-                             true>(
+        ave_time = gemm<ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        true,
+                        CDEElementWise>(
             args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
     }
     else
     {
-        ave_time = gemm_calc<ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout,
-                             false>(
+        ave_time = gemm<ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        false,
+                        CDEElementWise>(
             args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
     }
 
@@ -328,20 +338,27 @@ int run_gemm_example_with_layouts(int argc,
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
-    invoke_gemm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        a_m_k_dev_buf,
-        b_k_n_dev_buf,
-        c_m_n_dev_buf,
-        M,
-        N,
-        K,
-        stride_A,
-        stride_B,
-        stride_C,
-        kbatch,
-        n_warmup,
-        n_repeat,
-        persistent);
+    invoke_gemm<ADataType,
+                BDataType,
+                ck_tile::tuple<>,
+                AccDataType,
+                CDataType,
+                ALayout,
+                BLayout,
+                ck_tile::tuple<>,
+                CLayout>(a_m_k_dev_buf,
+                         b_k_n_dev_buf,
+                         c_m_n_dev_buf,
+                         M,
+                         N,
+                         K,
+                         stride_A,
+                         stride_B,
+                         stride_C,
+                         kbatch,
+                         n_warmup,
+                         n_repeat,
+                         persistent);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index fafe40c333..beb6987605 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -15,13 +15,17 @@
 
 template <typename ADataType,
           typename BDataType,
+          typename DsDataType,
           typename AccDataType,
           typename CDataType,
           typename ALayout,
           typename BLayout,
-          typename CLayout,
-          bool Persistent>
-float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+
 {
     using GemmShape = ck_tile::TileGemmShape<
         ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
@@ -30,24 +34,26 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
         GemmConfig::PermuteA,
         GemmConfig::PermuteB>;
+
     using TilePartitioner =
         ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
                                                    GemmConfig::TileParitionerGroupNum,
                                                    GemmConfig::TileParitionerM01>;
 
-    using Traits              = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
                                            GemmConfig::kPadN,
                                            GemmConfig::kPadK,
                                            ALayout,
                                            BLayout,
-                                           CLayout>;
+                                           ELayout>;
+
     using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
                                                                  GemmConfig::kPadN,
                                                                  GemmConfig::kPadK,
                                                                  GemmConfig::DoubleSmemBuffer,
                                                                  ALayout,
                                                                  BLayout,
-                                                                 CLayout,
+                                                                 ELayout,
                                                                  GemmConfig::TransposeC,
                                                                  GemmConfig::UseStructuredSparsity,
                                                                  Persistent,
@@ -85,9 +91,12 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
+                                                 DsDataType,
                                                  AccDataType,
                                                  CDataType,
-                                                 CLayout,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
                                                  GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
index c5c86b1952..9616abb800 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -15,7 +15,16 @@
 #include "ck_tile/host.hpp"
 #include "batched_gemm.hpp"
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
@@ -123,12 +132,16 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
                                                                                tail_number_v>;
 
             using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
+                                                 DsDataType,
                                                  AccDataType,
                                                  CDataType,
+                                                 DsLayout,
                                                  CLayout,
+                                                 CDEElementWise,
                                                  GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
@@ -139,6 +152,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
                                                  K_Warp_Tile,
                                                  UniversalGemmProblem::TransposeC,
                                                  memory_operation>>;
+
             using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.hpp b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
index 0999c7ad3b..78d915e873 100644
--- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index 16a31e519a..7d5e1910dd 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -23,7 +23,16 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                           ck_tile::DeviceMem& b_k_n_dev_buf,
                           ck_tile::DeviceMem& c_m_n_dev_buf,
@@ -44,20 +53,29 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     ck_tile::BatchedGemmHostArgs args;
     args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
     args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+    args.e_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
     args.k_batch        = kbatch;
     args.M              = M;
     args.N              = N;
     args.K              = K;
     args.stride_A       = stride_A;
     args.stride_B       = stride_B;
-    args.stride_C       = stride_C;
+    args.stride_E       = stride_C;
     args.batch_stride_A = batch_stride_A;
     args.batch_stride_B = batch_stride_B;
-    args.batch_stride_C = batch_stride_C;
+    args.batch_stride_E = batch_stride_C;
     args.batch_count    = batch_count;
 
-    float ave_time = batched_gemm<ALayout, BLayout, CLayout>(
+    float ave_time = batched_gemm<ADataType,
+                                  BDataType,
+                                  DsDataType,
+                                  AccDataType,
+                                  CDataType,
+                                  ALayout,
+                                  BLayout,
+                                  DsLayout,
+                                  CLayout,
+                                  CDEElementWise>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::string op_name{"Batched Gemm"};
@@ -169,22 +187,30 @@ int run_batched_gemm_example_with_layouts(int argc,
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
-    invoke_batched_gemm<ALayout, BLayout, CLayout>(a_m_k_dev_buf,
-                                                   b_k_n_dev_buf,
-                                                   c_m_n_dev_buf,
-                                                   M,
-                                                   N,
-                                                   K,
-                                                   stride_A,
-                                                   stride_B,
-                                                   stride_C,
-                                                   batch_stride_A,
-                                                   batch_stride_B,
-                                                   batch_stride_C,
-                                                   batch_count,
-                                                   kbatch,
-                                                   n_warmup,
-                                                   n_repeat);
+    invoke_batched_gemm<ADataType,
+                        BDataType,
+                        ck_tile::tuple<>,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        ck_tile::tuple<>,
+                        CLayout>(a_m_k_dev_buf,
+                                 b_k_n_dev_buf,
+                                 c_m_n_dev_buf,
+                                 M,
+                                 N,
+                                 K,
+                                 stride_A,
+                                 stride_B,
+                                 stride_C,
+                                 batch_stride_A,
+                                 batch_stride_B,
+                                 batch_stride_C,
+                                 batch_count,
+                                 kbatch,
+                                 n_warmup,
+                                 n_repeat);
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
 
diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md
index d1a0458eda..59396a558b 100644
--- a/example/ck_tile/17_grouped_gemm/README.md
+++ b/example/ck_tile/17_grouped_gemm/README.md
@@ -1,6 +1,6 @@
 # Grouped CShuffle GEMM
 
-This folder contains example for Grouped GEMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile GEMM, but creates the placeholders for the future support on different GEMM pipeline and different GEMM modules. In the near future, we will gradually migrate all the GEMM features from old CK to CK Tile.
+This folder contains example for Grouped GEMM using ck_tile tile-programming implementation.
 
 ## build
 ```
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 2a72c6325e..85d75320c5 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -16,7 +16,16 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
                    void* kargs_ptr)
@@ -130,9 +139,12 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
+                                             DsDataType,
                                              AccDataType,
                                              CDataType,
+                                             DsLayout,
                                              CLayout,
+                                             CDEElementWise,
                                              GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 77db182c72..c4e83617d3 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -7,7 +7,8 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
@@ -53,7 +54,7 @@ using BDataType   = Types::BDataType;
 using AccDataType = Types::AccDataType;
 using CDataType   = Types::CDataType;
 
-using grouped_gemm_kargs = ck_tile::GemmHostArgs;
+using grouped_gemm_kargs = ck_tile::GemmHostArgs</*NumDTensor = 0*/>;
 
 auto create_args(int argc, char* argv[])
 {
@@ -82,7 +83,17 @@ inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gem
     return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
 }
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise>
 float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
                    void* kargs_ptr);
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index a01d8178cc..5ed1219731 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -30,7 +30,17 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ALayout, typename BLayout, typename CLayout, bool Persistent>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_gemm(int n_warmup,
                   int n_repeat,
                   int group_count,
@@ -44,7 +54,16 @@ float invoke_gemm(int n_warmup,
     if constexpr(!Persistent)
     {
         // Regular version of grouped gemm
-        ave_time = grouped_gemm<ALayout, BLayout, CLayout>(
+        ave_time = grouped_gemm<ADataType,
+                                BDataType,
+                                DsDataType,
+                                AccDataType,
+                                CDataType,
+                                ALayout,
+                                BLayout,
+                                DsLayout,
+                                CLayout,
+                                CDEElementWise>(
             args,
             ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
             gemm_workspace.GetDeviceBuffer());
@@ -64,16 +83,18 @@ float invoke_gemm(int n_warmup,
         const bool splitk = args[0].k_batch > 1;
         for(const auto& arg : args)
         {
-            kargs.emplace_back(ck_tile::GemmKernelArgs{arg.a_ptr,
-                                                       arg.b_ptr,
-                                                       arg.c_ptr,
-                                                       arg.M,
-                                                       arg.N,
-                                                       arg.K,
-                                                       arg.stride_A,
-                                                       arg.stride_B,
-                                                       arg.stride_C,
-                                                       arg.k_batch});
+            kargs.emplace_back(ck_tile::GemmKernelArgs<>{arg.a_ptr,
+                                                         arg.b_ptr,
+                                                         {},
+                                                         arg.e_ptr,
+                                                         arg.M,
+                                                         arg.N,
+                                                         arg.K,
+                                                         arg.stride_A,
+                                                         arg.stride_B,
+                                                         {},
+                                                         arg.stride_E,
+                                                         arg.k_batch});
         }
         const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
         HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
@@ -219,10 +240,19 @@ int run_grouped_gemm_example_with_layouts(int argc,
         void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
         gemm_descs.push_back(
-            {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+            {p_a, p_b, {}, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], {}, stride_Cs[i]});
     }
 
-    invoke_gemm<ALayout, BLayout, CLayout, Persistent>(warmup, repeat, group_count, gemm_descs);
+    invoke_gemm<ADataType,
+                BDataType,
+                ck_tile::tuple<>,
+                AccDataType,
+                CDataType,
+                ALayout,
+                BLayout,
+                ck_tile::tuple<>,
+                CLayout,
+                Persistent>(warmup, repeat, group_count, gemm_descs);
 
     for(int i = 0; i < group_count; i++)
     {
diff --git a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
new file mode 100644
index 0000000000..e2e68b325a
--- /dev/null
+++ b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(tile_example_gemm_multi_d_fp16 EXCLUDE_FROM_ALL gemm_multi_d_fp16.cpp)
diff --git a/example/ck_tile/19_gemm_multi_d/README.md b/example/ck_tile/19_gemm_multi_d/README.md
new file mode 100644
index 0000000000..7e8cd87546
--- /dev/null
+++ b/example/ck_tile/19_gemm_multi_d/README.md
@@ -0,0 +1,35 @@
+#Multiple D GEMM
+
+This folder contains example for Multiple D GEMM using ck_tile tile-programming implementation.
+
+## build
+```
+#in the root of ck_tile
+mkdir build && cd build
+#you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \
+    leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+#The basic pipeline method on the gemm calculation
+make tile_example_gemm_multi_d_fp16 -j
+```
+This will result in an executable `build/bin/tile_example_gemm_multi_d_fp16`
+
+## example
+```
+args:
+       -m  M dimensions - (Default: 3840)
+       -n  N dimensions - (Default: 4096)
+       -k  K dimensions - (Default: 4096)
+-a_layout  Tensor A layout (default:R)
+-b_layout  Tensor B layout (default:C)
+-ds_layout Tensor D layout (default:R)
+-e_layout  Tensor E layout (default:R)
+-stride_a  Tensor A strides - (Default: 0)
+-stride_b  Tensor B strides - (Default: 0)
+-stride_e  Tensor C strides - (Default: 0)
+-stride_ds Tensor D strides - (Default: 0)
+-validate  0. No validation, 1. Validation on GPU. (Default: 1)
+  -warmup  Number of iterations before benchmark the kernel. (Default: 10)
+  -repeat  Number of iterations to benchmark the kernel. (Default: 100)
+  -kbatch  kbatch for SplitK. (Default 1)
+```
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
new file mode 100644
index 0000000000..6c5ca08426
--- /dev/null
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_multi_d_fp16.hpp"
+#include "utils.hpp"
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config& s) -> float
+{
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+    // Memory friendly for Interwave scheduler
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 32;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 4;
+    constexpr ck_tile::index_t N_Warp = 1;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    constexpr bool DoubleSmemBuffer = false;
+#endif
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    // Compute friendly for Intrawave scheduler
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = true;
+#endif
+
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr bool TransposeC = false;
+
+    constexpr int kBlockPerCu                         = 1;
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                 kPadN,
+                                                                 kPadK,
+                                                                 DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 EDataType,
+                                                 DsLayout,
+                                                 CLayout,
+                                                 CDEElementWise,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    if(has_hot_loop)
+    {
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "For compute pipeline tail number should always be Full, but have \"" << tail_num
+                << "\" which is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+        if(tail_num == ck_tile::TailNumber::One)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Full)
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+
+        auto check_tail = [&](auto... TNs) {
+            (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
+        };
+
+        check_tail(ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
+                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+        if(tail_num == ck_tile::TailNumber::Three)
+        {
+            RunSplitk(
+                ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+        }
+        else
+        {
+            RunSplitk(ck_tile::bool_constant<true>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+        }
+#endif
+    }
+    else
+    {
+        if(tail_num == ck_tile::TailNumber::Full)
+        {
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Odd)
+        {
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+        }
+        else if(tail_num == ck_tile::TailNumber::Even)
+        {
+            RunSplitk(ck_tile::bool_constant<false>{},
+                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Num K loop must be larger than number of prefetech stages."
+                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
+    }
+
+    return ave_time;
+}
+
+#include "run_gemm_multi_d_fp16_example.inc"
+
+int main(int argc, char* argv[]) { return !run_multiple_d_gemm_example(argc, argv); }
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
new file mode 100644
index 0000000000..3ce3965e56
--- /dev/null
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
+using ADataType   = ck_tile::half_t;
+using BDataType   = ck_tile::half_t;
+using D0DataType  = ck_tile::half_t;
+using D1DataType  = ck_tile::half_t;
+using EDataType   = ck_tile::half_t;
+using DsDataType  = ck_tile::tuple<D0DataType, D1DataType>;
+using AccDataType = float;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "4096", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Col by default")
+        .insert("ds_layout", "R", "Ds tensor data layout - Row by default")
+        .insert("e_layout", "R", "E tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_ds", "0", "Tensor Ds stride")
+        .insert("stride_e", "0", "Tensor E stride")
+        .insert("v", "1", "0. No validation, 1. Validation on GPU")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("kbatch", "1", "kbatch for SplitK");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+using gemm_multi_d_kargs = ck_tile::GemmHostArgs<DsDataType::size()>;
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise>
+float gemm_multi_d(const gemm_multi_d_kargs& kargs, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc b/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
new file mode 100644
index 0000000000..a0d7157d03
--- /dev/null
+++ b/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <cstddef>
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm_multi_d(const void* a_m_k_dev_buf,
+                          const void* b_k_n_dev_buf,
+                          const std::array<const void*, DsDataType::size()>& ds_m_n_dev_buf,
+                          void* e_m_n_dev_buf,
+                          ck_tile::index_t M,
+                          ck_tile::index_t N,
+                          ck_tile::index_t K,
+                          ck_tile::index_t StrideA,
+                          ck_tile::index_t StrideB,
+                          const std::array<ck_tile::index_t, DsDataType::size()>& StrideDs,
+                          ck_tile::index_t StrideE,
+                          int n_warmup,
+                          int n_repeat,
+                          int k_batch)
+{
+    gemm_multi_d_kargs gemm_descs({a_m_k_dev_buf,
+                                   b_k_n_dev_buf,
+                                   ds_m_n_dev_buf,
+                                   e_m_n_dev_buf,
+                                   k_batch,
+                                   M,
+                                   N,
+                                   K,
+                                   StrideA,
+                                   StrideB,
+                                   StrideDs,
+                                   StrideE});
+
+    float ave_time = gemm_multi_d<ADataType,
+                                  BDataType,
+                                  DsDataType,
+                                  AccDataType,
+                                  EDataType,
+                                  ALayout,
+                                  BLayout,
+                                  DsLayout,
+                                  ELayout,
+                                  CDEElementWise>(
+        gemm_descs, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::string op_name{"Gemm Multiple-D"};
+    static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
+
+    std::size_t flop = 0, num_btype = 0;
+
+    flop += std::size_t(2) * M * N * K;
+
+    ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
+        num_btype += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
+        flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
+    });
+
+    num_btype += sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm Multiple-D kernel with:\n";
+    std::cout << "M =" << M << " N =" << N << " K =" << K << "\n";
+    std::cout << "StrideA = " << StrideA << " StrideB = " << StrideB << " StrideE = " << StrideE
+              << "\n";
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << "\n";
+
+    return ave_time;
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout>
+int run_multiple_d_gemm_example_with_layouts(int argc,
+                                             char* argv[],
+                                             const ALayout a_layout   = ALayout{},
+                                             const BLayout b_layout   = BLayout{},
+                                             const D0Layout d0_layout = D0Layout{},
+                                             const D1Layout d1_layout = D1Layout{},
+                                             const ELayout e_layout   = ELayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+    using CDElementWiseFn = MultiplyMultiply;
+    using DsLayout        = ck_tile::tuple<D0Layout, D1Layout>;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t StrideA = arg_parser.get_int("stride_a");
+    ck_tile::index_t StrideB = arg_parser.get_int("stride_b");
+    ck_tile::index_t StrideD = arg_parser.get_int("stride_ds");
+    ck_tile::index_t StrideE = arg_parser.get_int("stride_e");
+
+    ck_tile::index_t StrideD0 = StrideD;
+    ck_tile::index_t StrideD1 = StrideD;
+
+    const int n_warmup = arg_parser.get_int("warmup");
+    const int n_repeat = arg_parser.get_int("repeat");
+    const int k_batch  = arg_parser.get_int("kbatch");
+
+    StrideA  = get_default_stride(M, K, StrideA, is_row_major(a_layout));
+    StrideB  = get_default_stride(K, N, StrideB, is_row_major(b_layout));
+    StrideD0 = get_default_stride(M, N, StrideD0, is_row_major(d0_layout));
+    StrideD1 = get_default_stride(M, N, StrideD1, is_row_major(d1_layout));
+    StrideE  = get_default_stride(M, N, StrideE, is_row_major(e_layout));
+
+    ck_tile::HostTensor<ADataType> a_m_k_tesnor(
+        host_tensor_descriptor(M, K, StrideA, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n_tensors(
+        host_tensor_descriptor(K, N, StrideB, is_row_major(b_layout)));
+    ck_tile::HostTensor<D0DataType> d0_m_n_tensors(
+        host_tensor_descriptor(M, N, StrideD0, is_row_major(d0_layout)));
+    ck_tile::HostTensor<D1DataType> d1_m_n_tensors(
+        host_tensor_descriptor(M, N, StrideD1, is_row_major(d1_layout)));
+    ck_tile::HostTensor<EDataType> e_m_n_device_result(
+        host_tensor_descriptor(M, N, StrideE, is_row_major(e_layout)));
+
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tesnor);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors);
+    ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n_tensors);
+    ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n_tensors);
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k_tesnor.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k_tesnor.mData.data());
+    b_k_n_dev_buf.ToDevice(b_k_n_tensors.mData.data());
+    d0_m_n_dev_buf.ToDevice(d0_m_n_tensors.mData.data());
+    d1_m_n_dev_buf.ToDevice(d1_m_n_tensors.mData.data());
+
+    e_m_n_dev_buf.SetZero();
+    e_m_n_device_result.SetZero();
+
+    std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                              d1_m_n_dev_buf.GetDeviceBuffer()};
+
+    std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {StrideD0, StrideD1};
+
+    invoke_gemm_multi_d<ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        EDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        ELayout,
+                        CDElementWiseFn>(a_m_k_dev_buf.GetDeviceBuffer(),
+                                         b_k_n_dev_buf.GetDeviceBuffer(),
+                                         ds_ptr_buf,
+                                         e_m_n_dev_buf.GetDeviceBuffer(),
+                                         M,
+                                         N,
+                                         K,
+                                         StrideA,
+                                         StrideB,
+                                         stridesDs,
+                                         StrideE,
+                                         n_warmup,
+                                         n_repeat,
+                                         k_batch);
+
+    e_m_n_dev_buf.FromDevice(e_m_n_device_result.data());
+
+    ck_tile::HostTensor<EDataType> e_m_n_host_ref(
+        host_tensor_descriptor(M, N, StrideE, is_row_major(e_layout)));
+    e_m_n_host_ref.SetZero();
+
+    ck_tile::reference_gemm_multiple_d<ADataType,
+                                       BDataType,
+                                       DsDataType,
+                                       AccDataType,
+                                       EDataType,
+                                       CDElementWiseFn>(
+        a_m_k_tesnor, b_k_n_tensors, {d0_m_n_tensors, d1_m_n_tensors}, e_m_n_host_ref);
+
+    bool pass{true};
+    if(arg_parser.get_int("v"))
+    {
+        const float max_accumulated_value =
+            *std::max_element(e_m_n_host_ref.mData.begin(), e_m_n_host_ref.mData.end());
+
+        const auto rtol_atol = calculate_rtol_atol(K, 1, max_accumulated_value);
+
+        pass &= ck_tile::check_err(e_m_n_device_result,
+                                   e_m_n_host_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << std::endl;
+        std::cout << "Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+    return pass;
+}
+
+int run_multiple_d_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string ds_layout = arg_parser.get_str("ds_layout");
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if(a_layout == "R" && b_layout == "C" && ds_layout == "R")
+    {
+        return run_multiple_d_gemm_example_with_layouts(
+            argc, argv, Row{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for provided tensors!");
+    }
+}
diff --git a/example/ck_tile/19_gemm_multi_d/utils.hpp b/example/ck_tile/19_gemm_multi_d/utils.hpp
new file mode 100644
index 0000000000..a201d11ffc
--- /dev/null
+++ b/example/ck_tile/19_gemm_multi_d/utils.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) * ck_tile::type_convert<float>(d0) *
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index d479cd35f6..f2f39b6e17 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -18,5 +18,6 @@ add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
+add_subdirectory(19_gemm_multi_d)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_copy)
diff --git a/include/ck_tile/core/tensor/tile_elementwise.hpp b/include/ck_tile/core/tensor/tile_elementwise.hpp
index 79018b9ced..d2b24ad54e 100644
--- a/include/ck_tile/core/tensor/tile_elementwise.hpp
+++ b/include/ck_tile/core/tensor/tile_elementwise.hpp
@@ -59,6 +59,38 @@ CK_TILE_DEVICE auto tile_elementwise_in(const InElementFunc& in_element_func,
     return out_dstr_tensor;
 }
 
+/**
+ * @brief  Template function that "unpacks" a tuple and applies an element-wise operation.
+ *
+ * @param in_element_func    Function to apply element-wise.
+ * @param t                  Any container containing elements to process, with known size and
+ * tuple-like semantic.
+ * @return Calls tile_elementwise_inout with unpacked tuple elements.
+ */
+template <typename InElementFunc, typename Tuple, size_t... I>
+CK_TILE_DEVICE auto tile_elementwise_inout_unpack(const InElementFunc& in_element_func,
+                                                  const Tuple& t,
+                                                  std::index_sequence<I...>)
+{
+    return tile_elementwise_inout(in_element_func, t[number<I>{}]...);
+}
+
+/**
+ * @brief  Template function that "unpacks" a tuple and applies an element-wise operation.
+ *
+ * @param in_element_func   Function to apply element-wise.
+ * @param t                 Any container containing elements to process, with known size and
+ * tuple-like semantic.
+ * @return Calls the overloaded function, passing an index sequence.
+ */
+template <typename InElementFunc, typename Tuple>
+CK_TILE_DEVICE auto tile_elementwise_inout_unpack(const InElementFunc& in_element_func,
+                                                  const Tuple& t)
+{
+    static constexpr auto size = Tuple::size();
+    return tile_elementwise_inout_unpack(in_element_func, t, std::make_index_sequence<size>{});
+}
+
 template <typename DstrTensors, typename T>
 CK_TILE_DEVICE void set_tile(DstrTensors& dstr_tensor, const T& value)
 {
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index fe5077083c..c88deaec01 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -71,6 +71,58 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
     make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
 }
 
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ACCElementOp,
+          typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
+CK_TILE_HOST void
+reference_gemm_multiple_d(const HostTensor<ADataType>& a_m_k,
+                          const HostTensor<BDataType>& b_k_n,
+                          const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
+                          HostTensor<CDataType>& c_m_n,
+                          const ACCElementOp& acc_element_op = {})
+{
+    const std::size_t M = a_m_k.get_length(0);
+    const std::size_t N = b_k_n.get_length(1);
+    const std::size_t K = a_m_k.get_length(1);
+
+    auto f_mk_kn_mn = [&](auto m, auto n) {
+        AccDataType v_acc = 0;
+        for(std::size_t k = 0; k < K; ++k)
+        {
+            ADataType v_a = a_m_k(m, k);
+            BDataType v_b = b_k_n(k, n);
+            v_acc +=
+                ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
+        }
+
+        CDataType v_c = 0;
+        if constexpr(DsDataType::size() == 0)
+        {
+            acc_element_op(v_c, ck_tile::type_convert<float>(v_acc));
+        }
+        else if constexpr(DsDataType::size() == 1)
+        {
+            acc_element_op(v_c,
+                           ck_tile::type_convert<float>(v_acc),
+                           ck_tile::type_convert<float>(ds_m_n[0](m, n)));
+        }
+        else if constexpr(DsDataType::size() == 2)
+        {
+            acc_element_op(v_c,
+                           ck_tile::type_convert<float>(v_acc),
+                           ck_tile::type_convert<float>(ds_m_n[0](m, n)),
+                           ck_tile::type_convert<float>(ds_m_n[1](m, n)));
+        }
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
+    };
+
+    make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
+}
+
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index a3a0df996d..a3fe5045cf 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1479,5 +1479,6 @@ struct FastNumericArrayConverter<uint8_t, ck_tile::fp16_t, N>
     CK_TILE_DEVICE OutputArray operator()(InputArray const& Input) { return convert(Input); }
 };
 #endif
+
 } // namespace element_wise
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 6613ceebb2..68e91520bf 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -11,9 +11,12 @@ namespace ck_tile {
 
 template <typename ADataType_,
           typename BDataType_,
+          typename DsDataType_,
           typename AccDataType_,
           typename ODataType_,
-          typename CLayout_,
+          typename DsLayout_,
+          typename ELayout_,
+          typename CDElementwise_,
           index_t kBlockSize_,
           index_t kM_,
           index_t kN_,
@@ -31,7 +34,10 @@ struct CShuffleEpilogueProblem
     using BDataType                                        = remove_cvref_t<BDataType_>;
     using AccDataType                                      = remove_cvref_t<AccDataType_>;
     using ODataType                                        = remove_cvref_t<ODataType_>;
-    using CLayout                                          = remove_cvref_t<CLayout_>;
+    using DsDataType                                       = remove_cvref_t<DsDataType_>;
+    using DsLayout                                         = remove_cvref_t<DsLayout_>;
+    using ELayout                                          = remove_cvref_t<ELayout_>;
+    using CDElementwise                                    = remove_cvref_t<CDElementwise_>;
     static constexpr index_t kBlockSize                    = kBlockSize_;
     static constexpr index_t kMPerBlock                    = kM_;
     static constexpr index_t kNPerBlock                    = kN_;
@@ -43,6 +49,10 @@ struct CShuffleEpilogueProblem
     static constexpr index_t isCTransposed                 = isCTransposed_;
     static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
     static constexpr index_t kNumWaveGroups                = kNumWaveGroups_;
+    static constexpr index_t NumDTensor                    = DsDataType::size();
+
+    static_assert(NumDTensor == DsLayout::size(),
+                  "The size of DsDataType and DsLayout should be the same");
 };
 
 template <typename Problem_, typename Policy_ = void>
@@ -53,10 +63,13 @@ struct CShuffleEpilogue
     using BDataType   = remove_cvref_t<typename Problem::BDataType>;
     using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
     using ODataType   = remove_cvref_t<typename Problem::ODataType>;
+    using DsDataType  = remove_cvref_t<typename Problem::DsDataType>;
+    using DsLayout    = remove_cvref_t<typename Problem::DsLayout>;
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
-    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+    using ELayout       = remove_cvref_t<typename Problem::ELayout>;
+    using CDElementwise = remove_cvref_t<typename Problem::CDElementwise>;
     static constexpr memory_operation_enum MemoryOperation = Problem::MemoryOperation;
     static constexpr index_t kBlockSize                    = Problem::kBlockSize;
     static constexpr index_t kMPerBlock                    = Problem::kMPerBlock;
@@ -69,7 +82,10 @@ struct CShuffleEpilogue
     static constexpr index_t isCTransposed                 = Problem::isCTransposed;
     static constexpr index_t MPerIteration                 = MPerXdl * MWave;
     static constexpr index_t NPerIteration                 = NPerXdl * NWave;
+    static constexpr index_t NumDTensor                    = Problem::NumDTensor;
 
+    static_assert(NumDTensor == DsLayout::size(),
+                  "The size of DsDataType and DsLayout should be the same");
     /**
      * @brief Get the vector store size for C tensor.
      *
@@ -83,22 +99,49 @@ struct CShuffleEpilogue
     CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeC()
     {
         constexpr index_t max_vector_size = 16;
-        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
         {
             return std::min(static_cast<int>(NPerIteration),
                             static_cast<int>(max_vector_size / sizeof(ODataType)));
         }
-        else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
+        else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
         {
             return std::min(static_cast<int>(MPerIteration),
                             static_cast<int>(max_vector_size / sizeof(ODataType)));
         }
         else
         {
-            static_assert(false, "Unsupported CLayout!");
+            static_assert(false, "Unsupported ELayout!");
         }
     }
 
+    /**
+     * @brief Get the vector store size for Di tensor.
+     *
+     * @return The vector store size for Di tensor.
+     */
+    template <index_t I>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeD(number<I> index)
+    {
+        constexpr index_t max_vector_size = 16;
+        using DiDataType = remove_cvref_t<std::tuple_element_t<index.value, DsDataType>>;
+        using DiLayout   = remove_cvref_t<std::tuple_element_t<index.value, DsLayout>>;
+        if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+        {
+            return std::min(static_cast<int>(NPerIteration),
+                            static_cast<int>(max_vector_size / sizeof(DiDataType)));
+        }
+        else if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::ColumnMajor>)
+        {
+            return std::min(static_cast<int>(MPerIteration),
+                            static_cast<int>(max_vector_size / sizeof(DiDataType)));
+        }
+        else
+        {
+            static_assert(false, "Unsupported DLayout!");
+        }
+        return max_vector_size / sizeof(DiDataType);
+    }
     /**
      * @brief Shuffle tile configuration parameters
      *
@@ -116,7 +159,7 @@ struct CShuffleEpilogue
         else
         {
             constexpr index_t num_xdl_shuffles = GetVectorSizeC() / elem_per_thread;
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
             {
                 static_assert((kMPerBlock % (MPerXdl * MWave) == 0) &&
                                   (kMPerBlock % num_xdl_shuffles == 0),
@@ -147,7 +190,8 @@ struct CShuffleEpilogue
     }();
     static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
     static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);
-    using WG                                      = WarpGemmMfmaDispatcher<ADataType,
+
+    using WG = WarpGemmMfmaDispatcher<ADataType,
                                       BTypeToUse,
                                       AccDataType,
                                       MPerXdl,
@@ -162,14 +206,14 @@ struct CShuffleEpilogue
     CK_TILE_HOST_DEVICE static constexpr auto MakeLdsBlockDescriptor()
     {
         // N is contiguous dimension
-        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
         {
             return make_naive_tensor_descriptor(
                 make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
                 make_tuple(number<NPerIterationShuffle>{}, number<1>{}));
         }
         // M is contiguous dimension
-        else if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::ColumnMajor>)
+        else if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::ColumnMajor>)
         {
             return make_naive_tensor_descriptor(
                 make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
@@ -177,7 +221,7 @@ struct CShuffleEpilogue
         }
         else
         {
-            static_assert(false, "Unsupported CLayout!");
+            static_assert(false, "Unsupported ELayout!");
         }
     }
 
@@ -202,9 +246,11 @@ struct CShuffleEpilogue
         return MPerIterationShuffle * NPerIterationShuffle * sizeof(ODataType);
     }
 
-    template <typename ODramWindow, typename OAccTile>
-    CK_TILE_DEVICE auto
-    operator()(ODramWindow& out_dram_window, const OAccTile& o_acc_tile, void* p_smem)
+    template <typename ODramWindow, typename OAccTile, typename DsDramWindows>
+    CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
+                                   const OAccTile& o_acc_tile,
+                                   const DsDramWindows& ds_dram_windows,
+                                   void* p_smem)
     {
         constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
 
@@ -230,7 +276,7 @@ struct CShuffleEpilogue
                                         sequence<MPerIterationShuffle, NPerIterationShuffle>>;
         constexpr index_t num_access = SFC::get_num_of_access();
 
-        static_assert(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>,
+        static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
                       "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
 
         using TileEncodingPattern =
@@ -242,6 +288,12 @@ struct CShuffleEpilogue
                                               Problem::kNumWaveGroups>;
         constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
 
+        auto d_dram_windows = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(ds_dram_windows[idx], dram_tile_distribution);
+            },
+            number<NumDTensor>{});
+
         constexpr auto c_warp_y_lengths =
             to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
@@ -265,8 +317,17 @@ struct CShuffleEpilogue
             store_tile(in_lds_window, c_warptile_in_tensor_casted);
             block_sync_lds();
 
-            const auto c_out_tensor =
-                load_tile(make_tile_window(out_lds_window, dram_tile_distribution));
+            auto c_out_tensor = load_tile(make_tile_window(out_lds_window, dram_tile_distribution));
+
+            const auto ds_tensor = generate_tuple(
+                [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
+
+            const auto c_ds_tiles = concat_tuple_of_reference(
+                tie(c_out_tensor, c_out_tensor),
+                generate_tie(
+                    [&](auto idx) -> const auto& { return ds_tensor[idx]; }, number<NumDTensor>{}));
+
+            tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
 
             if constexpr(MemoryOperation == memory_operation_enum::set)
             {
@@ -279,7 +340,13 @@ struct CShuffleEpilogue
             if constexpr(iAccess != num_access - 1)
             {
                 constexpr auto step = SFC::get_forward_step(iAccess);
+
                 move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
+
+                static_for<0, NumDTensor, 1>{}([&](auto idx) {
+                    move_tile_window(d_dram_windows[idx],
+                                     {step.at(number<0>{}), step.at(number<1>{})});
+                });
             }
         });
     }
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index d495c0d950..09c7d58558 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -9,7 +9,7 @@
 
 namespace ck_tile {
 
-struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs
+struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs</*NumDTensor = 0*/>
 {
     CK_TILE_HOST BatchedGemmHostArgs() = default;
     CK_TILE_HOST BatchedGemmHostArgs(const void* a_ptr_,
@@ -26,18 +26,28 @@ struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs
                                      ck_tile::index_t batch_stride_B_,
                                      ck_tile::index_t batch_stride_C_,
                                      ck_tile::index_t batch_count_)
-        : GemmHostArgs(
-              a_ptr_, b_ptr_, c_ptr_, k_batch_, M_, N_, K_, stride_A_, stride_B_, stride_C_),
+        : GemmHostArgs(a_ptr_,
+                       b_ptr_,
+                       {},
+                       c_ptr_,
+                       k_batch_,
+                       M_,
+                       N_,
+                       K_,
+                       stride_A_,
+                       stride_B_,
+                       {},
+                       stride_C_),
           batch_stride_A(batch_stride_A_),
           batch_stride_B(batch_stride_B_),
-          batch_stride_C(batch_stride_C_),
+          batch_stride_E(batch_stride_C_),
           batch_count(batch_count_)
     {
     }
 
     ck_tile::index_t batch_stride_A;
     ck_tile::index_t batch_stride_B;
-    ck_tile::index_t batch_stride_C;
+    ck_tile::index_t batch_stride_E;
     ck_tile::index_t batch_count;
 };
 
@@ -46,18 +56,18 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 {
     using Base = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
 
-    using GemmKernelArgs = typename ck_tile::GemmKernelArgs;
+    using GemmKernelArgs = typename ck_tile::GemmKernelArgs<>;
 
     using ADataType = typename Base::ADataType;
     using BDataType = typename Base::BDataType;
-    using CDataType = typename Base::CDataType;
+    using CDataType = typename Base::EDataType;
 
     using TilePartitioner  = typename Base::TilePartitioner;
     using GemmPipeline     = typename Base::GemmPipeline;
     using EpiloguePipeline = typename Base::EpiloguePipeline;
     using ALayout          = typename Base::ALayout;
     using BLayout          = typename Base::BLayout;
-    using CLayout          = typename Base::CLayout;
+    using CLayout          = typename Base::ELayout;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -75,7 +85,7 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
     {
         index_t batch_stride_A;
         index_t batch_stride_B;
-        index_t batch_stride_C;
+        index_t batch_stride_E;
         index_t batch_count;
     };
 
@@ -94,17 +104,19 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
     {
         return BatchedGemmKernelArgs{{hostArgs.a_ptr,
                                       hostArgs.b_ptr,
-                                      hostArgs.c_ptr,
+                                      {},
+                                      hostArgs.e_ptr,
                                       hostArgs.M,
                                       hostArgs.N,
                                       hostArgs.K,
                                       hostArgs.stride_A,
                                       hostArgs.stride_B,
-                                      hostArgs.stride_C,
+                                      {},
+                                      hostArgs.stride_E,
                                       hostArgs.k_batch},
                                      hostArgs.batch_stride_A,
                                      hostArgs.batch_stride_B,
-                                     hostArgs.batch_stride_C,
+                                     hostArgs.batch_stride_E,
                                      hostArgs.batch_count};
     }
 
@@ -135,14 +147,14 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B +
                                  splitk_batch_offset.b_k_split_offset;
 
-        const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C);
-        const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C);
-        CDataType* c_ptr          = static_cast<CDataType*>(kargs.c_ptr) + batch_offset_C;
+        const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E);
+        const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E);
+        CDataType* c_ptr          = static_cast<CDataType*>(kargs.e_ptr) + batch_offset_C;
 
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
-        this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        this->RunGemm(a_ptr, b_ptr, {}, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index bfb0d2626b..4cd26c2234 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -16,70 +16,72 @@
 
 namespace ck_tile {
 
-/// @brief The GEMM problem definition.
-///
-/// @par Overview
-///      This structure defines the GEMM problem configuration by stating all required information
-///      like M,N,K sizes and respective strides.
-struct GemmProblem
-{
-    CK_TILE_HOST GemmProblem() = default;
-    CK_TILE_HOST GemmProblem(
-        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
-        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
-    {
-    }
-
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-};
-
 /// @brief The GEMM kernel host arguments.
 ///
 /// @par Overview
 ///      This structure is passed to @ref GemmKernel "GemmKernel" when creating kernel arguments
 ///      object. It contain all necessary information required to build proper kernel argument
 ///      and launch kernel on GPU.
-struct GemmHostArgs : public GemmProblem
+///      This structure defines the GEMM problem configuration by stating all required information
+///      like M,N,K sizes and respective strides.
+///      NumDTensor describes the number of D tensors.
+template <index_t NumDTensor = 0>
+struct GemmHostArgs
 {
     CK_TILE_HOST GemmHostArgs() = default;
     CK_TILE_HOST GemmHostArgs(const void* a_ptr_,
                               const void* b_ptr_,
-                              void* c_ptr_,
+                              const std::array<const void*, NumDTensor>& ds_ptr_,
+                              void* e_ptr_,
                               index_t k_batch_,
                               index_t M_,
                               index_t N_,
                               index_t K_,
                               index_t stride_A_,
                               index_t stride_B_,
-                              index_t stride_C_)
-        : GemmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_),
-          a_ptr(a_ptr_),
+                              const std::array<index_t, NumDTensor>& stride_Ds_,
+                              index_t stride_E_)
+        : a_ptr(a_ptr_),
           b_ptr(b_ptr_),
-          c_ptr(c_ptr_),
+          ds_ptr(ds_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_Ds(stride_Ds_),
+          stride_E(stride_E_),
           k_batch(k_batch_)
     {
     }
 
     const void* a_ptr;
     const void* b_ptr;
-    void* c_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
+    void* e_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    const std::array<index_t, NumDTensor> stride_Ds;
+    index_t stride_E;
     index_t k_batch;
 };
 
 /// @brief The GEMM kernel device arguments.
+template <index_t NumDTensor = 0>
 struct GemmKernelArgs
 {
     /// @brief The A input tensor's pointer to device memory.
     const void* a_ptr;
     /// @brief The B input tensor's pointer to device memory.
     const void* b_ptr;
-    /// @brief The C output tensor's pointer to device memory.
-    void* c_ptr;
+    /// @brief The Ds input tensor's pointer to device memory.
+    const std::array<const void*, NumDTensor> ds_ptr;
+    /// @brief The E output tensor's pointer to device memory.
+    void* e_ptr;
     /// @brief GEMM's M dimension size.
     index_t M;
     /// @brief GEMM's N dimension size.
@@ -93,8 +95,11 @@ struct GemmKernelArgs
     ///        (in memory) of B tensor.
     index_t stride_B;
     /// @brief The distance between consecutive elements of non-contiguous dimension
-    ///        (in memory) of C tensor.
-    index_t stride_C;
+    ///        (in memory) of Ds tensor.
+    std::array<index_t, NumDTensor> stride_Ds;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of E tensor.
+    index_t stride_E;
     index_t k_batch;
 };
 
@@ -133,16 +138,19 @@ struct GemmKernelArgs
 /// @tparam EpiloguePipeline_   The type of class providing the final part of matrix
 ///                             multiplication implementation. It is responsible for storing
 ///                             results calculated by @ref GemmPipeline_ "GemmPipeline" to
-///                             the output C tensor in global memory.
+///                             the output E tensor in global memory.
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct GemmKernel
 {
-    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
-    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
-    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout          = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout          = remove_cvref_t<typename GemmPipeline::BLayout>;
+    // TODO: GemmPipeline::CLayout -> GemmPipeline::ELayout will be changed for multi-ABD
+    using ELayout    = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using DsLayout   = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+    using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
     // Get the persistent kernel if the pipeline has it available
@@ -163,11 +171,18 @@ struct GemmKernel
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
     // Below type is actually accumulation data type - the output of block GEMM.
-    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    static constexpr index_t NumDTensor = DsDataType::size();
 
     static constexpr auto I0 = number<0>();
     static constexpr auto I1 = number<1>();
     static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>{};
+
+    static_assert(DsLayout::size() == DsDataType::size(),
+                  "The size of DsLayout and DsDataType should be the same");
+    using KernelArgs = GemmKernelArgs<DsLayout::size()>;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -190,7 +205,7 @@ struct GemmKernel
     CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
     {
         using Kernel      = GemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
-        const auto kernel = kentry<KernelBlockSize, 1, Kernel, GemmKernelArgs>;
+        const auto kernel = kentry<KernelBlockSize, 1, Kernel, KernelArgs>;
         int occupancy;
         hip_check_error(
             hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
@@ -200,18 +215,22 @@ struct GemmKernel
 
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
-    CK_TILE_HOST static constexpr GemmKernelArgs MakeKernelArgs(const GemmHostArgs& hostArgs)
+    CK_TILE_HOST static constexpr KernelArgs
+    MakeKernelArgs(const GemmHostArgs<NumDTensor>& hostArgs)
     {
-        return GemmKernelArgs{hostArgs.a_ptr,
-                              hostArgs.b_ptr,
-                              hostArgs.c_ptr,
-                              hostArgs.M,
-                              hostArgs.N,
-                              hostArgs.K,
-                              hostArgs.stride_A,
-                              hostArgs.stride_B,
-                              hostArgs.stride_C,
-                              hostArgs.k_batch};
+
+        return KernelArgs{hostArgs.a_ptr,
+                          hostArgs.b_ptr,
+                          hostArgs.ds_ptr,
+                          hostArgs.e_ptr,
+                          hostArgs.M,
+                          hostArgs.N,
+                          hostArgs.K,
+                          hostArgs.stride_A,
+                          hostArgs.stride_B,
+                          hostArgs.stride_Ds,
+                          hostArgs.stride_E,
+                          hostArgs.k_batch};
     }
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -221,8 +240,7 @@ struct GemmKernel
 
     struct SplitKBatchOffset
     {
-        __device__ SplitKBatchOffset(const GemmKernelArgs& kargs,
-                                     const std::size_t k_id = blockIdx.z)
+        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
         {
             constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
             const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
@@ -261,10 +279,10 @@ struct GemmKernel
         index_t splitted_k;
     };
 
-    CK_TILE_HOST static bool IsSupportedArgument(const GemmKernelArgs& kargs)
+    CK_TILE_HOST static bool IsSupportedArgument(const KernelArgs& kargs)
     {
         if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                     is_any_of<CDataType, fp16_t, bf16_t>::value)
+                     is_any_of<EDataType, fp16_t, bf16_t>::value)
         {
             if(kargs.k_batch != 1)
             {
@@ -360,7 +378,56 @@ struct GemmKernel
             }
         }
 
-        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        bool DTesnorIsValid = {true};
+        static_for<0, NumDTensor, 1>{}([&](auto index) {
+            using DiLayout = remove_cvref_t<std::tuple_element_t<index.value, DsLayout>>;
+            if(std::is_same_v<DiLayout, ELayout> == false)
+            {
+                DTesnorIsValid = false;
+            }
+            if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("Can't support N for tensor D that is not a multiple of "
+                                      "NPerBlock without padding!");
+                    }
+                    DTesnorIsValid = false;
+                }
+                if(kargs.N % EpiloguePipeline::GetVectorSizeD(index) != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("N is not a multiple of vector load size for D tensor!");
+                    }
+                    DTesnorIsValid = false;
+                }
+            }
+            else
+            {
+                if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("Can't support M for tensor D that is not a multiple of "
+                                      "MPerBlock without padding!");
+                    }
+                    DTesnorIsValid = false;
+                }
+                if(kargs.M % EpiloguePipeline::GetVectorSizeD(index) != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("M is not a multiple of vector load size for D tensor!");
+                    }
+                    DTesnorIsValid = false;
+                }
+            }
+        });
+
+        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
         {
             if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
             {
@@ -400,15 +467,17 @@ struct GemmKernel
                 return false;
             }
         }
-        return true;
+        return DTesnorIsValid;
     }
 
     template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
-                                                   const BDataType* b_ptr,
-                                                   CDataType* c_ptr,
-                                                   const GemmKernelArgs& kargs,
-                                                   const SplitKBatchOffset& splitk_batch_offset)
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const ADataType* a_ptr,
+                        const BDataType* b_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        EDataType* e_ptr,
+                        const KernelArgs& kargs,
+                        const SplitKBatchOffset& splitk_batch_offset)
     {
         static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
         const auto& a_tensor_view = [&]() {
@@ -495,29 +564,54 @@ struct GemmKernel
             }
         }();
 
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                using DiLayout   = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                using DDataType_ = remove_cvref_t<std::tuple_element_t<i.value, DsDataType>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const DDataType_*>(ds_ptr[i]),
+                        make_tuple(kargs.M, kargs.N),
+                        make_tuple(kargs.stride_Ds[i], 1),
+                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
+                        number<1>{});
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const DDataType_*>(ds_ptr[i]),
+                        make_tuple(kargs.N, kargs.M),
+                        make_tuple(kargs.stride_Ds[i], 1),
+                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
+                        number<1>{});
+                }
+            },
+            number<NumDTensor>{});
+
         // TODO: enable vector write for C in ColMajor
-        const auto& c_tensor_view = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        const auto& e_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    c_ptr,
+                    e_ptr,
                     make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
+                    make_tuple(kargs.stride_E, 1),
                     number<EpiloguePipeline::GetVectorSizeC()>{},
                     number<1>{});
             }
             else
             {
                 return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    c_ptr,
+                    e_ptr,
                     make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
+                    make_tuple(1, kargs.stride_E),
                     number<1>{},
                     number<1>{});
             }
         }();
 
-        return make_tuple(a_tensor_view, b_tensor_view, c_tensor_view);
+        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, e_tensor_view);
     }
 
     template <typename TensorView>
@@ -559,35 +653,57 @@ struct GemmKernel
             }
         }();
 
+        const auto& ds_pad_view = generate_tuple(
+            [&](auto i) {
+                const auto& d_tensor_view = views.at(I2);
+                using DiLayout            = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return pad_tensor_view(d_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                      number<TilePartitioner::NPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadN>{});
+                }
+                else
+                {
+                    return pad_tensor_view(d_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                      number<TilePartitioner::MPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadM>{});
+                }
+            },
+            number<NumDTensor>{});
+
         // TODO vector write in for C in ColMajor
-        const auto& c_pad_view = [&]() {
-            const auto& c_tensor_view = views.at(I2);
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        const auto& e_pad_view = [&]() {
+            const auto& e_tensor_view = views.at(I3);
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
             {
-                return pad_tensor_view(c_tensor_view,
+                return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
                                        sequence<false, GemmPipeline::kPadN>{});
             }
             else
             {
-                return pad_tensor_view(c_tensor_view,
+                return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
                                        sequence<GemmPipeline::kPadM, false>{});
             }
         }();
 
-        return make_tuple(a_pad_view, b_pad_view, c_pad_view);
+        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, e_pad_view);
     }
 
     template <typename PadView>
     CK_TILE_DEVICE static auto
     MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
     {
-        const auto& a_pad_view = views.at(I0);
-        const auto& b_pad_view = views.at(I1);
-        const auto& c_pad_view = views.at(I2);
+        const auto& a_pad_view  = views.at(I0);
+        const auto& b_pad_view  = views.at(I1);
+        const auto& ds_pad_view = views.at(I2);
+        const auto& e_pad_view  = views.at(I3);
 
         const auto& a_block_window = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
@@ -623,12 +739,32 @@ struct GemmKernel
             }
         }();
 
-        auto c_block_window = make_tile_window(
-            c_pad_view,
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                using DiLayout = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_tile_window(ds_pad_view[i],
+                                            make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                       number<TilePartitioner::NPerBlock>{}),
+                                            {i_m, i_n});
+                }
+                else
+                {
+                    return make_tile_window(ds_pad_view[i],
+                                            make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                       number<TilePartitioner::MPerBlock>{}),
+                                            {i_n, i_m});
+                }
+            },
+            number<NumDTensor>{});
+
+        auto e_block_window = make_tile_window(
+            e_pad_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
             {i_m, i_n});
 
-        return make_tuple(a_block_window, b_block_window, c_block_window);
+        return make_tuple(a_block_window, b_block_window, ds_block_window, e_block_window);
     }
 
     /**
@@ -636,7 +772,8 @@ struct GemmKernel
      *
      * @param a_ptr input A pointer
      * @param b_ptr input B pointer
-     * @param c_ptr output C pointer
+     * @param ds_ptr input Ds pointer
+     * @param e_ptr output E pointer
      * @param smem_ptr_0 The start memory pointer of the shared memory block.
      * @param kargs GEMM kernel arguments
      * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
@@ -647,9 +784,10 @@ struct GemmKernel
     template <bool UseDefaultScheduler = true>
     CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
                                        const BDataType* b_ptr,
-                                       CDataType* c_ptr,
+                                       const std::array<const void*, NumDTensor>& ds_ptr,
+                                       EDataType* e_ptr,
                                        void* smem_ptr_0,
-                                       const GemmKernelArgs& kargs,
+                                       const KernelArgs& kargs,
                                        const SplitKBatchOffset& splitk_batch_offset,
                                        const index_t block_idx_m,
                                        const index_t block_idx_n)
@@ -657,7 +795,7 @@ struct GemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+                a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
@@ -668,6 +806,7 @@ struct GemmKernel
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
         const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
 
         const auto& c_block_tile = GemmPipeline{}.template operator()(
             a_block_window, b_block_window, num_loop, smem_ptr_0);
@@ -675,11 +814,11 @@ struct GemmKernel
         if(UseDefaultScheduler || (get_warp_id() == 0))
         {
             // Run Epilogue Pipeline
-            auto& c_block_window = gemm_tile_windows.at(I2);
+            auto& c_block_window = gemm_tile_windows.at(I3);
 
-            EpiloguePipeline{}
-                .template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-                    c_block_window, c_block_tile, smem_ptr_0);
+            EpiloguePipeline{}.template
+            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+                c_block_window, c_block_tile, d_block_window, smem_ptr_0);
         }
     }
 
@@ -690,7 +829,8 @@ struct GemmKernel
      *
      * @param a_ptr input A pointer
      * @param b_ptr input B pointer
-     * @param c_ptr output C pointer
+     * @param ds_ptr input Ds pointer
+     * @param e_ptr output E pointer
      * @param smem_ptr_0 The starting pointer of 1st shared memory block.
      * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
      * @param kargs GEMM kernel arguments
@@ -701,10 +841,11 @@ struct GemmKernel
      */
     CK_TILE_DEVICE static void RunGemm2LDS(const ADataType* a_ptr,
                                            const BDataType* b_ptr,
-                                           CDataType* c_ptr,
+                                           const std::array<const void*, NumDTensor>& ds_ptr,
+                                           EDataType* e_ptr,
                                            void* __restrict__ smem_ptr_0,
                                            void* __restrict__ smem_ptr_1,
-                                           const GemmKernelArgs& kargs,
+                                           const KernelArgs& kargs,
                                            const SplitKBatchOffset& splitk_batch_offset,
                                            const index_t block_idx_m,
                                            const index_t block_idx_n)
@@ -712,7 +853,8 @@ struct GemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+                a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
@@ -722,20 +864,22 @@ struct GemmKernel
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
         const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
 
         const auto& c_block_tile = GemmPipeline{}.template operator()(
             a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
 
         // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(I2);
+        auto& c_block_window = gemm_tile_windows.at(I3);
 
-        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, smem_ptr_0);
+        EpiloguePipeline{}.template
+        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
     }
 
     // Non-persistent kernel entry point
     template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
-    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
+    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
         const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
@@ -743,12 +887,14 @@ struct GemmKernel
         const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
 
         const SplitKBatchOffset splitk_batch_offset(kargs);
+
         // options
         const ADataType* a_ptr =
             static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
         const BDataType* b_ptr =
             static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+        EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
 
         // allocate LDS
         __shared__ char smem_ptr_0[GetSmemSize()];
@@ -758,11 +904,12 @@ struct GemmKernel
             __shared__ char smem_ptr_1[GetSmemSize()];
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
                            EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<CDataType, fp16_t, bf16_t>::value))
+                           is_any_of<EDataType, fp16_t, bf16_t>::value))
             {
                 RunGemm2LDS(a_ptr,
                             b_ptr,
-                            c_ptr,
+                            kargs.ds_ptr,
+                            e_ptr,
                             smem_ptr_0,
                             smem_ptr_1,
                             kargs,
@@ -775,18 +922,25 @@ struct GemmKernel
         {
             if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
                            EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<CDataType, fp16_t, bf16_t>::value))
+                           is_any_of<EDataType, fp16_t, bf16_t>::value))
             {
                 constexpr auto scheduler_type = (GemmPipeline::NumWaveGroups == 1);
-                RunGemm<scheduler_type>(
-                    a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                RunGemm<scheduler_type>(a_ptr,
+                                        b_ptr,
+                                        kargs.ds_ptr,
+                                        e_ptr,
+                                        smem_ptr_0,
+                                        kargs,
+                                        splitk_batch_offset,
+                                        i_m,
+                                        i_n);
             }
         }
     }
 
     // Persistent kernel entry point
     template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
-    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
+    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
         const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
         const auto num_tiles =
@@ -809,7 +963,7 @@ struct GemmKernel
                 static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
             const BDataType* b_ptr =
                 static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-            CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+            EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
 
             // allocate LDS
             __shared__ char smem_ptr_0[GetSmemSize()];
@@ -820,11 +974,12 @@ struct GemmKernel
                 if constexpr(!(EpiloguePipeline::MemoryOperation ==
                                    memory_operation_enum::atomic_add &&
                                EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<CDataType, fp16_t, bf16_t>::value))
+                               is_any_of<EDataType, fp16_t, bf16_t>::value))
                 {
                     RunGemm2LDS(a_ptr,
                                 b_ptr,
-                                c_ptr,
+                                kargs.ds_ptr,
+                                e_ptr,
                                 smem_ptr_0,
                                 smem_ptr_1,
                                 kargs,
@@ -838,9 +993,17 @@ struct GemmKernel
                 if constexpr(!(EpiloguePipeline::MemoryOperation ==
                                    memory_operation_enum::atomic_add &&
                                EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<CDataType, fp16_t, bf16_t>::value))
+                               is_any_of<EDataType, fp16_t, bf16_t>::value))
                 {
-                    RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                    RunGemm(a_ptr,
+                            b_ptr,
+                            kargs.ds_ptr,
+                            e_ptr,
+                            smem_ptr_0,
+                            kargs,
+                            splitk_batch_offset,
+                            i_m,
+                            i_n);
                 }
             }
             // Advance to the next work item
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index f57600d7a5..533cabb736 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -18,17 +18,17 @@ namespace ck_tile {
 
 struct GemmTransKernelArg
 {
-    GemmKernelArgs group_karg;
+    GemmKernelArgs<> group_karg;
     ck_tile::index_t block_start;
     ck_tile::index_t block_end;
 
-    GemmTransKernelArg() = default;
-    GemmTransKernelArg(GemmKernelArgs&& karg, index_t bl_start, index_t bl_end)
+    GemmTransKernelArg() = delete;
+    GemmTransKernelArg(GemmKernelArgs<>&& karg, index_t bl_start, index_t bl_end)
         : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
     {
     }
 
-    GemmTransKernelArg(GemmKernelArgs&& karg) : group_karg{karg}, block_start{0}, block_end{0} {}
+    GemmTransKernelArg(GemmKernelArgs<>&& karg) : group_karg{karg}, block_start{0}, block_end{0} {}
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
@@ -39,7 +39,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
     using ALayout          = remove_cvref_t<typename GemmPipeline::ALayout>;
     using BLayout          = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using CLayout          = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using ELayout          = remove_cvref_t<typename GemmPipeline::CLayout>;
 
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
@@ -65,8 +65,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // clang-format on
     }
 
-    CK_TILE_HOST static auto GetWorkSpaceSize(const std::vector<GemmHostArgs>& gemm_descs)
-        -> std::size_t
+    CK_TILE_HOST static auto
+    GetWorkSpaceSize(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs) -> std::size_t
     {
         return gemm_descs.size() * sizeof(GemmTransKernelArg);
     }
@@ -95,7 +95,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static constexpr auto GridSize(const std::vector<GemmHostArgs>& gemm_descs)
+    CK_TILE_HOST static constexpr auto
+    GridSize(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs)
     {
         index_t grid_size = 0;
         for(const auto& it_desc : gemm_descs)
@@ -106,7 +107,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static auto MakeKargs(const std::vector<GemmHostArgs>& gemm_descs)
+    CK_TILE_HOST static auto
+    MakeKargs(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs)
         -> std::vector<GemmTransKernelArg>
     {
         std::vector<GemmTransKernelArg> gemm_kernel_args_;
@@ -127,7 +129,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
             const index_t stride_a = gemm_descs[i].stride_A;
             const index_t stride_b = gemm_descs[i].stride_B;
-            const index_t stride_c = gemm_descs[i].stride_C;
+            const index_t stride_e = gemm_descs[i].stride_E;
 
             const index_t grid_size_grp = TilePartitioner::GridSize(M, N) * gemm_descs[i].k_batch;
 
@@ -136,16 +138,18 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
             grid_size += grid_size_grp;
 
-            auto karg = GemmKernelArgs{type_convert<const ADataType*>(gemm_descs[i].a_ptr),
-                                       type_convert<const BDataType*>(gemm_descs[i].b_ptr),
-                                       type_convert<CDataType*>(gemm_descs[i].c_ptr),
-                                       M,
-                                       N,
-                                       K,
-                                       stride_a,
-                                       stride_b,
-                                       stride_c,
-                                       gemm_descs[i].k_batch};
+            auto karg = GemmKernelArgs<>{type_convert<const ADataType*>(gemm_descs[i].a_ptr),
+                                         type_convert<const BDataType*>(gemm_descs[i].b_ptr),
+                                         {},
+                                         type_convert<CDataType*>(gemm_descs[i].e_ptr),
+                                         M,
+                                         N,
+                                         K,
+                                         stride_a,
+                                         stride_b,
+                                         {},
+                                         stride_e,
+                                         gemm_descs[i].k_batch};
 
             gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end);
         }
@@ -177,7 +181,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         Run(kargs.group_karg, block_idx_2d, block_idx_z);
     }
 
-    CK_TILE_DEVICE void Run(const GemmKernelArgs& kargs,
+    CK_TILE_DEVICE void Run(const GemmKernelArgs<>& kargs,
                             const tuple<index_t, index_t>& block_idx_2d,
                             const index_t block_idx_z) const
     {
@@ -192,7 +196,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
             static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
         const BDataType* b_ptr =
             static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+        CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr);
 
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
@@ -204,7 +208,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         }
         else
         {
-            this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            this->RunGemm(a_ptr, b_ptr, {}, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
         }
     }
 
@@ -230,7 +234,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
                                  const BDataType* b_ptr,
                                  CDataType* c_ptr,
                                  void* smem_ptr_0,
-                                 const GemmKernelArgs& kargs,
+                                 const GemmKernelArgs<>& kargs,
                                  const typename Base::SplitKBatchOffset& splitk_batch_offset,
                                  const index_t block_idx_m,
                                  const index_t block_idx_n)
@@ -238,13 +242,14 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+                a_ptr, b_ptr, {}, c_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows =
             Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
         const auto& a_block_window = gemm_tile_windows.at(Base::I0);
         const auto& b_block_window = gemm_tile_windows.at(Base::I1);
+        const auto& d_block_window = gemm_tile_windows.at(Base::I2);
 
         // Get hot-loop and tail configuration
         const index_t num_loop = __builtin_amdgcn_readfirstlane(
@@ -256,9 +261,10 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         const auto& c_block_tile = GemmPipeline{}.template operator()(
             a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
         // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(Base::I2);
-        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, smem_ptr_0);
+        auto& c_block_window = gemm_tile_windows.at(Base::I3);
+        EpiloguePipeline{}.template
+        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
     }
 
     CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr,
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 8f9d7ac89b..57afb5cbb5 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(image_to_column)
 add_subdirectory(gemm)
 add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
+add_subdirectory(gemm_multi_d)
 add_subdirectory(data_type)
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index cffa81d1c5..79bd51d65c 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -11,6 +11,7 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
 template <typename Tuple>
 class TestCkTileBatchedGemm : public ::testing::Test
@@ -23,6 +24,8 @@ class TestCkTileBatchedGemm : public ::testing::Test
     using BDataType   = std::tuple_element_t<4, Tuple>;
     using AccDataType = std::tuple_element_t<5, Tuple>;
     using CDataType   = std::tuple_element_t<6, Tuple>;
+    using DsLayout    = ck_tile::tuple<>;
+    using DsDataType  = ck_tile::tuple<>;
 
     template <typename ALayout, typename BLayout, typename CLayout>
     void invoke_batched_gemm(const ck_tile::BatchedGemmHostArgs& args,
@@ -102,9 +105,12 @@ class TestCkTileBatchedGemm : public ::testing::Test
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
+                                                 DsDataType,
                                                  AccDataType,
                                                  CDataType,
+                                                 DsLayout,
                                                  CLayout,
+                                                 ck_tile::element_wise::PassThrough,
                                                  GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
@@ -239,17 +245,17 @@ class TestCkTileBatchedGemm : public ::testing::Test
         ck_tile::BatchedGemmHostArgs args;
         args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
         args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-        args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+        args.e_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
         args.k_batch        = 1;
         args.M              = M;
         args.N              = N;
         args.K              = K;
         args.stride_A       = StrideA;
         args.stride_B       = StrideB;
-        args.stride_C       = StrideC;
+        args.stride_E       = StrideC;
         args.batch_stride_A = BatchStrideA;
         args.batch_stride_B = BatchStrideB;
-        args.batch_stride_C = BatchStrideC;
+        args.batch_stride_E = BatchStrideC;
         args.batch_count    = BatchCount;
 
         invoke_batched_gemm<ALayout, BLayout, CLayout>(args,
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index b3146b5f8e..5f2a53645d 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -76,12 +76,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
     using CDataType                    = std::tuple_element_t<6, Tuple>;
     static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
     static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
+
+    using DsLayout   = ck_tile::tuple<>;
+    using DsDataType = ck_tile::tuple<>;
+
     static constexpr bool Persistent =
         ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
     // TODO: expose tile size through test t-param ?
 
     template <bool PadM, bool PadN, bool PadK>
-    void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+    void invoke_gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args,
+                     const ck_tile::stream_config& s)
     {
         // TODO: This should be parameterized in tests
         constexpr ck_tile::index_t M_Tile = 256;
@@ -165,9 +170,12 @@ class TestCkTileGemmPipeline : public ::testing::Test
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
+                                                 DsDataType,
                                                  AccDataType,
                                                  CDataType,
+                                                 DsLayout,
                                                  CLayout,
+                                                 ck_tile::element_wise::PassThrough,
                                                  GemmPipeline::BlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
@@ -326,17 +334,17 @@ class TestCkTileGemmPipeline : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::GemmHostArgs args;
+        ck_tile::GemmHostArgs</*NumDTensor = 0*/> args;
         args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
         args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-        args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+        args.e_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
         args.k_batch  = kbatch;
         args.M        = M;
         args.N        = N;
         args.K        = K;
         args.stride_A = stride_A;
         args.stride_B = stride_B;
-        args.stride_C = stride_C;
+        args.stride_E = stride_C;
 
         invoke_gemm<PadM, PadN, PadK>(args, ck_tile::stream_config{nullptr, false});
 
diff --git a/test/ck_tile/gemm_multi_d/CMakeLists.txt b/test/ck_tile/gemm_multi_d/CMakeLists.txt
new file mode 100644
index 0000000000..1ec77eb87a
--- /dev/null
+++ b/test/ck_tile/gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_gemm_multi_d test_gemm_multi_d.cpp)
+endif()
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d.cpp b/test/ck_tile/gemm_multi_d/test_gemm_multi_d.cpp
new file mode 100644
index 0000000000..a634d825b7
--- /dev/null
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_multi_d_util.hpp"
+
+using F16  = ck_tile::half_t;
+using BF16 = ck_tile::bf16_t;
+using F32  = float;
+using F8   = ck_tile::fp8_t;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //          ALayout, BLayout, CLayout, D0Layout, D1Layout, ADataType, BDataType, D0DataType,  D1DataType, AccDataType, CDataType, CDElementWiseFn
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          BF16,       BF16,       F32,      F16,     ElementWiseAddAdd>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          F32,        F32,        F32,      F16,     ElementWiseAddAdd>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          F32,        F32,        F32,      F16,     ElementWiseAddAdd>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F8,        F8,           BF16,       BF16,       F32,      F32,     ElementWiseAddAdd>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F8,        F8,           F8,         F8,         F32,      F16,     ElementWiseAddAdd>,
+
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          F16,        F16,        F32,      F16,     MultiplyMultiply>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          BF16,       BF16,       F32,      F32,     MultiplyMultiply>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          F32,        F32,        F32,      F32,     MultiplyMultiply>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F16,       F16,          F32,        F32,        F32,      F16,     MultiplyMultiply>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F8,        F8,           BF16,       BF16,       F32,      F32,     MultiplyMultiply>,
+    std::tuple<    Row,     Col,     Row,     Row,      Row,      F8,        F8,           F8,         F8,         F32,      F32,     MultiplyMultiply>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileGemmMultiD, KernelTypes);
+
+#include "test_gemm_multi_d_ut_cases.inc"
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases.inc b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases.inc
new file mode 100644
index 0000000000..22d887fa83
--- /dev/null
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_ut_cases.inc
@@ -0,0 +1,334 @@
+#pragma once
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_256x512x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_512x256x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_512x512x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_256x256x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_512x768x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_512x1280x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_256x1280x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_768x512x256)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_1280x512x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch1_1280x256x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_256x512x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_512x256x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_512x512x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_256x256x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_512x768x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_512x1280x256)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_256x1280x256)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_768x512x256)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_1280x512x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch1_1280x256x256)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 256;
+    constexpr int kBatch = 1;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_256x256x512)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_512x768x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_512x1280x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_256x1280x512)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_768x512x512)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_1280x512x512)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDAddKBatch2_1280x256x512)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_256x512x512)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_512x256x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_512x512x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_256x256x512)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_512x768x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 768;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_512x1280x512)
+{
+    constexpr int M      = 512;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_256x1280x512)
+{
+    constexpr int M      = 256;
+    constexpr int N      = 1280;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_768x512x512)
+{
+    constexpr int M      = 768;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_1280x512x512)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 512;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
+
+TYPED_TEST(TestCkTileGemmMultiD, TestCkTileGemmMultiDMultiplyMultiplyKBatch2_1280x256x512)
+{
+    constexpr int M      = 1280;
+    constexpr int N      = 256;
+    constexpr int K      = 512;
+    constexpr int kBatch = 2;
+    this->Run(M, N, K, kBatch);
+}
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
new file mode 100644
index 0000000000..7dd91077b1
--- /dev/null
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <sstream>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+struct ElementWiseAddAdd
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) + ck_tile::type_convert<float>(d0) +
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) * ck_tile::type_convert<float>(d0) *
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename DsDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(DsDataType), ComputeTypeAB, DsDataType>;
+
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename Tuple>
+class TestCkTileGemmMultiD : public ::testing::Test
+{
+    protected:
+    using ALayout         = std::tuple_element_t<0, Tuple>;
+    using BLayout         = std::tuple_element_t<1, Tuple>;
+    using D0Layout        = std::tuple_element_t<2, Tuple>;
+    using D1Layout        = std::tuple_element_t<3, Tuple>;
+    using ELayout         = std::tuple_element_t<4, Tuple>;
+    using ADataType       = std::tuple_element_t<5, Tuple>;
+    using BDataType       = std::tuple_element_t<6, Tuple>;
+    using D0DataType      = std::tuple_element_t<7, Tuple>;
+    using D1DataType      = std::tuple_element_t<8, Tuple>;
+    using AccDataType     = std::tuple_element_t<9, Tuple>;
+    using EDataType       = std::tuple_element_t<10, Tuple>;
+    using CDElementWiseFn = std::tuple_element_t<11, Tuple>;
+    using DsLayout        = ck_tile::tuple<D0Layout, D1Layout>;
+    using DsDataType      = ck_tile::tuple<D0DataType, D1DataType>;
+
+    template <typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename EDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              typename CDEElementWise = ck_tile::element_wise::PassThrough>
+    void invoke_gemm_multi_d(const ck_tile::GemmHostArgs<DsDataType::size()>& args,
+                             const ck_tile::stream_config& s)
+    {
+        constexpr ck_tile::index_t M_Tile = 256;
+        constexpr ck_tile::index_t N_Tile = 256;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+        constexpr bool DoubleSmemBuffer = false;
+
+        constexpr bool kPadM = false;
+        constexpr bool kPadN = false;
+        constexpr bool kPadK = false;
+
+        constexpr bool TransposeC = false;
+
+        constexpr int kBlockPerCu                         = 1;
+        constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, ELayout>;
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                     kPadN,
+                                                                     kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout,
+                                                                     TransposeC>;
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{0};
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 EDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+        if(has_hot_loop)
+        {
+            if(tail_num == ck_tile::TailNumber::Full)
+            {
+                RunSplitk(
+                    ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else
+            {
+                std::ostringstream err;
+                err << "For compute pipeline tail number should always be Full, but have \""
+                    << tail_num << "\" which is not supported! PrefetchStages: "
+                    << BaseGemmPipeline::PrefetchStages << "\n File: " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__;
+                throw std::runtime_error(err.str());
+            }
+        }
+        else
+        {
+            std::ostringstream err;
+            err << "Num K loop must be larger than number of prefetech stages."
+                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
+                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            throw std::runtime_error(err.str());
+        }
+    }
+
+    public:
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int k_batch,
+             int StrideA  = 0,
+             int StrideB  = 0,
+             int StrideD0 = 0,
+             int StrideD1 = 0,
+             int StrideE  = 0)
+    {
+        using namespace ck_tile::literals;
+
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        StrideA  = f_get_default_stride(M, K, StrideA, ALayout{});
+        StrideB  = f_get_default_stride(K, N, StrideB, BLayout{});
+        StrideD0 = f_get_default_stride(M, N, StrideD0, D0Layout{});
+        StrideD1 = f_get_default_stride(M, N, StrideD1, D1Layout{});
+        StrideE  = f_get_default_stride(M, N, StrideE, ELayout{});
+
+        ck_tile::HostTensor<ADataType> a_m_k_tesnor(
+            f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+        ck_tile::HostTensor<BDataType> b_k_n_tensors(
+            f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+        ck_tile::HostTensor<D0DataType> d0_m_n_tensors(
+            f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+        ck_tile::HostTensor<D1DataType> d1_m_n_tensors(
+            f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+        ck_tile::HostTensor<EDataType> e_m_n_device_result(
+            f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tesnor);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors);
+        ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n_tensors);
+        ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n_tensors);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k_tesnor.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n_tensors.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n_tensors.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n_tensors.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k_tesnor.mData.data());
+        b_k_n_dev_buf.ToDevice(b_k_n_tensors.mData.data());
+        d0_m_n_dev_buf.ToDevice(d0_m_n_tensors.mData.data());
+        d1_m_n_dev_buf.ToDevice(d1_m_n_tensors.mData.data());
+
+        e_m_n_dev_buf.SetZero();
+        e_m_n_device_result.SetZero();
+
+        std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                                  d1_m_n_dev_buf.GetDeviceBuffer()};
+        std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {StrideD0, StrideD1};
+
+        ck_tile::GemmHostArgs<DsDataType::size()> args({a_m_k_dev_buf.GetDeviceBuffer(),
+                                                        b_k_n_dev_buf.GetDeviceBuffer(),
+                                                        ds_ptr_buf,
+                                                        e_m_n_dev_buf.GetDeviceBuffer(),
+                                                        k_batch,
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        stridesDs,
+                                                        StrideE});
+
+        invoke_gemm_multi_d<ADataType,
+                            BDataType,
+                            DsDataType,
+                            AccDataType,
+                            EDataType,
+                            ALayout,
+                            BLayout,
+                            DsLayout,
+                            ELayout,
+                            CDElementWiseFn>(args, ck_tile::stream_config{nullptr, false});
+
+        std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K
+                  << " StrideA =" << StrideA << " StrideB =" << StrideB << " StrideE =" << StrideE
+                  << " StrideD0 =" << StrideD0 << " StrideD1 =" << StrideD1 << std::endl;
+
+        e_m_n_dev_buf.FromDevice(e_m_n_device_result.data());
+        bool pass = true;
+
+        ck_tile::HostTensor<EDataType> e_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+        e_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm_multiple_d<ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           AccDataType,
+                                           EDataType,
+                                           CDElementWiseFn>(
+            a_m_k_tesnor, b_k_n_tensors, {d0_m_n_tensors, d1_m_n_tensors}, e_m_n_host_ref);
+
+        const float max_accumulated_value =
+            *std::max_element(e_m_n_host_ref.mData.begin(), e_m_n_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<ADataType, BDataType, AccDataType, EDataType, DsDataType>(
+                K, k_batch, max_accumulated_value);
+        pass = ck_tile::check_err(e_m_n_device_result,
+                                  e_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 382a32a7d9..54f772f89e 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -11,6 +11,7 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
 template <typename Tuple>
 class TestCkTileGroupedGemm : public ::testing::Test
@@ -23,6 +24,8 @@ class TestCkTileGroupedGemm : public ::testing::Test
     using BDataType   = std::tuple_element_t<4, Tuple>;
     using AccDataType = std::tuple_element_t<5, Tuple>;
     using CDataType   = std::tuple_element_t<6, Tuple>;
+    using DsLayout    = ck_tile::tuple<>;
+    using DsDataType  = ck_tile::tuple<>;
 
     // Get the persistent value from ck_tile::bool_constant
     using PersistentType             = std::tuple_element_t<7, Tuple>;
@@ -48,7 +51,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         static const ck_tile::index_t K_Warp_Tile = 16;
     };
 
-    using grouped_gemm_kargs = ck_tile::GemmHostArgs;
+    using grouped_gemm_kargs = ck_tile::GemmHostArgs</*NumDTensor = 0*/>;
     std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
     {
         return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
@@ -127,9 +130,12 @@ class TestCkTileGroupedGemm : public ::testing::Test
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
+                                                 DsDataType,
                                                  AccDataType,
                                                  CDataType,
+                                                 DsLayout,
                                                  CLayout,
+                                                 ck_tile::element_wise::PassThrough,
                                                  GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
@@ -256,9 +262,12 @@ class TestCkTileGroupedGemm : public ::testing::Test
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
+                                                 DsDataType,
                                                  AccDataType,
                                                  CDataType,
+                                                 DsLayout,
                                                  CLayout,
+                                                 ck_tile::element_wise::PassThrough,
                                                  GemmPipelineProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
@@ -428,7 +437,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
             void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
             gemm_descs.push_back(
-                {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+                {p_a, p_b, {}, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], {}, stride_Cs[i]});
         }
 
         ck_tile::DeviceMem gemm_workspace;
@@ -442,16 +451,18 @@ class TestCkTileGroupedGemm : public ::testing::Test
             const bool splitk = gemm_descs[0].k_batch > 1;
             for(const auto& arg : gemm_descs)
             {
-                kargs.emplace_back(ck_tile::GemmKernelArgs{arg.a_ptr,
-                                                           arg.b_ptr,
-                                                           arg.c_ptr,
-                                                           arg.M,
-                                                           arg.N,
-                                                           arg.K,
-                                                           arg.stride_A,
-                                                           arg.stride_B,
-                                                           arg.stride_C,
-                                                           arg.k_batch});
+                kargs.emplace_back(ck_tile::GemmKernelArgs<>{arg.a_ptr,
+                                                             arg.b_ptr,
+                                                             {},
+                                                             arg.e_ptr,
+                                                             arg.M,
+                                                             arg.N,
+                                                             arg.K,
+                                                             arg.stride_A,
+                                                             arg.stride_B,
+                                                             {},
+                                                             arg.stride_E,
+                                                             arg.k_batch});
             }
             const auto stream = ck_tile::stream_config{nullptr, false, 1};
             ck_tile::hip_check_error(

From a0f4db8d9cb730d15ea32d3c6ede3feb409d8adf Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:34:22 -0700
Subject: [PATCH 212/443] check for if misched-bottomup flag is valid (#2341)

---
 .../65_gemm_multiply_multiply/CMakeLists.txt  |  8 +++++++-
 .../gpu/gemm_blockscale_wp/CMakeLists.txt     | 19 +++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 36f1860e4f..b9748aabda 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -43,7 +43,13 @@ endforeach()
 set(GEMM_OPTIONS)
 list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 set(BLOCKSCALE_GEMM_OPTIONS)
-list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
+check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
+if(HAS_MISCHED_BOTTOMUP)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+elseif(HAS_MISCHED_PRERA_DIRECTION)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+endif()
 check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental " HAS_MAX_OCCUPANCY_EXPERIMENTAL)
 if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
     list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
index 57cbd725aa..c8740e8d8c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
@@ -7,10 +7,17 @@ list(APPEND GEMM_BLOCKSCALE_WP_INSTANCES
         device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
         device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
         )
-
-set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-
+check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
+check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
+if(HAS_MISCHED_BOTTOMUP)
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+elseif(HAS_MISCHED_PRERA_DIRECTION)
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
+endif()
 add_instance_library(device_gemm_blockscale_wp_instance ${GEMM_BLOCKSCALE_WP_INSTANCES})

From 56f654a826b4794402e69675185af0bf3b98401b Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:13:07 -0700
Subject: [PATCH 213/443] Limit the threads to builf ck_tile engine, use ninja.
 (#2342)

* limit the threads to builf ck_tile engine, use ninja

* disable ck_tile engine until it can be built safely
---
 Jenkinsfile                | 18 +++++++++++++-----
 script/cmake-ck-dev.sh     |  2 +-
 script/cmake-ck-release.sh |  2 +-
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1cb1a6ca6c..f9d7feb77c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -793,7 +793,7 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=false
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -1185,8 +1185,12 @@ pipeline {
                     agent{ label rocmnode("gfx90a") }
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make benchmark_gemm -j && \
+                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_BUILD_TYPE=Release \
+                                            -D GPU_TARGETS="gfx90a" \
+                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
+                                           ninja -j64 benchmark_gemm && \
                                            ./bin/benchmark_gemm """
                     }
                     steps{
@@ -1203,8 +1207,12 @@ pipeline {
                     agent{ label rocmnode("gfx942") }
                     environment{
                         setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make benchmark_gemm -j && \
+                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_BUILD_TYPE=Release \
+                                            -D GPU_TARGETS="gfx942" \
+                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
+                                           ninja -j128 benchmark_gemm && \
                                            ./bin/benchmark_gemm """
                     }
                     steps{
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 0e57af7aef..4d0836af39 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -16,7 +16,7 @@ fi
 
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm/                                                                   \
--D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++                                                  \
 -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 95b1bebca5..acb04ac75f 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -16,7 +16,7 @@ fi
 
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
--D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++                                                  \
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \

From 2d8a804152ebaa36775fea393227cb956e6e550e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Sun, 15 Jun 2025 15:22:34 -0700
Subject: [PATCH 214/443] Fix direct lds load for gfx950 and clang20 (#2346)

* fix direct lds load for gfx950 and clang20

* Update include/ck/utility/amd_buffer_addressing_builtins.hpp

* Fix format

---------

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
---
 .../utility/amd_buffer_addressing_builtins.hpp   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 1836e9461d..f642e06050 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -402,7 +402,7 @@ __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::typ
                                                       tmp.template AsType<half2_t>()[i]);
         });
     }
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
     else if constexpr(is_same<T, bhalf_t>::value)
     {
         vector_type<bhalf_t, N> tmp{src_thread_data};
@@ -838,10 +838,18 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                               const bool is_valid,
                                               const index_t src_element_space_size)
 {
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-    constexpr auto dword_bytes      = 4;
+    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
+    // For gfx950: supports 1, 3, or 4 DWORDs per thread
+    // For gfx942: supports exactly 1 DWORD per thread
     constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+#if defined(__gfx950__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
+                  bytes_per_thread == dword_bytes * 4);
+#elif defined(__gfx942__)
+    constexpr auto dword_bytes = 4;
     static_assert(bytes_per_thread == dword_bytes);
+#endif
 
     const int32x4_t src_resource =
         make_wave_buffer_resource(global_base_ptr, src_element_space_size);
@@ -872,7 +880,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
 #endif
 
     llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
 #endif
 }
 #endif

From fb97f75099bae6778adc8f41e20df184c416f83e Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Mon, 16 Jun 2025 13:49:04 +0800
Subject: [PATCH 215/443] hot fix block_gemm fail with pipeline_problem by
 adding NumWaveGroups inside block gemm problem (#2348)

---
 include/ck_tile/ops/gemm/block/block_gemm_problem.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp b/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
index d8f66c81ca..fd5211a59a 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
@@ -12,7 +12,8 @@ template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
           index_t kBlockSize_,
-          typename BlockGemmShape_>
+          typename BlockGemmShape_,
+          index_t NumWaveGroups_ = 1>
 struct BlockGemmProblem
 {
     using ADataType      = remove_cvref_t<ADataType_>;
@@ -20,7 +21,8 @@ struct BlockGemmProblem
     using CDataType      = remove_cvref_t<CDataType_>;
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
-    static constexpr index_t kBlockSize = kBlockSize_;
+    static constexpr index_t kBlockSize    = kBlockSize_;
+    static constexpr index_t NumWaveGroups = NumWaveGroups_;
 };
 
 } // namespace ck_tile

From b34c234f5144d4ebd16ca04a379c907854d087ff Mon Sep 17 00:00:00 2001
From: ruanjm <jiming.ruan@amd.com>
Date: Mon, 16 Jun 2025 17:17:03 +0800
Subject: [PATCH 216/443] Add support for specifying valid flag when fetching
 elements for tile_scatter_gather (#2332)

* Add support for specifying valid flag when fetching elements for tile_scatter_gather

Add constexpr for operator[] of TrueGenerator

* Use different path when valid is enabled
---
 .../core/tensor/tile_scatter_gather.hpp       | 167 +++++++++++++++---
 1 file changed, 147 insertions(+), 20 deletions(-)

diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
index 351737d4d9..c7811133d6 100644
--- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp
+++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
@@ -33,6 +33,7 @@ template <typename BottomTensorView_,
           typename WindowLengths_,
           typename StaticTileDistribution_,
           typename StaticPageIndexArray_,
+          typename StaticValidArray_,
           index_t HsGatherDim = 0,
           index_t NumCoord    = 1,
           index_t YsGatherDim = 0>
@@ -42,6 +43,7 @@ struct tile_scatter_gather
     using WindowLengths    = remove_cvref_t<WindowLengths_>;
     using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
     using PageIdxArray     = remove_cvref_t<StaticPageIndexArray_>;
+    using ValidArray       = remove_cvref_t<StaticValidArray_>;
     using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
     using BottomTensorDesc = typename BottomTensorView::TensorDesc;
 
@@ -152,12 +154,14 @@ struct tile_scatter_gather
                                                  const WindowLengths& window_lengths,
                                                  const BottomTensorIndex& window_origin,
                                                  const TileDstr& tile_distribution,
-                                                 const PageIdxArray& page_idx)
+                                                 const PageIdxArray& page_idx,
+                                                 const ValidArray& valids)
         : bottom_tensor_view_{bottom_tensor_view},
           window_lengths_{window_lengths},
           window_origin_{window_origin},
           tile_dstr_{tile_distribution},
           page_idx_{page_idx},
+          valids_{valids},
           pre_computed_coords_{}
     {
 #if 0 // debug
@@ -336,12 +340,25 @@ struct tile_scatter_gather
                 constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
                 constexpr auto idx_gather   = idx_ys_start[number<YsGatherDim>{}];
                 const auto page_offset      = page_idx_[idx_gather];
+
                 // read from bottom tensor
-                const vector_t vec_value =
-                    get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
-                        bottom_tensor_thread_coord,
-                        page_offset,
-                        bool_constant<oob_conditional_check>{});
+                const vector_t vec_value = [&]() {
+                    if constexpr(std::is_same_v<ValidArray, std::nullptr_t>)
+                    {
+                        return get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                            bottom_tensor_thread_coord,
+                            page_offset,
+                            bool_constant<oob_conditional_check>{});
+                    }
+                    else
+                    {
+                        return get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                            bottom_tensor_thread_coord,
+                            page_offset,
+                            valids_[idx_gather],
+                            bool_constant<oob_conditional_check>{});
+                    }
+                }();
 #if 1
                 // write into distributed tensor
                 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
@@ -451,9 +468,23 @@ struct tile_scatter_gather
                 constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
                 constexpr auto idx_gather   = idx_ys_start[number<YsGatherDim>{}];
                 const auto page_offset      = page_idx_[idx_gather];
+
                 // read from bottom tensor
-                get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
-                    smem, bottom_tensor_thread_coord, page_offset, 0, pre_nop_);
+                if constexpr(std::is_same_v<ValidArray, std::nullptr_t>)
+                {
+                    get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                        smem, bottom_tensor_thread_coord, page_offset, 0, pre_nop_);
+                }
+                else
+                {
+                    get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                        smem,
+                        bottom_tensor_thread_coord,
+                        page_offset,
+                        valids_[idx_gather],
+                        0,
+                        pre_nop_);
+                }
 
                 // move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
@@ -529,11 +560,24 @@ struct tile_scatter_gather
                 // const vector_t vec_value = vec.template get_as<vector_t>().template at<0>();
 
                 // write into bottom tensor
-                get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord,
-                    page_offset,
-                    vec_value,
-                    bool_constant<oob_conditional_check>{});
+                if constexpr(std::is_same_v<ValidArray, std::nullptr_t>)
+                {
+                    get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                        bottom_tensor_thread_coord,
+                        page_offset,
+                        vec_value,
+                        bool_constant<oob_conditional_check>{});
+                }
+                else
+                {
+                    get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                        bottom_tensor_thread_coord,
+                        page_offset,
+                        valids_[idx_gather],
+                        vec_value,
+                        bool_constant<oob_conditional_check>{});
+                }
+
                 // printf("coord_offset:%d,   scatter_offset:%d \n",
                 // bottom_tensor_thread_coord.get_offset(), offset); move thread coordinate
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
@@ -570,14 +614,23 @@ struct tile_scatter_gather
         });
     }
 
-    CK_TILE_DEVICE void update_page_idx(const PageIdxArray& new_idx)
-    {
-        page_idx_ = new_idx;
+    CK_TILE_DEVICE void update_page_idx(const PageIdxArray& new_idx) { page_idx_ = new_idx; }
 
-        // static_for<0, 2, 1>{}([&](auto k0) {
-        //     printf("update tid %d %d \n", threadIdx.x, page_idx_[k0]);
-        // });
+    CK_TILE_DEVICE void update_valids(const ValidArray& new_valids)
+    {
+        if constexpr(std::is_same_v<ValidArray, std::nullptr_t> == false)
+        {
+            valids_ = new_valids;
+        }
     }
+
+    CK_TILE_DEVICE void update_page_idx_and_valids(const PageIdxArray& new_idx,
+                                                   const ValidArray& new_valids)
+    {
+        update_page_idx(new_idx);
+        update_valids(new_valids);
+    }
+
     CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
     {
         window_origin_ = new_window_origin;
@@ -657,6 +710,7 @@ struct tile_scatter_gather
     TileDstr tile_dstr_;
 
     PageIdxArray page_idx_;
+    ValidArray valids_;
 
     // this contains:
     //   per-thread coordinate for window adaptor
@@ -684,9 +738,10 @@ make_tile_scatter_gather(const TensorView_& tensor_view,
                                remove_cvref_t<WindowLengths_>,
                                remove_cvref_t<StaticTileDistribution_>,
                                remove_cvref_t<StaticPageIndexArray_>,
+                               std::nullptr_t,
                                HsGatherDim,
                                NumCoord>{
-        tensor_view, window_lengths, origin, tile_distribution, page_idx};
+        tensor_view, window_lengths, origin, tile_distribution, page_idx, nullptr};
 }
 
 template <typename TensorView,
@@ -728,4 +783,76 @@ CK_TILE_DEVICE constexpr auto make_tile_scatter_gather(
                                     number<HsGatherDim>{});
 }
 
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename StaticPageIndexArray_,
+          typename StaticValidArray_,
+          index_t HsGatherDim = 0,
+          index_t NumCoord    = 1>
+CK_TILE_DEVICE constexpr auto
+make_tile_scatter_gather(const TensorView_& tensor_view,
+                         const WindowLengths_& window_lengths,
+                         const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                         const StaticTileDistribution_& tile_distribution,
+                         const StaticPageIndexArray_& page_idx,
+                         const StaticValidArray_& valids,
+                         number<HsGatherDim> = {},
+                         number<NumCoord>    = {})
+{
+    return tile_scatter_gather<remove_cvref_t<TensorView_>,
+                               remove_cvref_t<WindowLengths_>,
+                               remove_cvref_t<StaticTileDistribution_>,
+                               remove_cvref_t<StaticPageIndexArray_>,
+                               remove_cvref_t<StaticValidArray_>,
+                               HsGatherDim,
+                               NumCoord>{
+        tensor_view, window_lengths, origin, tile_distribution, page_idx, valids};
+}
+
+template <typename TensorView,
+          typename WindowLengths,
+          typename StaticTileDistribution,
+          typename StaticPageIndexArray,
+          typename StaticValidArray,
+          index_t HsGatherDim>
+CK_TILE_DEVICE constexpr auto make_tile_scatter_gather(
+    const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+    const multi_index<TensorView::get_num_of_dimension()>& origin,
+    const StaticTileDistribution& tile_distribution,
+    const StaticPageIndexArray& page_idx,
+    const StaticValidArray& valids,
+    number<HsGatherDim> = {})
+{
+    return make_tile_scatter_gather(tile_window.get_bottom_tensor_view(),
+                                    tile_window.get_window_lengths(),
+                                    origin,
+                                    tile_distribution,
+                                    page_idx,
+                                    valids,
+                                    number<HsGatherDim>{});
+}
+
+template <typename TensorView,
+          typename WindowLengths,
+          typename StaticTileDistribution,
+          typename StaticPageIndexArray,
+          typename StaticValidArray,
+          index_t HsGatherDim>
+CK_TILE_DEVICE constexpr auto make_tile_scatter_gather(
+    const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+    const StaticTileDistribution& tile_distribution,
+    const StaticPageIndexArray& page_idx,
+    const StaticValidArray& valids,
+    number<HsGatherDim> = {})
+{
+    return make_tile_scatter_gather(tile_window.get_bottom_tensor_view(),
+                                    tile_window.get_window_lengths(),
+                                    tile_window.get_window_origin(),
+                                    tile_distribution,
+                                    page_idx,
+                                    valids,
+                                    number<HsGatherDim>{});
+}
+
 } // namespace ck_tile

From d996bc78befb15ee0405ff78d0ad0da00f8550f3 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 16 Jun 2025 02:17:53 -0700
Subject: [PATCH 217/443] fix the flatmm (#2349)

---
 example/ck_tile/18_flatmm/flatmm_basic.cpp          | 3 +++
 include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp | 3 ++-
 include/ck_tile/ops/gemm.hpp                        | 2 +-
 script/run_ck_profiler_gemm_with_csv_shapes.py      | 4 ++--
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index c564d7d1b1..8782d2bb6a 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -49,9 +49,12 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
+                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
+                                             ck_tile::tuple<>,
                                              CLayout,
+                                             ck_tile::element_wise::PassThrough,
                                              CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index a9ed1519e6..d2e1bde58f 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -447,6 +447,7 @@ struct FlatmmKernel
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window      = gemm_tile_windows.at(I0);
         const auto& b_flat_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window      = gemm_tile_windows.at(I2);
         const auto& c_block_tile        = FlatmmPipeline{}.template operator()(
             a_block_window, b_flat_block_window, num_loop, smem_ptr);
 
@@ -454,7 +455,7 @@ struct FlatmmKernel
         auto& c_block_window = gemm_tile_windows.at(I2);
 
         EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, smem_ptr);
+            c_block_window, c_block_tile, d_block_window, smem_ptr);
     }
 
     CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 8db822ebd1..a1d37f0824 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -31,8 +31,8 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
diff --git a/script/run_ck_profiler_gemm_with_csv_shapes.py b/script/run_ck_profiler_gemm_with_csv_shapes.py
index 1f7ec7585f..54b4b337de 100644
--- a/script/run_ck_profiler_gemm_with_csv_shapes.py
+++ b/script/run_ck_profiler_gemm_with_csv_shapes.py
@@ -278,13 +278,13 @@ def main():
     shapes = tuples(filename)
 
     all_results = []
-    from tqdm import tqdm
     from functools import partial
     from os import path
 
     profiler_bin = path.join(args["build_dir"], "bin", "ckProfiler")
 
-    for s in tqdm(shapes):
+    total = len(shapes)
+    for idx, s in enumerate(shapes, 1): 
         run_shape_stdout_lines = run_shape(
             s, profiler_bin, args["op_name"], args["dtype"], args["layout"]
         )

From f6c2ff9dcedbc58065ae1fc10a661f00716c6839 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 16 Jun 2025 15:36:53 +0200
Subject: [PATCH 218/443] Grouped convolution forward with clamp (#2334)

* Grouped convolution forward with clamp

* Optimize clamp

* unary fixes

* test gk bias

* Revert "test gk bias"

This reverts commit 8e42e29d7b64dfa12d15bb85932ce9dd0f334065.

* Revert "Revert "test gk bias""

This reverts commit e73c0550ce840f6013580722fb6426df1bbaf17b.

* workaround comment
---
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  11 +-
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp |  22 +-
 ...d_multiple_d_xdl_large_tensor_cshuffle.hpp |   5 +-
 .../element/unary_element_wise_operation.hpp  | 179 +++++++++++++
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |  95 ++++---
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    | 143 +++++++----
 .../device_operation_instance_factory.hpp     |   1 +
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp |   1 +
 .../device_grouped_conv_fwd_xdl_instance.hpp  |   1 +
 ...ped_conv_fwd_xdl_large_tensor_instance.hpp |   1 +
 ...vice_grouped_conv_fwd_xdl_mem_instance.hpp |   1 +
 ...ed_conv_fwd_xdl_merged_groups_instance.hpp |   1 +
 .../gpu/grouped_convolution_forward_clamp.hpp | 140 ++++++++++
 .../grouped_convolution_forward_clamp_xdl.inc | 242 ++++++++++++++++++
 .../grouped_conv2d_fwd_clamp/CMakeLists.txt   |  16 ++
 ...hwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp |  67 +++++
 ...l_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp |  61 +++++
 ...c_gkyxc_nhwgk_bf16_comp_part2_instance.cpp |  67 +++++
 ..._nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp |  60 +++++
 ...mp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  60 +++++
 ...tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  41 +++
 ...gc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp |  63 +++++
 ...gc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp |  63 +++++
 ...groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  80 ++++++
 .../grouped_conv3d_fwd_clamp/CMakeLists.txt   |  16 ++
 ...dhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp | 127 +++++++++
 ...hwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp |  58 +++++
 ...xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  58 +++++
 ...sor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  41 +++
 ..._gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp |  61 +++++
 ..._gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp |  61 +++++
 ...ups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |  51 ++++
 ...ofile_grouped_conv_fwd_bias_clamp_impl.hpp |  51 +++-
 .../profile_grouped_conv_fwd_impl.hpp         |   9 +-
 script/convert_miopen_driver_to_profiler.py   |  48 ++++
 test/CMakeLists.txt                           |   2 +-
 .../CMakeLists.txt                            |  10 +
 .../test_grouped_convnd_fwd_bias_clamp.cpp    |   3 +-
 .../test_grouped_convnd_fwd_clamp.cpp         |  95 +++++++
 .../test_grouped_convnd_fwd_gk_bias_clamp.cpp |  93 +++++++
 .../CMakeLists.txt                            |   4 -
 41 files changed, 2103 insertions(+), 106 deletions(-)
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 create mode 100644 test/grouped_convnd_fwd_activation/CMakeLists.txt
 rename test/{grouped_convnd_fwd_bias_clamp => grouped_convnd_fwd_activation}/test_grouped_convnd_fwd_bias_clamp.cpp (96%)
 create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
 create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
 delete mode 100644 test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 27da1d91a3..6d04835b21 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -311,8 +311,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
     static_assert(NumGroupsToMerge >= 1);
 
-    static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
-    static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
+    static constexpr bool isMultiA  = is_detected<is_tuple, ADataType>::value;
+    static constexpr bool isMultiB  = is_detected<is_tuple, BDataType>::value;
+    static constexpr bool isMultiAB = isMultiA || isMultiB;
 
     // NGCHW is not supported for multiAB
     static_assert(!(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
@@ -323,6 +324,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
+    static constexpr bool DoElementwiseBeforeCShuffle =
+        NumDTensor == 0 && !isMultiAB && is_same_v<EDataType, bhalf_t> &&
+        !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -465,7 +470,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
-        BComputeDataType
+        BComputeDataType, DoElementwiseBeforeCShuffle
     // Use appropriate gridwise gemm
     using GridwiseGemm = std::conditional_t<
         isMultiA || isMultiB,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index bebcd72ceb..48424c16b9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -279,6 +279,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     static constexpr bool isMultiD   = DsDataType::Size() > 0;
     static constexpr bool isMultiABD = isMultiA || isMultiB || isMultiD;
 
+    static constexpr bool DoElementwiseBeforeCShuffle =
+        !isMultiABD && is_same_v<EDataType, bhalf_t> &&
+        !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
+
     static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
     static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
     static constexpr index_t NumDTensor = DsDataType::Size();
@@ -412,7 +416,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
         CDEBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer,       \
-        AComputeDataType, BComputeDataType
+        AComputeDataType, BComputeDataType, false, false, DoElementwiseBeforeCShuffle
 
     // Use appropriate gridwise gemm
     using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<GridwiseGemmV3TemplateParams>;
@@ -780,8 +784,20 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         sizeof(EDataType);
             }
 
-            typename GridwiseGemm::Argument gemm_arg{
-                p_a_grid, p_b_grid, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, I1};
+            typename GridwiseGemm::Argument gemm_arg{p_a_grid,
+                                                     p_b_grid,
+                                                     p_e_grid,
+                                                     GemmM,
+                                                     GemmN,
+                                                     GemmK,
+                                                     I0,
+                                                     I0,
+                                                     I0,
+                                                     I1,
+                                                     false,
+                                                     arg.a_element_op_,
+                                                     arg.b_element_op_,
+                                                     arg.cde_element_op_};
 
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index 94a4e0da4c..9988367959 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -192,6 +192,9 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
     static constexpr index_t NumDTensor  = DsDataType::Size();
     static constexpr index_t MaxGemmsNum = 32;
+    static constexpr bool DoElementwiseBeforeCShuffle =
+        NumDTensor == 0 && is_same_v<EDataType, bhalf_t> &&
+        !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -361,7 +364,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,            \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                         \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,                \
-        AComputeDataType
+        AComputeDataType, DoElementwiseBeforeCShuffle
     // Use appropriate gridwise gemm
     using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>;
 
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 047ff3bd06..8f829496da 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -730,6 +730,15 @@ struct UnaryAbs
     {
         y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
     };
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = ck::type_convert<bhalf_t>(ck::math::abs(x));
+    };
 };
 
 struct UnarySqrt
@@ -744,6 +753,79 @@ struct UnarySqrt
     };
 };
 
+struct Clamp
+{
+    Clamp(float floor = 0.f, float ceil = NumericLimits<float>::Max())
+        : floor_(floor), ceil_(ceil){};
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float>(float& y, const float& x) const
+    {
+        const float& a = x;
+        y              = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<double, double>(double& y, const double& x) const
+    {
+        const double& a = x;
+        y               = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        const float a = type_convert<half_t>(x);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<half_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, float>(half_t& y, const float& x) const
+    {
+        const float& a = x;
+        const float b  = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y              = type_convert<half_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        const float& a = x;
+        const float b  = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y              = type_convert<bhalf_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, bhalf_t>(bhalf_t& y,
+                                                                    const bhalf_t& x) const
+    {
+        const float a = type_convert<float>(x);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<bhalf_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<int, int>(int& y, const int& x) const
+    {
+        const int8_t& a = x;
+        y               = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        const int8_t& a = x;
+        y               = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    const float floor_;
+    const float ceil_;
+};
+
 struct Relu
 {
     template <typename T>
@@ -756,6 +838,9 @@ struct Relu
         y = x > 0 ? x : 0;
     }
 
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
     template <>
     __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
     {
@@ -763,6 +848,13 @@ struct Relu
         float y_f32 = x_f32 > 0 ? x_f32 : 0;
         y           = type_convert<bhalf_t>(y_f32);
     }
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        float y_f32 = x > 0 ? x : 0;
+        y           = type_convert<bhalf_t>(y_f32);
+    };
 };
 
 // Fast GeLU
@@ -915,6 +1007,16 @@ struct Sigmoid
         constexpr T one = type_convert<T>(1);
         y               = one / (one + math::exp(-x));
     };
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        constexpr float one = 1.f;
+        y                   = type_convert<bhalf_t>(one / (one + math::exp(-x)));
+    };
 };
 
 struct Silu
@@ -942,6 +1044,15 @@ struct TanH
 
         y = math::tanh(x);
     };
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(math::tanh(x));
+    };
 };
 
 struct ACos
@@ -1201,6 +1312,13 @@ struct Swish
         y        = type_convert<Y>(x / (1.f + math::exp(bx)));
     };
 
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        float bx = -beta_ * x;
+        y        = type_convert<bhalf_t>(x / (1.f + math::exp(bx)));
+    };
+
     const float beta_;
 };
 
@@ -1219,6 +1337,16 @@ struct SoftRelu
         constexpr T one = type_convert<T>(1);
         y               = math::log(one + math::exp(x * casted_alpha)) / casted_alpha;
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        constexpr float one = 1.f;
+        y = type_convert<bhalf_t>(math::log(one + math::exp(x * alpha_)) / alpha_);
+    };
     const float alpha_;
 };
 
@@ -1240,6 +1368,17 @@ struct Power
         T shifted_scaled_x = casted_alpha + casted_beta * x;
         y                  = math::pow(shifted_scaled_x, casted_gamma);
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        const float shifted_scaled_x = alpha_ + beta_ * x;
+        y                            = type_convert<bhalf_t>(math::pow(shifted_scaled_x, gamma_));
+    };
+
     const float alpha_;
     const float beta_;
     const float gamma_;
@@ -1260,6 +1399,16 @@ struct ClippedRelu
         T casted_beta  = type_convert<T>(beta_);
         y              = math::min(casted_beta, math::max(casted_alpha, x));
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(math::min(beta_, math::max(alpha_, x)));
+    };
+
     const float alpha_;
     const float beta_;
 };
@@ -1278,6 +1427,16 @@ struct LeakyRelu
         T casted_alpha = type_convert<T>(alpha_);
         y              = x >= 0 ? x : x * casted_alpha;
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(x >= 0 ? x : x * alpha_);
+    };
+
     const float alpha_;
 };
 
@@ -1295,6 +1454,16 @@ struct Elu
         T casted_alpha = type_convert<T>(alpha_);
         y              = x > 0 ? x : casted_alpha * math::expm1(x);
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(x > 0 ? x : alpha_ * math::expm1(x));
+    };
+
     const float alpha_;
 };
 
@@ -1313,6 +1482,16 @@ struct Logistic
         constexpr T one = type_convert<T>(1);
         y               = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        constexpr float one = 1.f;
+        y                   = type_convert<bhalf_t>(alpha_ / (one + ck::math::exp(-x) * alpha_));
+    };
     const float alpha_;
 };
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index be0fff087e..acbccf1889 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -71,11 +71,13 @@ template <typename ADataType,
           typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
           LoopScheduler LoopSched,
-          PipelineVersion PipelineVer = PipelineVersion::v1,
-          typename BComputeDataType_  = AComputeDataType_>
+          PipelineVersion PipelineVer      = PipelineVersion::v1,
+          typename BComputeDataType_       = AComputeDataType_,
+          bool DoElementwiseBeforeCShuffle = false>
 struct GridwiseGemmMultipleD_xdl_cshuffle
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
+    static_assert(!DoElementwiseBeforeCShuffle || NumDTensor == 0);
 
     using GemmSpecialization = ck::tensor_operation::device::GemmSpecialization;
 
@@ -796,37 +798,60 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
                     make_multi_index(n_thread_data_on_block));
 
+            tensor_operation::element_wise::PassThrough pass_through{};
+            const auto& vpgr_to_lds_element_op = [&] {
+                if constexpr(DoElementwiseBeforeCShuffle)
+                {
+                    return cde_element_op;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+            const auto& lds_to_global_element_op = [&] {
+                if constexpr(!DoElementwiseBeforeCShuffle)
+                {
+                    return cde_element_op;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                conditional_t<DoElementwiseBeforeCShuffle,
+                              CDEElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
+                Sequence<CShuffleMXdlPerWavePerShuffle,
+                         CShuffleNXdlPerWavePerShuffle,
+                         I1,
+                         I1,
+                         M2,
+                         I1,
+                         M4,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                7,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       n_thread_data_on_block_idx[I2]),
+                      vpgr_to_lds_element_op()};
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
@@ -860,7 +885,9 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                 Tuple<EDataType>,
                 decltype(c_ds_desc_refs),
                 decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CDEElementwiseOperation,
+                conditional_t<!DoElementwiseBeforeCShuffle,
+                              CDEElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
                 Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
                                                                             // support arbitray type
                 Sequence<1,
@@ -881,7 +908,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                  idx_c_ds_block_begin,
                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
                  make_tuple(make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0)),
-                 cde_element_op};
+                 lds_to_global_element_op()};
 
             // space filling curve for threadwise C in VGPR before shuffle
             constexpr auto sfc_c_vgpr =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 338674ae85..6270d0c4dc 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -186,6 +186,8 @@ __global__ void
 ///                             in global memory. Currently not supported!
 /// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
 ///                             in global memory (pre-shuffled).
+/// @tparam DoElementwiseBeforeCShuffle Whether the cde_elementwise should be performed before or
+///                                     after elementwise op.
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -233,7 +235,8 @@ template <typename ALayout,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           bool PermuteA                               = false,
-          bool PermuteB                               = false>
+          bool PermuteB                               = false,
+          bool DoElementwiseBeforeCShuffle            = false>
 struct GridwiseGemm_xdl_cshuffle_v3
 {
     static constexpr auto I0 = Number<0>{};
@@ -636,7 +639,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
                          index_t StrideA_,
                          index_t StrideB_,
                          index_t StrideC_,
-                         index_t KBatch_)
+                         index_t KBatch_,
+                         AElementwiseOperation a_element_op,
+                         BElementwiseOperation b_element_op,
+                         CElementwiseOperation c_element_op)
             : M{M_},
               N{N_},
               K{K_},
@@ -651,7 +657,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)}
+              NBlock{CalculateNBlock(N_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
         {
         }
 
@@ -689,6 +698,9 @@ struct GridwiseGemm_xdl_cshuffle_v3
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
     };
 
     // Argument
@@ -704,8 +716,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
                           index_t StrideB_,
                           index_t StrideC_,
                           index_t k_batch_,
-                          bool is_reduce_ = false)
-            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, k_batch_},
+                          bool is_reduce_                    = false,
+                          AElementwiseOperation a_element_op = AElementwiseOperation{},
+                          BElementwiseOperation b_element_op = BElementwiseOperation{},
+                          CElementwiseOperation c_element_op = CElementwiseOperation{})
+            : Problem{M_,
+                      N_,
+                      K_,
+                      StrideA_,
+                      StrideB_,
+                      StrideC_,
+                      k_batch_,
+                      a_element_op,
+                      b_element_op,
+                      c_element_op},
               p_a_grid{p_a_grid_},
               p_b_grid{p_b_grid_},
               p_c_grid{p_c_grid_},
@@ -1377,10 +1401,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
-        const CElementwiseOperation c_element_op{};
-
         // divide block work by [M, N]
         const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
 
@@ -1440,7 +1460,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                 BlockwiseGemmPipe::GlobalBufferNum>(
                 a_grid_desc_ak0_m_ak1,
                 make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_element_op,
+                problem.a_element_op_,
                 a_block_desc_ak0_m_ak1,
                 make_multi_index(0, 0, 0),
                 ck::tensor_operation::element_wise::PassThrough{});
@@ -1471,7 +1491,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                 BlockwiseGemmPipe::GlobalBufferNum>(
                 b_grid_desc_bk0_n_bk1,
                 make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_element_op,
+                problem.b_element_op_,
                 b_block_desc_bk0_n_bk1,
                 make_multi_index(0, 0, 0),
                 ck::tensor_operation::element_wise::PassThrough{});
@@ -1598,42 +1618,67 @@ struct GridwiseGemm_xdl_cshuffle_v3
                 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
                     make_multi_index(n_thread_data_on_block));
 
+            tensor_operation::element_wise::PassThrough pass_through{};
+            const auto& vpgr_to_lds_element_op = [&] {
+                if constexpr(DoElementwiseBeforeCShuffle)
+                {
+                    return problem.c_element_op_;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+            const auto& lds_to_global_element_op = [&] {
+                if constexpr(!DoElementwiseBeforeCShuffle)
+                {
+                    return problem.c_element_op_;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                conditional_t<DoElementwiseBeforeCShuffle,
+                              CElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
+                Sequence<CShuffleMXdlPerWavePerShuffle,
+                         CShuffleNXdlPerWavePerShuffle,
+                         I1,
+                         I1,
+                         M2,
+                         I1,
+                         M4,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                7,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       n_thread_data_on_block_idx[I2]),
+                      vpgr_to_lds_element_op()};
 
             // shuffle: blockwise copy C from LDS to global
             auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
-                ThisThreadBlock,            // ThreadGroup
-                CElementwiseOperation,      // ElementwiseOperation,
+                ThisThreadBlock, // ThreadGroup
+                conditional_t<!DoElementwiseBeforeCShuffle,
+                              CElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
                 CGlobalMemoryDataOperation, // DstInMemOp,
                 Sequence<1,
                          CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
@@ -1654,7 +1699,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
                  make_multi_index(0, 0, 0, 0),
                  c_grid_desc_mblock_mperblock_nblock_nperblock,
                  make_multi_index(block_m_id, 0, block_n_id, 0),
-                 c_element_op};
+                 lds_to_global_element_op()};
 
             // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
@@ -1773,10 +1818,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
-        const CElementwiseOperation c_element_op{};
-
         // divide block work by [M, N]
         const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
 
@@ -1836,7 +1877,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                 BlockwiseGemmPipe::GlobalBufferNum>(
                 a_grid_desc_ak0_m_ak1,
                 make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_element_op,
+                problem.a_element_op_,
                 a_block_desc_ak0_m_ak1,
                 make_multi_index(0, 0, 0),
                 ck::tensor_operation::element_wise::PassThrough{});
@@ -1867,7 +1908,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                 BlockwiseGemmPipe::GlobalBufferNum>(
                 b_grid_desc_bk0_n_bk1,
                 make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_element_op,
+                problem.b_element_op_,
                 b_block_desc_bk0_n_bk1,
                 make_multi_index(0, 0, 0),
                 ck::tensor_operation::element_wise::PassThrough{});
@@ -2059,7 +2100,7 @@ struct GridwiseGemm_xdl_cshuffle_v3
                  make_multi_index(0, 0, 0, 0),
                  c_grid_desc_mblock_mperblock_nblock_nperblock,
                  make_multi_index(block_m_id, 0, block_n_id, 0),
-                 c_element_op};
+                 problem.c_element_op_};
 
             // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 274273d576..022afe7fa4 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -121,6 +121,7 @@ using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
 using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
 using AddRelu             = ck::tensor_operation::element_wise::AddRelu;
 using AddClamp            = ck::tensor_operation::element_wise::AddClamp;
+using Clamp               = ck::tensor_operation::element_wise::Clamp;
 using AddSilu             = ck::tensor_operation::element_wise::AddSilu;
 using AddReluAdd          = ck::tensor_operation::element_wise::AddReluAdd;
 using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index 3fbf2fbc7b..fca236d03e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -34,6 +34,7 @@ using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
+using Clamp       = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index 7311f4bf75..d6b695360b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -34,6 +34,7 @@ using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
+using Clamp       = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 5a4d0338b0..3e98852d58 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -26,6 +26,7 @@ using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
+using Clamp       = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 6da3ee1a4f..4e6b9c3d1d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -34,6 +34,7 @@ using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
+using Clamp       = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index d074988a22..7ef78d46e2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -26,6 +26,7 @@ using namespace ck::tensor_layout::convolution;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
+using Clamp       = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
new file mode 100644
index 0000000000..cb84ca6130
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+#ifdef CK_USE_XDL
+#include "grouped_convolution_forward_clamp_xdl.inc"
+#endif
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DLayouts,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename DDataTypes,
+          typename AComputeType,
+          typename BComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::Clamp,
+    AComputeType,
+    BComputeType>>
+{
+    using DeviceOp =
+        DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        DLayouts,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        DDataTypes,
+                                        OutDataType,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Clamp,
+                                        AComputeType,
+                                        BComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        // layout NHWGC/GKYXC/NHWGK
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+        // layout NDHWGC/GKZYXC/NDHWGK
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+#endif // CK_USE_XDL
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
new file mode 100644
index 0000000000..b943bf728f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_BF16
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..15d236525b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+add_instance_library(device_grouped_conv2d_fwd_clamp_instance
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
+
+   xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+
+   xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
+
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..d770bdc24e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<>,
+                                                               Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<>,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0,
+                                                               Tuple<>,
+                                                               Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<>,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0,
+                                                               Tuple<>,
+                                                               Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..ade9b466ac
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..5abab15254
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Tuple<>,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault,
+                                                                  Tuple<>,
+                                                                  Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Tuple<>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0,
+                                                                  Tuple<>,
+                                                                  Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC,
+                                                                  Tuple<>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0,
+                                                                  Tuple<>,
+                                                                  Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..61c84fcb29
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                                                    NHWGC,
+                                                                                    GKYXC,
+                                                                                    Tuple<>,
+                                                                                    NHWGK,
+                                                                                    ConvFwdDefault,
+                                                                                    Tuple<>,
+                                                                                    Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                                                    NHWGC,
+                                                                                    GKYXC,
+                                                                                    Tuple<>,
+                                                                                    NHWGK,
+                                                                                    ConvFwd1x1P0,
+                                                                                    Tuple<>,
+                                                                                    Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                                                    NHWGC,
+                                                                                    GKYXC,
+                                                                                    Tuple<>,
+                                                                                    NHWGK,
+                                                                                    ConvFwd1x1S1P0,
+                                                                                    Tuple<>,
+                                                                                    Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..f766db04c9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Tuple<>,
+                                                                              NHWGK,
+                                                                              ConvFwdDefault,
+                                                                              Tuple<>,
+                                                                              Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Tuple<>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1P0,
+                                                                              Tuple<>,
+                                                                              Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC,
+                                                                              Tuple<>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1S1P0,
+                                                                              Tuple<>,
+                                                                              Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..45a84fd814
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<>,
+                                                                Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..42c82c3c1a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..52fc9ed765
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..1156375655
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC,
+                                                                        Tuple<>,
+                                                                        NHWGK,
+                                                                        ConvFwdDefault,
+                                                                        Tuple<>,
+                                                                        Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC,
+                                                                        Tuple<>,
+                                                                        NHWGK,
+                                                                        ConvFwd3x3,
+                                                                        Tuple<>,
+                                                                        Clamp>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<>,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault,
+                                                                     Tuple<>,
+                                                                     Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC,
+                                                                     Tuple<>,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3,
+                                                                     Tuple<>,
+                                                                     Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..5eb0dd50eb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_FWD
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
+
+   xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+
+   xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
+
+   xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+)
+
+add_instance_library(device_grouped_conv3d_fwd_clamp_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..5293fa70c3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Tuple<>,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault,
+                                                                  Tuple<>,
+                                                                  Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Tuple<>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0,
+                                                                  Tuple<>,
+                                                                  Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Tuple<>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0,
+                                                                  Tuple<>,
+                                                                  Clamp>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<>,
+                                                               Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0,
+                                                               Tuple<>,
+                                                               Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0,
+                                                               Tuple<>,
+                                                               Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
new file mode 100644
index 0000000000..a454671a52
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                                                    NDHWGC,
+                                                                                    GKZYXC,
+                                                                                    Tuple<>,
+                                                                                    NDHWGK,
+                                                                                    ConvFwdDefault,
+                                                                                    Tuple<>,
+                                                                                    Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                                                    NDHWGC,
+                                                                                    GKZYXC,
+                                                                                    Tuple<>,
+                                                                                    NDHWGK,
+                                                                                    ConvFwd1x1P0,
+                                                                                    Tuple<>,
+                                                                                    Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                                                    NDHWGC,
+                                                                                    GKZYXC,
+                                                                                    Tuple<>,
+                                                                                    NDHWGK,
+                                                                                    ConvFwd1x1S1P0,
+                                                                                    Tuple<>,
+                                                                                    Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..9bc9c1c786
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Tuple<>,
+                                                                              NDHWGK,
+                                                                              ConvFwdDefault,
+                                                                              Tuple<>,
+                                                                              Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Tuple<>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1P0,
+                                                                              Tuple<>,
+                                                                              Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC,
+                                                                              Tuple<>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1S1P0,
+                                                                              Tuple<>,
+                                                                              Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..f35d6b3307
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<>,
+                                                                Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..c706ae4d7a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..d6c4bcc417
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
new file mode 100644
index 0000000000..d0f2a16c8a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<>,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<>,
+                                                                 NDHWGK,
+                                                                 ConvFwd3x3,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
index 3ef9f4505d..c12fa75e34 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
@@ -25,6 +25,28 @@
 namespace ck {
 namespace profiler {
 
+// NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to
+// just keep such implementation valid.
+// TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse
+// the same instances.
+
+template <ck::index_t NDimSpatial>
+auto get_bias_desc(ck::index_t G, ck::index_t K)
+{
+    if constexpr(NDimSpatial == 1)
+    {
+        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0});
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0});
+    }
+    else
+    {
+        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0});
+    }
+}
+
 template <ck::index_t NDimSpatial,
           typename InLayout,
           typename WeiLayout,
@@ -34,7 +56,8 @@ template <ck::index_t NDimSpatial,
           typename OutDataType,
           typename AComputeType = InDataType,
           typename BComputeType = AComputeType,
-          typename IndexType    = ck::index_t>
+          typename IndexType    = ck::index_t,
+          bool BiasGK           = false>
 bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                               int init_method,
                                               bool do_log,
@@ -61,12 +84,16 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     const auto out_g_n_k_wos_desc =
         ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
 
+    const index_t G = conv_param.G_;
+    const index_t K = conv_param.K_;
+
     std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
     std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
     std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
     std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
     std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
     std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial + 3> d_g_n_k_wos_strides{};
     std::array<IndexType, NDimSpatial> conv_filter_strides{};
     std::array<IndexType, NDimSpatial> conv_filter_dilations{};
     std::array<IndexType, NDimSpatial> input_left_pads{};
@@ -80,6 +107,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
     copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
     copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
     copy(conv_param.conv_filter_strides_, conv_filter_strides);
     copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
     copy(conv_param.input_left_pads_, input_left_pads);
@@ -89,7 +117,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
     Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
     Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
-    Tensor<OutDataType> bias(out_g_n_k_wos_desc);
+    const auto bias_desc = BiasGK ? get_bias_desc<NDimSpatial>(G, K) : out_g_n_k_wos_desc;
+    Tensor<OutDataType> bias(bias_desc);
 
     std::cout << "input: " << input.mDesc << std::endl;
     std::cout << "weight: " << weight.mDesc << std::endl;
@@ -113,7 +142,11 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
     DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
     DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
     DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
+
+    const std::size_t bias_dev_buf_size =
+        BiasGK ? sizeof(OutDataType) * G * K
+               : sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize();
+    DeviceMem bias_device_buf(bias_dev_buf_size);
 
     in_device_buf.ToDevice(input.mData.data());
     wei_device_buf.ToDevice(weight.mData.data());
@@ -244,6 +277,16 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
 
     std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
 
+    if constexpr(BiasGK)
+    {
+        constexpr ck::index_t spatial_offset = 3;
+        d_g_n_k_wos_strides[1]               = 0;
+        for(int i = 0; i < NDimSpatial; i++)
+        {
+            d_g_n_k_wos_strides[i + spatial_offset] = 0;
+        }
+    }
+
     for(auto& op_ptr : op_ptrs)
     {
         auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
@@ -255,7 +298,7 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                                         b_g_k_c_xs_lengths,
                                                         b_g_k_c_xs_strides,
                                                         {e_g_n_k_wos_lengths},
-                                                        {e_g_n_k_wos_strides},
+                                                        {d_g_n_k_wos_strides},
                                                         e_g_n_k_wos_lengths,
                                                         e_g_n_k_wos_strides,
                                                         conv_filter_strides,
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 08e707b665..a1f9ee1528 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -12,6 +12,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp"
 
 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -34,20 +35,20 @@ template <ck::index_t NDimSpatial,
           typename OutDataType,
           typename AComputeType = InDataType,
           typename BComputeType = AComputeType,
-          typename IndexType    = ck::index_t>
+          typename IndexType    = ck::index_t,
+          typename OutElementOp = ck::tensor_operation::element_wise::PassThrough>
 bool profile_grouped_conv_fwd_impl(int do_verification,
                                    int init_method,
                                    bool do_log,
                                    bool time_kernel,
-                                   const ck::utils::conv::ConvParam& conv_param)
+                                   const ck::utils::conv::ConvParam& conv_param,
+                                   const OutElementOp out_element_op = OutElementOp{})
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
 
     const auto in_g_n_c_wis_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index 2ddcbb67cd..9e2f436e68 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -208,6 +208,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-in_layout",
         "-I",
+        "--in_layout",
+        "--I",
         default="NCHW",
         type=str,
         required=False,
@@ -216,6 +218,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-forw",
         "-F",
+        "--forw",
+        "--F",
         default=0,
         type=int,
         required=False,
@@ -231,6 +235,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-spatial_dim",
         "-_",
+        "--spatial_dim",
+        "--_",
         default=2,
         type=int,
         required=False,
@@ -239,6 +245,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-batchsize",
         "-n",
+        "--batchsize",
+        "--n",
         default=100,
         type=int,
         required=False,
@@ -247,6 +255,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-in_channels",
         "-c",
+        "--in_channels",
+        "--c",
         default=3,
         type=int,
         required=False,
@@ -255,6 +265,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-in_d",
         "-!",
+        "--in_d",
+        "--!",
         default=32,
         type=int,
         required=False,
@@ -263,6 +275,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-in_h",
         "-H",
+        "--in_h",
+        "--H",
         default=32,
         type=int,
         required=False,
@@ -271,6 +285,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-in_w",
         "-W",
+        "--in_w",
+        "--W",
         default=32,
         type=int,
         required=False,
@@ -279,6 +295,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-out_channels",
         "-k",
+        "--out_channels",
+        "--k",
         default=32,
         type=int,
         required=False,
@@ -287,6 +305,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-fil_d",
         "-@",
+        "--fil_d",
+        "--@",
         default=3,
         type=int,
         required=False,
@@ -295,6 +315,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-fil_h",
         "-y",
+        "--fil_h",
+        "--y",
         default=3,
         type=int,
         required=False,
@@ -303,6 +325,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-fil_w",
         "-x",
+        "--fil_w",
+        "--x",
         default=3,
         type=int,
         required=False,
@@ -311,6 +335,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-conv_stride_d",
         "-#",
+        "--conv_stride_d",
+        "--#",
         default=1,
         type=int,
         required=False,
@@ -319,6 +345,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-conv_stride_h",
         "-u",
+        "--conv_stride_h",
+        "--u",
         default=1,
         type=int,
         required=False,
@@ -327,6 +355,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-conv_stride_w",
         "-v",
+        "--conv_stride_w",
+        "--v",
         default=1,
         type=int,
         required=False,
@@ -335,6 +365,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-pad_d",
         "-$",
+        "--pad_d",
+        "--$",
         default=1,
         type=int,
         required=False,
@@ -343,6 +375,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-pad_h",
         "-p",
+        "--pad_h",
+        "--p",
         default=1,
         type=int,
         required=False,
@@ -351,6 +385,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-pad_w",
         "-q",
+        "--pad_w",
+        "--q",
         default=1,
         type=int,
         required=False,
@@ -359,6 +395,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-verify",
         "-V",
+        "--verify",
+        "--V",
         default=1,
         type=int,
         required=False,
@@ -367,6 +405,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-time",
         "-t",
+        "--time",
+        "--t",
         default=0,
         type=int,
         required=False,
@@ -375,6 +415,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-dilation_d",
         "-^",
+        "--dilation_d",
+        "--^",
         default=1,
         type=int,
         required=False,
@@ -383,6 +425,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-dilation_h",
         "-l",
+        "--dilation_h",
+        "--l",
         default=1,
         type=int,
         required=False,
@@ -391,6 +435,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-dilation_w",
         "-j",
+        "--dilation_w",
+        "--j",
         default=1,
         type=int,
         required=False,
@@ -399,6 +445,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "-group_count",
         "-g",
+        "--group_count",
+        "--g",
         type=int,
         default=1,
         required=False,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1f2e7022ba..5b25550d9b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -252,7 +252,7 @@ add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_convnd_fwd)
-add_subdirectory(grouped_convnd_fwd_bias_clamp)
+add_subdirectory(grouped_convnd_fwd_activation)
 add_subdirectory(grouped_convnd_bwd_weight)
 add_subdirectory(block_to_ctile_map)
 add_subdirectory(softmax)
diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt
new file mode 100644
index 0000000000..8bded647b6
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_gk_bias_clamp test_grouped_convnd_fwd_gk_bias_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_gk_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_clamp test_grouped_convnd_fwd_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_clamp PRIVATE utility device_grouped_conv2d_fwd_clamp_instance device_grouped_conv3d_fwd_clamp_instance)
+endif()
diff --git a/test/grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
similarity index 96%
rename from test/grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp
rename to test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
index 7d5437d247..f3a569115a 100644
--- a/test/grouped_convnd_fwd_bias_clamp/test_grouped_convnd_fwd_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
@@ -41,7 +41,8 @@ class TestGroupedConvndFwd : public ::testing::Test
                                                                                   DataType,
                                                                                   DataType,
                                                                                   DataType,
-                                                                                  IndexType>(
+                                                                                  IndexType,
+                                                                                  false /*BiasGK*/>(
                                true,  // do_verification
                                1,     // init_method: integer value
                                false, // do_log
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
new file mode 100644
index 0000000000..d3ede8671e
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using Clamp = ck::tensor_operation::element_wise::Clamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        Clamp out_element_op{0.f, 256.f};
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                       InLayout,
+                                                                       WeiLayout,
+                                                                       OutLayout,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       IndexType,
+                                                                       Clamp>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param,
+                               out_element_op);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
new file mode 100644
index 0000000000..0a41eac286
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                                  InLayout,
+                                                                                  WeiLayout,
+                                                                                  OutLayout,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  IndexType,
+                                                                                  true /*BiasGK*/>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt b/test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt
deleted file mode 100644
index 4630a37d33..0000000000
--- a/test/grouped_convnd_fwd_bias_clamp/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
-    target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
-endif()

From 5523df4b2dfab16d6144d7717b3b075f8c6d5104 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 16 Jun 2025 07:54:55 -0700
Subject: [PATCH 219/443] Revert "fix the flatmm (#2349)" (#2352)

This reverts commit d996bc78befb15ee0405ff78d0ad0da00f8550f3.
---
 example/ck_tile/18_flatmm/flatmm_basic.cpp          | 3 ---
 include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp | 3 +--
 include/ck_tile/ops/gemm.hpp                        | 2 +-
 script/run_ck_profiler_gemm_with_csv_shapes.py      | 4 ++--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 8782d2bb6a..c564d7d1b1 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -49,12 +49,9 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
-                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
-                                             ck_tile::tuple<>,
                                              CLayout,
-                                             ck_tile::element_wise::PassThrough,
                                              CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index d2e1bde58f..a9ed1519e6 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -447,7 +447,6 @@ struct FlatmmKernel
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window      = gemm_tile_windows.at(I0);
         const auto& b_flat_block_window = gemm_tile_windows.at(I1);
-        const auto& d_block_window      = gemm_tile_windows.at(I2);
         const auto& c_block_tile        = FlatmmPipeline{}.template operator()(
             a_block_window, b_flat_block_window, num_loop, smem_ptr);
 
@@ -455,7 +454,7 @@ struct FlatmmKernel
         auto& c_block_window = gemm_tile_windows.at(I2);
 
         EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, d_block_window, smem_ptr);
+            c_block_window, c_block_tile, smem_ptr);
     }
 
     CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index a1d37f0824..8db822ebd1 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -31,8 +31,8 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
diff --git a/script/run_ck_profiler_gemm_with_csv_shapes.py b/script/run_ck_profiler_gemm_with_csv_shapes.py
index 54b4b337de..1f7ec7585f 100644
--- a/script/run_ck_profiler_gemm_with_csv_shapes.py
+++ b/script/run_ck_profiler_gemm_with_csv_shapes.py
@@ -278,13 +278,13 @@ def main():
     shapes = tuples(filename)
 
     all_results = []
+    from tqdm import tqdm
     from functools import partial
     from os import path
 
     profiler_bin = path.join(args["build_dir"], "bin", "ckProfiler")
 
-    total = len(shapes)
-    for idx, s in enumerate(shapes, 1): 
+    for s in tqdm(shapes):
         run_shape_stdout_lines = run_shape(
             s, profiler_bin, args["op_name"], args["dtype"], args["layout"]
         )

From 6589f50bc93ee3c4ccb7c8a6c765338284b9bc73 Mon Sep 17 00:00:00 2001
From: rahjain-amd <rahjain@amd.com>
Date: Mon, 16 Jun 2025 21:59:35 +0530
Subject: [PATCH 220/443] Add cmake flag to enable Assembly dump (#2347)

This flag makes it easy to dump assembly for the example kernels.
---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aab74f3069..b0fc725236 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,6 +308,7 @@ endif()
 
 option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
 option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
+option(ENABLE_ASM_DUMP "Whether to enable assembly dump for kernels." OFF)
 
 if(USE_BITINT_EXTENSION_INT4)
     add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -321,6 +322,12 @@ if(USE_OPT_GFX11)
     message(STATUS "CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
 endif()
 
+if(ENABLE_ASM_DUMP)
+    add_compile_options(--save-temps) 
+    add_compile_options(-Wno-gnu-line-marker)
+    message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
+endif()
+
 ## Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

From 3c4cdfac4f6dd9c2f952a02acb028e2c3dd62ef9 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 16 Jun 2025 17:38:52 -0700
Subject: [PATCH 221/443] Fix the CK Tile related operators (#2356)

* fix the flatmm

* Fix the pipeline

* address the comment
---
 example/ck_tile/03_gemm/gemm_basic.cpp                    | 3 +++
 example/ck_tile/03_gemm/universal_gemm.cpp                | 2 +-
 example/ck_tile/18_flatmm/flatmm_basic.cpp                | 3 +++
 include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp       | 3 ++-
 include/ck_tile/ops/gemm.hpp                              | 2 +-
 .../ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp  | 1 +
 .../gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp   | 2 ++
 .../ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp   | 2 ++
 include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp    | 1 +
 script/run_ck_profiler_gemm_with_csv_shapes.py            | 8 ++++++--
 10 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index defeffc2ee..1906b0bda7 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -69,9 +69,12 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
+                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
+                                             ck_tile::tuple<>,
                                              CLayout,
+                                             ck_tile::element_wise::PassThrough,
                                              CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index beb6987605..3ec90e7f00 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -166,7 +166,7 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                     // clear c mem
                     if(args.k_batch > 1)
                         hipGetErrorString(hipMemsetAsync(
-                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
                 };
                 ave_time = ck_tile::launch_kernel_preprocess(
                     s,
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index c564d7d1b1..8782d2bb6a 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -49,9 +49,12 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
+                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
+                                             ck_tile::tuple<>,
                                              CLayout,
+                                             ck_tile::element_wise::PassThrough,
                                              CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index a9ed1519e6..d2e1bde58f 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -447,6 +447,7 @@ struct FlatmmKernel
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window      = gemm_tile_windows.at(I0);
         const auto& b_flat_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window      = gemm_tile_windows.at(I2);
         const auto& c_block_tile        = FlatmmPipeline{}.template operator()(
             a_block_window, b_flat_block_window, num_loop, smem_ptr);
 
@@ -454,7 +455,7 @@ struct FlatmmKernel
         auto& c_block_window = gemm_tile_windows.at(I2);
 
         EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, smem_ptr);
+            c_block_window, c_block_tile, d_block_window, smem_ptr);
     }
 
     CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 8db822ebd1..a1d37f0824 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -31,8 +31,8 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 9ef7f3f0ef..55220730cd 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -1,5 +1,6 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/host/concat.hpp"
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 217408fffa..881467cb94 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -47,6 +47,8 @@ struct GemmPipelineAGmemBGmemCRegV1
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
 
+    static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
+
     static constexpr index_t kLdsAlignmentInBytes = 16;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 678fb6eb46..b349991470 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -32,6 +32,8 @@ struct GemmPipelineProblemBase
 
     static constexpr bool TransposeC = Traits::TransposeC;
 
+    static constexpr index_t NumWaveGroups = Traits::NumWaveGroups;
+
     static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
 
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index 353192d86f..c6f83068a9 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -28,6 +28,7 @@ struct TileGemmTraits
 
     static constexpr bool TransposeC            = false;
     static constexpr bool UseStructuredSparsity = false;
+    static constexpr index_t NumWaveGroups      = 1;
 };
 
 template <bool kPadM_,
diff --git a/script/run_ck_profiler_gemm_with_csv_shapes.py b/script/run_ck_profiler_gemm_with_csv_shapes.py
index 1f7ec7585f..553d46558e 100644
--- a/script/run_ck_profiler_gemm_with_csv_shapes.py
+++ b/script/run_ck_profiler_gemm_with_csv_shapes.py
@@ -278,13 +278,17 @@ def main():
     shapes = tuples(filename)
 
     all_results = []
-    from tqdm import tqdm
     from functools import partial
     from os import path
 
     profiler_bin = path.join(args["build_dir"], "bin", "ckProfiler")
 
-    for s in tqdm(shapes):
+    try:
+        from tqdm import tqdm as iterate
+    except ImportError:
+        iterate = lambda x: x
+
+    for s in iterate(shapes):
         run_shape_stdout_lines = run_shape(
             s, profiler_bin, args["op_name"], args["dtype"], args["layout"]
         )

From 3af66e99ab137716725116ae80a3ca88f9445bec Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 17 Jun 2025 10:07:08 -0400
Subject: [PATCH 222/443] add script to pre commit hooks for checking file
 permissions (#2322)

---
 .pre-commit-config.yaml   | 6 ++++++
 script/remove_exec_bit.sh | 8 ++++++++
 2 files changed, 14 insertions(+)
 create mode 100755 script/remove_exec_bit.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d6700ae05b..4dc70c1ffd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,3 +12,9 @@ repos:
         verbose: false
         language: script
         types: [c++]
+    - id: remove-exec-bit
+      name: Remove executable bit from non-executable files
+      entry: script/remove_exec_bit.sh
+      language: script
+      types_or: [c++, text]
+      verbose: true
diff --git a/script/remove_exec_bit.sh b/script/remove_exec_bit.sh
new file mode 100755
index 0000000000..25466d8c37
--- /dev/null
+++ b/script/remove_exec_bit.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+for file in $(git diff --cached --name-only --diff-filter=ACM | grep -E '\.(cpp|hpp|txt|inc)$'); do
+    if [ -x "$file" ]; then
+        chmod -x "$file"
+        echo "[remove-exec-bit] Removed executable bit from $file" >&2
+    fi
+done

From 4c57157d508e4c102626730aa372c8111670a878 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <53337087+satyanveshd@users.noreply.github.com>
Date: Wed, 18 Jun 2025 00:24:30 +0530
Subject: [PATCH 223/443] Do not use warpSize as compile time constant as it is
 removed (#2320)

* Do not use warpSize as compile time constant as it is removed

* Update tile_image_to_column_shape.hpp

update warpSize usage.

* clean-up all use of warpSize, make sure code builds

* fix

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: Bartlomiej Kocot <barkocot@amd.com>
---
 example/ck_tile/02_layernorm2d/generate.py    |  20 ++--
 example/ck_tile/05_reduce/reduce.hpp          |   2 +-
 example/ck_tile/10_rmsnorm2d/generate.py      |  20 ++--
 .../add_rmsnorm2d_rdquant_fwd.hpp             |  20 ++--
 .../ck_tile/12_smoothquant/smoothquant.hpp    |  20 ++--
 .../14_moe_smoothquant/moe_smoothquant.hpp    |  20 ++--
 include/ck/ck.hpp                             |   6 +
 ...blockwise_gemm_mx_pipeline_xdlops_base.hpp |   2 +-
 .../blockwise_gemm_pipeline_xdlops_base.hpp   |   2 +-
 .../blockwise_gemm_pipeline_xdlops_v2.hpp     |   4 +-
 ...kwise_gemm_pipeline_xdlops_v2_ab_scale.hpp |   2 +-
 ...ckwise_gemm_pipeline_xdlops_v2_b_scale.hpp |   4 +-
 .../gridwise_multiblock_batchnorm_forward.hpp |   2 +-
 ...wise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp |   6 +-
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |   6 +-
 ...fle_v3_multi_d_blockscale_b_preshuffle.hpp |   6 +-
 ...se_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp |   4 +-
 .../gpu/grid/gridwise_moe_gemm.hpp            |  11 +-
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp |  10 +-
 .../gpu/grid/gridwise_moe_mx_gemm.hpp         |  10 +-
 .../gpu/grid/gridwise_moe_mx_gemm_bns.hpp     |   4 +-
 .../ck/utility/workgroup_synchronization.hpp  |   2 +-
 include/ck_tile/core/arch/utility.hpp         |   2 +-
 .../flatmm_32x512x128_1x4x1_16x16x32.hpp      |  26 ++---
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |  38 +++----
 .../fused_moe/kernel/fused_moegemm_shape.hpp  |   2 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 106 +++++++++---------
 .../fused_moegemm_pipeline_flatmm_policy.hpp  |  52 ++++-----
 .../pipeline/tile_image_to_column_shape.hpp   |   2 +-
 .../norm_reduce/block/block_norm_reduce.hpp   |   4 +-
 .../ops/reduce/block/block_reduce2d.hpp       |   4 +-
 31 files changed, 213 insertions(+), 206 deletions(-)

diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index 0238a125dc..2dc9ccbd77 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -75,22 +75,22 @@ struct layernorm2d_fwd_traits_
     using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
     using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
         }
     }();
 
@@ -98,13 +98,13 @@ struct layernorm2d_fwd_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
         }
     }();
 
diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp
index 55e479591c..50ffb9c1c7 100644
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -35,7 +35,7 @@ struct Reduce2dShape
     static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
 
     static constexpr index_t BlockSize =
-        warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+        WarpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
 };
 
 template <typename XDataType_,
diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py
index 39d42e5ff1..6a181bf32e 100644
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -74,22 +74,22 @@ struct rmsnorm2d_fwd_traits_
     using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
     using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
         }
     }();
 
@@ -97,13 +97,13 @@ struct rmsnorm2d_fwd_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
         }
     }();
 
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
index c91b387d62..1d843b5594 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -80,22 +80,22 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
     using InputDataType     = ck_tile::remove_cvref_t<InputDataType_>;
     using QuantizedDataType = ck_tile::remove_cvref_t<QuantizedDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
         }
     }();
 
@@ -103,13 +103,13 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
         }
     }();
 
diff --git a/example/ck_tile/12_smoothquant/smoothquant.hpp b/example/ck_tile/12_smoothquant/smoothquant.hpp
index 83ad7b012c..265399c276 100644
--- a/example/ck_tile/12_smoothquant/smoothquant.hpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.hpp
@@ -49,22 +49,22 @@ struct smoothquant_traits_
 {
     using DataType = ck_tile::remove_cvref_t<DataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
         }
     }();
 
@@ -72,13 +72,13 @@ struct smoothquant_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
         }
     }();
 
diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
index c1b90b14b2..b29295f175 100644
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
@@ -38,22 +38,22 @@ struct moe_smoothquant_traits_
     using InputType  = ck_tile::remove_cvref_t<InputType_>;
     using OutputType = ck_tile::remove_cvref_t<OutputType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
         }
     }();
 
@@ -61,13 +61,13 @@ struct moe_smoothquant_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
         }
     }();
 
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 26e4787949..3c1373a387 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -274,6 +274,12 @@
 
 namespace ck {
 
+#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
+__device__ static constexpr int WarpSize = 64;
+#else
+__device__ static constexpr int WarpSize = 32;
+#endif
+
 enum struct InMemoryDataOperationEnum
 {
     Set,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
index f366f309ff..5370cfa975 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -45,7 +45,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    // Hardcode to 64, as HIP-provided "warpSize" would return 32 on RDNA GPUs.
+    // Hardcode to 64, as HIP-provided "WarpSize" would return 32 on RDNA GPUs.
     static constexpr index_t WaveSize = 64;
 
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index 94772361d3..9296b8136f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -40,7 +40,7 @@ struct BlockwiseGemmXdlops_pipeline_base
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    // Hardcode to 64, as HIP-provided "warpSize" would return 32 on RDNA GPUs.
+    // Hardcode to 64, as HIP-provided "WarpSize" would return 32 on RDNA GPUs.
     static constexpr index_t WaveSize = 64;
 
     static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
index 54edf0c353..a6b5e272ff 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp
@@ -141,7 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Intrawave,
     using Base::BMmaKStride;
 
     static constexpr index_t WgpPerCU =
-        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
@@ -631,7 +631,7 @@ struct BlockwiseGemmXdlops_pipeline_v2<BlockGemmPipelineScheduler::Interwave,
     static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
 
     static constexpr index_t WgpPerCU =
-        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
index c8ad9c5b02..0c030030fe 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp
@@ -143,7 +143,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale<BlockGemmPipelineScheduler::Intr
     using Base::BMmaKStride;
 
     static constexpr index_t WgpPerCU =
-        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
index 776f66dbbb..69002d7962 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
@@ -141,7 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Intra
     using Base::BMmaKStride;
 
     static constexpr index_t WgpPerCU =
-        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
@@ -632,7 +632,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_b_scale<BlockGemmPipelineScheduler::Inter
     static constexpr index_t KRepeat        = KPerThread / KPerInnerLoop;
 
     static constexpr index_t WgpPerCU =
-        (4 * warpSize / BlockSize) >= 1 ? 4 * warpSize / BlockSize : 1;
+        (4 * WarpSize / BlockSize) >= 1 ? 4 * WarpSize / BlockSize : 1;
     static constexpr index_t FullMemBandPrefetchStages = math::integer_divide_ceil(
         32768 / WgpPerCU,
         (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
index 47573107cf..7c9febf4de 100644
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
@@ -202,7 +202,7 @@ struct GridwiseMultiblockBatchNormForward
         const index_t block_local_id  = block_global_id % blkgroup_size;
 
         if(block_local_id == 0)
-            gms_init(BlockSize / warpSize * blkgroup_size, &p_control[blkgroup_id * 2]);
+            gms_init(BlockSize / WarpSize * blkgroup_size, &p_control[blkgroup_id * 2]);
 
         const auto thread_cluster_idx =
             thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index cfa8bfeb2a..8d5c844103 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -347,7 +347,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1229,7 +1229,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
 
@@ -1607,7 +1607,7 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 3eb0f986b3..d31ed19787 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -374,7 +374,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPackPerGroup>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPackPerGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1249,7 +1249,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPackPerGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPackPerGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1687,7 +1687,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPackPerGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPackPerGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
index 322cd3d162..909376e5f7 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
@@ -370,7 +370,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1208,7 +1208,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1707,7 +1707,7 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index 223670e3bc..6691c63484 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -422,7 +422,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack>{};
         return make_naive_tensor_descriptor_packed(
             make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
     }
@@ -1886,7 +1886,7 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                                    get_warp_local_1d_id() % NWave,
                                    0,
                                    0,
-                                   KPack * (get_thread_local_1d_id() % warpSize)));
+                                   KPack * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 62d94c0bf8..92aab5af52 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -405,7 +405,7 @@ struct GridwiseMoeGemm
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1315,7 +1315,7 @@ struct GridwiseMoeGemm
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1361,7 +1361,8 @@ struct GridwiseMoeGemm
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
+
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
@@ -2027,7 +2028,7 @@ struct GridwiseMoeGemm
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -2077,7 +2078,7 @@ struct GridwiseMoeGemm
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index fbfe2509ff..f092c9c1eb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -410,7 +410,7 @@ struct GridwiseMoeGemmBlockScale
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave, NWave, K0, NkSwizzleNumber),
             make_tuple(NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber, I1));
@@ -1355,7 +1355,7 @@ struct GridwiseMoeGemmBlockScale
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1467,7 +1467,7 @@ struct GridwiseMoeGemmBlockScale
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             const BScaleType* p_b_scale_grid_up =
                 p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
             const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -2105,7 +2105,7 @@ struct GridwiseMoeGemmBlockScale
                   make_multi_index(n_block_data_idx_on_grid,
                                    get_warp_local_1d_id() % NWave,
                                    0,
-                                   KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                   KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -2221,7 +2221,7 @@ struct GridwiseMoeGemmBlockScale
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             const BScaleType* p_b_scale_grid_up =
                 p_b_scale_grid + expert_scale_stride / 2 / BPackedSize;
             const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index fc156a878f..59693a5861 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -409,7 +409,7 @@ struct GridwiseMoeGemmMX
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack / KGroup>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
         return make_naive_tensor_descriptor(
             make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber),
             make_tuple(NWave * NXdlPack * K0 * NkSwizzleNumber,
@@ -1415,7 +1415,7 @@ struct GridwiseMoeGemmMX
                 make_multi_index(n_block_data_idx_on_grid,
                                  get_warp_local_1d_id() % NWave,
                                  0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1508,7 +1508,7 @@ struct GridwiseMoeGemmMX
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
             const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_b_scale_grid_up + expert_id * expert_scale_stride,
@@ -2123,7 +2123,7 @@ struct GridwiseMoeGemmMX
                                  get_warp_local_1d_id() % NWave,
                                  0,
                                  0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -2221,7 +2221,7 @@ struct GridwiseMoeGemmMX
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
             const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_b_scale_grid_up + expert_id * expert_scale_stride,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
index 7238917920..9ccd334262 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -2319,7 +2319,7 @@ struct GridwiseMoeGemmMXBNS
                                  get_warp_local_1d_id() % NWave,
                                  0,
                                  0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -2417,7 +2417,7 @@ struct GridwiseMoeGemmMXBNS
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
             const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_b_scale_grid_up + expert_id * expert_scale_stride,
diff --git a/include/ck/utility/workgroup_synchronization.hpp b/include/ck/utility/workgroup_synchronization.hpp
index 24858fdbdc..af5b0808fb 100644
--- a/include/ck/utility/workgroup_synchronization.hpp
+++ b/include/ck/utility/workgroup_synchronization.hpp
@@ -32,7 +32,7 @@ static __device__ void gms_init(int NumWarps, int* p_control_bits)
 // all the workgroups in the synchronization group is supposed to call this function
 static __device__ void gms_barrier(int* p_control_bits)
 {
-    constexpr int mask = warpSize - 1;
+    constexpr int mask = WarpSize - 1;
 
     if((threadIdx.x & mask) == 0)
     {
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index df0f54c5ed..7184f99521 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -35,7 +35,7 @@ CK_TILE_DEVICE T warp_shuffle_up(const T& v_local, uint32_t lane_delta)
 #elif 1
     static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
 
-    const uint32_t wrap_around_lane_delta = warpSize - lane_delta;
+    const uint32_t wrap_around_lane_delta = get_warp_size() - lane_delta;
 
     const int32_t v_remote_tmp = __builtin_amdgcn_ds_bpermute(
         (__lane_id() << 2) + (wrap_around_lane_delta << 2), bit_cast<int32_t>(v_local));
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
index 869ab32c2e..1dcd62011a 100644
--- a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
+++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
@@ -95,7 +95,7 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
         // constexpr index_t Block_M = Problem::BlockShape::Block_M0;
         // constexpr index_t Block_K = Problem::BlockShape::Block_K0;
         // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t warpSize = ck_tile::get_warp_size();
+        constexpr index_t WarpSize = ck_tile::get_warp_size();
         // constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
 
         constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
@@ -104,11 +104,11 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
 
         static_assert(Block_K % KVector == 0);
         constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= warpSize)
+        if constexpr(LanesPerK >= WarpSize)
         {
             // need multiple waves to load K
-            static_assert(LanesPerK % warpSize == 0);
-            constexpr index_t wavesPerK = LanesPerK / warpSize;
+            static_assert(LanesPerK % WarpSize == 0);
+            constexpr index_t wavesPerK = LanesPerK / WarpSize;
             if constexpr(wavesPerK > NumWarps)
             {
                 // TODO: need multiple issues along K to load all data
@@ -121,11 +121,11 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
                     make_tuple(number<NumIssues>{},                             // m0
                                number<wavesPerM>{},                             // m1
                                number<wavesPerK>{},                             // k0
-                               number<warpSize>{},                              // k1
+                               number<WarpSize>{},                              // k1
                                number<KVector>{}),                              // k2
-                    make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{},  // m0
-                               number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
-                               number<warpSize * KVector + KPad>{},             // k0
+                    make_tuple(number<NumWarps*(WarpSize * KVector + KPad)>{},  // m0
+                               number<wavesPerK*(WarpSize * KVector + KPad)>{}, // m1
+                               number<WarpSize * KVector + KPad>{},             // k0
                                number<KVector>{},                               // k1
                                number<1>{}),                                    // k2
                     number<KVector>{}, // lds store vector(actually no explicit store)
@@ -136,7 +136,7 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
                     make_tuple(
                         make_pass_through_transform(number<NumIssues>{}),
                         make_merge_transform(make_tuple(number<wavesPerM>{}, number<wavesPerK>{})),
-                        make_merge_transform(make_tuple(number<warpSize>{}, number<KVector>{}))),
+                        make_merge_transform(make_tuple(number<WarpSize>{}, number<KVector>{}))),
                     make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
                     make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
 
@@ -146,8 +146,8 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
         else
         {
             // lanes within a wave load different M but same K
-            static_assert(warpSize % LanesPerK == 0);
-            constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
+            static_assert(WarpSize % LanesPerK == 0);
+            constexpr index_t LaneGroups = WarpSize / LanesPerK; // along m
             constexpr index_t NumIssues  = Block_M / (LaneGroups * NumWarps);
 
             constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
@@ -156,9 +156,9 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
                            number<NumWarps>{},                             // m2
                            number<LanesPerK>{},                            // k0
                            number<KVector>{}),                             // k1
-                make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
+                make_tuple(number<NumWarps*(WarpSize * KVector + KPad)>{}, // m0
                            number<Block_K>{},                              // m1
-                           number<warpSize * KVector + KPad>{},            // m2
+                           number<WarpSize * KVector + KPad>{},            // m2
                            number<KVector>{},                              // k0
                            number<1>{}),                                   // k1
                 number<KVector>{}, // lds store vector(actually no explicit store)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 26f7e46f9f..30d07a4754 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -448,19 +448,19 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
                 constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
                 constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
                 constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
-                constexpr index_t warpSize   = ck_tile::get_warp_size();
+                constexpr index_t WarpSize   = ck_tile::get_warp_size();
 
                 constexpr index_t KPack   = GetSmemKPackK<Problem>(); // this is for lds
                 constexpr index_t KVector = GetAlignmentK<Problem>(); // this is for global load
                 constexpr index_t kPad    = KPack;
 
-                static_assert(warpSize * KVector >= kKPerBlock &&
-                              warpSize * KVector % kKPerBlock == 0);
+                static_assert(WarpSize * KVector >= kKPerBlock &&
+                              WarpSize * KVector % kKPerBlock == 0);
                 constexpr index_t LanesPerK  = kKPerBlock / KVector;
-                constexpr index_t LaneGroups = warpSize / LanesPerK;
+                constexpr index_t LaneGroups = WarpSize / LanesPerK;
                 constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
 
-                return NumIssues * NumWarps * (warpSize * KVector + kPad);
+                return NumIssues * NumWarps * (WarpSize * KVector + kPad);
             }
         }();
 
@@ -516,18 +516,18 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
         constexpr index_t kBlockSize = Problem::kBlockSize;
         constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
-        constexpr index_t warpSize   = ck_tile::get_warp_size();
+        constexpr index_t WarpSize   = ck_tile::get_warp_size();
 
         constexpr index_t KPack   = GetSmemKPackK<Problem>(); // this is for lds
         constexpr index_t KVector = GetAlignmentK<Problem>(); // this is for global load
         constexpr index_t kPad =
             KPack; // for async-copy, this pad is between warps. Optimize this for lds_read speed
 
-        static_assert(warpSize * KVector >= kKPerBlock && warpSize * KVector % kKPerBlock == 0);
+        static_assert(WarpSize * KVector >= kKPerBlock && WarpSize * KVector % kKPerBlock == 0);
         constexpr index_t LanesPerK =
             kKPerBlock / KVector; // how many lane (within a wave) to load K
         constexpr index_t LaneGroups =
-            warpSize /
+            WarpSize /
             LanesPerK; // how many groups (within a wave), they may load different N, but same K
         constexpr index_t NumIssues = kNPerBlock / (LaneGroups * NumWarps);
         static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
@@ -538,9 +538,9 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
                        number<NumWarps>{},   // n2
                        number<LanesPerK>{},  // k0
                        number<KVector>{}),   // k1
-            make_tuple(number<NumWarps*(warpSize * KVector + kPad)>{},
+            make_tuple(number<NumWarps*(WarpSize * KVector + kPad)>{},
                        number<kKPerBlock>{},
-                       number<warpSize * KVector + kPad>{},
+                       number<WarpSize * KVector + kPad>{},
                        number<KVector>{},
                        number<1>{}),
             number<IBuf * GetSingleSmemElementSpaceSize<Problem>()>{},
@@ -569,18 +569,18 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
         constexpr index_t kBlockSize = Problem::kBlockSize;
         constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
-        constexpr index_t warpSize   = ck_tile::get_warp_size();
+        constexpr index_t WarpSize   = ck_tile::get_warp_size();
 
         constexpr index_t KPack   = GetSmemKPackK<Problem>(); // this is for lds
         constexpr index_t KVector = GetAlignmentK<Problem>(); // this is for global load
         constexpr index_t kPad    = KPack; // for async-copy, this pad is between warps
 
-        static_assert(warpSize * KVector >= kKPerBlock && warpSize * KVector % kKPerBlock == 0);
+        static_assert(WarpSize * KVector >= kKPerBlock && WarpSize * KVector % kKPerBlock == 0);
         constexpr index_t LanesPerK  = kKPerBlock / KVector; // within a wave
-        constexpr index_t LaneGroups = warpSize / LanesPerK; // within a wave
+        constexpr index_t LaneGroups = WarpSize / LanesPerK; // within a wave
         constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
         static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
-        // constexpr index_t SingleKSize = NumIssues * NumWarps * (warpSize * KVector + kPad);
+        // constexpr index_t SingleKSize = NumIssues * NumWarps * (WarpSize * KVector + kPad);
         // constexpr index_t SingleVSize =
         // MakeVLdsBlockDescriptor<Problem>().get_element_space_size();
         constexpr index_t BufferSize =
@@ -594,8 +594,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
                                                     number<kKPerBlock / KPack>{}, // k0
                                                     number<KPack>{}),             // k1
                                          make_tuple(number<BufferSize>{},
-                                                    number<NumWarps*(warpSize * KVector + kPad)>{},
-                                                    number<warpSize * KVector + kPad>{},
+                                                    number<NumWarps*(WarpSize * KVector + kPad)>{},
+                                                    number<WarpSize * KVector + kPad>{},
                                                     number<kKPerBlock>{},
                                                     number<KPack>{},
                                                     number<1>{}),
@@ -746,13 +746,13 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
             constexpr index_t kBlockSize = Problem::kBlockSize;
             constexpr index_t NumWarps   = Problem::BlockFmhaShape::NumWarps;
-            constexpr index_t warpSize   = ck_tile::get_warp_size();
+            constexpr index_t WarpSize   = ck_tile::get_warp_size();
 
             constexpr index_t KVector = GetAlignmentK<Problem>(); // this is for global load
 
-            static_assert(warpSize * KVector >= kKPerBlock && warpSize * KVector % kKPerBlock == 0);
+            static_assert(WarpSize * KVector >= kKPerBlock && WarpSize * KVector % kKPerBlock == 0);
             constexpr index_t LanesPerK  = kKPerBlock / KVector; // within a wave
-            constexpr index_t LaneGroups = warpSize / LanesPerK; // within a wave
+            constexpr index_t LaneGroups = WarpSize / LanesPerK; // within a wave
             constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
             static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
 
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
index 4f3f8bb7d3..336bdc806f 100644
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
@@ -101,7 +101,7 @@ struct FusedMoeGemmShape
     static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1;
     static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1;
 
-    static constexpr index_t BlockSize = warpSize * NumWarps;
+    static constexpr index_t BlockSize = WarpSize * NumWarps;
 
     // some assert
     static_assert(Block_M0 == Block_M1);
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 4166c1c602..d3c98d7bca 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -381,7 +381,7 @@ struct MoeSortingKernel
     }
 
     // reduce single pixel within a wave
-    template <typename T, typename F, index_t wave_size_ = warpSize>
+    template <typename T, typename F, index_t wave_size_ = WarpSize>
     __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
     {
         // constexpr int wave_size = 64;
@@ -618,7 +618,7 @@ struct MoeSortingKernel
         {
             const index_t prefill_token = topk_mdiv.div(numel);
             // TODO: only support expert-tile like 8, 16, 32
-            static constexpr index_t experts_per_wave = warpSize / Problem::ExpertTile;
+            static constexpr index_t experts_per_wave = WarpSize / Problem::ExpertTile;
             {
                 index_t eid           = tid / experts_per_wave;
                 index_t expert_offset = cumsum[eid] +
@@ -686,7 +686,7 @@ struct MoeSortingKernel
                                    void* smem) const
     {
         const index_t tid            = static_cast<index_t>(threadIdx.x);
-        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / warpSize);
+        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / WarpSize);
         const index_t lid            = __lane_id();
         constexpr index_t block_size = 256;           // blockDim.x;
         const index_t sub_tokens     = smem_rows - 2; // sub_tokens_mdiv.divisor;
@@ -791,7 +791,7 @@ struct MoeSortingKernel
                 // NOTE: under this block can never use __syncthreads!
                 int i_e_          = 0;
                 int local_cumsum_ = 0;
-                for(; i_e_ < num_experts; i_e_ += warpSize)
+                for(; i_e_ < num_experts; i_e_ += WarpSize)
                 {
                     int pre_cumsum_ = smem_cumsum(lid == 0 ? i_e_ : 0);
                     int local_cnt   = smem_cumsum(i_e_ + lid + 1);
@@ -836,7 +836,7 @@ struct MoeSortingKernel
                                                   // cumsum padded in case local cumsum is zero, but
                                                   // pre_sumsum has value, which will result int
                                                   // zero local cumsum(but we want at least padded)
-                    wave_cumsum<int, warpSize>(local_cumsum_);
+                    wave_cumsum<int, WarpSize>(local_cumsum_);
 
                     if((i_e_ + lid) < num_experts)
                         smem_cumsum(i_e_ + lid + 1) = local_cumsum_;
@@ -844,7 +844,7 @@ struct MoeSortingKernel
                     if constexpr(Problem::LocalExpertMasking)
                     {
                         local_masking += pre_cumsum_masking;
-                        wave_cumsum<int, warpSize>(local_masking);
+                        wave_cumsum<int, WarpSize>(local_masking);
                         if((i_e_ + lid) < num_experts)
                             smem_cumdup(i_e_ + lid + 1) = local_masking;
                     }
@@ -854,7 +854,7 @@ struct MoeSortingKernel
                     // than 0(which is not we want)
                     __builtin_amdgcn_s_waitcnt(0xc07f);
                 }
-                if((lid + i_e_ - warpSize) == (num_experts - 1))
+                if((lid + i_e_ - WarpSize) == (num_experts - 1))
                 {
                     *p_total_tokens_post_pad = local_cumsum_;
                 }
@@ -1091,7 +1091,7 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
     return chunk * sizeof(index_t);
 };
 
-template <typename T, typename F, index_t wave_size_ = warpSize>
+template <typename T, typename F, index_t wave_size_ = WarpSize>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
     // constexpr int wave_size = 64;
@@ -1456,7 +1456,7 @@ struct MoeSortingMultiPhaseKernel_P1
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return BLOCK_SIZE / warpSize * sizeof(IndexType);
+        return BLOCK_SIZE / WarpSize * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1498,8 +1498,8 @@ struct MoeSortingMultiPhaseKernel_P1
             cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
         }
 
-        index_t lane_id = threadIdx.x % warpSize;
-        index_t wave_id = threadIdx.x / warpSize;
+        index_t lane_id = threadIdx.x % WarpSize;
+        index_t wave_id = threadIdx.x / WarpSize;
 
         // reduce cross wave
         IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -1512,7 +1512,7 @@ struct MoeSortingMultiPhaseKernel_P1
         if(threadIdx.x == 0)
         {
             index_t c = 0;
-            for(auto i = 0; i < (BLOCK_SIZE / warpSize); i++)
+            for(auto i = 0; i < (BLOCK_SIZE / WarpSize); i++)
             {
                 c += s[i];
             }
@@ -1601,7 +1601,7 @@ struct MoeSortingMultiPhaseKernel_P01
     // in byte
     CK_TILE_HOST static constexpr auto GetSmemSize()
     {
-        return BLOCK_SIZE / warpSize * sizeof(IndexType);
+        return BLOCK_SIZE / WarpSize * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1685,8 +1685,8 @@ struct MoeSortingMultiPhaseKernel_P01
                     cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
                 }
 
-                index_t lane_id = threadIdx.x % warpSize;
-                index_t wave_id = threadIdx.x / warpSize;
+                index_t lane_id = threadIdx.x % WarpSize;
+                index_t wave_id = threadIdx.x / WarpSize;
 
                 // reduce cross wave
                 IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -1700,7 +1700,7 @@ struct MoeSortingMultiPhaseKernel_P01
                 if(threadIdx.x == 0)
                 {
                     index_t c = 0;
-                    for(auto i = 0; i < (BLOCK_SIZE / warpSize); i++)
+                    for(auto i = 0; i < (BLOCK_SIZE / WarpSize); i++)
                     {
                         c += s[i];
                     }
@@ -1777,7 +1777,7 @@ struct MoeSortingMultiPhaseKernel_P2
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
         // return 2 * BLOCK_SIZE * sizeof(IndexType);
-        return (4 + 2 * BLOCK_SIZE / warpSize) * sizeof(IndexType);
+        return (4 + 2 * BLOCK_SIZE / WarpSize) * sizeof(IndexType);
     }
 
     // reduce single pixel within a wave
@@ -1802,8 +1802,8 @@ struct MoeSortingMultiPhaseKernel_P2
         IndexType* p_sorted_expert_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
 
         const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        index_t wave_id     = threadIdx.x / warpSize;
-        index_t lane_id     = threadIdx.x % warpSize;
+        index_t wave_id     = threadIdx.x / WarpSize;
+        index_t lane_id     = threadIdx.x % WarpSize;
 
         IndexType prev_cumsum_a = 0;
         IndexType prev_cumsum_b = 0;
@@ -1848,22 +1848,22 @@ struct MoeSortingMultiPhaseKernel_P2
             IndexType cumsum_b = b_;
 
             // Note: we first cumsum local round, then add previous cumsum
-            impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_a);
-            impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_b);
+            impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_a);
+            impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_b);
 
             __syncthreads();
-            if(lane_id == warpSize - 1)
+            if(lane_id == WarpSize - 1)
             {
                 s[4 + wave_id]                         = cumsum_a;
-                s[4 + wave_id + BLOCK_SIZE / warpSize] = cumsum_b;
+                s[4 + wave_id + BLOCK_SIZE / WarpSize] = cumsum_b;
             }
 
             __syncthreads();
 
             // reduce cross wave
-            static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+            static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
                 IndexType prev_a = s[4 + i_w];
-                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / warpSize];
+                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / WarpSize];
                 prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                 prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                 cumsum_a += prev_a;
@@ -1978,7 +1978,7 @@ struct MoeSortingMultiPhaseKernel_P3
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return (4 + BLOCK_SIZE / warpSize) * sizeof(IndexType);
+        return (4 + BLOCK_SIZE / WarpSize) * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1995,8 +1995,8 @@ struct MoeSortingMultiPhaseKernel_P3
         WeightType* p_sorted_weights  = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
 
         int eid     = blockIdx.x;
-        int wave_id = threadIdx.x / warpSize;
-        int lane_id = threadIdx.x % warpSize;
+        int wave_id = threadIdx.x / WarpSize;
+        int lane_id = threadIdx.x % WarpSize;
         int e_start = p_expert_cumsum[eid];
         int e_end   = p_expert_cumsum[eid + 1];
         if constexpr(Problem::SkipExpertsWithZeroTokens)
@@ -2026,17 +2026,17 @@ struct MoeSortingMultiPhaseKernel_P3
             int i_topk = x - 1;          // topk of this token
             int i_show = x != 0 ? 1 : 0; // has this token or not
             int cumsum = i_show;
-            impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+            impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
 
             __syncthreads();
-            if(lane_id == warpSize - 1)
+            if(lane_id == WarpSize - 1)
             {
                 s[4 + wave_id] = cumsum;
             }
             __syncthreads();
 
             // reduce cross wave
-            static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+            static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
                 IndexType prev = s[4 + i_w];
                 prev           = wave_id > i_w ? prev : 0; // mask out
                 cumsum += prev;
@@ -2081,7 +2081,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_size_p23(int num_experts_)
 {
     constexpr index_t BLOCK_SIZE     = 256; // hardcoded 256
     const index_t expert_cumsum_elem = num_experts_ + 1;
-    return (4 + 2 * BLOCK_SIZE / warpSize + expert_cumsum_elem) * sizeof(int);
+    return (4 + 2 * BLOCK_SIZE / WarpSize + expert_cumsum_elem) * sizeof(int);
 }
 } // namespace impl
 
@@ -2186,15 +2186,15 @@ struct MoeSortingMultiPhaseKernel_P23
             const IndexType* p_local_expert_mask =
                 static_cast<const IndexType*>(kargs.p_local_expert_mask);
             IndexType* p_expert_cumsum      = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / warpSize;
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / WarpSize;
             IndexType* p_total_tokens_post_pad =
                 reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
             IndexType* p_sorted_expert_ids =
                 reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
 
             const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-            index_t wave_id     = threadIdx.x / warpSize;
-            index_t lane_id     = threadIdx.x % warpSize;
+            index_t wave_id     = threadIdx.x / WarpSize;
+            index_t lane_id     = threadIdx.x % WarpSize;
 
             IndexType prev_cumsum_a = 0;
             IndexType prev_cumsum_b = 0;
@@ -2239,22 +2239,22 @@ struct MoeSortingMultiPhaseKernel_P23
                 IndexType cumsum_b = b_;
 
                 // Note: we first cumsum local round, then add previous cumsum
-                impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_a);
-                impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_b);
+                impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_a);
+                impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_b);
 
                 __syncthreads();
-                if(lane_id == warpSize - 1)
+                if(lane_id == WarpSize - 1)
                 {
                     s[4 + wave_id]                         = cumsum_a;
-                    s[4 + wave_id + BLOCK_SIZE / warpSize] = cumsum_b;
+                    s[4 + wave_id + BLOCK_SIZE / WarpSize] = cumsum_b;
                 }
 
                 __syncthreads();
 
                 // reduce cross wave
-                static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
                     IndexType prev_a = s[4 + i_w];
-                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / warpSize];
+                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / WarpSize];
                     prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                     prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                     cumsum_a += prev_a;
@@ -2324,13 +2324,13 @@ struct MoeSortingMultiPhaseKernel_P23
             IndexType* s                  = reinterpret_cast<IndexType*>(smem);
             MeshType* p_expert_mesh       = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
             IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / warpSize;
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / WarpSize;
             const WeightType* p_weights     = static_cast<const WeightType*>(kargs.p_weights);
             WeightType* p_sorted_weights    = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
 
             int eid     = blockIdx.x;
-            int wave_id = threadIdx.x / warpSize;
-            int lane_id = threadIdx.x % warpSize;
+            int wave_id = threadIdx.x / WarpSize;
+            int lane_id = threadIdx.x % WarpSize;
             int e_start = p_expert_cumsum_smem[eid];
             int e_end   = p_expert_cumsum_smem[eid + 1];
             if constexpr(Problem::SkipExpertsWithZeroTokens)
@@ -2390,17 +2390,17 @@ struct MoeSortingMultiPhaseKernel_P23
                         int i_topk  = x - 1;          // topk of this token
                         int i_show  = x != 0 ? 1 : 0; // has this token or not
                         int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
 
                         __syncthreads();
-                        if(lane_id == warpSize - 1)
+                        if(lane_id == WarpSize - 1)
                         {
                             s[4 + wave_id] = cumsum;
                         }
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
@@ -2441,17 +2441,17 @@ struct MoeSortingMultiPhaseKernel_P23
                             cumsum_store += i_show[j];
                         });
                         int cumsum = cumsum_store;
-                        impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
 
                         __syncthreads();
-                        if(lane_id == warpSize - 1)
+                        if(lane_id == WarpSize - 1)
                         {
                             s[4 + wave_id] = cumsum;
                         }
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
@@ -2496,17 +2496,17 @@ struct MoeSortingMultiPhaseKernel_P23
                         int i_topk_1  = x1 - 1;          // topk of this token
                         int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
                         int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
 
                         __syncthreads();
-                        if(lane_id == warpSize - 1)
+                        if(lane_id == WarpSize - 1)
                         {
                             s[4 + wave_id] = cumsum;
                         }
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
index 629f0ee8f1..0c8baaf191 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -303,7 +303,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
         constexpr index_t Block_M = Problem::BlockShape::Block_M0;
         constexpr index_t Block_K = Problem::BlockShape::Block_K0;
         // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t warpSize = ck_tile::get_warp_size();
+        constexpr index_t WarpSize = ck_tile::get_warp_size();
         constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
 
         constexpr index_t KPack   = GetSmemKPack_A<Problem>(); // LDS
@@ -312,11 +312,11 @@ struct FusedMoeGemmPipelineFlatmmPolicy
 
         static_assert(Block_K % KVector == 0);
         constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= warpSize)
+        if constexpr(LanesPerK >= WarpSize)
         {
             // need multiple waves to load K
-            static_assert(LanesPerK % warpSize == 0);
-            constexpr index_t wavesPerK = LanesPerK / warpSize;
+            static_assert(LanesPerK % WarpSize == 0);
+            constexpr index_t wavesPerK = LanesPerK / WarpSize;
             if constexpr(wavesPerK > NumWarps)
             {
                 // TODO: need multiple issues along K to load all data
@@ -329,11 +329,11 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                     make_tuple(number<NumIssues>{},                             // m0
                                number<wavesPerM>{},                             // m1
                                number<wavesPerK>{},                             // k0
-                               number<warpSize>{},                              // k1
+                               number<WarpSize>{},                              // k1
                                number<KVector>{}),                              // k2
-                    make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{},  // m0
-                               number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
-                               number<warpSize * KVector + KPad>{},             // k0
+                    make_tuple(number<NumWarps*(WarpSize * KVector + KPad)>{},  // m0
+                               number<wavesPerK*(WarpSize * KVector + KPad)>{}, // m1
+                               number<WarpSize * KVector + KPad>{},             // k0
                                number<KVector>{},                               // k1
                                number<1>{}),                                    // k2
                     number<KVector>{}, // lds store vector(actually no explicit store)
@@ -344,7 +344,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                     make_tuple(
                         make_pass_through_transform(number<NumIssues>{}),
                         make_merge_transform(make_tuple(number<wavesPerM>{}, number<wavesPerK>{})),
-                        make_merge_transform(make_tuple(number<warpSize>{}, number<KVector>{}))),
+                        make_merge_transform(make_tuple(number<WarpSize>{}, number<KVector>{}))),
                     make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
                     make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
 
@@ -354,8 +354,8 @@ struct FusedMoeGemmPipelineFlatmmPolicy
         else
         {
             // lanes within a wave load different M but same K
-            static_assert(warpSize % LanesPerK == 0);
-            constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
+            static_assert(WarpSize % LanesPerK == 0);
+            constexpr index_t LaneGroups = WarpSize / LanesPerK; // along m
             constexpr index_t NumIssues  = Block_M / (LaneGroups * NumWarps);
 
             constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
@@ -364,9 +364,9 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                            number<NumWarps>{},                             // m2
                            number<LanesPerK>{},                            // k0
                            number<KVector>{}),                             // k1
-                make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
+                make_tuple(number<NumWarps*(WarpSize * KVector + KPad)>{}, // m0
                            number<Block_K>{},                              // m1
-                           number<warpSize * KVector + KPad>{},            // m2
+                           number<WarpSize * KVector + KPad>{},            // m2
                            number<KVector>{},                              // k0
                            number<1>{}),                                   // k1
                 number<KVector>{}, // lds store vector(actually no explicit store)
@@ -398,7 +398,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
         constexpr index_t Block_M = Problem::BlockShape::Block_M0;
         constexpr index_t Block_K = Problem::BlockShape::Block_K0;
         // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t warpSize = ck_tile::get_warp_size();
+        constexpr index_t WarpSize = ck_tile::get_warp_size();
         constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
 
         constexpr index_t KPack   = GetSmemKPack_A<Problem>(); // LDS
@@ -407,11 +407,11 @@ struct FusedMoeGemmPipelineFlatmmPolicy
 
         static_assert(Block_K % KVector == 0);
         constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= warpSize)
+        if constexpr(LanesPerK >= WarpSize)
         {
             // need multiple waves to load K
-            static_assert(LanesPerK % warpSize == 0);
-            constexpr index_t wavesPerK = LanesPerK / warpSize;
+            static_assert(LanesPerK % WarpSize == 0);
+            constexpr index_t wavesPerK = LanesPerK / WarpSize;
             if constexpr(wavesPerK >= NumWarps)
             {
                 // TODO: need multiple issues along K to load all data
@@ -424,11 +424,11 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                     make_tuple(number<NumIssues>{},                             // m0
                                number<wavesPerM>{},                             // m1
                                number<wavesPerK>{},                             // k0
-                               number<warpSize>{},                              // k1
+                               number<WarpSize>{},                              // k1
                                number<KVector>{}),                              // k2
-                    make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{},  // m0
-                               number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
-                               number<warpSize * KVector + KPad>{},             // k0
+                    make_tuple(number<NumWarps*(WarpSize * KVector + KPad)>{},  // m0
+                               number<wavesPerK*(WarpSize * KVector + KPad)>{}, // m1
+                               number<WarpSize * KVector + KPad>{},             // k0
                                number<KVector>{},                               // k1
                                number<1>{}),                                    // k2
                     number<KPack>{},                                            // lds load vector
@@ -439,7 +439,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                     make_tuple(
                         make_merge_transform(make_tuple(number<NumIssues>{}, number<wavesPerM>{})),
                         make_merge_transform(make_tuple(
-                            number<wavesPerK>{}, number<warpSize>{}, number<KVector>{}))),
+                            number<wavesPerK>{}, number<WarpSize>{}, number<KVector>{}))),
                     make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
                     make_tuple(sequence<0>{}, sequence<1>{}));
 
@@ -449,8 +449,8 @@ struct FusedMoeGemmPipelineFlatmmPolicy
         else
         {
             // lanes within a wave load different M but same K
-            static_assert(warpSize % LanesPerK == 0);
-            constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
+            static_assert(WarpSize % LanesPerK == 0);
+            constexpr index_t LaneGroups = WarpSize / LanesPerK; // along m
             constexpr index_t NumIssues  = Block_M / (LaneGroups * NumWarps);
 
             constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
@@ -459,9 +459,9 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                            number<NumWarps>{},                             // m2
                            number<LanesPerK>{},                            // k0
                            number<KVector>{}),                             // k1
-                make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
+                make_tuple(number<NumWarps*(WarpSize * KVector + KPad)>{}, // m0
                            number<Block_K>{},                              // m1
-                           number<warpSize * KVector + KPad>{},            // m2
+                           number<WarpSize * KVector + KPad>{},            // m2
                            number<KVector>{},                              // k0
                            number<1>{}),                                   // k1
                 number<KPack>{},                                           // lds load vector
diff --git a/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp b/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp
index b038472fcf..ad513dbd11 100644
--- a/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp
+++ b/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp
@@ -26,7 +26,7 @@ struct TileImageToColumnShape
     static constexpr index_t kMWarpPerBlock = kMPerBlock / kMPerWarp;
     static constexpr index_t kKWarpPerBlock = kKPerBlock / kKPerWarp;
 
-    static constexpr index_t kBlockSize = warpSize * kMWarpPerBlock * kKWarpPerBlock;
+    static constexpr index_t kBlockSize = get_warp_size() * kMWarpPerBlock * kKWarpPerBlock;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
index 15ac021631..26437c7126 100644
--- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -250,7 +250,7 @@ struct BlockNormReduceCrossWarpSync
         //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
         //
         //   -> also store data from every wave into LDS
-        constexpr index_t num_warps = BlockShape::BlockSize / warpSize;
+        constexpr index_t num_warps = BlockShape::BlockSize / WarpSize;
         return num_warps * 4 * thread_buf_size * sizeof(float);
     }
 
@@ -276,7 +276,7 @@ struct BlockNormReduceCrossWarpSync
         const index_t lane_id           = get_lane_id();
         const index_t warp_id           = get_warp_id();
         constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
-        constexpr index_t num_warps     = BlockShape::BlockSize / warpSize;
+        constexpr index_t num_warps     = BlockShape::BlockSize / WarpSize;
         const index_t smem_offset       = warp_id;
 
         // skip if nonthing to do
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index d6ca98e7b4..6a1f926a9a 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -210,7 +210,7 @@ struct BlockReduce2dCrossWarpSync
         //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
         //
         //   -> also store data from every wave into LDS
-        constexpr index_t num_warps = BlockShape::BlockSize / warpSize;
+        constexpr index_t num_warps = BlockShape::BlockSize / get_warp_size();
         return num_warps * thread_buf_size * sizeof(DataType);
     }
 
@@ -226,7 +226,7 @@ struct BlockReduce2dCrossWarpSync
         const index_t lane_id           = get_lane_id();
         const index_t warp_id           = get_warp_id();
         constexpr auto num_reduce_warps = GetReduceWarps<YDistributedTensor_>();
-        constexpr index_t num_warps     = BlockShape::BlockSize / warpSize;
+        constexpr index_t num_warps     = BlockShape::BlockSize / get_warp_size();
         const index_t smem_offset       = warp_id;
 
         // skip if nonthing to do

From cc98a41f465108af2ecf5168c7bd7844a64b6fc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Tue, 17 Jun 2025 22:25:56 +0200
Subject: [PATCH 224/443] Fix Add in dynamic buffer for fp32/i8 (#2351)

* Fix Add in dynamic buffer for fp32/i8

* fixes

* Fix
---
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp |  6 +--
 include/ck/utility/dynamic_buffer.hpp         | 52 ++-----------------
 2 files changed, 7 insertions(+), 51 deletions(-)
 mode change 100755 => 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
 mode change 100755 => 100644 include/ck/utility/dynamic_buffer.hpp

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
old mode 100755
new mode 100644
index f1c0ec1c68..d45ed79ae3
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -1841,7 +1841,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                         Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                         CShuffleDataType,     // typename SrcData,
-                        CShuffleDataType,     // typename DstData,
+                        AccDataType,          // typename DstData,
                         decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
                         decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle),
                         Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
@@ -2591,7 +2591,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                         Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
                         CShuffleDataType,     // typename SrcData,
-                        CShuffleDataType,     // typename DstData,
+                        AccDataType,          // typename DstData,
                         decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
                         decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle),
                         Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
old mode 100755
new mode 100644
index eb35c34498..2debd09c2d
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -139,8 +139,7 @@ struct DynamicBuffer
     template <InMemoryDataOperationEnum Op,
               typename X,
               typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value ||
-                                     !is_native_type<X>(),
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                  bool>::type = false>
     __host__ __device__ void Update(IndexType i, bool is_valid_element, const X& x)
     {
@@ -160,37 +159,7 @@ struct DynamicBuffer
         {
             auto tmp       = this->template Get<X>(i, is_valid_element);
             using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
-
-#if defined(__gfx942__) || defined(__gfx950__)
-
-            // Properly handle addition for all low-precision types
-            if constexpr(is_same_v<scalar_t, bhalf_t> || is_same_v<scalar_t, half_t>)
-            {
-                if constexpr(is_scalar_type<X>::value)
-                {
-                    // Scalar type: Convert to float, add, convert back
-                    auto result =
-                        type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
-                    this->template Set<X>(i, is_valid_element, result);
-                }
-                else
-                {
-                    // Vector type
-                    constexpr auto vector_size = scalar_type<remove_cvref_t<X>>::vector_size;
-                    const vector_type<scalar_t, vector_size> a_vector{tmp};
-                    const vector_type<scalar_t, vector_size> b_vector{x};
-
-                    // Process each element of the vector in higher precision
-                    static_for<0, vector_size, 1>{}([&](auto idx) {
-                        auto result = type_convert<scalar_t>(
-                            type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
-                            type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
-                        this->template Set<scalar_t>(i + idx, is_valid_element, result);
-                    });
-                }
-            }
-#else
-            //   handle bfloat addition
+            // handle bfloat addition
             if constexpr(is_same_v<scalar_t, bhalf_t>)
             {
                 if constexpr(is_scalar_type<X>::value)
@@ -218,8 +187,6 @@ struct DynamicBuffer
             {
                 this->template Set<X>(i, is_valid_element, x + tmp);
             }
-
-#endif
         }
     }
 
@@ -273,20 +240,9 @@ struct DynamicBuffer
         if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
         {
             constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-            using vector_t = typename vector_type_maker<remove_cvref_t<T>, t_per_x>::type::type;
-            vector_t tmp;
-
-            if constexpr(is_same_v<remove_cvref_t<X>, vector_t>)
-            {
-                tmp = x;
-            }
-            else
-            {
-                __builtin_memcpy(&tmp, &x, sizeof(vector_t));
-            }
 
             amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
-                tmp, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
+                x, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
         }
         else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
                           is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&

From cdfd7722bfda0181e9ccb75db4161fb95fdef353 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 17 Jun 2025 13:56:30 -0700
Subject: [PATCH 225/443] Revert "Shard several of the most costly targets.
 (#2266)" (#2361)

This reverts commit 3a0cb2796605082cdbac4d1649397b9435e49556.
---
 .gitignore                                    |   3 -
 cmake/ShardInstantiation.cmake                | 116 ------------------
 cmake/call_shard.in                           |  15 ---
 cmake/instantiate_shard.in                    |   9 --
 include/ck/utility/filter_tuple.hpp           |  66 ----------
 .../gpu/grouped_convolution_forward_xdl.inc   |   3 +-
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |  51 +-------
 ..._ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp} |  38 +++---
 ...d_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp} |  40 +++---
 ...wd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp} |  64 +++++-----
 ...c_gkyxc_nhwgk_int8_mem_inter_instance.cpp} | 100 +++++++--------
 ...wgc_gkyxc_nhwgk_int8_mem_inter_instance.in |  80 ------------
 ...c_gkyxc_nhwgk_int8_mem_intra_instance.cpp} | 100 +++++++--------
 ...wgc_gkyxc_nhwgk_int8_mem_intra_instance.in |  80 ------------
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     | 109 +++-------------
 ...dhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp | 111 +++++++++++++++++
 ...ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in |  66 ----------
 ...ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp | 111 +++++++++++++++++
 ..._ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in |  65 ----------
 ...gcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp |  54 ++++++++
 ...ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in |  65 ----------
 ...ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp |  54 ++++++++
 ..._ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in |  63 ----------
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |  53 ++++++++
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |  53 ++++++++
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp |   9 --
 ...ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp |   9 --
 ...gkczyx_ngkdhw_bf16_mem_inter_instance.cpp} |  53 ++++----
 ...w_gkczyx_ngkdhw_bf16_mem_inter_instance.in |  64 ----------
 ..._gkczyx_ngkdhw_bf16_mem_intra_instance.cpp |  55 +++++++++
 ...w_gkczyx_ngkdhw_bf16_mem_intra_instance.in |  65 ----------
 ..._gkczyx_ngkdhw_f16_mem_inter_instance.cpp} |  53 ++++----
 ..._gkczyx_ngkdhw_f16_mem_intra_instance.cpp} |  69 +++++------
 ..._gkczyx_ngkdhw_f32_mem_inter_instance.cpp} |  69 +++++------
 ..._gkczyx_ngkdhw_f32_mem_intra_instance.cpp} |  69 +++++------
 41 files changed, 820 insertions(+), 1318 deletions(-)
 delete mode 100644 cmake/ShardInstantiation.cmake
 delete mode 100644 cmake/call_shard.in
 delete mode 100644 cmake/instantiate_shard.in
 delete mode 100644 include/ck/utility/filter_tuple.hpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp} (53%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp} (71%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp} (64%)
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in => grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp} (54%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
 rename library/src/tensor_operation_instance/gpu/{grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in => grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp} (54%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in => mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp} (64%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in => mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp} (64%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp} (59%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp} (59%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp} (59%)

diff --git a/.gitignore b/.gitignore
index e4dd8f7513..599ef99e35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,3 @@ build*/
 
 # Python cache
 __pycache__/
-
-.cache/
-
diff --git a/cmake/ShardInstantiation.cmake b/cmake/ShardInstantiation.cmake
deleted file mode 100644
index 47a5d0c48c..0000000000
--- a/cmake/ShardInstantiation.cmake
+++ /dev/null
@@ -1,116 +0,0 @@
-# Function to generate templated instantiation functions and caller function.
-
-# In order to reduce build times, we split the instantiation of template functions into multiple files.
-# Developers can use ck::util::generate_sharded_instantiations to generate the instantiation functions,
-# which can be placed the TEMPLATE_FILE (typically a .in file).
-
-# This CMake function generates the instantiation functions and a caller function that calls all the instantiation 
-# functions. The ck::util::generate_sharded_instantiations function allows us to generate an arbitrary number of
-# shards (NUM_SHARDS). This function loops over the shards, generates an instantiation function for each shard,
-# and generates a caller function that calls all the instantiation functions.
-
-# The explicit instatiation pattern requires the use of `extern template` to avoid implicit instantiation
-# of the template functions in the caller function, and that code is automatically generated by this function.
-
-# In addition to the user-supplied template, this CMake function uses two generic templates:
-#
-# 1. `instantiate_shard.in`: This is the template for the instantiation functions.
-# 2. `call_shard.in`: This is the template for the caller function that calls all the instantiation functions.
-
-# This function takes the following arguments:
-#
-# - INSTANCES_NAME: The name of the instances (the calling function will be named `add_${INSTANCE_NAMES}`).
-# - TEMPLATE_FILE: The path to the template file that contains the templated instantiation function definitions.
-# - NUM_SHARDS: The number of shards to generate.
-# - OUTPUT_DIR: The build directory where the generated source files will be placed.
-# - SRC_LIST: The list of source files to which the generated source files will be added.
-
-
-function(generate_sharded_instantiations)
-    cmake_parse_arguments(
-        GEN_SHARDED
-        # No boolean arguments
-        ""
-        # Single-value arguments
-        "INSTANCES_NAME;TEMPLATE_FILE;NUM_SHARDS;OUTPUT_DIR;SRC_LIST"
-        # No multi-value arguments.
-        ""
-        ${ARGN}
-    )
-    if (NOT GEN_SHARDED_INSTANCES_NAME)
-        message(FATAL_ERROR "INSTANCES_NAME is required for generate_sharded_instantiations")
-    endif()
-    if (NOT GEN_SHARDED_TEMPLATE_FILE)
-        message(FATAL_ERROR "TEMPLATE_FILE is required for generate_sharded_instantiations")
-    endif()
-    if (NOT GEN_SHARDED_NUM_SHARDS)
-        message(FATAL_ERROR "NUM_SHARDS is required for generate_sharded_instantiations")
-    endif()
-    if(NOT GEN_SHARDED_OUTPUT_DIR)
-        message(FATAL_ERROR "OUTPUT_DIR is required for generate_sharded_instantiations")
-    endif()
-    if (NOT GEN_SHARDED_SRC_LIST)
-        message(FATAL_ERROR "SRC_LIST is required for generate_sharded_instantiations")
-    endif()
-
-    file(MAKE_DIRECTORY ${GEN_SHARDED_OUTPUT_DIR})
-
-
-    set(GENERATED_SOURCE_FILES "")
-    set(EXTERN_TEMPLATE_STATEMENTS "")
-    set(CALL_STATEMENTS "")
-    message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
-
-    set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
-    
-    # Generate the inc file with the template function defintions.
-    # This include file will hold the template function definitions and a using alias for all the shard
-    # instantiation functions.
-    configure_file(
-        "${GEN_SHARDED_TEMPLATE_FILE}"
-        "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.inc"
-        @ONLY
-    )
-
-    # Generate the sharded instantiation functions.
-    # This is where the build parallelization happens.
-    # Each of these source files will contain a single instantiation function for a shard,
-    # which will be called sequentially by the caller function.
-    set(INC_DIR "${GEN_SHARDED_INC_DIR}")
-    math(EXPR LAST_SHARD_ID "${GEN_SHARDED_NUM_SHARDS} - 1")
-    foreach(SHARD_ID RANGE 0 ${LAST_SHARD_ID})
-        set(NUM_SHARDS "${GEN_SHARDED_NUM_SHARDS}")
-        set(SHARD_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}_shard_${SHARD_ID}.cpp")
-        set(SHARD_FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/instantiate_shard.in")
-        configure_file(
-            "${SHARD_FUNCTION_TEMPLATE}"
-            "${SHARD_FUNCTION_PATH}"
-            @ONLY
-        )
-        list(APPEND GENERATED_SOURCE_FILES "${SHARD_FUNCTION_PATH}")
-        set(SHARDED_FUNCTION_NAME "add_${INSTANCES}_shard<${NUM_SHARDS}, ${SHARD_ID}>")
-        list(APPEND EXTERN_TEMPLATE_STATEMENTS "extern template void\n${SHARDED_FUNCTION_NAME}(\n  ${INSTANCES}& instances)")
-        list(APPEND CALL_STATEMENTS "  ${SHARDED_FUNCTION_NAME}(instances)")
-    endforeach()
-
-    # Join the include statements, the extern template declarations, and the call statements each
-    # into a single string for variable substitution in the caller function.
-    string(REPLACE ";" ";\n" INCLUDE_STATEMENTS "${INCLUDE_STATEMENTS}")
-    string(REPLACE ";" ";\n" CALL_STATEMENTS "${CALL_STATEMENTS}")
-    string(REPLACE ";" ";\n" EXTERN_TEMPLATE_STATEMENTS "${EXTERN_TEMPLATE_STATEMENTS}")
-
-    # Generate the caller function.
-    set(CALLER_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.cpp")
-    set(FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/call_shard.in")
-    configure_file(
-        "${FUNCTION_TEMPLATE}"
-        "${CALLER_FUNCTION_PATH}"
-        @ONLY
-    )
-    list(APPEND GENERATED_SOURCE_FILES "${CALLER_FUNCTION_PATH}")
-
-    # Add the generated source files to the list of source files.
-    # This allows the generated source files to be included in the build.
-    list(APPEND ${GEN_SHARDED_SRC_LIST} ${GENERATED_SOURCE_FILES})
-    set(${GEN_SHARDED_SRC_LIST} "${${GEN_SHARDED_SRC_LIST}}" PARENT_SCOPE)
-endfunction()
\ No newline at end of file
diff --git a/cmake/call_shard.in b/cmake/call_shard.in
deleted file mode 100644
index daba79b055..0000000000
--- a/cmake/call_shard.in
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "@INSTANCES@.inc"
-
-namespace ck::tensor_operation::device::instance {
-
-@EXTERN_TEMPLATE_STATEMENTS@;
-
-void add_@INSTANCES@(
-    @INSTANCES@& instances) {
-@CALL_STATEMENTS@; 
-}
-
-} // namespace ck::tensor_operation::device::instance
diff --git a/cmake/instantiate_shard.in b/cmake/instantiate_shard.in
deleted file mode 100644
index dbc0af17a9..0000000000
--- a/cmake/instantiate_shard.in
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "@INSTANCES@.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_@INSTANCES@_shard<@NUM_SHARDS@, @SHARD_ID@>(
-    @INSTANCES@& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/include/ck/utility/filter_tuple.hpp b/include/ck/utility/filter_tuple.hpp
deleted file mode 100644
index c2e378b879..0000000000
--- a/include/ck/utility/filter_tuple.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-#include "ck/utility/functional.hpp"
-#include "ck/utility/sequence.hpp"
-
-namespace ck::util {
-
-template <typename Tuple, std::size_t Stride, std::size_t Offset>
-struct filter_tuple_by_modulo
-{
-    // Validate Stride and Offset.
-    static_assert(Stride > 0, "Offset must be positive.");
-    static_assert(Offset >= 0 && Offset < Stride,
-                  "Offset must be positive and less than the stride.");
-
-    // Generate filtered indices for this stride and offset.
-    static constexpr int new_size = (std::tuple_size_v<Tuple> + Stride - Offset - 1) / Stride;
-
-    template <std::size_t... Is>
-    static constexpr auto to_index(std::index_sequence<Is...>)
-    {
-        return std::index_sequence<(Offset + Is * Stride)...>{};
-    }
-
-    using filtered_indices = decltype(to_index(std::make_index_sequence<new_size>{}));
-
-    // Helper struct to construct the new tuple type from the filtered indices.
-    template <typename T, typename Indices>
-    struct make_filtered_tuple_type_impl;
-
-    template <typename T, std::size_t... Is>
-    struct make_filtered_tuple_type_impl<T, std::index_sequence<Is...>>
-    {
-        using type = std::tuple<std::tuple_element_t<Is, T>...>;
-    };
-
-    using type = typename make_filtered_tuple_type_impl<Tuple, filtered_indices>::type;
-};
-
-// Filter a tuple with a stride and offset.
-//
-// Tuple is a std::tuple or equivalent
-// Stride is a positive integer
-// Offset is a positive integer smaller than ofset
-//
-// Evaluates to a smaller tuple type from elements of T with stride M and offset I.
-//
-// Can be used to filter a tuple of types for sharded instantiations.
-template <typename Tuple, std::size_t Stride, std::size_t Offset>
-using filter_tuple_by_modulo_t = typename filter_tuple_by_modulo<Tuple, Stride, Offset>::type;
-
-// Example compile-time test:
-// using OriginalTuple =
-//    std::tuple<int, double, char, float, long, short, bool, char, long long, unsigned int>;
-// using NewTuple_Every3rdFrom2nd = filter_tuple_by_modulo_t<OriginalTuple, 3, 1>;
-// static_assert(std::is_same_v<NewTuple_Every3rdFrom2nd, std::tuple<double, long, char>>,
-//               "Test Case 1 Failed: Every 3rd from 2nd");
-
-} // namespace ck::util
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index a3f2515099..b018737932 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -688,6 +688,7 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 7f3621a2ba..22e9d726b0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -1,5 +1,5 @@
 # XDL_DL_WMMA_KERNELS
-set(GROUPED_CONV2D_FWD
+add_instance_library(device_grouped_conv2d_fwd_instance
    #xdl
    # GNHWC, GKYXC, GNHWK
    xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -19,6 +19,8 @@ set(GROUPED_CONV2D_FWD
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
    # NGCHW, GKCYX, NGKHW
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
@@ -44,10 +46,12 @@ set(GROUPED_CONV2D_FWD
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
    # NGCHW, GKCYX, NGKHW
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
@@ -67,6 +71,7 @@ set(GROUPED_CONV2D_FWD
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
    # NGCHW, GKCYX, NGKHW
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -100,47 +105,3 @@ set(GROUPED_CONV2D_FWD
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
-# Add generated files for sharded instantiations.
-include(ShardInstantiation)
-
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
-  NUM_SHARDS 16
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
-  NUM_SHARDS 16
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
-  NUM_SHARDS 16
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
-  NUM_SHARDS 21
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
-  NUM_SHARDS 21
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-add_instance_library(device_grouped_conv2d_fwd_instance ${GROUPED_CONV2D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
similarity index 53%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
index 55165729c6..7368587c93 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
@@ -1,14 +1,16 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/host_utility/device_prop.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -20,23 +22,19 @@ using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances_shard([[maybe_unused]]
-    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(
         instances,
-        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                                                           NGCHW,
-                                                                                           GKCYX,
-                                                                                           Empty_Tuple,
-                                                                                           NGKHW,
-                                                                                           ConvFwdDefault>,
-                                           Shards,
-                                           ShardIndex>{});
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NGCHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwdDefault>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
\ No newline at end of file
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
similarity index 71%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
index 88c84adfe2..4ca1b2b85e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -3,11 +3,13 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -19,40 +21,32 @@ using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances_shard(
-    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwdDefault>,
-                                   Shards,
-                                   ShardIndex>{});
+                                                                              ConvFwdDefault>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwd1x1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                                                              ConvFwd1x1P0>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwd1x1S1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                                                              ConvFwd1x1S1P0>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
index 13fb583725..e3a12fd5f4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -3,11 +3,13 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -19,40 +21,32 @@ using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances_shard(
-    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                              NGCHW,
-                                                                              GKCYX,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwdDefault>,
-                                   Shards,
-                                   ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwdDefault>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                              NGCHW,
-                                                                              GKCYX,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwd1x1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1P0>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                              NGCHW,
-                                                                              GKCYX,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwd1x1S1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1S1P0>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
similarity index 54%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
index 7571dff883..f667481fa4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,62 +1,66 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
                                                                 Empty_Tuple,
-                                                                NGKDHW,
-                                                                F32,
-                                                                F32,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
                                                                 Empty_Tuple,
-                                                                F32,
+                                                                int8_t,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
deleted file mode 100644
index d8b35bda68..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances_shard(
-    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwdDefault,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwd1x1S1P0,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwdOddC,
-                                                                                      Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
similarity index 54%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
index 91a2444241..2ff2c7f51f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,62 +1,66 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
                                                                 Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
                                                                 Empty_Tuple,
-                                                                F16,
+                                                                int8_t,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
deleted file mode 100644
index 125e16139d..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances_shard(
-    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwdDefault,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwd1x1S1P0,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwdOddC,
-                                                                                      Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 1d9d75a104..f8efa5a7c1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -11,6 +11,8 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -30,13 +32,23 @@ set(GROUPED_CONV3D_FWD
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
    
-      xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
-xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -59,99 +71,6 @@ xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
-# Add generated files for sharded instantiations.
-include(ShardInstantiation)
-
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
-  NUM_SHARDS 8
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
-  NUM_SHARDS 8
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..a94f687ef8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
deleted file mode 100644
index e1a6e6c0c4..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NDHWGC,
-                                                            GKZYXC,
-                                                            Empty_Tuple,
-                                                            NDHWGK,
-                                                            ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NDHWGC,
-                                                            GKZYXC,
-                                                            Empty_Tuple,
-                                                            NDHWGK,
-                                                            ConvFwd1x1P0>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NDHWGC,
-                                                            GKZYXC,
-                                                            Empty_Tuple,
-                                                            NDHWGK,
-                                                            ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
new file mode 100644
index 0000000000..0c63345e7f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGK,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
deleted file mode 100644
index 6d196ad71f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           Empty_Tuple,
-                                                           NDHWGK,
-                                                           ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                      NDHWGC,
-                                                                                      GKZYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NDHWGK,
-                                                                                      ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           Empty_Tuple,
-                                                           NDHWGK,
-                                                           ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
new file mode 100644
index 0000000000..43241454a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NGCDHW,
+                                                                                   GKCZYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKDHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
deleted file mode 100644
index 4c67e4912c..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NGCDHW,
-                                                            GKCZYX,
-                                                            Empty_Tuple,
-                                                            NGKDHW,
-                                                            ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NGCDHW,
-                                                            GKCZYX,
-                                                            Empty_Tuple,
-                                                            NGKDHW,
-                                                            ConvFwd1x1P0>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NGCDHW,
-                                                            GKCZYX,
-                                                            Empty_Tuple,
-                                                            NGKDHW,
-                                                            ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
new file mode 100644
index 0000000000..d02d9f6778
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
deleted file mode 100644
index 0fbefa3bbc..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                      NGCDHW,
-                                                                                      GKCZYX,
-                                                                                      Empty_Tuple,
-                                                                                      NGKDHW,
-                                                                                      ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..060eebebc1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..85b088f416
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
deleted file mode 100644
index da2f3dc1fa..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 0>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
deleted file mode 100644
index 5d551833c0..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 1>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
deleted file mode 100644
index 715cbf6beb..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 2>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
deleted file mode 100644
index cf2a9f4023..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 3>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
deleted file mode 100644
index 085b2904d6..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 4>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
deleted file mode 100644
index 18b1e0c6d9..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 5>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
deleted file mode 100644
index b95f1d1229..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 6>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
deleted file mode 100644
index afe3e5d19f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 7>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
index c87783eed9..fac3098341 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
 
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -20,43 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwdDefault>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwd1x1S1P0>,
-                                       Shards,
-                                       ShardIndex>{});
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
deleted file mode 100644
index 2586bc0f16..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwdDefault,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                      NGCDHW,
-                                                                                      GKCZYX,
-                                                                                      Empty_Tuple,
-                                                                                      NGKDHW,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwd1x1S1P0,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..f3eccc7dc8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
deleted file mode 100644
index 7405f86a5f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwdDefault,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                      NGCDHW,
-                                                                                      GKCZYX,
-                                                                                      Empty_Tuple,
-                                                                                      NGKDHW,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwd1x1S1P0,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
index ca6d571be1..abea0bea81 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
 
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -20,43 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwdDefault>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwd1x1S1P0>,
-                                       Shards,
-                                       ShardIndex>{});
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
index 24d6b66976..ba5d9fb1de 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
@@ -3,11 +3,13 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
 
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -19,44 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
index 38ed240fab..5a2c4a0d5b 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.inc
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
@@ -3,11 +3,13 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
 
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -19,44 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
                                                                 F32,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
index 38ed240fab..701b8eb4a4 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
@@ -3,11 +3,13 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
 
-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
 
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -19,44 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
                                                                 F32,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
+                                                                PassThrough>>>& instances)
 {
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
     add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
 }
 
-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From df54667102a3a1183fa55872eb6889717b42fde6 Mon Sep 17 00:00:00 2001
From: John Afaganis <john.afaganis@amd.com>
Date: Tue, 17 Jun 2025 15:29:45 -0600
Subject: [PATCH 226/443] Add missing copyright headers (#2359)

* Add missing copyright headers

* empty commit
---
 example/ck_tile/18_flatmm/script/smoke_test_basic.sh         | 4 ++++
 example/ck_tile/35_batched_transpose/script/perf_test.sh     | 5 ++++-
 example/ck_tile/35_batched_transpose/script/run_full_test.sh | 4 ++++
 example/ck_tile/35_batched_transpose/script/smoke_test.sh    | 5 ++++-
 .../test_batched_gemm_device_utils.hpp                       | 3 +++
 test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc    | 3 +++
 test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc    | 3 +++
 test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc     | 3 +++
 .../test_gemm_universal_streamk_ut_cases_bf16.inc            | 3 +++
 .../test_gemm_universal_streamk_ut_cases_fp16.inc            | 3 +++
 .../test_gemm_universal_streamk_ut_cases_fp8.inc             | 3 +++
 11 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/example/ck_tile/18_flatmm/script/smoke_test_basic.sh b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
index a3fc61cc31..6bcec3a812 100755
--- a/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
+++ b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
+
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier:  MIT
+
 EXE="$(find . -name tile_example_flatmm_basic -type f | head -n 1)"
 KNAME=1
 
diff --git a/example/ck_tile/35_batched_transpose/script/perf_test.sh b/example/ck_tile/35_batched_transpose/script/perf_test.sh
index 7ecfefc580..dde646eb2a 100755
--- a/example/ck_tile/35_batched_transpose/script/perf_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/perf_test.sh
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier:  MIT
+
 EXE=./build/bin/tile_example_batched_transpose
 
 for pr in "fp8" "fp16" "bf16"; do
@@ -8,4 +11,4 @@ $EXE -pr=$pr -N=1 -C=1024 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
 $EXE -pr=$pr -N=1 -C=1024 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
 $EXE -pr=$pr -N=1 -C=4096 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
 
-done
\ No newline at end of file
+done
diff --git a/example/ck_tile/35_batched_transpose/script/run_full_test.sh b/example/ck_tile/35_batched_transpose/script/run_full_test.sh
index 4d0c988912..bd42959256 100755
--- a/example/ck_tile/35_batched_transpose/script/run_full_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/run_full_test.sh
@@ -1,4 +1,8 @@
 #!/bin/bash 
+
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier:  MIT
+
 #
 # in order to run this script you'd first need to build the tile_example_batched_transpose executables in ../build/bin/
 #
diff --git a/example/ck_tile/35_batched_transpose/script/smoke_test.sh b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
index fdc01a2eb4..5ba2743364 100755
--- a/example/ck_tile/35_batched_transpose/script/smoke_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier:  MIT
+
 EXE=./build/bin/tile_example_batched_transpose
 
 for pr in "fp8" "fp16" "bf16"; do
@@ -24,4 +27,4 @@ $EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
 $EXE -pr=$pr -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
 $EXE -pr=$pr -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
 
-done
\ No newline at end of file
+done
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp
index 7d20ee4827..f8f621e9eb 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_device_utils.hpp
@@ -1,5 +1,8 @@
 #pragma once
 
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #include <hip/hip_runtime.h>
 #include <string>
 
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
index 233f86ef43..c344d10434 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_bf16.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #pragma once
 
 TYPED_TEST(TestGemmUniversal_BF16_MK_KN, SmallM)
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
index adc84848f2..309b212249 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp16.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #pragma once
 
 TYPED_TEST(TestGemmUniversal_FP16_MK_KN, SmallM)
diff --git a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
index b831e15e9c..770107a2df 100644
--- a/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
+++ b/test/gemm_universal/test_gemm_universal_ut_cases_fp8.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #pragma once
 
 TYPED_TEST(TestGemmUniversal_FP8_MK_KN, SmallM)
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
index 22977866b5..5cefd911a7 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_bf16.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #pragma once
 
 TYPED_TEST(TestGemmUniversal_Streamk_BF16_MK_KN, SmallM)
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
index 99c8e6d163..6deb867cd3 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp16.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #pragma once
 
 TYPED_TEST(TestGemmUniversal_Streamk_FP16_MK_KN, SmallM)
diff --git a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
index b98ee92800..43140e0ef4 100644
--- a/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
+++ b/test/gemm_universal_streamk/test_gemm_universal_streamk_ut_cases_fp8.inc
@@ -1,3 +1,6 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
 #pragma once
 
 TYPED_TEST(TestGemmUniversal_Streamk_FP8_MK_KN, SmallM)

From 0eb8974502df073be0e131f25435a30ecbf9a656 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Wed, 18 Jun 2025 08:27:46 +0800
Subject: [PATCH 227/443] [CK_TILE] Support multi-config in
 tile_example_gemm_universal (#2240)

* [CK_TILE] Support multi-config in tile_example_gemm_universal

Add GemmConfig in run_gemm_example to support multiple tile config.
- It is useful when use you need compare gemm perf with different tile/pipeline config
- we also can use it simplify the code for wmma support in the furture.

* [CK_TILE] Support multi-config in tile_example_gemm_universal

Address review comments

* rebase code and fix clang format.

* fix clang format

* support pipeline v5.

* fix merge conflict

* address review comment

* add missing file

* address review comment v2

* fix build error
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |  41 +--
 example/ck_tile/03_gemm/gemm_utils.hpp        | 301 ++++++++++++------
 example/ck_tile/03_gemm/run_gemm_example.inc  |  40 ++-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  71 +++--
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   3 +-
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |   5 +-
 6 files changed, 306 insertions(+), 155 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 1906b0bda7..090a98486e 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -12,7 +12,8 @@
 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -22,7 +23,7 @@ template <typename ADataType,
           typename DsLayout,
           typename CLayout,
           bool Persistent,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+          typename CDEElementWise>
 float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
 
 {
@@ -140,12 +141,12 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     {
         if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
                 argc, argv, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
                 argc, argv, Col{}, Col{}, Row{});
         }
         else
@@ -156,24 +157,24 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     }
     else
     {
-        if(a_layout == "R" && b_layout == "R")
+        if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
-        }
-        else if(a_layout == "R" && b_layout == "C")
-        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
                 argc, argv, Row{}, Col{}, Row{});
         }
+        else if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
         else if(a_layout == "C" && b_layout == "R")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
                 argc, argv, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
                 argc, argv, Col{}, Col{}, Row{});
         }
         else
@@ -211,15 +212,19 @@ int run_gemm_example(int argc, char* argv[])
         return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
             a_layout, b_layout, argc, argv);
     }
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
     else if(data_type == "pk_int4_t")
     {
         // TODO: Add support for bhalf_t ADataType
-        return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        if constexpr(GemmConfigBase::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported data type for this operation !!!");
+        }
     }
-#endif
     else
     {
         throw std::runtime_error("Unsupported data type for this operation !!!");
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 6987a2492e..101e195903 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -16,105 +16,8 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4
 
-#ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
-#endif
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V5)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV5
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV5
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
-#endif
-
-struct GemmConfig
+struct GemmConfigBase
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 32;
-    static constexpr ck_tile::index_t K_Tile = 64;
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128;
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 32;
-
-    static constexpr bool DoubleSmemBuffer          = false;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 32;
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer          = true;
-    static constexpr ck_tile::index_t NumWaveGroups = 1;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V5)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 32;
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 2;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer = false;
-
-    // Available wavegroups will be split into `NumWaveGroups` and each of these wavegroups
-    // will be responsible for specific jobs. For instance, perform Global Memory read operations,
-    // perform block-gemm operation etc...
-    static constexpr ck_tile::index_t NumWaveGroups = 2;
-#endif
-
     static constexpr bool kPadM = false;
     static constexpr bool kPadN = false;
     static constexpr bool kPadK = false;
@@ -128,6 +31,169 @@ struct GemmConfig
     static constexpr int kBlockPerCu                         = 1;
     static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
     static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryIntrawave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3 : public GemmConfigBase
+{
+    // Compute V3 only support Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 32 : 128;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV5 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+
+    static constexpr bool DoubleSmemBuffer               = false;
+    static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
 };
 
 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
@@ -224,6 +290,45 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
     static constexpr const char* name = "pk_int4_t";
 };
 
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index cc9a825c73..140107bfb4 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -30,7 +30,8 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename Tensor,
+template <typename GemmConfig,
+          typename Tensor,
           typename ADataType,
           typename BDataType,
           typename AccDataType,
@@ -63,11 +64,12 @@ void permute_tensor_b(Tensor& tensor)
                                                                        AccDataType,
                                                                        GemmShape,
                                                                        GemmUniversalTraits,
-                                                                       GEMM_PIPELINE_SCHEDULER,
+                                                                       GemmConfig::Scheduler,
                                                                        true,
                                                                        ck_tile::TailNumber::Full>;
 
-    using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
+        UniversalGemmProblem>;
 
     const ck_tile::index_t K  = tensor.get_length(0);
     const ck_tile::index_t N  = tensor.get_length(1);
@@ -144,7 +146,22 @@ void permute_vectors_i4x4_b(Tensor& tensor)
     }
 }
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float gemm(const ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& s);
+
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -184,7 +201,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     float ave_time;
     if(persistent)
     {
-        ave_time = gemm<ADataType,
+        ave_time = gemm<GemmConfig,
+                        ADataType,
                         BDataType,
                         DsDataType,
                         AccDataType,
@@ -199,7 +217,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     }
     else
     {
-        ave_time = gemm<ADataType,
+        ave_time = gemm<GemmConfig,
+                        ADataType,
                         BDataType,
                         DsDataType,
                         AccDataType,
@@ -232,7 +251,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     return ave_time;
 }
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType = ADataType,
           typename CDataType = ADataType,
           typename ALayout,
@@ -312,7 +332,8 @@ int run_gemm_example_with_layouts(int argc,
         ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
         if constexpr(GemmConfig::PermuteB)
         {
-            permute_tensor_b<decltype(b_k_n_dev),
+            permute_tensor_b<GemmConfig,
+                             decltype(b_k_n_dev),
                              ADataType,
                              BDataType,
                              AccDataType,
@@ -338,7 +359,8 @@ int run_gemm_example_with_layouts(int argc,
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
-    invoke_gemm<ADataType,
+    invoke_gemm<GemmConfig,
+                ADataType,
                 BDataType,
                 ck_tile::tuple<>,
                 AccDataType,
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 3ec90e7f00..ecfaa92b9a 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -13,7 +13,8 @@
 #include "gemm_utils.hpp"
 #include "run_gemm_example.inc"
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -23,7 +24,7 @@ template <typename ADataType,
           typename DsLayout,
           typename ELayout,
           bool Persistent,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+          typename CDEElementWise>
 float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
 
 {
@@ -45,7 +46,8 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                                            GemmConfig::kPadK,
                                            ALayout,
                                            BLayout,
-                                           ELayout>;
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
 
     using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
                                                                  GemmConfig::kPadN,
@@ -61,7 +63,8 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
 
     const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
     const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
@@ -75,7 +78,7 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
         [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
             constexpr bool has_hot_loop_v   = has_hot_loop_.value;
             constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
             constexpr auto memory_operation = memory_operation_.value;
 
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
@@ -87,7 +90,8 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                                                                                has_hot_loop_v,
                                                                                tail_number_v>;
 
-            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
@@ -108,7 +112,6 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                                                  UniversalGemmProblem::TransposeC,
                                                  memory_operation,
                                                  GemmConfig::NumWaveGroups>>;
-
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
@@ -205,7 +208,10 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
     return ave_time;
 }
 
-template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
     using Row = ck_tile::tensor_layout::gemm::RowMajor;
@@ -215,12 +221,12 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     {
         if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 argc, argv, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 argc, argv, Col{}, Col{}, Row{});
         }
         else
@@ -233,22 +239,22 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     {
         if(a_layout == "R" && b_layout == "R")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 argc, argv, Row{}, Row{}, Row{});
         }
         else if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 argc, argv, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 argc, argv, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
                 argc, argv, Col{}, Col{}, Row{});
         }
         else
@@ -258,6 +264,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     }
 }
 
+template <template <typename PreType> typename GemmConfig>
 int run_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -270,31 +277,43 @@ int run_gemm_example(int argc, char* argv[])
 
     if(data_type == "fp16")
     {
-        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
-        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "fp8")
     {
-        return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
     }
     else if(data_type == "bf8")
     {
-        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
     }
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
     else if(data_type == "pk_int4_t")
     {
         // TODO: Add support for bhalf_t ADataType
-        return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
     }
-#endif
     else
     {
         throw std::runtime_error("Unsupported data type for this operation !!!");
@@ -305,7 +324,7 @@ int main(int argc, char* argv[])
 {
     try
     {
-        return !run_gemm_example(argc, argv);
+        return !run_gemm_example<GemmConfigComputeV3>(argc, argv);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index b349991470..b10ee0320f 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -42,8 +42,7 @@ struct GemmPipelineProblemBase
     static constexpr bool kPadN = Traits::kPadN;
     static constexpr bool kPadK = Traits::kPadK;
 
-    static constexpr bool DoubleSmemBuffer = Traits::DoubleSmemBuffer;
-
+    static constexpr bool DoubleSmemBuffer  = Traits::DoubleSmemBuffer;
     static constexpr auto Scheduler         = GemmPipelineScheduler::Default;
     static constexpr index_t VectorLoadSize = Traits::_VectorSize;
 
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index c6f83068a9..b546cebcd5 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -12,7 +12,8 @@ template <bool kPadM_,
           bool kPadK_,
           typename ALayout_,
           typename BLayout_,
-          typename CLayout_>
+          typename CLayout_,
+          index_t NumWaveGroups_ = 1>
 struct TileGemmTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -28,7 +29,7 @@ struct TileGemmTraits
 
     static constexpr bool TransposeC            = false;
     static constexpr bool UseStructuredSparsity = false;
-    static constexpr index_t NumWaveGroups      = 1;
+    static constexpr index_t NumWaveGroups      = NumWaveGroups_;
 };
 
 template <bool kPadM_,

From cd606f72c1fb3a99d596ad0f79521b46152764cb Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Tue, 17 Jun 2025 17:30:21 -0700
Subject: [PATCH 228/443] Fix default epilogue  (#2358)

* [ck-tile] fix default epilogue in gemm universal

* argument validation needs vector size D

* operator() needs to specify dram windows

* copy/paste from cshuffle epilogue

* clang-format

* mark unused argument

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 include/ck_tile/ops/epilogue/default_2d_epilogue.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index ab3c0df88d..623433c1dc 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -71,9 +71,11 @@ struct Default2DEpilogue
 
     // TODO: this function assume store out vector size is the same as OAccTile last dimension size
     //       how do we fix this ?
-    template <typename ODramWindowTmp, typename OAccTile>
-    CK_TILE_DEVICE auto
-    operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr)
+    template <typename ODramWindowTmp, typename OAccTile, typename DsDramWindows>
+    CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
+                                   const OAccTile& o_acc_tile,
+                                   const DsDramWindows& /* unused */,
+                                   void* = nullptr)
     {
 
         // TODO: this is ugly
@@ -114,6 +116,8 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+    using DsDataType                       = ck_tile::tuple<>;
+    using DsLayout                         = ck_tile::tuple<>;
     using CLayout                          = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kMPerXdl      = Problem::kMPerXdl;
     static constexpr index_t kNPerXdl      = Problem::kNPerXdl;
@@ -181,6 +185,8 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
             static_assert(false, "Unsupported CLayout!");
         }
     }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeD() { return 1; }
 };
 
 } // namespace ck_tile

From c7c6a0ccb37063835686121fdd8a67112c930d02 Mon Sep 17 00:00:00 2001
From: Kiefer van Teutem <50830967+krithalith@users.noreply.github.com>
Date: Wed, 18 Jun 2025 04:29:09 +0200
Subject: [PATCH 229/443] Fix argument order for calls to
 profile_batched_gemm_impl() (#2277)

* Fix argument order for calls to profile_batched_gemm_impl()

* Revert previous and swap the order of the profile_batched_gemm_impl() function arguments instead.

* Revert copyright years for unchanged files.

* Remove test_batched_gemm from REGRESSION_TESTS since it no longer takes more than 30 seconds to run.

---------

Co-authored-by: Kiefer van Teutem <kiefer.van.teutem@streamhpc.com>
---
 profiler/include/profiler/profile_batched_gemm_impl.hpp | 8 ++++----
 profiler/src/profile_batched_gemm.cpp                   | 8 ++++----
 profiler/src/profile_batched_gemm_multi_d.cpp           | 8 ++++----
 test/CMakeLists.txt                                     | 1 -
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
index 936c22f5d8..92e06e4a70 100644
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -41,12 +41,12 @@ bool profile_batched_gemm_impl(int do_verification,
                                int M,
                                int N,
                                int K,
-                               int BatchStrideA,
-                               int BatchStrideB,
-                               int BatchStrideC,
                                int StrideA,
                                int StrideB,
                                int StrideC,
+                               int BatchStrideA,
+                               int BatchStrideB,
+                               int BatchStrideC,
                                int BatchCount)
 {
     bool pass = true;
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 222532b7bb..d9da68b050 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdint>
 #include <iostream>
@@ -136,12 +136,12 @@ int profile_batched_gemm(int argc, char* argv[])
                                                                           M,
                                                                           N,
                                                                           K,
-                                                                          BatchStrideA_,
-                                                                          BatchStrideB_,
-                                                                          BatchStrideC_,
                                                                           StrideA_,
                                                                           StrideB_,
                                                                           StrideC_,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
                                                                           BatchCount);
 
             return pass ? 0 : 1;
diff --git a/profiler/src/profile_batched_gemm_multi_d.cpp b/profiler/src/profile_batched_gemm_multi_d.cpp
index 7cd4636d98..eb94c07900 100644
--- a/profiler/src/profile_batched_gemm_multi_d.cpp
+++ b/profiler/src/profile_batched_gemm_multi_d.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdint>
 #include <iostream>
@@ -138,12 +138,12 @@ int profile_batched_gemm_multi_d(int argc, char* argv[])
                                                                           M,
                                                                           N,
                                                                           K,
-                                                                          BatchStrideA_,
-                                                                          BatchStrideB_,
-                                                                          BatchStrideC_,
                                                                           StrideA_,
                                                                           StrideB_,
                                                                           StrideC_,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
                                                                           BatchCount);
 
             return pass ? 0 : 1;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5b25550d9b..1be7c88c2e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,7 +13,6 @@ set(REGRESSION_TESTS
     test_gemm_standalone_xdl_fp16
     test_gemm_fp16
     test_gemm_splitk
-    test_batched_gemm
     test_gemm_universal_wmma_fp16
     test_gemm_universal_xdl_fp16
     test_gemm_universal_streamk_fp16

From a4e1248dbaeb868bf5d95f90f844e3a9a58e85d5 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Wed, 18 Jun 2025 10:49:43 +0800
Subject: [PATCH 230/443] [CK_TILE] moe_sorting support "local_tokens" feature
 for EP case (#2335)

* support local_token for hipgraph

* update README

* fix comment

* fix fmoe example
---
 example/ck_tile/13_moe_sorting/README.md      |  28 ++-
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    |  59 ++++--
 .../13_moe_sorting/moe_sorting_api.cpp        | 158 ++++++++++------
 .../13_moe_sorting/script/smoke_test.sh       |  12 +-
 example/ck_tile/15_fused_moe/fused_moe.hpp    |   1 +
 .../15_fused_moe/instances/fused_moe_api.cpp  |   1 +
 .../instances/fused_moesorting_api.cpp        | 163 ++++++++++------
 example/ck_tile/15_fused_moe/main.cpp         |  51 ++++-
 .../host/reference/reference_moe_sorting.hpp  |   4 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 176 +++++++++++++++---
 .../fused_moe/kernel/moe_sorting_problem.hpp  |   4 +
 11 files changed, 495 insertions(+), 162 deletions(-)

diff --git a/example/ck_tile/13_moe_sorting/README.md b/example/ck_tile/13_moe_sorting/README.md
index 7b6792dd95..1822ff3a37 100644
--- a/example/ck_tile/13_moe_sorting/README.md
+++ b/example/ck_tile/13_moe_sorting/README.md
@@ -14,14 +14,24 @@ This will result in an executable `build/bin/tile_example_moe_sorting`
 ## example
 ```
 args:
-          -v    weather do CPU validation or not (default:1)
-       -pr_i    index data type. (currently only fp32 supported now) (default:int32)
-       -pr_w    output weight data type(currently only fp32 supported now) (default:fp32)
-          -t    number of input tokens (default:32)
-          -e    number of experts (default:8)
-          -k    topk (default:2)
-       -st_i    row stride of input, -1 means same as experts (default:-1)
-       -seed    seed to be used, -1 means random every time (default:-1)
-      -kname    when set to 1 it will print kernel name (default:0)
+           -v    turn CPU validation on (1) or off (0). (default:1)
+        -pr_i    index data type.  Only int32 is currently supported. (default:int32)
+        -pr_w    output weight data type. Only fp32 is currently supported. (default:fp32)
+           -t    number of input tokens. (default:128)
+                 If "local_t" presents, this value indicates global concurrency of all ranks.
+     -local_t    Number of local input tokens for curent rank. (default:-1)
+                 This value must be within range "[0, t)", or "-1"(no such feature)
+                 This feature is to simulate EP case where where each rank has different tokens.
+                 Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.
+           -e    number of num_experts (default:8)
+           -k    topk (default:4)
+        -unit    unit_size (default:32)
+-moe_buf_size    moe_buf_size (default:0)
+   -local_eid    a list of experts enabled as local expert. e.g. "0,1,4,5" (default:-1)
+                 please make sure eid is in ascending order!
+        -seed    seed to be used. When set to -1, a random seed will be generated each time invoking this example (default:-1)
+       -kname    prints the kernel name when set to 1 (default:0)
+      -warmup    number of iterations before benchmark the kernel (default:5)
+      -repeat    number of iterations to benchmark the kernel (default:20)
 
 ```
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index da1c15b86f..f139081cd4 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -18,10 +18,20 @@
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
-        .insert("pr_i", "int32", "index data type. (currently only int32 supported now)")
-        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
-        .insert("t", "128", "number of input tokens")
+    arg_parser.insert("v", "1", "turn CPU validation on (1) or off (0).")
+        .insert("pr_i", "int32", "index data type.  Only int32 is currently supported.")
+        .insert("pr_w", "fp32", "output weight data type. Only fp32 is currently supported.")
+        .insert("t",
+                "128",
+                "number of input tokens.\n"
+                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
+        .insert(
+            "local_t",
+            "-1",
+            "Number of local input tokens for curent rank.\n"
+            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
+            "This feature is to simulate EP case where where each rank has different tokens.\n"
+            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
         .insert("e", "8", "number of num_experts")
         .insert("k", "4", "topk")
         .insert("unit", "32", "unit_size")
@@ -30,8 +40,11 @@ auto create_args(int argc, char* argv[])
                 "-1",
                 "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
                 "please make sure eid is in ascending order!")
-        .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("seed",
+                "-1",
+                "seed to be used. When set to -1, a random seed will be generated each time "
+                "invoking this example")
+        .insert("kname", "0", "prints the kernel name when set to 1")
         .insert("warmup", "5", "number of iterations before benchmark the kernel")
         .insert("repeat", "20", "number of iterations to benchmark the kernel");
 
@@ -70,6 +83,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     std::string index_prec  = args.get_str("pr_i");
     std::string weight_prec = args.get_str("pr_w");
     int tokens              = args.get_int("t");
+    int local_tokens        = args.get_int("local_t");
     int num_experts         = args.get_int("e");
     int topk                = args.get_int("k");
     int seed                = args.get_int("seed");
@@ -95,6 +109,16 @@ bool test_moe_sorting(ck_tile::ArgParser args)
         return false;
     }
 
+    // if local_tokens == tokens, not local_token, but better avoid this since no meaning for such
+    // case
+    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+    if(local_tokens > tokens)
+    {
+        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+        return false;
+    }
+
     bool local_expert_masking      = args.get_str("local_eid") != "-1";
     auto local_expert_masking_host = [&]() {
         if(local_expert_masking)
@@ -143,6 +167,13 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     ck_tile::DeviceMem local_expert_masking_dev(
         local_expert_masking_host.get_element_space_size_in_bytes());
 
+    // used for simulating dynamic_tokens for EP case
+    ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+    if(is_local_token)
+    {
+        local_tokens_dev.ToDevice(&local_tokens);
+    }
+
     topk_ids_dev.ToDevice(topk_ids_host.data());
     weights_dev.ToDevice(weights_host.data());
     if(moe_buf_size > 0)
@@ -164,6 +195,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                           weights_dev.GetDeviceBuffer(),
                           local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
                                                : nullptr,
+                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
                           sorted_ids_dev.GetDeviceBuffer(),
                           sorted_weights_dev.GetDeviceBuffer(),
                           sorted_expert_ids_dev.GetDeviceBuffer(),
@@ -236,13 +268,12 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     }
 #endif
 
-    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, mp:%d, ",
-           index_prec.c_str(),
-           weight_prec.c_str(),
-           tokens,
-           num_experts,
-           topk,
-           workspace_size != 0 ? 1 : 0);
+    printf("[%s|%s]tokens:%d", index_prec.c_str(), weight_prec.c_str(), tokens);
+    if(is_local_token)
+    {
+        printf("(%d)", local_tokens);
+    }
+    printf(", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);
 
     if(local_expert_masking)
     {
@@ -285,6 +316,8 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                                                               ref_total_tokens_post_pad,
                                                               num_experts,
                                                               unit_size,
+                                                              is_local_token ? local_tokens
+                                                                             : tokens,
                                                               local_expert_masking);
         printf("total_tokens_post_pad:%d(%d), ",
                ref_total_tokens_post_pad,
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index 305cf118d2..0899fefcfc 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -33,15 +33,18 @@
 
 #else
 
-#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_, local_expert_masking_)                \
+#define MOE_SORTING_DISPATCH_(                                                                          \
+    sub_token_tile_, sub_token_onshot_, local_expert_masking_, local_token_)                            \
     constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
     constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
     constexpr bool local_expert_masking       = local_expert_masking_;                                  \
+    constexpr bool local_token                = local_token_;                                           \
     using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
                                                     ms_weight_type,            \
                                                     sub_token_tile,            \
                                                     sub_token_onshot,          \
-                                                    local_expert_masking>;     \
+                                                    local_expert_masking,      \
+                                                    local_token>;              \
     using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
     auto kargs                                = kernel::MakeKargs(a);                                   \
     const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -51,32 +54,43 @@
         s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
     return ave_time;
 
-#define MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_) \
-    if(row_ % 8 == 0)                                                                   \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 4 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 2 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else                                                                                \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_);             \
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(                                                  \
+    row_, sub_token_onshot_, local_expert_masking_, local_token_)                         \
+    if(row_ % 8 == 0)                                                                     \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 4 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 2 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else                                                                                  \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_, local_token_); \
     }
 
-#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)            \
-    if(is_sub_token_onshot)                                                 \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, true, local_expert_masking_)  \
-    }                                                                       \
-    else                                                                    \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, false, local_expert_masking_) \
+#define MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, sub_token_onshot_, local_expert_masking_)    \
+    if(is_local_token)                                                                         \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, true)  \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, false) \
+    }
+
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)                \
+    if(is_sub_token_onshot)                                                     \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                           \
+    else                                                                        \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, false, local_expert_masking_) \
     }
 
 #define MOE_SORTING_DISPATCH_EMASK_(row_)        \
@@ -171,6 +185,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         auto row_                    = sub_token_ / 8;
         bool is_sub_token_onshot     = a.tokens <= sub_token_;
         bool is_local_expert_masking = t.local_expert_masking;
+        bool is_local_token          = a.p_local_tokens != nullptr;
 
         MOE_SORTING_DISPATCH_EMASK_(row_);
         // MOE_SORTING_DISPATCH_ETILE(0, 0);
@@ -179,15 +194,17 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
     return -1;
 }
 
-#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -195,15 +212,17 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
     }()
 
-#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -211,15 +230,17 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
     }()
 #if MOE_SORTING_SUPPORT_LARGE_EXPERT
-#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -227,15 +248,17 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 
-#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -244,15 +267,17 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
     }()
 #endif
 
-#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                          \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
         constexpr bool expert_masking         = expert_masking_;                                     \
+        constexpr bool local_token            = local_token_;                                        \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
                                                         ms_weight_type,         \
                                                         mesh_type_,             \
                                                         unroll_num,             \
-                                                        expert_masking>;        \
+                                                        expert_masking,         \
+                                                        local_token>;           \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                                \
         const dim3 grids                      = kernel::GridSize(a);                                 \
@@ -261,28 +286,53 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
     }()
 
-#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)  \
-    if(t.local_expert_masking)                                                           \
-    {                                                                                    \
-        float ave_time =                                                                 \
-            ck_tile::launch_kernel(s,                                                    \
-                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true),     \
-                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true),     \
-                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true));  \
-        return ave_time;                                                                 \
-    }                                                                                    \
-    else                                                                                 \
-    {                                                                                    \
-        float ave_time =                                                                 \
-            ck_tile::launch_kernel(s,                                                    \
-                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false),    \
-                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false),    \
-                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false)); \
-        return ave_time;                                                                 \
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
+    if(t.local_expert_masking)                                                                     \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, true),     \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, true),     \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, true));  \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, false),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, false),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, false)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, true),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, true),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, true)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time = ck_tile::launch_kernel(                                               \
+                s,                                                                                 \
+                MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, false),                          \
+                MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, false),                          \
+                MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, false));                       \
+            return ave_time;                                                                       \
+        }                                                                                          \
     }
 
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
 {
+    bool is_local_token = a.p_local_tokens != nullptr;
     if(t.weight_type == "fp32" && t.index_type == "int32")
     {
         using ms_index_t     = ck_tile::index_t;
diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
index fbfb10822c..63bc0acceb 100644
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -31,4 +31,14 @@ $EXE -t=8192 -e=32 -k=5 -moe_buf_size=163840
 $EXE -t=8192 -e=32 -k=8 -moe_buf_size=163840
 $EXE -t=8192 -e=256 -k=5 -moe_buf_size=163840
 $EXE -t=8192 -e=256 -k=8 -moe_buf_size=163840
-$EXE -t=163840 -e=256 -k=8 -moe_buf_size=163840
\ No newline at end of file
+$EXE -t=163840 -e=256 -k=8 -moe_buf_size=163840
+$EXE -t=12 -local_t=3 -e=256 -k=5 -local_eid=9,10,199,145
+$EXE -t=67 -local_t=9 -e=555 -k=5 -local_eid=19,23,24,25,26,99
+$EXE -t=99 -local_t=93 -e=121 -moe_buf_size=10244
+$EXE -t=536 -local_t=345 -e=802 -k=99
+$EXE -t=331 -local_t=39 -e=83 -k=33
+$EXE -t=765 -local_t=654 -e=783 -k=8
+$EXE -t=23 -local_t=9 -e=1 -k=1
+$EXE -t=7 -local_t=0 -e=89 -k=1 -local_eid=0,8,12,33
+$EXE -t=61 -local_t=0 -e=333 -k=99 -local_eid=0,8,12,33
+$EXE -t=133940 -local_t=111921 -e=256 -k=17 -moe_buf_size=133940
diff --git a/example/ck_tile/15_fused_moe/fused_moe.hpp b/example/ck_tile/15_fused_moe/fused_moe.hpp
index 46425384cc..e4c25217fb 100644
--- a/example/ck_tile/15_fused_moe/fused_moe.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moe.hpp
@@ -16,6 +16,7 @@ struct fused_moe_args
     const void* d_scale_ptr;           // [e, 1, k], down scale
     const void* y_smooth_scale_ptr;    // [e, 1, n], smooth-quant-scale for 2nd gemm input
     const void* local_expert_mask_ptr; // [e], local_expert_mask_ptr for EP
+    const void* local_tokens;          // [1] if not nullptr, tokens read from here
     void* o_ptr;                       // [m, k], output token (no need to do zeroing)
     void* ws_ptr;                      // size is moe_sorting_get_workspace_size()
                                        // if return zero, then could be nullptr
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
index b3515b1bec..27274878a2 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -28,6 +28,7 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
         a.topk_ids_ptr,          // const void* p_topk_ids;
         a.topk_weight_ptr,       // const void* p_weights;
         a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+        a.local_tokens,
         a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
         a.sorted_weight_ptr,     // void* p_sorted_weights;
         a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
index 0d83c48d02..f745284f3e 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -33,15 +33,18 @@
 
 #else
 
-#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_, local_expert_masking_)                \
+#define MOE_SORTING_DISPATCH_(                                                                          \
+    sub_token_tile_, sub_token_onshot_, local_expert_masking_, local_token_)                            \
     constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
     constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
     constexpr bool local_expert_masking       = local_expert_masking_;                                  \
+    constexpr bool local_token                = local_token_;                                           \
     using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
                                                     ms_weight_type,            \
                                                     sub_token_tile,            \
                                                     sub_token_onshot,          \
-                                                    local_expert_masking>;     \
+                                                    local_expert_masking,      \
+                                                    local_token>;              \
     using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
     auto kargs                                = kernel::MakeKargs(a);                                   \
     const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -51,32 +54,43 @@
         s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
     return ave_time;
 
-#define MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_) \
-    if(row_ % 8 == 0)                                                                   \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 4 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 2 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else                                                                                \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_);             \
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(                                                  \
+    row_, sub_token_onshot_, local_expert_masking_, local_token_)                         \
+    if(row_ % 8 == 0)                                                                     \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 4 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 2 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else                                                                                  \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_, local_token_); \
     }
 
-#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)            \
-    if(is_sub_token_onshot)                                                 \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, true, local_expert_masking_)  \
-    }                                                                       \
-    else                                                                    \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, false, local_expert_masking_) \
+#define MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, sub_token_onshot_, local_expert_masking_)    \
+    if(is_local_token)                                                                         \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, true)  \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, false) \
+    }
+
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)                \
+    if(is_sub_token_onshot)                                                     \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                           \
+    else                                                                        \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, false, local_expert_masking_) \
     }
 
 #define MOE_SORTING_DISPATCH_EMASK_(row_)        \
@@ -175,6 +189,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         auto row_                    = sub_token_ / 8;
         bool is_sub_token_onshot     = a.tokens <= sub_token_;
         bool is_local_expert_masking = t.local_expert_masking;
+        bool is_local_token          = a.p_local_tokens != nullptr;
 
         MOE_SORTING_DISPATCH_EMASK_(row_);
         // MOE_SORTING_DISPATCH_ETILE(0, 0);
@@ -183,15 +198,17 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
     return -1;
 }
 
-#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -199,15 +216,17 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
     }()
 
-#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -215,15 +234,17 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
     }()
 #if MOE_SORTING_SUPPORT_LARGE_EXPERT
-#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -231,15 +252,17 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
     }()
 
-#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                         \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
         constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
                                                         ms_weight_type,        \
                                                         mesh_type_,            \
                                                         unroll_num,            \
-                                                        expert_masking>;       \
+                                                        expert_masking,        \
+                                                        local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -248,15 +271,17 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
     }()
 #endif
 
-#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_)                                  \
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
     [&]() {                                                                                          \
         constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
         constexpr bool expert_masking         = expert_masking_;                                     \
+        constexpr bool local_token            = local_token_;                                        \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
                                                         ms_weight_type,         \
                                                         mesh_type_,             \
                                                         unroll_num,             \
-                                                        expert_masking>;        \
+                                                        expert_masking,         \
+                                                        local_token>;           \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                                \
         const dim3 grids                      = kernel::GridSize(a);                                 \
@@ -265,30 +290,55 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
     }()
 
-#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)  \
-    if(t.local_expert_masking)                                                           \
-    {                                                                                    \
-        float ave_time =                                                                 \
-            ck_tile::launch_kernel(s,                                                    \
-                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true),     \
-                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true),     \
-                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true));  \
-        return ave_time;                                                                 \
-    }                                                                                    \
-    else                                                                                 \
-    {                                                                                    \
-        float ave_time =                                                                 \
-            ck_tile::launch_kernel(s,                                                    \
-                                   MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false),    \
-                                   MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false),    \
-                                   MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false)); \
-        return ave_time;                                                                 \
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
+    if(t.local_expert_masking)                                                                     \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, true),     \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, true),     \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, true));  \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, false),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, false),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, false)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, true),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, true),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, true)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time = ck_tile::launch_kernel(                                               \
+                s,                                                                                 \
+                MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, false),                          \
+                MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, false),                          \
+                MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, false));                       \
+            return ave_time;                                                                       \
+        }                                                                                          \
     }
 
 float fused_moesorting_mp(fused_moesorting_trait t,
                           fused_moesorting_args a,
                           ck_tile::stream_config s)
 {
+    bool is_local_token = a.p_local_tokens != nullptr;
     if(t.weight_type == "fp32" && t.index_type == "int32")
     {
         using ms_index_t     = ck_tile::index_t;
@@ -360,3 +410,8 @@ float fused_moesorting_mp(fused_moesorting_trait t,
     }
     return -1;
 }
+
+int fused_moesorting_get_workspace_size(int tokens, int num_experts, int topk)
+{
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk);
+}
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index da843891ce..d9950426a2 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -87,7 +87,18 @@ void topid_unique_gen(
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("t", "128", "num input tokens")
+    arg_parser
+        .insert("t",
+                "128",
+                "number of input tokens.\n"
+                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
+        .insert(
+            "local_t",
+            "-1",
+            "Number of local input tokens for curent rank.\n"
+            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
+            "This feature is to simulate EP case where where each rank has different tokens.\n"
+            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
         .insert("e", "32", "num of experts")
         .insert("k", "5", "topk")
         .insert("h", "8192", "hidden_size of this model")
@@ -131,6 +142,7 @@ template <typename I, typename W, typename O, typename ST, typename SW, typename
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t tokens            = arg_parser.get_int("t");
+    ck_tile::index_t local_tokens      = arg_parser.get_int("local_t");
     ck_tile::index_t experts           = arg_parser.get_int("e");
     ck_tile::index_t topk              = arg_parser.get_int("k");
     ck_tile::index_t hidden_size       = arg_parser.get_int("h");
@@ -169,6 +181,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // w1 (Down, N size)
     ck_tile::index_t shared_intermediate_size_1 = intermediate_size / tp;
 
+    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+    if(local_tokens > tokens)
+    {
+        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+        return false;
+    }
+
     auto prec_str = [&]() {
         auto base_str = prec_i;
         if(prec_i != prec_w)
@@ -198,11 +218,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
             return std::string(", st:") + std::to_string(stride);
     }();
 
+    std::cout << "[" << api_str << "|" << prec_str << "]"
+              << " t:" << tokens;
+
+    if(is_local_token)
+    {
+        std::cout << "(" << local_tokens << ")";
+    }
+
     std::cout
-        << "[" << api_str << "|" << prec_str << "]"
-        << " t:" << tokens << ", e:" << experts << ", k:" << topk << stride_str
-        << ", hidden:" << hidden_size << ", interm:" << intermediate_size << ", tp:" << tp
-        << ", act:"
+        << ", e:" << experts << ", k:" << topk << stride_str << ", hidden:" << hidden_size
+        << ", interm:" << intermediate_size << ", tp:" << tp << ", act:"
         << activation
         // << ", shrd_interm:" << shared_intermediate_size_0 << "|" << shared_intermediate_size_1
         << (gate_only ? ", g1u0" : ", g1u1") << ", q:" << fused_quant << std::flush;
@@ -377,6 +403,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
         ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
         if(workspace_size != 0)
             moe_sorting_ws.SetZero(); // note, clear here!!!!
+        ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+        if(is_local_token)
+        {
+            local_tokens_dev.ToDevice(&local_tokens);
+        }
 
         fused_moe_traits traits{prec_i,
                                 prec_w,
@@ -400,6 +431,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                             fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr,
                             local_expert_masking ? local_expert_mask_buf.GetDeviceBuffer()
                                                  : nullptr,
+                            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
                             o_buf.GetDeviceBuffer(),
                             workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
                             topk_ids_buf.GetDeviceBuffer(),
@@ -463,6 +495,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 num_sorted_tiles_host.mData[0],
                 experts,
                 block_m,
+                is_local_token ? local_tokens : tokens,
                 local_expert_masking);
             if(activation == 0)
             {
@@ -495,6 +528,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             num_sorted_tiles_host.mData[0],
             experts,
             block_m,
+            is_local_token ? local_tokens : tokens,
             local_expert_masking);
 
         // done, preparing GPU buffer
@@ -506,6 +540,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
         ck_tile::DeviceMem sd_buf(sd_host);
         ck_tile::DeviceMem sy_buf(sy_host);
         ck_tile::DeviceMem o_buf(o_host);
+        ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+        if(is_local_token)
+        {
+            local_tokens_dev.ToDevice(&local_tokens);
+        }
 
         // manually clear output buffer for atomic
         o_buf.SetZero();
@@ -542,7 +581,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                 num_sorted_tiles_buf.GetDeviceBuffer(),
                                 hidden_size,
                                 intermediate_size / tp,
-                                tokens,
+                                is_local_token ? local_tokens : tokens,
                                 experts,
                                 topk,
                                 stride};
diff --git a/include/ck_tile/host/reference/reference_moe_sorting.hpp b/include/ck_tile/host/reference/reference_moe_sorting.hpp
index 47f0ba576b..1e877b9933 100644
--- a/include/ck_tile/host/reference/reference_moe_sorting.hpp
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
@@ -21,10 +21,12 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
                                         index_t& unit_cnt,
                                         const index_t experts,
                                         const index_t unit_size,
+                                        const index_t tokens,
                                         bool local_expert_masking,
                                         bool skip_experts_with_zero_token = true)
 {
-    const index_t num_token = topk_ids.mDesc.get_lengths()[0];
+    // note: if tokens is smaller than topk_ids.mDesc.get_lengths()[0], indicating local_token case
+    const index_t num_token = tokens; //  topk_ids.mDesc.get_lengths()[0];
     const index_t topk      = topk_ids.mDesc.get_lengths()[1];
     // allocate a temp buffer, and fill the value with [number_token|topk]
     std::vector<std::vector<IndexType>> expert_tokens(
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index d3c98d7bca..3e2e100025 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -165,7 +165,8 @@ struct MoeSortingHostArgs
     const void* p_topk_ids;     // [token, topk]
     const void* p_weights;      // [token, topk]
 
-    const void* p_local_expert_mask;
+    const void* p_local_expert_mask; // [experts]
+    const void* p_local_tokens;  // [1] if not nullptr, tokens read from here
 
     void* p_sorted_token_ids;
     void* p_sorted_weights;
@@ -177,7 +178,7 @@ struct MoeSortingHostArgs
     void* p_ws;             // size is moe_sorting_get_workspace_size()
                             // if return zero, then could be nullptr
                             // must be cleard before use
-    index_t tokens;
+    index_t tokens;         // if p_local_tokens is not nullptr, this indicate the max possible tokens used for ws/LDS calculation
     index_t unit_size;      // this is the M_a of fused-moe kernel
     index_t num_experts;
     index_t topk;
@@ -201,6 +202,7 @@ struct MoeSortingKernel
         const void* p_topk_ids;
         const void* p_weights;
         const void* p_local_expert_mask;
+        const void* p_local_tokens;  // [1] if not nullptr, tokens read from here
         void* p_sorted_token_ids;
         void* p_sorted_weights;
         void* p_sorted_expert_ids;
@@ -253,6 +255,7 @@ struct MoeSortingKernel
         k.p_topk_ids              = h.p_topk_ids;
         k.p_weights               = h.p_weights;
         k.p_local_expert_mask     = h.p_local_expert_mask;
+        k.p_local_tokens          = h.p_local_tokens;
         k.p_sorted_token_ids      = h.p_sorted_token_ids;
         k.p_sorted_weights        = h.p_sorted_weights;
         k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
@@ -263,9 +266,13 @@ struct MoeSortingKernel
         k.moe_buf_bytes           = h.moe_buf_bytes;
 
         const auto blocks   = BlockSize(h);
+        // NOTE: tokens could from p_local_tokens, so here this variable is useless
+        // hence moe_align_block_size_kernel() will not behavior properly if we have dynamic tokens
+        // (indeed we can deprecate moe_align_block_size_kernel)
         k.tokens_per_thread = integer_divide_ceil(h.tokens * h.topk, blocks.x);
         k.unit_size_mdiv    = mdiv{static_cast<uint32_t>(h.unit_size)};
         k.topk_mdiv         = mdiv{static_cast<uint32_t>(h.topk)};
+        // NOTE: tokens could from p_local_tokens, so here the LDS will be bigger than expected (but works)
         k.smem_rows         = [&](){
             auto [r_, c_] = moe_sorting_get_smem_row_col(h.tokens, h.num_experts);
             (void) c_;
@@ -1009,8 +1016,19 @@ struct MoeSortingKernel
         }
         const size_t numel = kargs.tokens * kargs.topk_mdiv.divisor;
         extern __shared__ char smem[];
+
 #if MOE_SORTING_USE_EX_KERNEL
         (void)numel;
+        index_t tokens_ = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return kargs.tokens;
+            }
+        }();
         return moe_align_block_size_kernel_ex(
             static_cast<const IndexType*>(kargs.p_topk_ids),
             static_cast<const WeightType*>(kargs.p_weights),
@@ -1020,7 +1038,7 @@ struct MoeSortingKernel
             static_cast<IndexType*>(kargs.p_sorted_expert_ids),
             static_cast<IndexType*>(kargs.p_total_tokens_post_pad),
             kargs.num_experts,
-            kargs.tokens,
+            tokens_,
             kargs.unit_size_mdiv,
             kargs.topk_mdiv,
             kargs.expert_mdiv,
@@ -1245,6 +1263,7 @@ CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, long_index_t buf_by
 
 } // namespace impl
 
+// TODO: tokens could be from
 // prefer to run mp kernel if is not oneshot
 CK_TILE_HOST bool moe_sorting_is_oneshot(int tokens_, int num_experts_)
 {
@@ -1351,9 +1370,11 @@ struct MoeSortingMultiPhaseKernel_P0
 
     struct Kargs
     {
-        const void* p_topk_ids; // [tokens, topk]
-        void* p_expert_mesh;    // [expert, tokens]
-        index_t tokens;
+        const void* p_topk_ids;     // [tokens, topk]
+        const void* p_local_tokens; // [1], if not nullptr, use this as actual tokens
+        void* p_expert_mesh;        // [expert, tokens]
+        index_t tokens; // if p_local_tokens is not nullptr, this indicate the max possible tokens
+                        // used for ws/LDS calculation
         index_t mesh_stride; // mesh_stride for p_expert_mesh
         mdiv topk_mdiv;
     };
@@ -1373,11 +1394,12 @@ struct MoeSortingMultiPhaseKernel_P0
     CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
     {
         Kargs k;
-        k.p_topk_ids    = h.p_topk_ids;
-        k.p_expert_mesh = h.p_ws;
-        k.tokens        = h.tokens;
-        k.mesh_stride   = impl::moe_sorting_mp_mesh_stride(h.tokens);
-        k.topk_mdiv     = mdiv{static_cast<uint32_t>(h.topk)};
+        k.p_topk_ids     = h.p_topk_ids;
+        k.p_local_tokens = h.p_local_tokens;
+        k.p_expert_mesh  = h.p_ws;
+        k.tokens         = h.tokens;
+        k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.topk_mdiv      = mdiv{static_cast<uint32_t>(h.topk)};
         return k;
     }
 
@@ -1394,7 +1416,26 @@ struct MoeSortingMultiPhaseKernel_P0
 
         const topk_id_t* p_topk_ids = reinterpret_cast<const topk_id_t*>(kargs.p_topk_ids);
         MeshType* p_expert_mesh     = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
-        index_t total_elem = kargs.tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
+        index_t tokens              = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return kargs.tokens;
+            }
+        }();
+        index_t rounded_tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return (tokens + Problem::SubTokenTile - 1) / Problem::SubTokenTile *
+                       Problem::SubTokenTile;
+            }
+            else
+                return tokens;
+        }();
+        index_t total_elem = rounded_tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
 
 #pragma unroll Problem::SubTokenTile
         for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem;
@@ -1405,8 +1446,15 @@ struct MoeSortingMultiPhaseKernel_P0
                 IndexType eid = x[j.value]; // ext_vector_type must use int to []
                 uint32_t curr_token_id, curr_topk_id;
                 kargs.topk_mdiv.divmod(i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
-                p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
-                    (curr_topk_id + 1) & 0xffff;
+                if constexpr(Problem::LocalToken)
+                {
+                    if(static_cast<index_t>(curr_token_id) < tokens)
+                        p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
+                            (curr_topk_id + 1) & 0xffff;
+                }
+                else
+                    p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
+                        (curr_topk_id + 1) & 0xffff;
             });
         }
     }
@@ -1542,6 +1590,7 @@ struct MoeSortingMultiPhaseKernel_P01
     {
         const void* p_topk_ids;          // [tokens, topk]
         const void* p_local_expert_mask; // [expert]
+        const void* p_local_tokens;      // [1]
         void* p_expert_mesh;             // [expert, tokens]
         void* p_expert_cumsum;           // [expert + 1]
         void* p_expert_sem;              // [1]
@@ -1569,6 +1618,7 @@ struct MoeSortingMultiPhaseKernel_P01
         Kargs k;
         k.p_topk_ids          = h.p_topk_ids;
         k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_local_tokens      = h.p_local_tokens;
         k.p_expert_mesh       = h.p_ws;
         k.p_expert_cumsum     = reinterpret_cast<void*>(
             reinterpret_cast<char*>(h.p_ws) +
@@ -1580,8 +1630,17 @@ struct MoeSortingMultiPhaseKernel_P01
         k.tokens      = h.tokens;
         k.num_experts = h.num_experts;
         k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
-        k.wg_count    = WGCounts(h);
-        k.topk_mdiv   = mdiv{static_cast<uint32_t>(h.topk)};
+        k.wg_count    = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return GridSize(h);
+            }
+            else
+            {
+                return WGCounts(h);
+            }
+        }();
+        k.topk_mdiv = mdiv{static_cast<uint32_t>(h.topk)};
         return k;
     }
 
@@ -1607,13 +1666,46 @@ struct MoeSortingMultiPhaseKernel_P01
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
         workgroup_barrier wb{reinterpret_cast<uint32_t*>(kargs.p_expert_sem)};
+        index_t tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return kargs.tokens;
+            }
+        }();
+        index_t rounded_tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return (tokens + Problem::SubTokenTile - 1) / Problem::SubTokenTile *
+                       Problem::SubTokenTile;
+            }
+            else
+                return tokens;
+        }();
+        index_t wg_count = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                index_t total_elem = rounded_tokens * kargs.topk / Problem::SubTokenTile;
+                index_t elem_cnt   = (total_elem + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+                // no more than grid_size
+                return min(elem_cnt, kargs.wg_count);
+            }
+            else
+            {
+                return kargs.wg_count;
+            }
+        }();
 
         {
             using topk_id_t = ext_vector_t<IndexType, Problem::SubTokenTile>;
 
             const topk_id_t* p_topk_ids = reinterpret_cast<const topk_id_t*>(kargs.p_topk_ids);
             IndexType* p_expert_mesh    = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
-            index_t total_elem = kargs.tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
+            index_t total_elem = rounded_tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
 
 #pragma unroll Problem::SubTokenTile
             for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem;
@@ -1625,10 +1717,19 @@ struct MoeSortingMultiPhaseKernel_P01
                     uint32_t curr_token_id, curr_topk_id;
                     kargs.topk_mdiv.divmod(
                         i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
-                    p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] = curr_topk_id + 1;
+                    // p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] = curr_topk_id + 1;
+                    if constexpr(Problem::LocalToken)
+                    {
+                        if(static_cast<index_t>(curr_token_id) < tokens)
+                            p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
+                                (curr_topk_id + 1) & 0xffff;
+                    }
+                    else
+                        p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
+                            (curr_topk_id + 1) & 0xffff;
                 });
             }
-            if(static_cast<index_t>(blockIdx.x) < kargs.wg_count)
+            if(static_cast<index_t>(blockIdx.x) < wg_count)
             {
                 wb.inc();
             }
@@ -1642,7 +1743,7 @@ struct MoeSortingMultiPhaseKernel_P01
             if(eid >= kargs.num_experts)
                 return;
 
-            wb.wait_lt(kargs.wg_count);
+            wb.wait_lt(wg_count);
 
             for(; eid < kargs.num_experts; eid += gridDim.x)
             {
@@ -1731,6 +1832,7 @@ struct MoeSortingMultiPhaseKernel_P2
     struct Kargs
     {
         const void* p_local_expert_mask; // [expert]
+        const void* p_local_tokens;      // [1]
         void* p_expert_mesh;             // [expert, tokens]
         void* p_expert_cumsum;           // [expert + 1]
         void* p_total_tokens_post_pad;   // [1]
@@ -1747,6 +1849,7 @@ struct MoeSortingMultiPhaseKernel_P2
     {
         Kargs k;
         k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_local_tokens      = h.p_local_tokens;
         k.p_expert_cumsum     = reinterpret_cast<void*>(
             reinterpret_cast<char*>(h.p_ws) +
             impl::moe_sorting_mp_mesh_smem_size(h.tokens, h.num_experts, h.topk));
@@ -1942,6 +2045,7 @@ struct MoeSortingMultiPhaseKernel_P3
     {
         const void* p_weights;
         const void* p_local_expert_mask;
+        const void* p_local_tokens;
         void* p_sorted_token_ids;
         void* p_sorted_weights;
         void* p_expert_mesh; // [token, expert]
@@ -1958,6 +2062,7 @@ struct MoeSortingMultiPhaseKernel_P3
         Kargs k;
         k.p_weights           = h.p_weights;
         k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_local_tokens      = h.p_local_tokens;
         k.p_sorted_token_ids  = h.p_sorted_token_ids;
         k.p_sorted_weights    = h.p_sorted_weights;
         k.p_expert_mesh       = h.p_ws;
@@ -1994,6 +2099,16 @@ struct MoeSortingMultiPhaseKernel_P3
         const WeightType* p_weights   = static_cast<const WeightType*>(kargs.p_weights);
         WeightType* p_sorted_weights  = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
 
+        index_t tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return kargs.tokens;
+            }
+        }();
         int eid     = blockIdx.x;
         int wave_id = threadIdx.x / WarpSize;
         int lane_id = threadIdx.x % WarpSize;
@@ -2019,7 +2134,7 @@ struct MoeSortingMultiPhaseKernel_P3
         {
             int i_token = i * BLOCK_SIZE + threadIdx.x;
             IndexType x = 0;
-            if(i_token < kargs.tokens)
+            if(i_token < tokens)
             {
                 x = p_expert_mesh[eid * kargs.mesh_stride + i_token];
             }
@@ -2066,7 +2181,7 @@ struct MoeSortingMultiPhaseKernel_P3
         for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
         {
 #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-            p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(kargs.tokens, kargs.topk_mdiv.divisor);
+            p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(tokens, kargs.topk_mdiv.divisor);
 #else
             p_sorted_token_ids[i] = tokens;
 #endif
@@ -2105,6 +2220,7 @@ struct MoeSortingMultiPhaseKernel_P23
     {
         const void* p_weights;
         const void* p_local_expert_mask; // [expert]
+        const void* p_local_tokens;      // [1]
         void* p_expert_mesh;             // [expert, tokens]
         void* p_expert_cumsum;           // [expert + 1]
         void* p_total_tokens_post_pad;   // [1]
@@ -2127,6 +2243,7 @@ struct MoeSortingMultiPhaseKernel_P23
         Kargs k;
         k.p_weights           = h.p_weights;
         k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_local_tokens      = h.p_local_tokens;
         k.p_expert_mesh       = h.p_ws;
         k.p_expert_cumsum     = reinterpret_cast<void*>(
             reinterpret_cast<char*>(h.p_ws) +
@@ -2346,6 +2463,17 @@ struct MoeSortingMultiPhaseKernel_P23
                     return; // skip empty expert
             }
 
+            index_t tokens = [&]() {
+                if constexpr(Problem::LocalToken)
+                {
+                    return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+                }
+                else
+                {
+                    return kargs.tokens;
+                }
+            }();
+
             // cumsum one by one
             constexpr index_t index_pack = Problem::SubTokenTile;              // always packed
             using r_t                    = ext_vector_t<MeshType, index_pack>; // always use int32x4
@@ -2357,7 +2485,7 @@ struct MoeSortingMultiPhaseKernel_P23
             {
                 int i_token_pack = i * BLOCK_SIZE + threadIdx.x;
                 r_t x_v          = 0;
-                if(i_token_pack < (kargs.tokens + index_pack - 1) / index_pack)
+                if(i_token_pack < (tokens + index_pack - 1) / index_pack)
                 {
                     x_v = reinterpret_cast<r_t*>(p_expert_mesh +
                                                  eid * kargs.mesh_stride)[i_token_pack];
@@ -2554,7 +2682,7 @@ struct MoeSortingMultiPhaseKernel_P23
             for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
             {
 #if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(kargs.tokens, kargs.topk_mdiv.divisor);
+                p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(tokens, kargs.topk_mdiv.divisor);
 #else
                 p_sorted_token_ids[i] = tokens;
 #endif
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
index 39bc6ca93e..181266d7af 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
@@ -31,6 +31,7 @@ template <typename IndexType_,
           index_t SubTokenTile_,    // 1,2,4,8, or 0 in the future
           bool SubTokenOneShot_,    // if we only loop over once or not
           bool LocalExpertMasking_, // used in EP case
+          bool LocalToken_,         // used in EP case
           bool SkipExpertsWithZeroTokens_ = true,
           index_t ExpertTile_             = 0>
 struct MoeSortingProblemEx
@@ -44,6 +45,7 @@ struct MoeSortingProblemEx
     static constexpr index_t SubTokenTile           = SubTokenTile_;
     static constexpr bool SubTokenOneShot           = SubTokenOneShot_;
     static constexpr bool LocalExpertMasking        = LocalExpertMasking_;
+    static constexpr bool LocalToken                = LocalToken_;
     static constexpr bool SkipExpertsWithZeroTokens = SkipExpertsWithZeroTokens_;
     static_assert(SubTokenTile == 1 || SubTokenTile == 2 || SubTokenTile == 4 || SubTokenTile == 8);
     static constexpr index_t ExpertTile = ExpertTile_; // TODO: only used in store out
@@ -54,6 +56,7 @@ template <typename IndexType_,
           typename MeshType_,
           index_t SubTokenTile_,    // 1,2,4,8
           bool LocalExpertMasking_, // used in EP case
+          bool LocalToken_,         // used in EP case
           bool SkipExpertsWithZeroTokens_ = true>
 struct MoeSortingProblemMp
 {
@@ -64,6 +67,7 @@ struct MoeSortingProblemMp
 
     static constexpr index_t SubTokenTile           = SubTokenTile_;
     static constexpr bool LocalExpertMasking        = LocalExpertMasking_;
+    static constexpr bool LocalToken                = LocalToken_;
     static constexpr bool SkipExpertsWithZeroTokens = SkipExpertsWithZeroTokens_;
     static_assert(SubTokenTile == 1 || SubTokenTile == 2 || SubTokenTile == 4 ||
                   SubTokenTile == 8 || SubTokenTile == 16);

From 7aeec9a901e7e502e8d6ff8538b74cf0944ce318 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Wed, 18 Jun 2025 12:37:59 +0800
Subject: [PATCH 231/443] [CK_TILE] fix build error in
 tile_add_rmsnorm2d_rdquant_fwd (#2243)

* [CK_TILE] fix build error in tile_add_rmsnorm2d_rdquant_fwd

* fix error with the latest develop code.
---
 .../11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp    | 1 +
 .../example_add_rmsnorm2d_rdquant_fwd.cpp                     | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
index 1d843b5594..faa134e5c4 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -80,6 +80,7 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
     using InputDataType     = ck_tile::remove_cvref_t<InputDataType_>;
     using QuantizedDataType = ck_tile::remove_cvref_t<QuantizedDataType_>;
 
+    static constexpr auto WarpSize        = ck_tile::get_warp_size();
     static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
     static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
     static constexpr ck_tile::index_t total_warps =
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
index ada4c6f2da..c43d9c9a2e 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -186,7 +186,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         // Rmsnorm2d
         {
             ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
-
+            ck_tile::HostTensor<ck_tile::null_type> unquant_y_host_ref({m, n});
             // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
             // simplicity
             ck_tile::reference_rmsnorm2d_fwd<XDataType,
@@ -194,7 +194,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              ComputeDataType,
                                              YDataType,
                                              InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
         }
 
         // yscale

From 64a2fda713a7723e63562f4be80f0cc123baa724 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 17 Jun 2025 22:43:05 -0700
Subject: [PATCH 232/443] Revert "Fix default epilogue  (#2358)" (#2364)

This reverts commit cd606f72c1fb3a99d596ad0f79521b46152764cb.
---
 include/ck_tile/ops/epilogue/default_2d_epilogue.hpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 623433c1dc..ab3c0df88d 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -71,11 +71,9 @@ struct Default2DEpilogue
 
     // TODO: this function assume store out vector size is the same as OAccTile last dimension size
     //       how do we fix this ?
-    template <typename ODramWindowTmp, typename OAccTile, typename DsDramWindows>
-    CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
-                                   const OAccTile& o_acc_tile,
-                                   const DsDramWindows& /* unused */,
-                                   void* = nullptr)
+    template <typename ODramWindowTmp, typename OAccTile>
+    CK_TILE_DEVICE auto
+    operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr)
     {
 
         // TODO: this is ugly
@@ -116,8 +114,6 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
-    using DsDataType                       = ck_tile::tuple<>;
-    using DsLayout                         = ck_tile::tuple<>;
     using CLayout                          = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kMPerXdl      = Problem::kMPerXdl;
     static constexpr index_t kNPerXdl      = Problem::kNPerXdl;
@@ -185,8 +181,6 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
             static_assert(false, "Unsupported CLayout!");
         }
     }
-
-    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeD() { return 1; }
 };
 
 } // namespace ck_tile

From a2f01141aadedc9bfcd5edad75dbaf67d3d5990a Mon Sep 17 00:00:00 2001
From: joyeamd <john.ye@amd.com>
Date: Wed, 18 Jun 2025 16:28:34 +0800
Subject: [PATCH 233/443] transpose load api development (#2177)

* add transpose load; no real logic

* fix some compile errors

* fix some issues

* update transpose load logic

* add some fixes

* fix a distribution issue

* update some codes

* add some fix

* can pass; but no logic

* transpose load enable

* update tile transpose

* miss output tile distribution mapping

* hack for transpose 16x16

* update output tensor distribution

* delete unused variables

* fix transpose related codes

* update transpose load example

* exchange the iteration order

* fix 16x16 related dimension transpose

* fix a transpose index issue

* fix a transpose index issue

* fix clang format check

* update load tile transpose related codes

* fix compile errors and pass 16x16 tests

* fix a typo

* update logic

* check other data types

* add transpose load api

* update transpose load api

* fix clang format check

* change file name

* refactor codes

* update code name

* delete some unused codes

* delete the unused oob flag for transpose load

* update tensor view api for transpose load

* update for testing

* fix a typo error

* move transpose ops to example directory

* update transpose api

* update include file

* fix for pr review

* fix compile errors

* add transpose load; no real logic

* fix some compile errors

* fix some issues

* update transpose load logic

* add some fixes

* fix a distribution issue

* update some codes

* add some fix

* can pass; but no logic

* transpose load enable

* update tile transpose

* miss output tile distribution mapping

* hack for transpose 16x16

* update output tensor distribution

* delete unused variables

* fix transpose related codes

* update transpose load example

* exchange the iteration order

* fix 16x16 related dimension transpose

* fix a transpose index issue

* fix a transpose index issue

* fix clang format check

* update load tile transpose related codes

* fix compile errors and pass 16x16 tests

* fix a typo

* update logic

* check other data types

* add transpose load api

* update transpose load api

* fix clang format check

* change file name

* refactor codes

* update code name

* delete some unused codes

* delete the unused oob flag for transpose load

* update tensor view api for transpose load

* update for testing

* fix a typo error

* move transpose ops to example directory

* update transpose api

* update include file

* fix for pr review

* fix compile errors

* change directory name

* delete the duplicated directory

* update cmakelists file

* delete the unused codes

* update function names

* update transpose policy

* update code after remod.py

* update codes

* add some comment

* Polish the instr infrastructure

* build up the fixed instr

* redesign the transpose api, currently it has numerical error

* add the bf16 transpose

* fix some issues

* add some comments

* update document

* Finished the refactor of API and pass through the verification

* fix the merging issue

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/37_transpose/CMakeLists.txt   |   9 +
 example/ck_tile/37_transpose/README.md        |  27 ++
 .../37_transpose/batched_transpose_kernel.hpp | 120 ++++++
 .../ck_tile/37_transpose/block_transpose.hpp  | 149 +++++++
 .../ck_tile/37_transpose/transpose_api.cpp    |  59 +++
 .../37_transpose/transpose_example.cpp        | 257 +++++++++++++
 .../37_transpose/transpose_example.hpp        |  27 ++
 .../ck_tile/37_transpose/transpose_policy.hpp | 151 ++++++++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/core.hpp                      |   2 +
 .../core/arch/amd_buffer_addressing.hpp       |  34 ++
 .../core/arch/amd_transpose_load_encoding.hpp |  86 +++++
 include/ck_tile/core/tensor/buffer_view.hpp   |  82 ++++
 .../core/tensor/load_tile_transpose.hpp       | 362 ++++++++++++++++++
 include/ck_tile/core/tensor/tensor_view.hpp   |  27 ++
 include/ck_tile/core/tensor/tile_window.hpp   |  77 +++-
 .../core/tensor/tile_window_linear.hpp        |  54 +++
 17 files changed, 1523 insertions(+), 1 deletion(-)
 create mode 100644 example/ck_tile/37_transpose/CMakeLists.txt
 create mode 100644 example/ck_tile/37_transpose/README.md
 create mode 100644 example/ck_tile/37_transpose/batched_transpose_kernel.hpp
 create mode 100644 example/ck_tile/37_transpose/block_transpose.hpp
 create mode 100644 example/ck_tile/37_transpose/transpose_api.cpp
 create mode 100644 example/ck_tile/37_transpose/transpose_example.cpp
 create mode 100644 example/ck_tile/37_transpose/transpose_example.hpp
 create mode 100644 example/ck_tile/37_transpose/transpose_policy.hpp
 create mode 100644 include/ck_tile/core/arch/amd_transpose_load_encoding.hpp
 create mode 100644 include/ck_tile/core/tensor/load_tile_transpose.hpp

diff --git a/example/ck_tile/37_transpose/CMakeLists.txt b/example/ck_tile/37_transpose/CMakeLists.txt
new file mode 100644
index 0000000000..d6f374a9b4
--- /dev/null
+++ b/example/ck_tile/37_transpose/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET_NAME tile_example_transpose)
+add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL transpose_example.cpp transpose_api.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(tile_example_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
+
diff --git a/example/ck_tile/37_transpose/README.md b/example/ck_tile/37_transpose/README.md
new file mode 100644
index 0000000000..21578dd00e
--- /dev/null
+++ b/example/ck_tile/37_transpose/README.md
@@ -0,0 +1,27 @@
+# Batched Transpose
+This folder contains example for transpose load for architecture gfx950. This transpose load has some constraints in input tile distribution.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# Make the transpose executable
+make tile_example_transpose -j
+```
+This will result in an executable `build/bin/tile_example_transpose`
+
+## example
+```
+args:
+          -N    input batch size (default:2)
+          -C    input channel size. (default:64)
+          -H    input height size. (default:1)
+          -W    input width size. (default:64)
+          -v    whether do CPU validation or not (default: 1)
+  -layout_in    input tensor data layout - NCHW by default
+ -layout_out    output tensor data layout - NHWC by default
+       -seed    seed to be used, -1 means random every time (default:-1)
+     -k_name    t to 1 will print kernel name (default:0)
+```
\ No newline at end of file
diff --git a/example/ck_tile/37_transpose/batched_transpose_kernel.hpp b/example/ck_tile/37_transpose/batched_transpose_kernel.hpp
new file mode 100644
index 0000000000..4681a12cf7
--- /dev/null
+++ b/example/ck_tile/37_transpose/batched_transpose_kernel.hpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+struct BatchedTransposeHostArgs
+{
+    const void* p_input;
+    void* p_output;
+    index_t batch;
+    index_t height;
+    index_t width;
+    // index_t dim_blocks;
+    index_t dim_stride;
+    index_t dim_block_h;
+    index_t dim_block_w;
+};
+
+template <typename Pipeline_>
+struct BatchedTransposeKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
+
+    using Type = typename Problem::DataType;
+
+    struct BatchedTransposeKargs
+    {
+        const void* p_input;
+        void* p_output;
+        index_t batch;
+        index_t height;
+        index_t width;
+        index_t dim_stride;
+    };
+
+    using Kargs = BatchedTransposeKargs;
+    using Hargs = BatchedTransposeHostArgs;
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        size_t grid_size_x = h.dim_block_w;
+        size_t grid_size_y = h.dim_block_h;
+        size_t grid_size_z = h.batch;
+        return dim3(grid_size_x, grid_size_y, grid_size_z);
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_input    = h.p_input;
+        k.p_output   = h.p_output;
+        k.batch      = h.batch;
+        k.height     = h.height;
+        k.width      = h.width;
+        k.dim_stride = h.dim_stride;
+        return k;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        __shared__ char smem[Pipeline::GetSmemSize()];
+        static constexpr ck_tile::index_t kMPerBlock = Problem::kSecondSizePerBlock;
+        static constexpr ck_tile::index_t kNPerBlock = Problem::kLeadSizePerBlock;
+
+        const auto iDim  = blockIdx.z;
+        const auto x_m_n = [&]() {
+            const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const Type*>(kargs.p_input) + iDim * kargs.dim_stride,
+                make_tuple(kargs.height, kargs.width),
+                make_tuple(kargs.width, 1),
+                number<Pipeline::GetVectorSize()>{},
+                number<1>{});
+
+            return pad_tensor_view(x_dram_naive,
+                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+                                   sequence<false, false>{});
+        }();
+
+        const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.y * kMPerBlock);
+        const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.x * kNPerBlock);
+
+        const auto y_n_m = [&]() {
+            const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<Type*>(kargs.p_output) + iDim * kargs.dim_stride,
+                make_tuple(kargs.width, kargs.height),
+                make_tuple(kargs.height, 1),
+                number<Pipeline::GetVectorSize()>{},
+                number<1>{});
+
+            return pad_tensor_view(y_dram_naive,
+                                   make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
+                                   sequence<false, false>{});
+        }();
+
+        auto x_block_window = make_tile_window(
+            x_m_n,
+            make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+            {static_cast<ck_tile::index_t>(iM), static_cast<ck_tile::index_t>(iN)});
+
+        auto y_block_window = make_tile_window(
+            y_n_m,
+            make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
+            {static_cast<ck_tile::index_t>(iN), static_cast<ck_tile::index_t>(iM)});
+
+        Pipeline{}(x_block_window, y_block_window, smem);
+    }
+};
+} // namespace ck_tile
diff --git a/example/ck_tile/37_transpose/block_transpose.hpp b/example/ck_tile/37_transpose/block_transpose.hpp
new file mode 100644
index 0000000000..5c0baab846
--- /dev/null
+++ b/example/ck_tile/37_transpose/block_transpose.hpp
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "transpose_policy.hpp"
+
+namespace ck_tile {
+
+template <typename Layout_, index_t kRow, index_t kCol>
+struct TransposeTraits
+{
+    static constexpr index_t kLeadDim   = kCol;
+    static constexpr index_t kSecondDim = kRow;
+};
+
+template <index_t kRow, index_t kCol>
+struct TransposeTraits<tensor_layout::gemm::ColumnMajor, kRow, kCol>
+{
+    static constexpr index_t kLeadDim   = kRow;
+    static constexpr index_t kSecondDim = kCol;
+};
+
+// supports 2D transpose which will store to lds, then use ds_read_b*_tr_b* instruction to get the
+// transposed data; Layout in TransposePipelineProblem is the original layout of the data in the
+// global memory
+template <typename DataType_,
+          typename Layout_,
+          index_t kBlockSize_,
+          index_t kRowWarps_,    // how many warps in row direction
+          index_t kColWarps_,    // how many warps in col direction
+          index_t kRowPerBlock_, // row number per block
+          index_t kColPerBlock_, // col number per block
+          index_t kRowPerXdl_,   // row number per xdl ops
+          index_t kColPerXdl_>   // col number per xdl ops
+struct TransposePipelineProblem
+{
+    static_assert(kRowWarps_ * kColWarps_ * get_warp_size() == kBlockSize_,
+                  "the block size is not correct!");
+    using DataType                      = remove_cvref_t<DataType_>;
+    using Layout                        = remove_cvref_t<Layout_>;
+    static constexpr index_t kBlockSize = kBlockSize_;
+    static constexpr index_t kLeadNumWarps =
+        TransposeTraits<Layout, kRowWarps_, kColWarps_>::kLeadDim;
+    static constexpr index_t kSecondNumWarps =
+        TransposeTraits<Layout, kRowWarps_, kColWarps_>::kSecondDim;
+    static constexpr index_t kLeadSizePerBlock =
+        TransposeTraits<Layout, kRowPerBlock_, kColPerBlock_>::kLeadDim;
+    static constexpr index_t kSecondSizePerBlock =
+        TransposeTraits<Layout, kRowPerBlock_, kColPerBlock_>::kSecondDim;
+    static constexpr index_t kLeadSizePerXdl =
+        TransposeTraits<Layout, kRowPerXdl_, kColPerXdl_>::kLeadDim;
+    static constexpr index_t kSecondSizePerXdl =
+        TransposeTraits<Layout, kRowPerXdl_, kColPerXdl_>::kSecondDim;
+
+    static constexpr index_t kQuadrantLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
+    static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
+
+    static_assert(kLeadSizePerBlock % kLeadNumWarps == 0,
+                  "block dim should be divided by warp dim!");
+    static_assert(kSecondSizePerBlock % kSecondNumWarps == 0,
+                  "block dim should be divided by warp dim!");
+    // how many rows/cols implemented in one warp
+    static constexpr index_t kLeadSizePerWarp   = kLeadSizePerBlock / kLeadNumWarps;
+    static constexpr index_t kSecondSizePerWarp = kSecondSizePerBlock / kSecondNumWarps;
+
+    static_assert(kLeadSizePerWarp % kLeadSizePerXdl == 0,
+                  "warp dim should be divided by xdl dim!");
+    static_assert(kSecondSizePerWarp % kSecondSizePerXdl == 0,
+                  "warp dim should be divided by xdl dim!");
+
+    // warp rows/cols is divided into xdl.
+    static constexpr index_t kLeadXdlNumPerWarp   = kLeadSizePerWarp / kLeadSizePerXdl;
+    static constexpr index_t kSecondXdlNumPerWarp = kSecondSizePerWarp / kSecondSizePerXdl;
+
+    static_assert(kLeadSizePerXdl % kQuadrantLeadDim == 0,
+                  "xdl dim should be divided by quad dim!");
+    static_assert(kSecondSizePerXdl % kQuadrantSecondDim == 0,
+                  "xdl dim should be divided by quad dim!");
+    // xdl rows/cols is divided into quadrants.
+    static constexpr index_t kQuadNumPerLeadDim   = kLeadSizePerXdl / kQuadrantLeadDim;
+    static constexpr index_t kQuadNumPerSecondDim = kSecondSizePerXdl / kQuadrantSecondDim;
+
+    static constexpr index_t kIterationsInSecondDim =
+        kQuadNumPerLeadDim * kQuadNumPerSecondDim * 16 / get_warp_size();
+};
+
+template <typename Problem_, typename Policy_ = TransposePolicy>
+struct BlockTranspose
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using DataType = remove_cvref_t<typename Problem::DataType>;
+    using Layout   = remove_cvref_t<typename Problem::Layout>;
+
+    static constexpr index_t kBlockSize          = Problem::kBlockSize;
+    static constexpr index_t kLeadSizePerBlock   = Problem::kLeadSizePerBlock;
+    static constexpr index_t kSecondSizePerBlock = Problem::kSecondSizePerBlock;
+
+    static constexpr index_t GetVectorSize() { return Policy::template GetVectorSize<Problem>(); }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename InputTileWindow, typename OutputTileWindow>
+    CK_TILE_DEVICE void operator()(const InputTileWindow& input_window,
+                                   OutputTileWindow& output_window,
+                                   void* __restrict__ p_smem)
+    {
+        auto input_tile_window =
+            make_tile_window(input_window, Policy::template MakeInputDistribution<Problem>());
+        auto output_tile_window =
+            make_tile_window(output_window, Policy::template MakeOutputDistribution<Problem>());
+
+        DataType* p_lds_ptr              = static_cast<DataType*>(p_smem);
+        constexpr auto in_lds_block_desc = Policy::template MakeLdsStoreBlockDescriptor<Problem>();
+        auto input_lds_block =
+            make_tensor_view<address_space_enum::lds>(p_lds_ptr, in_lds_block_desc);
+
+        constexpr auto out_lds_block_desc = Policy::template MakeLdsLoadBlockDescriptor<Problem>();
+        auto output_lds_block =
+            make_tensor_view<address_space_enum::lds>(p_lds_ptr, out_lds_block_desc);
+
+        auto copy_to_lds_window =
+            make_tile_window(input_lds_block,
+                             make_tuple(number<kSecondSizePerBlock>{}, number<kLeadSizePerBlock>{}),
+                             {0, 0});
+        auto load_from_lds_window =
+            make_tile_window(output_lds_block,
+                             make_tuple(number<kSecondSizePerBlock>{}, number<kLeadSizePerBlock>{}),
+                             {0, 0},
+                             Policy::template MakeLdsLoadTileDistribution<Problem>());
+
+        auto x = load_tile(input_tile_window);
+
+        store_tile(copy_to_lds_window, x);
+        block_sync_lds();
+
+        auto y = load_tile_transpose(load_from_lds_window);
+
+        store_tile(output_tile_window, y);
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/37_transpose/transpose_api.cpp b/example/ck_tile/37_transpose/transpose_api.cpp
new file mode 100644
index 0000000000..fe184b4023
--- /dev/null
+++ b/example/ck_tile/37_transpose/transpose_api.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "transpose_example.hpp"
+#include <iostream>
+
+template <typename ts_type,
+          ck_tile::index_t block_x,
+          ck_tile::index_t block_y,
+          ck_tile::index_t warp_x,
+          ck_tile::index_t warp_y>
+float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
+{
+    uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
+    uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
+    uint32_t dim_stride  = a.height * a.width;
+
+    a.dim_stride  = dim_stride;
+    a.dim_block_h = dim_block_h;
+    a.dim_block_w = dim_block_w;
+
+    using ts_problem  = ck_tile::TransposePipelineProblem<ts_type,
+                                                         ck_tile::tensor_layout::gemm::RowMajor,
+                                                         64,
+                                                         1,
+                                                         1,
+                                                         block_y,
+                                                         block_x,
+                                                         warp_y,
+                                                         warp_x>;
+    using ts_pipeline = ck_tile::BlockTranspose<ts_problem>;
+
+    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
+
+    auto kargs = kernel::MakeKargs(a);
+
+    const dim3 grids      = kernel::GridSize(a);
+    constexpr dim3 blocks = kernel::BlockSize();
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+float batched_transpose(batched_transpose_trait t,
+                        batched_transpose_kargs a,
+                        ck_tile::stream_config s)
+{
+    if(t.type == "fp16")
+    {
+        return batched_transpose_dispatch<ck_tile::fp16_t, 16, 32, 16, 32>(a, s);
+    }
+    else if(t.type == "fp8")
+    {
+        return batched_transpose_dispatch<ck_tile::fp8_t, 16, 64, 16, 64>(a, s);
+    }
+
+    return -1;
+}
diff --git a/example/ck_tile/37_transpose/transpose_example.cpp b/example/ck_tile/37_transpose/transpose_example.cpp
new file mode 100644
index 0000000000..ac27ca7911
--- /dev/null
+++ b/example/ck_tile/37_transpose/transpose_example.cpp
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "transpose_example.hpp"
+
+#if 0
+template <typename T>
+void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
+{
+    auto len = x.get_lengths();
+    assert(len.size() == 4);
+    std::cout << "[";
+    for(size_t i = 0; i < len[0]; i++)
+    {
+        std::cout << i << ": [";
+        for(size_t j = 0; j < len[1]; j++)
+        {
+            std::cout << j << ": [";
+            for(size_t k = 0; k < len[2]; k++)
+            {
+                std::cout << k << ": [";
+                for(size_t v = 0; v < len[3]; v++)
+                {
+                    if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
+                    {
+                        auto m =
+                            ck_tile::type_convert<float>(x(std::vector<std::size_t>{i, j, k, v}));
+
+                        std::cout << m;
+                        if(v != len[3] - 1)
+                            std::cout << ",";
+                    }
+                    else
+                    {
+                        std::cout << x(std::vector<std::size_t>{i, j, k, v}) << " ";
+                    }
+                }
+                std::cout << "]" << std::endl;
+            }
+            std::cout << "]" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    std::cout << "--------------------" << std::endl;
+}
+#endif
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "whether do CPU validation or not")
+        .insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("N", "2", "input batch size. ")
+        .insert("C", "64", "input channel size.")
+        .insert("H", "1", "input height size.")
+        .insert("W", "64", "input width size. ")
+        .insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
+        .insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "t to 1 will print kernel name");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename Type>
+bool run_batched_transpose(ck_tile::ArgParser args)
+{
+    int validate           = args.get_int("v");
+    std::string prec       = args.get_str("pr");
+    int N                  = args.get_int("N");
+    int C                  = args.get_int("C");
+    int H                  = args.get_int("H");
+    int W                  = args.get_int("W");
+    std::string layout_in  = args.get_str("layout_in");
+    std::string layout_out = args.get_str("layout_out");
+    int seed               = args.get_int("seed");
+
+    int dim_in[4], dim_out[4];
+    int stride_dim_in[4], stride_dim_out[4];
+    bool nchw2nhwc = layout_in == "NCHW" && layout_out == "NHWC";
+    bool nhwc2nchw = layout_in == "NHWC" && layout_out == "NCHW";
+    assert(nchw2nhwc != nhwc2nchw);
+    (void)nhwc2nchw;
+
+    dim_in[0]         = N;
+    dim_in[1]         = nchw2nhwc ? C : H;
+    dim_in[2]         = nchw2nhwc ? H : W;
+    dim_in[3]         = nchw2nhwc ? W : C;
+    dim_out[0]        = N;
+    dim_out[1]        = nchw2nhwc ? H : C;
+    dim_out[2]        = nchw2nhwc ? W : H;
+    dim_out[3]        = nchw2nhwc ? C : W;
+    stride_dim_in[0]  = C * H * W;
+    stride_dim_in[1]  = nchw2nhwc ? H * W : C * W;
+    stride_dim_in[2]  = nchw2nhwc ? W : C;
+    stride_dim_in[3]  = 1;
+    stride_dim_out[0] = C * H * W;
+    stride_dim_out[1] = nchw2nhwc ? C * W : H * W;
+    stride_dim_out[2] = nchw2nhwc ? C : W;
+    stride_dim_out[3] = 1;
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    ck_tile::HostTensor<Type> x_host(
+        {dim_in[0], dim_in[1], dim_in[2], dim_in[3]},
+        {stride_dim_in[0], stride_dim_in[1], stride_dim_in[2], stride_dim_in[3]});
+    ck_tile::HostTensor<Type> y_host(
+        {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
+        {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
+
+    ck_tile::FillUniformDistribution<Type>{-.5f, .5f}(x_host);
+
+    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
+
+    x_dev.ToDevice(x_host.data());
+
+    auto trait = batched_transpose_trait{prec, layout_in};
+
+    uint32_t height = nchw2nhwc ? C : H * W;
+    uint32_t width  = nchw2nhwc ? H * W : C;
+
+    batched_transpose_kargs karg = [&]() {
+        batched_transpose_kargs a_;
+        a_.p_input  = x_dev.GetDeviceBuffer();
+        a_.p_output = y_dev.GetDeviceBuffer();
+        a_.batch    = N;
+        a_.height   = height;
+        a_.width    = width;
+        return a_;
+    }();
+
+    ck_tile::stream_config sc{nullptr, true};
+
+    auto ms = batched_transpose(trait, karg, sc);
+
+    std::size_t num_operations = N * C * H * (W - 1);
+    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
+
+    float ave_time   = ms * 1E-3;
+    float gb_per_sec = num_bytes / ms * 1.E-6;
+    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;
+
+    std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
+              << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
+              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
+              << gb_per_sec << " GB/s, " << std::endl;
+
+    printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
+           prec.c_str(),
+           N,
+           C,
+           H,
+           W,
+           layout_in.c_str(),
+           ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    y_dev.FromDevice(y_host.data());
+
+    bool rtn = true;
+    if(validate)
+    {
+        // this host buffer will not copy to GPU, so no need use stride
+        ck_tile::HostTensor<Type> y_ref(
+            {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
+            {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
+
+        ck_tile::reference_batched_transpose<Type>(x_host, y_ref, layout_in, layout_out);
+
+        auto [rtol, atol] = get_elimit<Type>("");
+
+        rtn &= ck_tile::check_err(
+            y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
+    }
+    printf("valid:%s\n", rtn ? "y" : "n");
+    fflush(stdout);
+    return rtn;
+}
+
+int main(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    std::string prec = args.get_str("pr");
+
+    bool r = true;
+    if(prec.compare("fp16") == 0)
+    {
+        r &= run_batched_transpose<ck_tile::fp16_t>(args);
+    }
+    else if(prec.compare("fp8") == 0)
+    {
+        r &= run_batched_transpose<ck_tile::fp8_t>(args);
+    }
+    else
+    {
+        std::cerr << "Unsupported data type: " << prec << std::endl;
+    }
+
+    return r ? 0 : -1;
+}
diff --git a/example/ck_tile/37_transpose/transpose_example.hpp b/example/ck_tile/37_transpose/transpose_example.hpp
new file mode 100644
index 0000000000..8128d583ef
--- /dev/null
+++ b/example/ck_tile/37_transpose/transpose_example.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "batched_transpose_kernel.hpp"
+#include "block_transpose.hpp"
+#include "transpose_policy.hpp"
+
+#include <vector>
+#include <string>
+
+#pragma once
+
+struct batched_transpose_trait
+{
+    std::string type;
+    std::string layout;
+};
+
+struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
+{
+};
+
+float batched_transpose(batched_transpose_trait t,
+                        batched_transpose_kargs a,
+                        ck_tile::stream_config s);
diff --git a/example/ck_tile/37_transpose/transpose_policy.hpp b/example/ck_tile/37_transpose/transpose_policy.hpp
new file mode 100644
index 0000000000..ea1a4130fe
--- /dev/null
+++ b/example/ck_tile/37_transpose/transpose_policy.hpp
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+struct TransposePolicy
+{
+    static constexpr auto TileAccessPattern = tile_distribution_pattern::thread_raked;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSize()
+    {
+        return 16 / sizeof(typename Problem::DataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return integer_least_multiple(
+            sizeof(typename Problem::DataType) *
+                MakeLdsStoreBlockDescriptor<Problem>().get_element_space_size(),
+            16);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
+    {
+        constexpr index_t BlockSize         = Problem::kBlockSize;
+        constexpr index_t LeadDimPerBlock   = Problem::kLeadSizePerBlock;
+        constexpr index_t SecondDimPerBlock = Problem::kSecondSizePerBlock;
+        constexpr index_t VecLoadSize       = 16 / sizeof(typename Problem::DataType);
+
+        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
+                                                                      SecondDimPerBlock,
+                                                                      LeadDimPerBlock,
+                                                                      VecLoadSize,
+                                                                      TileAccessPattern>;
+        return TileEncodingPattern::Make2DStaticTileDistribution();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    {
+        constexpr auto input_dstr = MakeLdsLoadTileDistribution<Problem>();
+
+        using OutTileDstrEncode =
+            typename OutputTileDistributionTraits<remove_cvref_t<decltype(input_dstr)>,
+                                                  typename Problem::DataType>::OutDstrEncode;
+        constexpr auto block_dstr = make_static_tile_distribution(OutTileDstrEncode{});
+
+        return block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreBlockDescriptor()
+    {
+        constexpr index_t kLeadDimPerBlock   = Problem::kLeadSizePerBlock;
+        constexpr index_t kSecondDimPerBlock = Problem::kSecondSizePerBlock;
+        constexpr index_t kVectorSize        = 16 / sizeof(typename Problem::DataType);
+
+        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kSecondDimPerBlock>{},
+                       number<kLeadDimPerBlock / kVectorSize>{},
+                       number<kVectorSize>{}),
+            make_tuple(number<kLeadDimPerBlock>{}, number<kVectorSize>{}, number<1>{}),
+            number<kVectorSize>{},
+            number<1>{});
+
+        constexpr auto lds_block_desc = transform_tensor_descriptor(
+            lds_block_desc_0,
+            make_tuple(make_pass_through_transform(number<kSecondDimPerBlock>{}),
+                       make_merge_transform(make_tuple(number<kLeadDimPerBlock / kVectorSize>{},
+                                                       number<kVectorSize>{}))),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadBlockDescriptor()
+    {
+        constexpr index_t kLeadDimPerBlock   = Problem::kLeadSizePerBlock;
+        constexpr index_t kSecondDimPerBlock = Problem::kSecondSizePerBlock;
+
+        constexpr index_t kVectorSize = 8 / sizeof(typename Problem::DataType);
+
+        constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kSecondDimPerBlock>{},
+                       number<kLeadDimPerBlock / kVectorSize>{},
+                       number<kVectorSize>{}),
+            make_tuple(number<kLeadDimPerBlock>{}, number<kVectorSize>{}, number<1>{}),
+            number<kVectorSize>{},
+            number<1>{});
+
+        constexpr auto lds_block_desc = transform_tensor_descriptor(
+            lds_block_desc_0,
+            make_tuple(make_pass_through_transform(number<kSecondDimPerBlock>{}),
+                       make_merge_transform(make_tuple(number<kLeadDimPerBlock / kVectorSize>{},
+                                                       number<kVectorSize>{}))),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadTileDistribution()
+    {
+        using DataType = typename Problem::DataType;
+
+        // Extract base dimensions from the traits
+        constexpr index_t kBaseLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
+        constexpr index_t kBaseSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
+
+        // Calculate block-level dimensions
+        constexpr index_t kLead              = Problem::kLeadSizePerXdl;
+        constexpr index_t kSecond            = Problem::kSecondSizePerXdl;
+        constexpr index_t kLeadIterPerWarp   = Problem::kLeadXdlNumPerWarp;
+        constexpr index_t kSecondIterPerWarp = Problem::kSecondXdlNumPerWarp;
+        constexpr index_t kLeadNumWarps      = Problem::kLeadNumWarps;
+        constexpr index_t kSecondNumWarps    = Problem::kSecondNumWarps;
+
+        // Calculate repetitions of base pattern
+        constexpr index_t kLeadRepetitions     = kLead / kBaseLeadDim;
+        constexpr index_t kSecondRepetitions   = kSecond / kBaseSecondDim;
+        constexpr index_t kSecondDimIterations = Problem::kIterationsInSecondDim;
+        constexpr index_t kSecondDimStrSub     = kSecondRepetitions / kSecondDimIterations;
+
+        constexpr auto xdllevel_dstr_encoding = make_transposed_distr_encode<DataType,
+                                                                             kSecondDimStrSub,
+                                                                             kSecondDimIterations,
+                                                                             kLeadRepetitions,
+                                                                             1>();
+
+        constexpr auto input_tile_encode =
+            InputTileDistributionEncoding<decltype(xdllevel_dstr_encoding),
+                                          kLeadIterPerWarp,
+                                          kSecondIterPerWarp,
+                                          kLeadNumWarps,
+                                          kSecondNumWarps>();
+        constexpr auto block_dstr = make_static_tile_distribution(input_tile_encode);
+        return block_dstr;
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index f2f39b6e17..92b859a750 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -21,3 +21,4 @@ add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_copy)
+add_subdirectory(37_transpose)
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index be84842347..ed39719cf4 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#include "ck_tile/core/arch/amd_transpose_load_encoding.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/arch/utility.hpp"
@@ -39,6 +40,7 @@
 #include "ck_tile/core/numeric/vector_type.hpp"
 #include "ck_tile/core/tensor/buffer_view.hpp"
 #include "ck_tile/core/tensor/load_tile.hpp"
+#include "ck_tile/core/tensor/load_tile_transpose.hpp"
 #include "ck_tile/core/tensor/null_tensor.hpp"
 #include "ck_tile/core/tensor/null_tile_window.hpp"
 #include "ck_tile/core/tensor/shuffle_tile.hpp"
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 7111eed596..0ec1a95511 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2784,6 +2784,40 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 #endif
 }
 
+template <typename T, index_t N, address_space_enum BufferAddressSpace>
+__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+{
+
+    if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
+    {
+        typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
+        __attribute__((address_space(3))) llvm_fp16x4_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_fp16x4_t*>(
+                reinterpret_cast<uintptr_t>(in_ptr));
+        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
+    }
+    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
+    {
+        typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
+        __attribute__((address_space(3))) llvm_bf16x4_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_bf16x4_t*>(
+                reinterpret_cast<uintptr_t>(in_ptr));
+        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
+    }
+    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t>)
+    {
+        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_fp8x8_t;
+        __attribute__((address_space(3))) llvm_fp8x8_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_fp8x8_t*>(
+                reinterpret_cast<uintptr_t>(in_ptr));
+        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
+    }
+    else
+    {
+        static_assert(false, "not implemented");
+    }
+}
+
 } // namespace ck_tile
 
 #endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp b/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp
new file mode 100644
index 0000000000..7ffe6dc0fb
--- /dev/null
+++ b/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
+
+namespace ck_tile {
+
+// this generate wave level tile distribution
+template <typename T, typename = void>
+struct LaneGroupTransposeTraits;
+
+template <typename T>
+struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 2>>
+{
+    // before transpose, 4x16
+    static constexpr index_t ksecondDim = 4;
+    static constexpr index_t kleadDim   = 16;
+    // after transpose, 16x4
+    static constexpr index_t ksecondDimT = 16;
+    static constexpr index_t kleadDimT   = 4;
+    template <index_t kOuterDistDim0,
+              index_t kOuterDistDim1,
+              index_t kInnerDistDim0,
+              index_t kInnerDistDim1>
+    using TileDistribution =
+        tile_distribution_encoding<sequence<>,
+                                   tuple<sequence<kOuterDistDim0, kOuterDistDim1, 4>,
+                                         sequence<kInnerDistDim0, kInnerDistDim1, 4, 4>>,
+                                   tuple<sequence<1, 2, 1, 2>>,
+                                   tuple<sequence<0, 0, 2, 2>>,
+                                   sequence<2, 1, 2>,
+                                   sequence<1, 1, 3>>;
+};
+
+template <typename T>
+struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 1>>
+{
+    static constexpr index_t ksecondDim = 8;
+    static constexpr index_t kleadDim   = 16;
+
+    static constexpr index_t ksecondDimT = 16;
+    static constexpr index_t kleadDimT   = 8;
+
+    template <index_t kOuterDistDim0,
+              index_t kOuterDistDim1,
+              index_t kInnerDistDim0,
+              index_t kInnerDistDim1>
+    using TileDistribution =
+        tile_distribution_encoding<sequence<>,
+                                   tuple<sequence<kOuterDistDim0, kOuterDistDim1, 8>,
+                                         sequence<kInnerDistDim0, kInnerDistDim1, 2, 8>>,
+                                   tuple<sequence<1, 2, 1, 2>>,
+                                   tuple<sequence<0, 0, 2, 2>>,
+                                   sequence<2, 1, 2>,
+                                   sequence<1, 1, 3>>;
+};
+
+/*
+ * @brief This function is used to generate the transposed distribution encoding
+ *        for the given data type and distribution dimensions.
+ *
+ * @tparam T The data type of the elements in the tensor.
+ * @tparam kOuterDistDim0 The outer distribution dimension 0, which is outer dimension for stride.
+ * @tparam kOuterDistDim1 The outer distribution dimension 1, which is inner dimension for stride.
+ * @tparam kInnerDistDim0 The inner distribution dimension 0, which is outer dimension for
+ * consecutive.
+ * @tparam kInnerDistDim1 The inner distribution dimension 1, which is inner dimension for
+ * consecutive.
+ */
+template <typename T,
+          index_t kOuterDistDim0,
+          index_t kOuterDistDim1,
+          index_t kInnerDistDim0,
+          index_t kInnerDistDim1>
+CK_TILE_DEVICE constexpr auto make_transposed_distr_encode()
+{
+    using xdllevel_dstr_encoding = typename LaneGroupTransposeTraits<T>::
+        template TileDistribution<kOuterDistDim0, kOuterDistDim1, kInnerDistDim0, kInnerDistDim1>;
+    return xdllevel_dstr_encoding{};
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index bdcfbdd920..cd7b7d0a1f 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -18,6 +18,7 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
 
 namespace ck_tile {
 
@@ -133,6 +134,28 @@ struct buffer_view<address_space_enum::generic,
         }
     }
 
+    /*
+    In the generic address space, we do not support the transpose instruction in the buffer view.
+    Will report compilation error when developer wants to use it.
+    */
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto transpose_get(index_t i,
+                                                index_t linear_offset,
+                                                bool is_valid_element,
+                                                bool_constant<oob_conditional_check> = {}) const
+    {
+        static_assert(false, "Error: transpose load not supported in global memory space.");
+        ignore = i;
+        ignore = linear_offset;
+        ignore = is_valid_element;
+        return;
+    }
+
     // i is offset of T, not X. i should be aligned to X
     template <memory_operation_enum Op,
               typename X,
@@ -359,6 +382,28 @@ struct buffer_view<address_space_enum::global,
         }
     }
 
+    /*
+    In the global memory address space, we do not support the transpose instruction in the buffer
+    view. Will report compilation error when developer wants to use it.
+    */
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto transpose_get(index_t i,
+                                                index_t linear_offset,
+                                                bool is_valid_element,
+                                                bool_constant<oob_conditional_check> = {}) const
+    {
+        static_assert(false, "Error: transpose load not supported in global memory space.");
+        ignore = i;
+        ignore = linear_offset;
+        ignore = is_valid_element;
+        return;
+    }
+
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
               bool oob_conditional_check = true,
@@ -852,6 +897,43 @@ struct buffer_view<address_space_enum::lds,
         smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
     }
 
+    template <typename X,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto
+    transpose_get(index_t i, index_t linear_offset, bool is_valid_element) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        if(is_valid_element)
+        {
+            constexpr address_space_enum addr_space = get_address_space();
+            return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
+                p_data_ + i + linear_offset);
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return X{numeric<remove_cvref_t<T>>::zero()};
+            }
+            else
+            {
+                return X{invalid_element_value_};
+            }
+        }
+    }
+
     // i is offset of T, not X. i should be aligned to X
     template <memory_operation_enum Op,
               typename X,
diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp b/include/ck_tile/core/tensor/load_tile_transpose.hpp
new file mode 100644
index 0000000000..d178ccb72c
--- /dev/null
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/container/thread_buffer.hpp"
+#include "ck_tile/core/container/statically_indexed_array.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+namespace util {
+template <typename Suffix, typename Sequence>
+struct is_sequence_suffix
+{
+    static constexpr bool size_check = (Suffix::size() <= Sequence::size());
+
+    static constexpr index_t start_pos = Sequence::size() - Suffix::size();
+    using extract_indices = typename arithmetic_sequence_gen<start_pos, Sequence::size(), 1>::type;
+
+    static constexpr bool value =
+        size_check && (Suffix{} == decltype(Sequence::extract(extract_indices{})){});
+};
+
+template <index_t... Xs>
+struct is_sequence_suffix<sequence<>, sequence<Xs...>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Suffix, typename Sequence>
+constexpr bool is_sequence_suffix_v = is_sequence_suffix<Suffix, Sequence>::value;
+
+} // namespace util
+
+// Default policy: Retains original 2D transpose behavior
+template <typename DataType>
+struct DefaultTranspose
+{
+    struct Quad16
+    {
+        using InputEncoding = tile_distribution_encoding<sequence<>,
+                                                         tuple<sequence<4>, sequence<4, 4>>,
+                                                         tuple<sequence<1, 2>>,
+                                                         tuple<sequence<0, 0>>,
+                                                         sequence<2>,
+                                                         sequence<1>>;
+
+        using OutputEncoding = tile_distribution_encoding<sequence<>,
+                                                          tuple<sequence<16>, sequence<4>>,
+                                                          tuple<sequence<1>>,
+                                                          tuple<sequence<0>>,
+                                                          sequence<2>,
+                                                          sequence<0>>;
+    };
+
+    struct Quad8
+    {
+        using InputEncoding = tile_distribution_encoding<sequence<>,
+                                                         tuple<sequence<8>, sequence<2, 8>>,
+                                                         tuple<sequence<1, 2>>,
+                                                         tuple<sequence<0, 0>>,
+                                                         sequence<2>,
+                                                         sequence<1>>;
+
+        using OutputEncoding = tile_distribution_encoding<sequence<>,
+                                                          tuple<sequence<16>, sequence<8>>,
+                                                          tuple<sequence<1>>,
+                                                          tuple<sequence<0>>,
+                                                          sequence<2>,
+                                                          sequence<0>>;
+    };
+
+    // Select based on data size
+    using QuadInputEncoding = std::conditional_t<sizeof(DataType) == 2,
+                                                 typename Quad16::InputEncoding,
+                                                 typename Quad8::InputEncoding>;
+
+    using QuadOutputEncoding = std::conditional_t<sizeof(DataType) == 2,
+                                                  typename Quad16::OutputEncoding,
+                                                  typename Quad8::OutputEncoding>;
+
+    // Always swap last two dimensions
+    static constexpr auto transpose_dims = sequence<1, 0>{};
+
+    // Programmable: Element grouping function
+    static constexpr auto group_func = [](auto idx) {
+        return idx; // Identity mapping
+    };
+
+    template <typename InDstrEncode>
+    struct ValidationTraits
+    {
+        static constexpr auto input_hs_lengthss = InDstrEncode::hs_lengthss_;
+        static constexpr auto quad_hs_lengthss  = QuadInputEncoding::hs_lengthss_;
+        // 1. Must be 2D tensor
+        static constexpr bool dims_valid = (InDstrEncode::NDimX == 2);
+        // 2. Quad pattern must be suffix of input pattern
+        static constexpr bool suffix_valid_dim0 =
+            util::is_sequence_suffix_v<decltype(quad_hs_lengthss.template get<0>()),
+                                       decltype(input_hs_lengthss.template get<0>())>;
+        static constexpr bool suffix_valid_dim1 =
+            util::is_sequence_suffix_v<decltype(quad_hs_lengthss.template get<1>()),
+                                       decltype(input_hs_lengthss.template get<1>())>;
+
+        // 3. PS→RHS mapping constraints
+        static constexpr auto input_ps_to_rhss_major = InDstrEncode::ps_to_rhss_major_;
+        static constexpr auto input_ps_to_rhss_minor = InDstrEncode::ps_to_rhss_minor_;
+
+        static constexpr index_t ndimp_outer = input_ps_to_rhss_major.size() - 1;
+        static constexpr index_t ndimp_inner =
+            input_ps_to_rhss_major[number<ndimp_outer>{}].size() - 1;
+
+        static constexpr bool ps_mapping_valid =
+            (input_ps_to_rhss_major[number<ndimp_outer>{}][number<ndimp_inner>{}] == 2) &&
+            (input_ps_to_rhss_minor[number<ndimp_outer>{}][number<ndimp_inner>{}] ==
+             input_hs_lengthss[number<1>{}].size() - 2) &&
+            (input_ps_to_rhss_major[number<ndimp_outer>{}][number<ndimp_inner - 1>{}] == 1) &&
+            (input_ps_to_rhss_minor[number<ndimp_outer>{}][number<ndimp_inner - 1>{}] ==
+             input_hs_lengthss[number<0>{}].size() - 1);
+
+        // 4. YS→RHS mapping constraints
+        static constexpr auto input_ys_to_rhs_major = InDstrEncode::ys_to_rhs_major_;
+        static constexpr auto input_ys_to_rhs_minor = InDstrEncode::ys_to_rhs_minor_;
+
+        static constexpr bool ys_mapping_valid =
+            (input_ys_to_rhs_major.back() == 2) &&
+            (input_ys_to_rhs_minor.back() == input_hs_lengthss[number<1>{}].size() - 1) &&
+            (input_ys_to_rhs_major[input_ys_to_rhs_major.size() - 2] == 1) &&
+            (input_ys_to_rhs_minor[input_ys_to_rhs_minor.size() - 2] ==
+             input_hs_lengthss[number<0>{}].size() - 2);
+
+        static constexpr bool value = dims_valid && suffix_valid_dim0 && suffix_valid_dim1 &&
+                                      ps_mapping_valid && ys_mapping_valid;
+    };
+};
+template <typename TileDistribution_, typename DataType_, typename Policy>
+struct TransposeTileDistrChecker
+{
+    using InDstrEncode = typename remove_cvref_t<TileDistribution_>::DstrEncode;
+
+    using Validator = typename Policy::template ValidationTraits<InDstrEncode>;
+
+    static constexpr bool distr_encoding_valid = Validator::value;
+};
+
+// this is used to generate the transposed output tile distribution encoding
+// based on the input tile distribution encoding
+template <typename TileDistribution_,
+          typename DataType_,
+          typename Policy = DefaultTranspose<DataType_>>
+struct OutputTileDistributionTraits
+{
+    using InDstrEncode = typename remove_cvref_t<TileDistribution_>::DstrEncode;
+    static constexpr auto input_hs_lengthss       = InDstrEncode::hs_lengthss_;
+    static constexpr auto quad_input_hs_lengthss  = Policy::QuadInputEncoding::hs_lengthss_;
+    static constexpr auto quad_output_hs_lengthss = Policy::QuadOutputEncoding::hs_lengthss_;
+
+    static constexpr auto input_ps_to_rhss_major = InDstrEncode::ps_to_rhss_major_;
+    static constexpr auto input_ps_to_rhss_minor = InDstrEncode::ps_to_rhss_minor_;
+    static constexpr auto input_ys_to_rhs_major  = InDstrEncode::ys_to_rhs_major_;
+    static constexpr auto input_ys_to_rhs_minor  = InDstrEncode::ys_to_rhs_minor_;
+
+    static constexpr auto quad_ps_to_rhss_major = Policy::QuadInputEncoding::ps_to_rhss_major_;
+    static constexpr auto quad_ps_to_rhss_minor = Policy::QuadInputEncoding::ps_to_rhss_minor_;
+
+    // for transpose load
+    // append the reversed quad output hs lengths to the input hs lengthss after removing
+    // the quad_input_hs_lengthss
+    // then reverse the whole sequence to get the dst_out_hs_lengthss
+    static constexpr auto reversed_quad_output_hs_lengthss = tuple_reverse(quad_output_hs_lengthss);
+
+    static constexpr auto full_out_hs_lengthss = generate_tuple(
+        [](auto i) {
+            return input_hs_lengthss[i]
+                .extract(typename arithmetic_sequence_gen<0,
+                                                          input_hs_lengthss[i].size() -
+                                                              quad_input_hs_lengthss[i].size(),
+                                                          1>::type{})
+                .push_back(reversed_quad_output_hs_lengthss[i]);
+        },
+        number<InDstrEncode::NDimX>{});
+
+    static constexpr auto dst_out_hs_lengthss = tuple_reverse(full_out_hs_lengthss);
+
+    // for PS→RHS mapping(both major and minor), we need to modify the last element of the major
+    // sequence
+    static constexpr auto modified_ps_to_rhss_major = generate_tuple(
+        [](auto i) {
+            if constexpr(i == input_ps_to_rhss_major.size() - 1)
+            {
+                constexpr auto current_size             = input_ps_to_rhss_major[i].size();
+                constexpr auto reduce_size              = quad_ps_to_rhss_major[number<0>{}].size();
+                constexpr auto reduced_ps_to_rhss_major = input_ps_to_rhss_major[i].extract(
+                    typename arithmetic_sequence_gen<0, current_size - reduce_size, 1>::type{});
+                return reduced_ps_to_rhss_major.push_back(number<2>{});
+            }
+            else
+            {
+                // For all other sequences, keep them unchanged
+                return input_ps_to_rhss_major[i];
+            }
+        },
+        number<input_ps_to_rhss_major.size()>{});
+
+    static constexpr auto minor_last_index =
+        full_out_hs_lengthss[number<InDstrEncode::NDimX - 1>{}].size() - 1;
+    static constexpr auto major_last_index = full_out_hs_lengthss[number<0>{}].size() - 1;
+
+    static constexpr auto dst_ps_to_rhss_minor = generate_tuple(
+        [](auto i) {
+            if constexpr(i == input_ps_to_rhss_minor.size() - 1)
+            {
+                constexpr auto current_size             = input_ps_to_rhss_minor[i].size();
+                constexpr auto reduce_size              = quad_ps_to_rhss_minor[number<0>{}].size();
+                constexpr auto reduced_ps_to_rhss_minor = input_ps_to_rhss_minor[i].extract(
+                    typename arithmetic_sequence_gen<0, current_size - reduce_size, 1>::type{});
+                return reduced_ps_to_rhss_minor.push_back(number<minor_last_index>{});
+            }
+            else
+            {
+                // For all other sequences, keep them unchanged
+                return input_ps_to_rhss_minor[i];
+            }
+        },
+        number<input_ps_to_rhss_minor.size()>{});
+
+    // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
+    static constexpr auto swap_one_and_two = [](const index_t idx) {
+        return (idx == 1) ? 2 : (idx == 2) ? 1 : idx;
+    };
+    static constexpr auto dst_ps_to_rhss_major = generate_tuple(
+        [](auto i) { return modified_ps_to_rhss_major[i].transform(swap_one_and_two); },
+        number<modified_ps_to_rhss_major.size()>{});
+
+    static constexpr auto modified_input_ys_to_rhs_major =
+        input_ys_to_rhs_major.pop_back().push_back(number<1>{});
+
+    static constexpr auto dst_ys_to_rhs_major = generate_sequence_v2(
+        [](auto i) { return number<swap_one_and_two(modified_input_ys_to_rhs_major[i])>{}; },
+        number<modified_input_ys_to_rhs_major.size()>{});
+
+    static constexpr auto dst_ys_to_rhs_minor =
+        input_ys_to_rhs_minor.pop_back().push_back(number<major_last_index>{});
+
+    using OutDstrEncode = tile_distribution_encoding<typename InDstrEncode::RsLengths,
+                                                     remove_cvref_t<decltype(dst_out_hs_lengthss)>,
+                                                     remove_cvref_t<decltype(dst_ps_to_rhss_major)>,
+                                                     remove_cvref_t<decltype(dst_ps_to_rhss_minor)>,
+                                                     remove_cvref_t<decltype(dst_ys_to_rhs_major)>,
+                                                     remove_cvref_t<decltype(dst_ys_to_rhs_minor)>>;
+};
+
+template <typename InnerEncode,
+          index_t kLeadIterPerWarp,
+          index_t kSecondIterPerWarp,
+          index_t kLeadNumWarps,
+          index_t kSecondNumWarps>
+CK_TILE_HOST_DEVICE constexpr auto InputTileDistributionEncoding()
+{
+    constexpr auto block_outer_dst_encoding =
+        tile_distribution_encoding<sequence<>,
+                                   tuple<sequence<kSecondIterPerWarp, kSecondNumWarps>,
+                                         sequence<kLeadIterPerWarp, kLeadNumWarps>>,
+                                   tuple<sequence<2, 1>>,
+                                   tuple<sequence<1, 1>>,
+                                   sequence<2, 1>,
+                                   sequence<0, 0>>{};
+    constexpr auto blk_distr_encode =
+        detail::make_embed_tile_distribution_encoding(block_outer_dst_encoding, InnerEncode{});
+
+    return blk_distr_encode;
+}
+
+/**
+ * @brief transpose loads tile from a tensor and returns the resulting tensor with a new
+ * (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid.
+ *
+ * This function is intended for use with statically distributed tensor tiles, where the input
+ * and output tile distributions differ due to the transpose operation. It ensures that the
+ * element space size and vector length remain consistent between the input and output
+ * distributions.
+ *
+ * @tparam BottomTensorView_      The type of the bottom tensor view.
+ * @tparam WindowLengths_         The type representing the window lengths.
+ * @tparam TileDistribution_      The type representing the tile distribution.
+ * @tparam NumCoord               The number of coordinates (dimensions).
+ * @tparam Policy                 The transpose policy to use (defaults to DefaultTranspose).
+ * the last is SFINAE to ensure the tile distribution encoding is valid.
+ *
+ * @param tile_window             The tile window with static distribution to load and transpose.
+ *
+ * @return A statically distributed tensor containing the transposed tile data.
+ *
+ * @note
+ * - The function uses compile-time checks to ensure the input and output tile distributions
+ *   are compatible in terms of element space size and vector length.
+ * - The transpose operation is performed according to the specified Policy.
+ */
+template <
+    typename BottomTensorView_,
+    typename WindowLengths_,
+    typename TileDistribution_,
+    index_t NumCoord,
+    typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>,
+    typename        = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_,
+                                                          typename BottomTensorView_::DataType,
+                                                          Policy>::distr_encoding_valid,
+                                Policy>>
+CK_TILE_DEVICE auto
+load_tile_transpose(const tile_window_with_static_distribution<BottomTensorView_,
+                                                               WindowLengths_,
+                                                               TileDistribution_,
+                                                               NumCoord>& tile_window)
+{
+    using OutTileDstrEncode =
+        typename OutputTileDistributionTraits<TileDistribution_,
+                                              typename BottomTensorView_::DataType>::OutDstrEncode;
+    auto out_tensor = make_static_distributed_tensor<typename BottomTensorView_::DataType>(
+        make_static_tile_distribution(OutTileDstrEncode{}));
+    auto trans_tensor           = tile_window.template load_transpose<Policy>();
+    constexpr auto input_distr  = TileDistribution_{};
+    constexpr auto output_distr = make_static_tile_distribution(OutTileDstrEncode{});
+
+    constexpr auto y_in_desc  = input_distr.get_ys_to_d_descriptor();
+    constexpr auto y_out_desc = output_distr.get_ys_to_d_descriptor();
+
+    constexpr index_t NDimYIn  = input_distr.get_num_of_dimension_y();
+    constexpr index_t NDimYOut = output_distr.get_num_of_dimension_y();
+
+    constexpr auto y_in_lengths  = to_sequence(y_in_desc.get_lengths());
+    constexpr auto y_out_lengths = to_sequence(y_out_desc.get_lengths());
+
+    constexpr auto y_in_element_space_size  = y_in_desc.get_element_space_size();
+    constexpr auto y_out_element_space_size = y_out_desc.get_element_space_size();
+    static_assert(y_in_element_space_size == y_out_element_space_size,
+                  "the element space size is not the same!");
+    static_assert(y_in_lengths[NDimYIn - 1] == y_out_lengths[NDimYOut - 1],
+                  "the vector length is not the same!");
+    constexpr index_t vecLoadSize = y_in_lengths[NDimYIn - 1];
+    constexpr index_t num_of_access =
+        reduce_on_sequence(y_in_lengths, multiplies{}, number<1>{}) / vecLoadSize;
+
+    using DataVec = array<typename BottomTensorView_::DataType, vecLoadSize>;
+    static_for<0, num_of_access, 1>{}([&](auto iAccess) {
+        out_tensor.get_thread_buffer().template set_as<DataVec>(
+            number<iAccess>{},
+            trans_tensor.get_thread_buffer().template get_as<DataVec>(number<iAccess>{}));
+    });
+
+    return out_tensor;
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 656ce8d20d..9429a960d8 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -251,6 +251,33 @@ struct tensor_view
                                               bool_constant<pre_nop>{});
     }
 
+    template <typename X,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
+    get_transpose_vectorized_elements(const TensorCoord& coord, index_t linear_offset) const
+    {
+        return buf_.template transpose_get<X>(
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
+    }
+
+    template <typename X,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
+    get_transpose_vectorized_elements(const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool is_valid_element // flag
+    ) const
+    {
+        return buf_.template transpose_get<X>(coord.get_offset(), linear_offset, is_valid_element);
+    }
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
     template <typename X,
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index d8a5c14f9b..6027668c8e 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -407,6 +407,82 @@ struct tile_window_with_static_distribution
         });
     }
 
+    template <typename Policy, index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load_transpose() const
+    {
+        constexpr auto tile_dstr = typename Base::TileDstr{};
+        auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
+        this->template load_transpose<Policy>(
+            dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }
+
+    template <typename Policy,
+              typename DistributedTensor,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto load_transpose(DistributedTensor& dst_tensor,
+                                       number<i_access_unsupport_>          = {},
+                                       bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits   = typename Base::Traits;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr = typename Base::TileDstr{};
+
+        constexpr auto group_func = Policy::group_func;
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+
+                // read from bottom tensor
+                const vector_t vec_value =
+                    this->get_bottom_tensor_view()
+                        .template get_transpose_vectorized_elements<vector_t>(
+                            bottom_tensor_thread_coord, 0);
+                // write into distributed tensor
+                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                    constexpr auto orig_idx_ys = generate_tuple(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<Base::NDimY>{});
+
+                    constexpr auto grouped_idx_ys = group_func(orig_idx_ys);
+
+                    constexpr index_t linear_distributed_index =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(grouped_idx_ys);
+
+                    dst_tensor.get_thread_buffer().template at<linear_distributed_index>() =
+                        vec_value.template get_as<typename Base::DataType>()[j];
+                });
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
+                        idx_diff_ys);
+
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
     template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE void store(const static_distributed_tensor<typename Base::DataType,
                                                               typename Base::TileDstr>& dstr_tensor,
@@ -415,7 +491,6 @@ struct tile_window_with_static_distribution
     {
         using Traits = typename Base::Traits;
 
-        // using vector_type_t = typename Traits::vector_type_t;
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index f11610d658..56c5066774 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -613,6 +613,60 @@ struct tile_window_linear
         WINDOW_DISPATCH_ISSUE();
     }
 
+    template <typename Policy, index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load_transpose() const
+    {
+        constexpr auto tile_dstr = typename Base::TileDstr{};
+        auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
+        this->template load_transpose_linear<Policy>(
+            dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }
+
+    template <typename Policy,
+              typename DistributedTensor,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load_transpose_linear(DistributedTensor& dst_tensor,
+                                              number<i_access>                     = {},
+                                              bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = typename Base::TileDstr{};
+
+        constexpr auto group_func = Policy::group_func;
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from bottom tensor
+            const vector_t vec_value =
+                this->get_bottom_tensor_view().template get_transpose_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord, 0);
+            // write into distributed tensor
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<Base::NDimY>{});
+
+                constexpr index_t linear_distributed_index =
+                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                dst_tensor.get_thread_buffer().template at<linear_distributed_index>() =
+                    vec_value.template get_as<typename Base::DataType>()[j];
+            });
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+
     template <index_t i_access = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE void store(const static_distributed_tensor<typename Base::DataType,
                                                               typename Base::TileDstr>& dstr_tensor,

From bfb33bc1e92330f3d5710aad22bdadb174d3f282 Mon Sep 17 00:00:00 2001
From: Muhammed  Emin Ozturk <Muhammed.Ozturk@amd.com>
Date: Wed, 18 Jun 2025 07:49:22 -0700
Subject: [PATCH 234/443] Stream-K CkProfiler Update ( Replace CPU Validation
 with GPU Validation and Add Dynamic Grid Size Calculation for Stream-K GEMM
 Profiler) (#2333)

* Stream-K Ckprofiler Update

* new grid list based on sm number

* clang

* update for review

* Update profile_gemm_universal_streamk.cpp

---------

Co-authored-by: root <root@ctr-ubbsmc16.amd.com>
---
 .../profile_gemm_universal_streamk_impl.hpp   | 111 +++++++++++++++---
 .../src/profile_gemm_universal_streamk.cpp    |   4 +-
 2 files changed, 97 insertions(+), 18 deletions(-)
 mode change 100644 => 100755 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
 mode change 100644 => 100755 profiler/src/profile_gemm_universal_streamk.cpp

diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100644
new mode 100755
index e625fae808..640b192baf
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 #include <iostream>
 #include <typeinfo>
+#include <hip/hip_runtime.h>
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -133,22 +134,62 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     // Run reference GEMM
     if(do_verification)
     {
+        // Use GPU validation
+        using ReferenceGemmInstanceGPU =
+            ck::tensor_operation::device::ReferenceGemm<ALayout,
+                                                        BLayout,
+                                                        CLayout,
+                                                        ADataType,
+                                                        BDataType,
+                                                        CDataType,
+                                                        AccDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CElementOp,
+                                                        ComputeDataType,
+                                                        ComputeDataType>;
 
-        // Use CPU validation
-        // Note: GPU validation is not supported for fp8 !!!
-        using ReferenceGemmInstanceCPU = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                   BDataType,
-                                                                                   CDataType,
-                                                                                   AccDataType,
-                                                                                   AElementOp,
-                                                                                   BElementOp,
-                                                                                   CElementOp,
-                                                                                   ComputeDataType>;
-        auto ref_gemm_cpu              = ReferenceGemmInstanceCPU{};
-        auto ref_invoker_cpu           = ref_gemm_cpu.MakeInvoker();
-        auto ref_argument_cpu          = ref_gemm_cpu.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-        ref_invoker_cpu.Run(ref_argument_cpu);
+        auto ref_gemm_gpu     = ReferenceGemmInstanceGPU{};
+        auto ref_invoker_gpu  = ref_gemm_gpu.MakeInvoker();
+        auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
+            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        if(ref_gemm_gpu.IsSupportedArgument(&ref_argument_gpu))
+        {
+            ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{nullptr, true});
+            c_m_n_device_ref_buf.FromDevice(c_m_n_host_result.mData.data());
+        }
+        else
+        {
+            std::cerr << "GPU reference GEMM does not support this problem configuration so does "
+                         "CPU validation."
+                      << std::endl;
+
+            // Use CPU validation
+
+            using ReferenceGemmInstanceCPU =
+                ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                          BDataType,
+                                                          CDataType,
+                                                          AccDataType,
+                                                          AElementOp,
+                                                          BElementOp,
+                                                          CElementOp,
+                                                          ComputeDataType>;
+            auto ref_gemm_cpu     = ReferenceGemmInstanceCPU{};
+            auto ref_invoker_cpu  = ref_gemm_cpu.MakeInvoker();
+            auto ref_argument_cpu = ref_gemm_cpu.MakeArgument(
+                a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+            ref_invoker_cpu.Run(ref_argument_cpu);
+        }
     }
 
     std::string best_op_name;
@@ -158,10 +199,48 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     float best_grid_size   = 0;
     float best_streamk_sel = 0;
 
+    // Get number of SMs on the current GPU
+    int device_id;
+    hipError_t err = hipGetDevice(&device_id);
+    if(err != hipSuccess)
+    {
+        std::cerr << "hipGetDevice failed: " << hipGetErrorString(err) << std::endl;
+        return false;
+    }
+
+    hipDeviceProp_t props;
+    err = hipGetDeviceProperties(&props, device_id);
+    if(err != hipSuccess)
+    {
+        std::cerr << "hipGetDeviceProperties failed: " << hipGetErrorString(err) << std::endl;
+        return false;
+    }
+    int num_sms = props.multiProcessorCount;
+
+    // Generate grid sizes based on SM count with multipliers
+    std::vector<float> multipliers = {0.2f, 0.4f, 0.6f, 0.8f, 1.0f, 1.2f, 1.4f, 1.6f, 2.0f};
+    std::vector<int> grid_size_list;
+
+    for(float mult : multipliers)
+    {
+        int grid_size = static_cast<int>(num_sms * mult);
+        if(grid_size > 0)
+        {
+            grid_size_list.push_back(grid_size);
+        }
+    }
+
+    std::cout << "Number of SMs: " << num_sms << std::endl;
+    std::cout << "Grid sizes to test: ";
+    for(auto gs : grid_size_list)
+    {
+        std::cout << gs << " ";
+    }
+    std::cout << std::endl;
+
     // profile device GEMM instances
     for(auto& op_ptr : op_ptrs)
     {
-        std::vector<int> grid_size_list   = {38, 76, 114, 152, 190, 228, 266, 304, 342, 380};
         std::vector<int> streamk_sel_list = {
             0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP,
                             // 2:2-tile Stream-K + DP
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100644
new mode 100755
index 4d1ab811ee..40ae0d70f5
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -90,8 +90,8 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
     const int Streamk_sel = std::stoi(argv[14]);
     const int Grid_size   = std::stoi(argv[15]);
 
-    int n_warmup      = 20;
-    int n_iter        = 50;
+    int n_warmup      = 1;
+    int n_iter        = 10;
     uint64_t rotating = 0;
     if(argc == 19)
     {

From c8b247c55c2b4fd7043268bda097c7dab4002eb2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 18 Jun 2025 08:15:59 -0700
Subject: [PATCH 235/443] Bump sphinxcontrib-bibtex from 2.6.3 to 2.6.4 in
 /docs/sphinx (#2365)

Bumps [sphinxcontrib-bibtex](https://github.com/mcmtroffaes/sphinxcontrib-bibtex) from 2.6.3 to 2.6.4.
- [Changelog](https://github.com/mcmtroffaes/sphinxcontrib-bibtex/blob/develop/CHANGELOG.rst)
- [Commits](https://github.com/mcmtroffaes/sphinxcontrib-bibtex/compare/2.6.3...2.6.4)

---
updated-dependencies:
- dependency-name: sphinxcontrib-bibtex
  dependency-version: 2.6.4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 489a448860..3b57fc5148 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
 rocm-docs-core[api_reference]==1.20.1
-sphinxcontrib-bibtex==2.6.3
+sphinxcontrib-bibtex==2.6.4
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 14e74b2a6f..59263a6e4e 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -278,7 +278,7 @@ sphinx-notfound-page==1.1.0
     # via rocm-docs-core
 sphinxcontrib-applehelp==2.0.0
     # via sphinx
-sphinxcontrib-bibtex==2.6.3
+sphinxcontrib-bibtex==2.6.4
     # via -r requirements.in
 sphinxcontrib-devhelp==2.0.0
     # via sphinx

From 11eb9f1c7711a419cfa0db5346c80edb1eaf7b4a Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 19 Jun 2025 10:39:30 -0700
Subject: [PATCH 236/443] Reland fix default epilogue (#2367)

* Revert "Revert "Fix default epilogue  (#2358)" (#2364)"

This reverts commit 64a2fda713a7723e63562f4be80f0cc123baa724.

* add operator() with old signature
---
 .../ck_tile/ops/epilogue/default_2d_epilogue.hpp   | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index ab3c0df88d..ff41ac0d61 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -75,7 +75,6 @@ struct Default2DEpilogue
     CK_TILE_DEVICE auto
     operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr)
     {
-
         // TODO: this is ugly
         if constexpr(UseRawStore && (kPadM || kPadN))
         {
@@ -101,6 +100,15 @@ struct Default2DEpilogue
             }
         }
     }
+
+    template <typename ODramWindowTmp, typename OAccTile, typename DsDramWindows>
+    CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
+                                   const OAccTile& o_acc_tile,
+                                   const DsDramWindows& /* unused */,
+                                   void* = nullptr)
+    {
+        return operator()<ODramWindowTmp, OAccTile>(o_dram_window_tmp, o_acc_tile);
+    }
 };
 
 template <typename Problem_, typename Policy_ = void>
@@ -114,6 +122,8 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
+    using DsDataType                       = ck_tile::tuple<>;
+    using DsLayout                         = ck_tile::tuple<>;
     using CLayout                          = remove_cvref_t<typename Problem::CLayout>;
     static constexpr index_t kMPerXdl      = Problem::kMPerXdl;
     static constexpr index_t kNPerXdl      = Problem::kNPerXdl;
@@ -181,6 +191,8 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
             static_assert(false, "Unsupported CLayout!");
         }
     }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeD() { return 1; }
 };
 
 } // namespace ck_tile

From 663992e99b412991eab554b0deb89bb916d40161 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 20 Jun 2025 11:41:04 +0200
Subject: [PATCH 237/443] Grouped conv bias clamp fp32/fp16 support (#2366)

---
 ...grouped_convolution_forward_bias_clamp.hpp |  88 ++++
 ...ped_convolution_forward_bias_clamp_xdl.inc | 428 ++++++++++++++++++
 .../gpu/grouped_convolution_forward_clamp.hpp |  84 ++++
 .../grouped_convolution_forward_clamp_xdl.inc | 428 ++++++++++++++++++
 .../CMakeLists.txt                            |  22 +-
 ...hwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp |  67 +++
 ...l_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp |  61 +++
 ...c_gkyxc_nhwgk_fp16_comp_part2_instance.cpp |  67 +++
 ...l_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp |  61 +++
 ..._nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp |  60 +++
 ...mp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp |  60 +++
 ..._nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp |  60 +++
 ...mp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp |  60 +++
 ...tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp |  41 ++
 ...tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp |  41 ++
 ...gc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp |  63 +++
 ...groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp |  80 ++++
 ...groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp |  54 +++
 .../grouped_conv2d_fwd_clamp/CMakeLists.txt   |  22 +-
 ...hwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp |  67 +++
 ...l_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp |  61 +++
 ...c_gkyxc_nhwgk_fp16_comp_part2_instance.cpp |  67 +++
 ...l_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp |  61 +++
 ..._nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp |  60 +++
 ...mp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp |  60 +++
 ..._nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp |  60 +++
 ...mp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp |  60 +++
 ...tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp |  41 ++
 ...tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp |  41 ++
 ...gc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp |  63 +++
 ...gc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp |  63 +++
 ...groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp |  80 ++++
 ...groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp |  53 +++
 .../CMakeLists.txt                            |  20 +-
 ...dhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp | 127 ++++++
 ...dhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp |  59 +++
 ...hwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp |  58 +++
 ...xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp |  58 +++
 ...hwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp |  58 +++
 ...xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp |  58 +++
 ...sor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp |  41 ++
 ...sor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp |  41 ++
 ..._gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp |  61 +++
 ...ups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp |  51 +++
 ...ups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp |  51 +++
 .../grouped_conv3d_fwd_clamp/CMakeLists.txt   |  20 +-
 ...dhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp | 127 ++++++
 ...dhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp |  59 +++
 ...hwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp |  58 +++
 ...xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp |  58 +++
 ...hwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp |  58 +++
 ...xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp |  58 +++
 ...sor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp |  41 ++
 ...sor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp |  41 ++
 ..._gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp |  61 +++
 ..._gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp |  61 +++
 ...ups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp |  51 +++
 ...ups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp |  51 +++
 .../test_grouped_convnd_fwd_bias_clamp.cpp    |   8 +-
 .../test_grouped_convnd_fwd_clamp.cpp         |   8 +-
 .../test_grouped_convnd_fwd_gk_bias_clamp.cpp |   8 +-
 71 files changed, 4733 insertions(+), 22 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
index 39231e31f0..43411b0031 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp
@@ -99,6 +99,52 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
         }
         // layout NDHWGC/GKZYXC/NDHWGK
@@ -127,6 +173,48 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
         }
 #endif // CK_USE_XDL
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
index cc29e66cc1..aaaacb0d18 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp_xdl.inc
@@ -236,6 +236,434 @@ void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_
 
 #endif
 
+#ifdef CK_ENABLE_FP16
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP32
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances);
+
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
index cb84ca6130..28e74e61e4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp
@@ -98,6 +98,50 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
         }
         // layout NDHWGC/GKZYXC/NDHWGK
@@ -126,6 +170,46 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
                     op_ptrs);
             }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
 #endif
         }
 #endif // CK_USE_XDL
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
index b943bf728f..d5a8a5344a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp_xdl.inc
@@ -236,6 +236,434 @@ void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter
 
 #endif
 
+#ifdef CK_ENABLE_FP16
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP32
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances);
+
+#endif
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
index b0a0cbb293..e63ac766b6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/CMakeLists.txt
@@ -2,15 +2,29 @@
 add_instance_library(device_grouped_conv2d_fwd_bias_clamp_instance
    xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
-
    xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-
    xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-
    xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
-
    xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
+
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..42d9315677
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwdDefault,
+                                                              Tuple<F16>,
+                                                              AddClamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0,
+                                                              Tuple<F16>,
+                                                              AddClamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0,
+                                                              Tuple<F16>,
+                                                              AddClamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
new file mode 100644
index 0000000000..98c85a0697
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<F16>,
+                                                                                  AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<F16>,
+                                                                                  AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<F16>,
+                                                                                  AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..e510988fc5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Tuple<NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault,
+                                                                 Tuple<F16>,
+                                                                 AddClamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Tuple<NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1P0,
+                                                                 Tuple<F16>,
+                                                                 AddClamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Tuple<NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1S1P0,
+                                                                 Tuple<F16>,
+                                                                 AddClamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
new file mode 100644
index 0000000000..7f3dc6587c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
new file mode 100644
index 0000000000..0c9470329b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<F16>,
+                                                                                   AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<F16>,
+                                                                                   AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<F16>,
+                                                                                   AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..6d67f94acc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<F16>,
+                                                                             AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<F16>,
+                                                                             AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<F16>,
+                                                                             AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
new file mode 100644
index 0000000000..e6d1482c11
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<F32>,
+                                                                                   AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<F32>,
+                                                                                   AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<F32>,
+                                                                                   AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..deda89cac0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<F32>,
+                                                                             AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<F32>,
+                                                                             AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<F32>,
+                                                                             AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..f3bb617ae2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<F16>,
+                                                               AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..6e46971e5c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<F32>,
+                                                               AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..5e58f7d921
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..3526cdbefa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..efd49c122e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..d2b1d9374f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..e618f8514f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC,
+                                                                       Tuple<NHWGK>,
+                                                                       NHWGK,
+                                                                       ConvFwdDefault,
+                                                                       Tuple<F16>,
+                                                                       AddClamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC,
+                                                                       Tuple<NHWGK>,
+                                                                       NHWGK,
+                                                                       ConvFwd3x3,
+                                                                       Tuple<F16>,
+                                                                       AddClamp>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Tuple<NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault,
+                                                                    Tuple<F16>,
+                                                                    AddClamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Tuple<NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwd3x3,
+                                                                    Tuple<F16>,
+                                                                    AddClamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..0e73b19c3f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_bias_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<F32>,
+                                                                AddClamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwd3x3,
+                                                                Tuple<F32>,
+                                                                AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
index 15d236525b..8faed08c05 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/CMakeLists.txt
@@ -2,15 +2,29 @@
 add_instance_library(device_grouped_conv2d_fwd_clamp_instance
    xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.cpp
-
    xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-
    xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
-
    xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
-
    xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
+
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
+
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
+   xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
 )
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
new file mode 100644
index 0000000000..8e385ca398
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_2x_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwdDefault,
+                                                              Tuple<>,
+                                                              Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0,
+                                                              Tuple<>,
+                                                              Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Tuple<>,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0,
+                                                              Tuple<>,
+                                                              Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
new file mode 100644
index 0000000000..7ebaa623da
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
new file mode 100644
index 0000000000..f2e35b0c08
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_comp_part2_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Tuple<>,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Tuple<>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1P0,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC,
+                                                                 Tuple<>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1S1P0,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
new file mode 100644
index 0000000000..215a586a67
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/comp/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_comp_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Tuple<>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
new file mode 100644
index 0000000000..fce6d16d95
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_16x16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..610303cdcb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
new file mode 100644
index 0000000000..62d938b016
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_16x16_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC,
+                                                                                   Tuple<>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..aaf865c41a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC,
+                                                                             Tuple<>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..28b097310f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<>,
+                                                               Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..fa01aa43dc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2,
+                                                               NHWGC,
+                                                               GKYXC,
+                                                               Tuple<>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<>,
+                                                               Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..ca50f67c2a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_inter_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..e2ca60c8fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp16_mem_intra_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..419130eb4b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_inter_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..f961bcce4c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/mem/device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_fp32_mem_intra_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC,
+                                                                                 Tuple<>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..39022e5e8c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp16_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC,
+                                                                       Tuple<>,
+                                                                       NHWGK,
+                                                                       ConvFwdDefault,
+                                                                       Tuple<>,
+                                                                       Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC,
+                                                                       Tuple<>,
+                                                                       NHWGK,
+                                                                       ConvFwd3x3,
+                                                                       Tuple<>,
+                                                                       Clamp>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Tuple<>,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault,
+                                                                    Tuple<>,
+                                                                    Clamp>{});
+
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    Tuple<>,
+                                                                    NHWGK,
+                                                                    ConvFwd3x3,
+                                                                    Tuple<>,
+                                                                    Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..43c7fc5598
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_fp32_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<>,
+                                                                Clamp>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Tuple<>,
+                                                                NHWGK,
+                                                                ConvFwd3x3,
+                                                                Tuple<>,
+                                                                Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
index a1c3feed3b..3bd6916cf0 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/CMakeLists.txt
@@ -2,15 +2,27 @@
 set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
-
    xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-
    xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-
    xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
-
    xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
+
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
 )
 
 add_instance_library(device_grouped_conv3d_fwd_bias_clamp_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
new file mode 100644
index 0000000000..a5cfa3ee2b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<F16>,
+                                                                                  AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<F16>,
+                                                                                  AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<F16>,
+                                                                                  AddClamp>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault,
+                                                                 Tuple<F16>,
+                                                                 AddClamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0,
+                                                                 Tuple<F16>,
+                                                                 AddClamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0,
+                                                                 Tuple<F16>,
+                                                                 AddClamp>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwdDefault,
+                                                              Tuple<F16>,
+                                                              AddClamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0,
+                                                              Tuple<F16>,
+                                                              AddClamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0,
+                                                              Tuple<F16>,
+                                                              AddClamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
new file mode 100644
index 0000000000..6850be3e78
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<F32>,
+                                                                                  AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
new file mode 100644
index 0000000000..d08d4ba258
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<F16>,
+                                                                                   AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<F16>,
+                                                                                   AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<F16>,
+                                                                                   AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..d9d5262c46
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<F16>,
+                                                                             AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<F16>,
+                                                                             AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<F16>,
+                                                                             AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
new file mode 100644
index 0000000000..f84201966e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<F32>,
+                                                                                   AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<F32>,
+                                                                                   AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<F32>,
+                                                                                   AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..7156afddfa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<F32>,
+                                                                             AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<F32>,
+                                                                             AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<F32>,
+                                                                             AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..468a217cee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<F16>,
+                                                               AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..a77a22d73a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<F32>,
+                                                               AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..915aa5915d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..90ef89ed42
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F16>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..28e3897cdf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..a80ed2d239
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<F32>,
+                                                                                 AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..ae2af87202
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<F16>,
+                                                                AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwd3x3,
+                                                                Tuple<F16>,
+                                                                AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..7f55cb69b8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_bias_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                AddClamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<F32>,
+                                                                AddClamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwd3x3,
+                                                                Tuple<F32>,
+                                                                AddClamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
index 5eb0dd50eb..234533244e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/CMakeLists.txt
@@ -2,15 +2,27 @@
 set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
    xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.cpp
-
    xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-
    xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-
    xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
-
    xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
+
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
+   xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
 )
 
 add_instance_library(device_grouped_conv3d_fwd_clamp_instance ${GROUPED_CONV3D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
new file mode 100644
index 0000000000..92162470a7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_comp_instance.cpp
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<>,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Tuple<>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0,
+                                                                 Tuple<>,
+                                                                 Clamp>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwdDefault,
+                                                              Tuple<>,
+                                                              Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0,
+                                                              Tuple<>,
+                                                              Clamp>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Tuple<>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0,
+                                                              Tuple<>,
+                                                              Clamp>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
new file mode 100644
index 0000000000..398a3b5bcd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/comp/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_comp_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Tuple<>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Tuple<>,
+                                                                                  Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
new file mode 100644
index 0000000000..920bda298d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_16x16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..ba2151849c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
new file mode 100644
index 0000000000..b421d2179b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_16x16_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwdDefault,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Tuple<>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1S1P0,
+                                                                                   Tuple<>,
+                                                                                   Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..11e91c6565
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC,
+                                                                             Tuple<>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,
+                                                                             Tuple<>,
+                                                                             Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..6dbf94a122
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<>,
+                                                               Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..ed386e98db
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Tuple<>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,
+                                                               Tuple<>,
+                                                               Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
new file mode 100644
index 0000000000..2a58b18f43
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_inter_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
new file mode 100644
index 0000000000..628b4c7a97
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp16_mem_intra_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
new file mode 100644
index 0000000000..401a4b84c9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_inter_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
new file mode 100644
index 0000000000..2215c7dea1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/mem/device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_fp32_mem_intra_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC,
+                                                                                 Tuple<>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,
+                                                                                 Tuple<>,
+                                                                                 Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
new file mode 100644
index 0000000000..5e6e78c4b6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp16_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<>,
+                                                                Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                ConvFwd3x3,
+                                                                Tuple<>,
+                                                                Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
new file mode 100644
index 0000000000..38a953850e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_fp32_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,
+                                                                Tuple<>,
+                                                                Clamp>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Tuple<>,
+                                                                NDHWGK,
+                                                                ConvFwd3x3,
+                                                                Tuple<>,
+                                                                Clamp>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
index f3a569115a..e38a6d6f6a 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp.cpp
@@ -55,9 +55,13 @@ class TestGroupedConvndFwd : public ::testing::Test
 
 using namespace ck::tensor_layout::convolution;
 
-using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
 
-using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
 
 template <typename Tuple>
 class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
index d3ede8671e..55c2e729cd 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_clamp.cpp
@@ -57,9 +57,13 @@ class TestGroupedConvndFwd : public ::testing::Test
 
 using namespace ck::tensor_layout::convolution;
 
-using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
 
-using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
 
 template <typename Tuple>
 class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
index 0a41eac286..cd4d90e243 100644
--- a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_clamp.cpp
@@ -55,9 +55,13 @@ class TestGroupedConvndFwd : public ::testing::Test
 
 using namespace ck::tensor_layout::convolution;
 
-using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
 
-using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
 
 template <typename Tuple>
 class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>

From 107e3623c7ac612465067316d7000173e2e9ad22 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Fri, 20 Jun 2025 11:24:54 -0700
Subject: [PATCH 238/443] Transpose builtin macro defense (#2374)

* add the macro defense

* add the static assert check
---
 .../core/arch/amd_buffer_addressing.hpp       |  4 ++
 .../arch/amd_buffer_addressing_builtins.hpp   | 38 +++++++++++++++++++
 include/ck_tile/core/tensor/buffer_view.hpp   | 12 ++++--
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 0ec1a95511..12f49aa4e3 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2784,10 +2784,13 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 #endif
 }
 
+#if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
 __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
 {
 
+    static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
+                  "We need to have the compatible compiler version to build this instruction");
     if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
     {
         typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
@@ -2817,6 +2820,7 @@ __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
         static_assert(false, "not implemented");
     }
 }
+#endif
 
 } // namespace ck_tile
 
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 53a344c7b0..306d2cdac3 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -2554,6 +2554,44 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 #endif
 }
 
+#if defined(__gfx950__)
+template <typename T, index_t N, address_space_enum BufferAddressSpace>
+__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+{
+
+    static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
+                  "We need to have the compatible compiler version to build this instruction");
+    if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
+    {
+        typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
+        __attribute__((address_space(3))) llvm_fp16x4_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_fp16x4_t*>(
+                reinterpret_cast<uintptr_t>(in_ptr));
+        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
+    }
+    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
+    {
+        typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
+        __attribute__((address_space(3))) llvm_bf16x4_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_bf16x4_t*>(
+                reinterpret_cast<uintptr_t>(in_ptr));
+        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
+    }
+    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t>)
+    {
+        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_fp8x8_t;
+        __attribute__((address_space(3))) llvm_fp8x8_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_fp8x8_t*>(
+                reinterpret_cast<uintptr_t>(in_ptr));
+        return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
+    }
+    else
+    {
+        static_assert(false, "not implemented");
+    }
+}
+#endif
+
 } // namespace ck_tile
 
 #endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index cd7b7d0a1f..8d19337b86 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -902,8 +902,9 @@ struct buffer_view<address_space_enum::lds,
                   std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                   bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    transpose_get(index_t i, index_t linear_offset, bool is_valid_element) const
+    CK_TILE_DEVICE constexpr auto transpose_get([[maybe_unused]] index_t i,
+                                                [[maybe_unused]] index_t linear_offset,
+                                                bool is_valid_element) const
     {
         // X contains multiple T
         constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -913,13 +914,16 @@ struct buffer_view<address_space_enum::lds,
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
 
-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
         if(is_valid_element)
         {
+#if defined(__gfx950__)
+            constexpr index_t t_per_x               = scalar_per_x_vector / scalar_per_t_vector;
             constexpr address_space_enum addr_space = get_address_space();
             return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
                 p_data_ + i + linear_offset);
+#else
+            return X{numeric<remove_cvref_t<T>>::zero()};
+#endif
         }
         else
         {

From c3c8c6a10f0842cf52c08f1f99dc31714accaaea Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:48:00 -0700
Subject: [PATCH 239/443] Introduce dependency-based CI test selection. (#2377)

* Selective test filter initial commit.

* Expanded folder paths for parsing ninja dependencies.

* Fixing default branch name in the test evaluation script.

* Fixing paths for robustness and adding ctest command to the launch script.

* change jenkins file and few tests to upgrade CI

* Setting ninja build path.

* Fixing typo in Jenkinsfile, and wrong paths.

* Fixing typo in launch script.

* add few more tests to check CI logic

* Fixing header for shell script.

* turn off performance test by default, add option to run all unit tests

* revert dummy changes in source code to trigger tests

* make sure develop branch runs all unit tests

---------

Co-authored-by: Vidyasagar Ananthan <vidyasagar.ananthan@amd.com>
---
 Jenkinsfile                                   |  43 ++-
 script/dependency-parser/README.md            | 173 ++++++++++
 script/dependency-parser/main.py              |  78 +++++
 .../src/enhanced_ninja_parser.py              | 315 ++++++++++++++++++
 .../src/selective_test_filter.py              | 136 ++++++++
 script/launch_tests.sh                        |  59 ++++
 6 files changed, 786 insertions(+), 18 deletions(-)
 create mode 100644 script/dependency-parser/README.md
 create mode 100644 script/dependency-parser/main.py
 create mode 100644 script/dependency-parser/src/enhanced_ninja_parser.py
 create mode 100644 script/dependency-parser/src/selective_test_filter.py
 create mode 100755 script/launch_tests.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index f9d7feb77c..b2fda68b70 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -343,15 +343,8 @@ def cmake_build(Map conf=[:]){
     def build_cmd
     def execute_cmd = conf.get("execute_cmd", "")
     if(!setup_args.contains("NO_CK_BUILD")){
-        if (setup_args.contains("gfx9") && params.NINJA_BUILD_TRACE){
-            echo "running ninja build trace"
-            setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace "  .. """)
-            build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
-        }
-        else{
-            setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
-            build_cmd = conf.get("build_cmd", "${build_envs} make -j${nt} ${config_targets}")
-        }
+        setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace "  .. """)
+        build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
         cmd = conf.get("cmd", """
             ${setup_cmd}
             ${build_cmd}
@@ -379,7 +372,12 @@ def cmake_build(Map conf=[:]){
                 archiveArtifacts "clang_build_analysis.log"
                 // do not run unit tests when building instances only
                 if(!params.BUILD_INSTANCES_ONLY){
-                    sh "ninja check"
+                    if (!params.RUN_ALL_UNIT_TESTS){
+                        sh "../script/launch_tests.sh"
+                    }
+                    else{
+                        sh "ninja check"
+                    }
                 }
                 if(params.BUILD_INSTANCES_ONLY){
                     // build deb packages
@@ -393,7 +391,12 @@ def cmake_build(Map conf=[:]){
             else{
                 // run unit tests unless building library for all targets
                 if (!params.BUILD_INSTANCES_ONLY){
-                    sh "make check"
+                    if (!params.RUN_ALL_UNIT_TESTS){
+                        sh "../script/launch_tests.sh"
+                    }
+                    else{
+                        sh "ninja check"
+                    }
                 }
             }
         }
@@ -793,10 +796,10 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=false
-                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true
-                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=false;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
+                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                               0 13 * * * % BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
 
@@ -859,8 +862,8 @@ pipeline {
             description: "Run the cppcheck static analysis (default: OFF)")
         booleanParam(
             name: "RUN_PERFORMANCE_TESTS",
-            defaultValue: true,
-            description: "Run the performance tests (default: ON)")
+            defaultValue: false,
+            description: "Run the performance tests (default: OFF)")
         booleanParam(
             name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
             defaultValue: false,
@@ -913,6 +916,10 @@ pipeline {
             name: "RUN_INDUCTOR_TESTS",
             defaultValue: true,
             description: "Run inductor codegen tests (default: ON)")
+        booleanParam(
+            name: "RUN_ALL_UNIT_TESTS",
+            defaultValue: false,
+            description: "Run all unit tests (default: OFF)")
     }
     environment{
         dbuser = "${dbuser}"
@@ -1025,7 +1032,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { params.RUN_CODEGEN_TESTS.toBoolean() }
+                        expression { params.RUN_CODEGEN_TESTS.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
                     }
                     agent{ label rocmnode("gfx90a")}
                     environment{
diff --git a/script/dependency-parser/README.md b/script/dependency-parser/README.md
new file mode 100644
index 0000000000..ff4a44b9a2
--- /dev/null
+++ b/script/dependency-parser/README.md
@@ -0,0 +1,173 @@
+# Dependency-based Selective Test Filtering using Static Analysis of Ninja Builds for C++ Projects
+
+## Overview
+
+This tool provides advanced dependency-based selective test filtering and build optimization for large C++ monorepos using static parsing of Ninja build files. By analyzing both source and header dependencies, it enables precise identification of which tests and executables are affected by code changes, allowing for efficient CI/CD workflows and faster incremental builds.
+
+The parser:
+- Identifies all executables in the Ninja build.
+- Maps object files to their source and header dependencies using `ninja -t deps`.
+- Constructs a reverse mapping from each file to all dependent executables.
+- Handles multi-executable dependencies and supports parallel processing for scalability.
+- Exports results in CSV and JSON formats for integration with other tools.
+
+## Features
+
+- **Comprehensive Dependency Tracking**: Captures direct source file dependencies and, critically, all included header files via `ninja -t deps`.
+- **Executable to Object Mapping**: Parses the `build.ninja` file to understand how executables are linked from object files.
+- **Object to Source/Header Mapping**: Uses `ninja -t deps` for each object file to get a complete list of its dependencies.
+- **File to Executable Inversion**: Inverts the dependency graph to map each file to the set of executables that depend on it.
+- **Parallel Processing**: Utilizes a `ThreadPoolExecutor` to run `ninja -t deps` commands in parallel, significantly speeding up analysis for projects with many object files.
+- **Filtering**: Option to filter out system files and focus on project-specific dependencies.
+- **Multiple Output Formats**:
+    - **CSV**: `enhanced_file_executable_mapping.csv` - A comma-separated values file where each row lists a file and a semicolon-separated list of executables that depend on it.
+    - **JSON**: `enhanced_dependency_mapping.json` - A JSON file representing a dictionary where keys are file paths and values are lists of dependent executables.
+- **Robust Error Handling**: Includes error handling for missing files and failed subprocess commands.
+
+## Prerequisites
+
+- **Python 3.7+**
+- **Ninja build system**: The `ninja` executable must be in the system's PATH or its path provided as an argument.
+- A **Ninja build directory** containing a `build.ninja` file and the compiled object files. The project should have been built at least once.
+
+## Using CMake with Ninja
+
+To use this tool effectively, your C++ project should be configured with CMake to generate Ninja build files and dependency information. Follow these steps:
+
+1. **Configure CMake to use Ninja and generate dependencies:**
+    ```bash
+    cmake -G Ninja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release /path/to/your/source
+    ```
+    - The `-G Ninja` flag tells CMake to generate Ninja build files.
+    - `-DCMAKE_EXPORT_COMPILE_COMMANDS=ON` is optional but useful for other tooling.
+    - Ensure your CMakeLists.txt uses `target_include_directories` and proper dependency declarations for accurate results.
+
+2. **Build your project with Ninja:**
+    ```bash
+    ninja
+    ```
+    - This step is required to generate all object files and dependency information (`.d` files) that the parser relies on.
+
+3. **Run the dependency parser tool:**
+    ```bash
+    python main.py parse /path/to/build.ninja --workspace-root /path/to/your/workspace
+    ```
+
+**Note:** Always run Ninja to ensure all dependencies are up to date before invoking the parser. If you change source files or headers, re-run Ninja first.
+
+## Usage
+
+All features are available via the unified main.py CLI:
+
+```bash
+# Dependency parsing (now supports --workspace-root)
+python main.py parse examples/build-ninja/build.ninja --workspace-root /path/to/your/workspace
+
+# Selective test filtering
+python main.py select enhanced_dependency_mapping.json <ref1> <ref2> [--all | --test-prefix] [--output <output_json>]
+
+# Code auditing
+python main.py audit enhanced_dependency_mapping.json
+
+# Build optimization
+python main.py optimize enhanced_dependency_mapping.json <changed_file1> [<changed_file2> ...]
+```
+
+**Arguments:**
+
+1.  `<path_to_build.ninja>`: (Required) The full path to the `build.ninja` file within your Ninja build directory.
+2.  `[--workspace-root <workspace_root>]`: (Optional, recommended) The root directory of your workspace.
+3.  `[path_to_ninja_executable]`: (Optional) The path to the `ninja` executable if it's not in your system's PATH. Defaults to `ninja`.
+
+**Example:**
+
+```bash
+# Assuming your build directory is 'build-ninja' and it contains 'build.ninja'
+python src/enhanced_ninja_parser.py build-ninja/build.ninja
+
+# With custom workspace root
+python src/enhanced_ninja_parser.py build-ninja/build.ninja ninja /path/to/your/workspace
+
+# If ninja is installed in a custom location
+python src/enhanced_ninja_parser.py /path/to/project/build/build.ninja /usr/local/bin/ninja
+```
+
+## How It Works
+
+1.  **Initialization**:
+    *   Takes the path to `build.ninja` and optionally the `ninja` executable.
+    *   Sets up internal data structures to store mappings.
+
+2.  **Build File Parsing (`_parse_build_file`)**:
+    *   Reads the `build.ninja` file.
+    *   Uses regular expressions to identify rules for linking executables (e.g., `build my_exe: link main.o utils.o`) and compiling object files (e.g., `build main.o: cxx ../src/main.cpp`).
+    *   Populates `executable_to_objects` (mapping an executable name to a list of its .o files) and `object_to_source` (mapping an object file to its primary source file).
+
+3.  **Object Dependency Extraction (`_extract_all_object_dependencies`)**:
+    *   Iterates through all unique object files identified in the previous step.
+    *   For each object file, it calls `_get_object_dependencies`.
+    *   This process is parallelized using `ThreadPoolExecutor` for efficiency. Each call to `ninja -t deps` runs in a separate thread.
+
+4.  **Individual Object Dependencies (`_get_object_dependencies`)**:
+    *   For a given object file (e.g., `main.o`), it runs the command: `ninja -t deps main.o` in the build directory.
+    *   This command outputs a list of all files that `main.o` depends on, including its primary source (`main.cpp`) and all headers (`*.h`, `*.hpp`) it includes directly or indirectly.
+    *   The output is parsed, cleaned, and returned as a list of file paths.
+
+5.  **Building Final File-to-Executable Mapping (`_build_file_to_executable_mapping`)**:
+    *   This is the core inversion step. It iterates through each executable and its associated object files.
+    *   For each object file, it looks up the full list of its dependencies (source and headers) obtained in step 3 & 4.
+    *   For every dependent file found, it adds the current executable to that file's entry in the `file_to_executables` dictionary.
+    *   If `filter_project_files` is enabled, it checks each dependency against a list of common system paths (e.g., `/usr/include`, `_deps/`) and excludes them if they match.
+
+6.  **Filtering (`_is_project_file`)**:
+    *   A helper function to determine if a given file path is likely a project file or a system/external library file. This helps in focusing the dependency map on the user's own codebase.
+
+7.  **Output Generation**:
+    *   **`export_to_csv(csv_file)`**: Writes the `file_to_executables` mapping to a CSV file. Each row contains a file path and a semicolon-delimited string of executable names.
+    *   **`export_to_json(json_file)`**: Dumps the `file_to_executables` mapping (where the set of executables is converted to a list) into a JSON file.
+    *   **`print_summary()`**: Prints a summary of the findings, including the number of executables, object files, source files, and header files mapped.
+
+## Output Files
+
+Running the script will generate two files in the same directory as the input `build.ninja` file:
+
+-   **`enhanced_file_executable_mapping.csv`**:
+    ```csv
+    File,Executables
+    /path/to/project/src/main.cpp,my_exe_1;my_exe_2
+    /path/to/project/include/utils.h,my_exe_1;another_test
+    ...
+    ```
+
+-   **`enhanced_dependency_mapping.json`**:
+    ```json
+    {
+      "/path/to/project/src/main.cpp": ["my_exe_1", "my_exe_2"],
+      "/path/to/project/include/utils.h": ["my_exe_1", "another_test"],
+      ...
+    }
+    ```
+
+## Use Cases
+
+-   **Impact Analysis**: Determine which executables (especially tests) need to be rebuilt or re-run when a specific source or header file changes.
+-   **Build Optimization**: Understand the dependency structure to potentially optimize build times.
+-   **Code Auditing**: Get a clear overview of how files are used across different executables.
+-   **Selective Testing**: Integrate with CI/CD systems to run only the tests affected by a given set of changes.
+
+## Limitations
+
+-   Relies on the accuracy of Ninja's dependency information (`ninja -t deps`). If the build system doesn't correctly generate `.d` (dependency) files, the header information might be incomplete.
+-   The definition of "project file" vs. "system file" is based on a simple path-based heuristic and might need adjustment for specific project structures.
+-   Performance for extremely large projects (tens of thousands of object files) might still be a consideration, though parallelization helps significantly.
+
+## Troubleshooting
+
+-   **"ninja: command not found"**: Ensure `ninja` is installed and in your PATH, or provide the full path to the executable as the second argument.
+-   **"build.ninja not found"**: Double-check the path to your `build.ninja` file.
+-   **Empty or Incomplete Output**:
+    *   Make sure the project has been successfully built at least once. `ninja -t deps` relies on information generated during the build.
+    *   Verify that your CMake (or other meta-build system) is configured to generate dependency files for Ninja.
+-   **Slow Performance**: For very large projects, the number of `ninja -t deps` calls can be substantial. While parallelized, it can still take time. Consider if all object files truly need to be analyzed or if a subset is sufficient for your needs.
+
+This tool provides a powerful way to gain deep insights into your Ninja project's dependency structure, enabling more intelligent build and test workflows.
diff --git a/script/dependency-parser/main.py b/script/dependency-parser/main.py
new file mode 100644
index 0000000000..b8fd67ac49
--- /dev/null
+++ b/script/dependency-parser/main.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""
+Unified CLI for Ninja Dependency Analysis and Selective Testing
+
+Features:
+- Dependency parsing (from build.ninja)
+- Selective test filtering (between git refs)
+- Code auditing (--audit)
+- Build optimization (--optimize-build)
+"""
+
+import argparse
+import sys
+import os
+
+def run_dependency_parser(args):
+    from src.enhanced_ninja_parser import main as ninja_main
+    sys.argv = ["enhanced_ninja_parser.py"] + args
+    ninja_main()
+
+def run_selective_test_filter(args):
+    from src.selective_test_filter import main as filter_main
+    sys.argv = ["selective_test_filter.py"] + args
+    filter_main()
+
+def main():
+    parser = argparse.ArgumentParser(description="Unified Ninja Dependency & Selective Testing Tool")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # Dependency parsing
+    parser_parse = subparsers.add_parser("parse", help="Parse build.ninja and generate dependency mapping")
+    parser_parse.add_argument("build_ninja", help="Path to build.ninja")
+    parser_parse.add_argument("--ninja", help="Path to ninja executable", default="ninja")
+    parser_parse.add_argument("--workspace-root", help="Path to workspace root", default=None)
+
+    # Selective testing
+    parser_test = subparsers.add_parser("select", help="Selective test filtering between git refs")
+    parser_test.add_argument("depmap_json", help="Path to dependency mapping JSON")
+    parser_test.add_argument("ref1", help="Source git ref")
+    parser_test.add_argument("ref2", help="Target git ref")
+    parser_test.add_argument("--all", action="store_true", help="Include all executables")
+    parser_test.add_argument("--test-prefix", action="store_true", help="Only include executables starting with 'test_'")
+    parser_test.add_argument("--output", help="Output JSON file", default="tests_to_run.json")
+
+    # Code auditing
+    parser_audit = subparsers.add_parser("audit", help="List all files and their dependent executables")
+    parser_audit.add_argument("depmap_json", help="Path to dependency mapping JSON")
+
+    # Build optimization
+    parser_opt = subparsers.add_parser("optimize", help="List affected executables for changed files")
+    parser_opt.add_argument("depmap_json", help="Path to dependency mapping JSON")
+    parser_opt.add_argument("changed_files", nargs="+", help="List of changed files")
+
+    args = parser.parse_args()
+
+    if args.command == "parse":
+        parse_args = [args.build_ninja, args.ninja]
+        if args.workspace_root:
+            parse_args.append(args.workspace_root)
+        run_dependency_parser(parse_args)
+    elif args.command == "select":
+        filter_args = [args.depmap_json, args.ref1, args.ref2]
+        if args.test_prefix:
+            filter_args.append("--test-prefix")
+        if args.all:
+            filter_args.append("--all")
+        if args.output:
+            filter_args += ["--output", args.output]
+        run_selective_test_filter(filter_args)
+    elif args.command == "audit":
+        run_selective_test_filter([args.depmap_json, "--audit"])
+    elif args.command == "optimize":
+        run_selective_test_filter([args.depmap_json, "--optimize-build"] + args.changed_files)
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
diff --git a/script/dependency-parser/src/enhanced_ninja_parser.py b/script/dependency-parser/src/enhanced_ninja_parser.py
new file mode 100644
index 0000000000..087ab50640
--- /dev/null
+++ b/script/dependency-parser/src/enhanced_ninja_parser.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Enhanced Ninja Dependency Parser
+
+This script combines ninja build file parsing with ninja -t deps to create a comprehensive
+mapping that includes both source files AND header files, and properly handles files
+used by multiple executables.
+"""
+
+import re
+import os
+import sys
+import subprocess
+from pathlib import Path
+from collections import defaultdict
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+class EnhancedNinjaDependencyParser:
+    def __init__(self, build_file_path, ninja_executable="ninja"):
+        self.build_file_path = build_file_path
+        self.build_dir = os.path.dirname(build_file_path)
+        self.ninja_executable = ninja_executable
+        
+        # Core data structures
+        self.executable_to_objects = {}  # exe -> [object_files]
+        self.object_to_source = {}       # object -> primary_source
+        self.object_to_all_deps = {}     # object -> [all_dependencies]
+        self.file_to_executables = defaultdict(set)  # file -> {executables}
+        
+        # Thread safety
+        self.lock = threading.Lock()
+        
+    def parse_dependencies(self):
+        """Main method to parse all dependencies."""
+        print(f"Parsing ninja dependencies from: {self.build_file_path}")
+        
+        # Step 1: Parse build file for executable -> object mappings
+        self._parse_build_file()
+        
+        # Step 2: Get all object files and their dependencies
+        print(f"Found {len(self.object_to_source)} object files")
+        print("Extracting detailed dependencies for all object files...")
+        self._extract_object_dependencies()
+        
+        # Step 3: Build the final file -> executables mapping
+        self._build_file_to_executable_mapping()
+        
+    def _parse_build_file(self):
+        """Parse the ninja build file to extract executable -> object mappings."""
+        print("Parsing ninja build file...")
+        
+        with open(self.build_file_path, 'r') as f:
+            content = f.read()
+          # Parse executable build rules
+        exe_pattern = r'^build (bin/[^:]+):\s+\S+\s+([^|]+)'
+        obj_pattern = r'^build ([^:]+\.(?:cpp|cu|hip)\.o):\s+\S+\s+([^\s|]+)'
+        
+        lines = content.split('\n')
+        
+        for line in lines:
+            # Match executable rules
+            exe_match = re.match(exe_pattern, line)
+            if exe_match and ('EXECUTABLE' in line or 'test_' in exe_match.group(1) or 'example_' in exe_match.group(1)):
+                exe = exe_match.group(1)
+                deps_part = exe_match.group(2).strip()
+                
+                object_files = []
+                for dep in deps_part.split():
+                    if dep.endswith('.o') and not dep.startswith('/'):
+                        object_files.append(dep)
+                
+                self.executable_to_objects[exe] = object_files
+                continue
+            
+            # Match object compilation rules
+            obj_match = re.match(obj_pattern, line)
+            if obj_match:
+                object_file = obj_match.group(1)
+                source_file = obj_match.group(2)
+                self.object_to_source[object_file] = source_file
+                
+        print(f"Found {len(self.executable_to_objects)} executables")
+        print(f"Found {len(self.object_to_source)} object-to-source mappings")
+        
+    def _extract_object_dependencies(self):
+        """Extract detailed dependencies for all object files using ninja -t deps."""
+        object_files = list(self.object_to_source.keys())
+          # Process object files in parallel for better performance
+        if not object_files:
+            print("No object files found - skipping dependency extraction")
+            return
+            
+        max_workers = min(16, len(object_files))  # Limit concurrent processes
+        
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all object files for processing
+            future_to_obj = {
+                executor.submit(self._get_object_dependencies, obj): obj 
+                for obj in object_files
+            }
+              # Process completed futures
+            completed = 0
+            for future in as_completed(future_to_obj):
+                obj_file = future_to_obj[future]
+                try:
+                    dependencies = future.result()
+                    with self.lock:
+                        self.object_to_all_deps[obj_file] = dependencies
+                        completed += 1
+                        if completed % 100 == 0:
+                            print(f"Processed {completed}/{len(object_files)} object files...")
+                except Exception as e:
+                    print(f"Error processing {obj_file}: {e}")
+                    
+        print(f"Completed dependency extraction for {len(self.object_to_all_deps)} object files")
+        
+    def _get_object_dependencies(self, object_file):
+        """Get all dependencies for a single object file using ninja -t deps."""
+        try:
+            # Run ninja -t deps for this object file
+            cmd = [self.ninja_executable, "-t", "deps", object_file]
+            result = subprocess.run(
+                cmd, 
+                cwd=self.build_dir,
+                capture_output=True, 
+                text=True, 
+                timeout=30
+            )
+            
+            if result.returncode != 0:
+                return []
+                
+            dependencies = []
+            lines = result.stdout.strip().split('\n')
+            
+            for line in lines[1:]:  # Skip first line with metadata
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    # Convert absolute paths to relative paths from workspace root
+                    dep_file = line
+                    ws_root = getattr(self, "workspace_root", "..")
+                    ws_prefix = ws_root.rstrip("/") + "/"
+                    if dep_file.startswith(ws_prefix):
+                        dep_file = dep_file[len(ws_prefix):]
+                    dependencies.append(dep_file)
+                    
+            return dependencies
+            
+        except Exception as e:
+            print(f"Error getting dependencies for {object_file}: {e}")
+            return []
+    
+    def _build_file_to_executable_mapping(self):
+        """Build the final mapping from files to executables."""
+        print("Building file-to-executable mapping...")
+        
+        for exe, object_files in self.executable_to_objects.items():
+            for obj_file in object_files:
+                # Add all dependencies of this object file
+                if obj_file in self.object_to_all_deps:
+                    for dep_file in self.object_to_all_deps[obj_file]:
+                        # Filter out system files and focus on project files
+                        if self._is_project_file(dep_file):
+                            self.file_to_executables[dep_file].add(exe)
+                            
+        print(f"Built mapping for {len(self.file_to_executables)} files")
+        
+        # Show statistics
+        multi_exe_files = {f: exes for f, exes in self.file_to_executables.items() if len(exes) > 1}
+        print(f"Files used by multiple executables: {len(multi_exe_files)}")
+        
+        if multi_exe_files:
+            print("Sample files with multiple dependencies:")
+            for f, exes in sorted(multi_exe_files.items())[:5]:
+                print(f"  {f}: {len(exes)} executables")
+                
+    def _is_project_file(self, file_path):
+        """Determine if a file is part of the project (not system files)."""
+        # Include files that are clearly part of the project
+        if any(file_path.startswith(prefix) for prefix in [
+            'include/', 'library/', 'test/', 'example/', 'src/', 'profiler/',
+            'build/include/', 'build/_deps/gtest', 'client_example', 'codegen', 'tile_engine'
+        ]):
+            return True
+            
+        # Exclude system files
+        if any(file_path.startswith(prefix) for prefix in [
+            '/usr/', '/opt/rocm', '/lib/', '/system/', '/local/'
+        ]):
+            return False
+            
+        # Include files with common source/header extensions
+        if file_path.endswith(('.cpp', '.hpp', '.h', '.c', '.cc', '.cxx', '.cu', '.hip', '.inc')):
+            return True
+            
+        return False
+          
+    def export_to_csv(self, output_file):
+        """Export the file-to-executable mapping to CSV with proper comma separation."""
+        print(f"Exporting mapping to {output_file}")
+        
+        with open(output_file, 'w') as f:
+            f.write("source_file,executables\n")
+            for file_path in sorted(self.file_to_executables.keys()):
+                executables = sorted(self.file_to_executables[file_path])
+                # Use semicolon to separate multiple executables within the field
+                exe_list = ';'.join(executables)
+                f.write(f'"{file_path}","{exe_list}"\n')
+                
+    def export_to_json(self, output_file):
+        """Export the complete mapping to JSON."""
+        print(f"Exporting complete mapping to {output_file}")
+        
+        # Build reverse mapping (executable -> files)
+        exe_to_files = defaultdict(set)
+        for file_path, exes in self.file_to_executables.items():
+            for exe in exes:
+                exe_to_files[exe].add(file_path)
+        
+        mapping_data = {
+            'file_to_executables': {
+                file_path: list(exes) for file_path, exes in self.file_to_executables.items()
+            },
+            'executable_to_files': {
+                exe: sorted(files) for exe, files in exe_to_files.items()
+            },
+            'statistics': {
+                'total_files': len(self.file_to_executables),
+                'total_executables': len(self.executable_to_objects),
+                'total_object_files': len(self.object_to_source),
+                'files_with_multiple_executables': len([f for f, exes in self.file_to_executables.items() if len(exes) > 1])
+            }
+        }
+        
+        with open(output_file, 'w') as f:
+            json.dump(mapping_data, f, indent=2)
+            
+    def print_summary(self):
+        """Print a summary of the parsed dependencies."""        
+        print("\n=== Enhanced Dependency Mapping Summary ===")
+        print(f"Total executables: {len(self.executable_to_objects)}")
+        print(f"Total files mapped: {len(self.file_to_executables)}")
+        print(f"Total object files processed: {len(self.object_to_all_deps)}")
+        
+        # Files by type
+        cpp_files = sum(1 for f in self.file_to_executables.keys() if f.endswith('.cpp'))
+        hpp_files = sum(1 for f in self.file_to_executables.keys() if f.endswith('.hpp'))
+        h_files = sum(1 for f in self.file_to_executables.keys() if f.endswith('.h'))
+        
+        print(f"\nFile types:")
+        print(f"  .cpp files: {cpp_files}")
+        print(f"  .hpp files: {hpp_files}")
+        print(f"  .h files: {h_files}")
+        
+        # Multi-executable files
+        multi_exe_files = {f: exes for f, exes in self.file_to_executables.items() if len(exes) > 1}
+        print(f"\nFiles used by multiple executables: {len(multi_exe_files)}")
+        
+        if multi_exe_files:
+            print("\nTop files with most dependencies:")
+            sorted_multi = sorted(multi_exe_files.items(), key=lambda x: len(x[1]), reverse=True)
+            for file_path, exes in sorted_multi[:10]:
+                print(f"  {file_path}: {len(exes)} executables")
+
+def main():
+    # Accept: build_file, ninja_path, workspace_root
+    default_workspace_root = ".."
+    if len(sys.argv) > 3:
+        build_file = sys.argv[1]
+        ninja_path = sys.argv[2]
+        workspace_root = sys.argv[3]
+    elif len(sys.argv) > 2:
+        build_file = sys.argv[1]
+        ninja_path = sys.argv[2]
+        workspace_root = default_workspace_root
+    elif len(sys.argv) > 1:
+        build_file = sys.argv[1]
+        ninja_path = "ninja"
+        workspace_root = default_workspace_root
+    else:
+        build_file = f"{default_workspace_root}/build/build.ninja"
+        ninja_path = "ninja"
+        workspace_root = default_workspace_root
+
+    if not os.path.exists(build_file):
+        print(f"Error: Build file not found: {build_file}")
+        sys.exit(1)
+
+    try:
+        subprocess.run([ninja_path, "--version"], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print(f"Error: ninja executable not found: {ninja_path}")
+        sys.exit(1)
+
+    parser = EnhancedNinjaDependencyParser(build_file, ninja_path)
+    parser.workspace_root = workspace_root  # Attach for use in _get_object_dependencies
+    parser.parse_dependencies()
+    parser.print_summary()
+
+    # Export results
+    output_dir = os.path.dirname(build_file)
+    csv_file = os.path.join(output_dir, 'enhanced_file_executable_mapping.csv')
+    json_file = os.path.join(output_dir, 'enhanced_dependency_mapping.json')
+
+    parser.export_to_csv(csv_file)
+    parser.export_to_json(json_file)
+
+    print(f"\nResults exported to:")
+    print(f"  CSV: {csv_file}")
+    print(f"  JSON: {json_file}")
+
+if __name__ == "__main__":
+    main()
diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py
new file mode 100644
index 0000000000..f364d60d27
--- /dev/null
+++ b/script/dependency-parser/src/selective_test_filter.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Selective Test Filter Tool
+
+Given two git refs (branches or commit IDs), this tool:
+- Identifies changed files between the refs
+- Loads the enhanced dependency mapping JSON (from enhanced_ninja_parser.py)
+- Maps changed files to affected test executables (optionally filtering for "test_" prefix)
+- Exports the list of tests to run to tests_to_run.json
+
+Usage:
+  python selective_test_filter.py <depmap_json> <ref1> <ref2> [--all | --test-prefix] [--output <output_json>]
+
+Arguments:
+  <depmap_json>   Path to enhanced_dependency_mapping.json
+  <ref1>          Source git ref (branch or commit)
+  <ref2>          Target git ref (branch or commit)
+
+Options:
+  --all           Include all executables (default)
+  --test-prefix   Only include executables starting with "test_"
+  --output        Output JSON file (default: tests_to_run.json)
+"""
+
+import sys
+import subprocess
+import json
+import os
+
+def get_changed_files(ref1, ref2):
+    """Return a set of files changed between two git refs."""
+    try:
+        result = subprocess.run(
+            ["git", "diff", "--name-only", ref1, ref2],
+            capture_output=True, text=True, check=True
+        )
+        files = set(line.strip() for line in result.stdout.splitlines() if line.strip())
+        return files
+    except subprocess.CalledProcessError as e:
+        print(f"Error running git diff: {e}")
+        sys.exit(1)
+
+def load_depmap(depmap_json):
+    """Load the dependency mapping JSON."""
+    with open(depmap_json, "r") as f:
+        data = json.load(f)
+    # Support both old and new formats
+    if "file_to_executables" in data:
+        return data["file_to_executables"]
+    return data
+
+def select_tests(file_to_executables, changed_files, filter_mode):
+    """Return a set of test executables affected by changed files."""
+    affected = set()
+    for f in changed_files:
+        if f in file_to_executables:
+            for exe in file_to_executables[f]:
+                if filter_mode == "all":
+                    affected.add(exe)
+                elif filter_mode == "test_prefix" and exe.startswith("test_"):
+                    affected.add(exe)
+    return sorted(affected)
+
+def main():
+    if "--audit" in sys.argv:
+        if len(sys.argv) < 2:
+            print("Usage: python selective_test_filter.py <depmap_json> --audit")
+            sys.exit(1)
+        depmap_json = sys.argv[1]
+        if not os.path.exists(depmap_json):
+            print(f"Dependency map JSON not found: {depmap_json}")
+            sys.exit(1)
+        file_to_executables = load_depmap(depmap_json)
+        for f, exes in file_to_executables.items():
+            print(f"{f}: {', '.join(exes)}")
+        print(f"Total files: {len(file_to_executables)}")
+        sys.exit(0)
+
+    if "--optimize-build" in sys.argv:
+        if len(sys.argv) < 3:
+            print("Usage: python selective_test_filter.py <depmap_json> --optimize-build <changed_file1> [<changed_file2> ...]")
+            sys.exit(1)
+        depmap_json = sys.argv[1]
+        changed_files = set(sys.argv[sys.argv.index("--optimize-build") + 1 :])
+        if not os.path.exists(depmap_json):
+            print(f"Dependency map JSON not found: {depmap_json}")
+            sys.exit(1)
+        file_to_executables = load_depmap(depmap_json)
+        affected_executables = set()
+        for f in changed_files:
+            if f in file_to_executables:
+                affected_executables.update(file_to_executables[f])
+        print("Affected executables:")
+        for exe in sorted(affected_executables):
+            print(exe)
+        print(f"Total affected executables: {len(affected_executables)}")
+        sys.exit(0)
+
+    if len(sys.argv) < 4:
+        print("Usage: python selective_test_filter.py <depmap_json> <ref1> <ref2> [--all | --test-prefix] [--output <output_json>]")
+        sys.exit(1)
+
+    depmap_json = sys.argv[1]
+    ref1 = sys.argv[2]
+    ref2 = sys.argv[3]
+    filter_mode = "all"
+    output_json = "tests_to_run.json"
+
+    if "--test-prefix" in sys.argv:
+        filter_mode = "test_prefix"
+    if "--all" in sys.argv:
+        filter_mode = "all"
+    if "--output" in sys.argv:
+        idx = sys.argv.index("--output")
+        if idx + 1 < len(sys.argv):
+            output_json = sys.argv[idx + 1]
+
+    if not os.path.exists(depmap_json):
+        print(f"Dependency map JSON not found: {depmap_json}")
+        sys.exit(1)
+
+    changed_files = get_changed_files(ref1, ref2)
+    if not changed_files:
+        print("No changed files detected.")
+        tests = []
+    else:
+        file_to_executables = load_depmap(depmap_json)
+        tests = select_tests(file_to_executables, changed_files, filter_mode)
+
+    with open(output_json, "w") as f:
+        json.dump({"tests_to_run": tests, "changed_files": sorted(changed_files)}, f, indent=2)
+
+    print(f"Exported {len(tests)} tests to run to {output_json}")
+
+if __name__ == "__main__":
+    main()
diff --git a/script/launch_tests.sh b/script/launch_tests.sh
new file mode 100755
index 0000000000..829ac82378
--- /dev/null
+++ b/script/launch_tests.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Get the directory where the script is located
+BUILD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Go one level up to PACKAGE_HOME
+PACKAGE_HOME="$(dirname "$BUILD_DIR")"
+
+SCRIPT_DIR="$PACKAGE_HOME/script/"
+
+# Search for build.ninja under PACKAGE_HOME
+BUILD_NINJA_FILE="$PACKAGE_HOME/build/build.ninja"
+
+if [ -z "$BUILD_NINJA_FILE" ]; then
+    echo "Error: build.ninja not found under $PACKAGE_HOME"
+    exit 1
+fi
+
+python3 "$SCRIPT_DIR/dependency-parser/main.py" parse "$BUILD_NINJA_FILE" --workspace-root "$PACKAGE_HOME"
+
+# Get the directory containing build.ninja
+BUILD_DIR=$(dirname "$BUILD_NINJA_FILE")
+
+# Path to enhanced_dependency_mapping.json in the same directory
+JSON_FILE="$BUILD_DIR/enhanced_dependency_mapping.json"
+
+# Check if the JSON file exists
+if [ ! -f "$JSON_FILE" ]; then
+    echo "Error: $JSON_FILE not found."
+    exit 1
+fi
+
+branch=$(git rev-parse --abbrev-ref HEAD)
+
+# Run the command
+python3 "$SCRIPT_DIR/dependency-parser/main.py" select "$JSON_FILE" origin/develop $branch
+
+# Path to tests_to_run.json in the same directory
+TEST_FILE="tests_to_run.json"
+
+command=$(python3 -c "
+import json
+import os
+with open('$TEST_FILE', 'r') as f:
+    data = json.load(f)
+    tests = data.get('tests_to_run', [])
+    if tests:
+        # Extract just the filename after the last '/'
+        clean_tests = [os.path.basename(test) for test in tests]
+        print('ctest -R \"' + '|'.join(clean_tests) + '\"')
+    else:
+        print('# No tests to run')
+")
+
+echo "$command"
+
+eval "$command"
+
+

From df6023e305f389bbf7249b0c4414e649f3ad6598 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Fri, 20 Jun 2025 12:50:13 -0700
Subject: [PATCH 240/443] fix the mi350 error (#2378)

---
 example/ck_tile/19_gemm_multi_d/CMakeLists.txt |  5 +++++
 test/ck_tile/gemm/CMakeLists.txt               |  4 ++--
 test/ck_tile/gemm_multi_d/CMakeLists.txt       | 10 ++++++++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
index e2e68b325a..4ecfec7ccf 100644
--- a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
+++ b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
@@ -1 +1,6 @@
 add_executable(tile_example_gemm_multi_d_fp16 EXCLUDE_FROM_ALL gemm_multi_d_fp16.cpp)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+target_compile_options(tile_example_gemm_multi_d_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index cfc5b0cd1a..8f880b8fde 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -1,9 +1,9 @@
 # Currently ck_tile_gemm is only built on gfx94/gfx95
-set(EXAMPLE_GEMM_COMPILE_OPTIONS "")
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
     list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
-set(EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS "")
+set(EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS)
 if(CK_USE_OCP_FP8)
     list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
diff --git a/test/ck_tile/gemm_multi_d/CMakeLists.txt b/test/ck_tile/gemm_multi_d/CMakeLists.txt
index 1ec77eb87a..a50de7178b 100644
--- a/test/ck_tile/gemm_multi_d/CMakeLists.txt
+++ b/test/ck_tile/gemm_multi_d/CMakeLists.txt
@@ -1,4 +1,10 @@
 # Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_ck_tile_gemm_multi_d test_gemm_multi_d.cpp)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+    add_gtest_executable(test_ck_tile_gemm_multi_d test_gemm_multi_d.cpp)
+    target_compile_definitions(test_ck_tile_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()

From 7378a51b4c7d551fcc0ad00e071869bd3316ec4b Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:03:20 -0700
Subject: [PATCH 241/443] update code owners list (#2381)

---
 .github/CODEOWNERS | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ccdfb0f6fb..f9ded8a029 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @shumway @vidyasagar-amd
 # Documentation files
-docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
-*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
-*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
-.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
 # Header directory for Doxygen documentation
-library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd

From cebdee4d9ee9533b9928a0c1a4c155f6693607c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Sat, 21 Jun 2025 00:44:36 +0200
Subject: [PATCH 242/443] [CK TILE] Grouped Convolution Forward Kernel (#2188)

* [CK TILE] Grouped Convolution Forward Kernel

* custom vector size

* fixes

* refactor

* rebase fixes

* fixes

* fixes
---
 .../20_grouped_convolution/CMakeLists.txt     |    4 +
 .../grouped_convolution_forward.cpp           |  207 +++
 .../grouped_convolution_utils.hpp             |  108 ++
 .../run_grouped_convolution_example.inc       |  206 +++
 example/ck_tile/CMakeLists.txt                |    1 +
 include/ck_tile/host.hpp                      |    1 +
 .../reference/reference_grouped_conv_fwd.hpp  |  165 ++
 .../ops/epilogue/cshuffle_epilogue.hpp        |   12 +-
 .../gemm_pipeline_ag_bg_cr_comp_v5.hpp        |    3 +-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |    4 +-
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   38 +-
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |   18 +-
 include/ck_tile/ops/grouped_convolution.hpp   |   12 +
 .../grouped_convolution_forward_kernel.hpp    |  800 +++++++++
 .../utils/convolution_specialization.hpp      |   30 +
 .../utils/grouped_convolution_utils.hpp       |   74 +
 .../utils/transform_conv_fwd_to_gemm.hpp      | 1432 +++++++++++++++++
 17 files changed, 3096 insertions(+), 19 deletions(-)
 create mode 100644 example/ck_tile/20_grouped_convolution/CMakeLists.txt
 create mode 100644 example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
 create mode 100644 example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
 create mode 100644 example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
 create mode 100644 include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp

diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
new file mode 100644
index 0000000000..00cb0ab9e5
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
+set(EXAMPLE_CONV_COMPILE_OPTIONS)
+list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
new file mode 100644
index 0000000000..685fdccde2
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                TilePartitioner,
+                                                                CodegenPipeline,
+                                                                ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                          ck_tile::memory_operation_enum::set>{});
+}
+
+#include "run_grouped_convolution_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_fwd_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_fwd_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("weight_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_fwd_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_fwd_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
new file mode 100644
index 0000000000..cc8d365b18
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -0,0 +1,108 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_spatial_lengths,
+                                         std::vector<ck_tile::index_t>& image_spatial_lengths,
+                                         std::vector<ck_tile::index_t>& strides,
+                                         std::vector<ck_tile::index_t>& dilations,
+                                         std::vector<ck_tile::index_t>& lpads,
+                                         std::vector<ck_tile::index_t>& rpads,
+                                         ck_tile::ArgParser& arg_parser)
+{
+
+    constexpr ck_tile::index_t non_sp_dims = 3;
+    const ck_tile::index_t n_dim_sp        = arg_parser.get_str("in_layout").size() - non_sp_dims;
+
+    if(!(n_dim_sp >= 1 && n_dim_sp <= 3))
+    {
+        throw std::runtime_error("Wrong layout!\n");
+    }
+
+    if(n_dim_sp == 3)
+    {
+        filter_spatial_lengths.push_back(arg_parser.get_int("z"));
+        image_spatial_lengths.push_back(arg_parser.get_int("d"));
+        strides.push_back(arg_parser.get_int("stride_d"));
+        dilations.push_back(arg_parser.get_int("dilation_d"));
+        lpads.push_back(arg_parser.get_int("lpad_d"));
+        rpads.push_back(arg_parser.get_int("rpad_d"));
+    }
+    if(n_dim_sp >= 2)
+    {
+        filter_spatial_lengths.push_back(arg_parser.get_int("y"));
+        image_spatial_lengths.push_back(arg_parser.get_int("h"));
+        strides.push_back(arg_parser.get_int("stride_h"));
+        dilations.push_back(arg_parser.get_int("dilation_h"));
+        lpads.push_back(arg_parser.get_int("lpad_h"));
+        rpads.push_back(arg_parser.get_int("rpad_h"));
+    }
+    filter_spatial_lengths.push_back(arg_parser.get_int("x"));
+    image_spatial_lengths.push_back(arg_parser.get_int("w"));
+    strides.push_back(arg_parser.get_int("stride_w"));
+    dilations.push_back(arg_parser.get_int("dilation_w"));
+    lpads.push_back(arg_parser.get_int("lpad_w"));
+    rpads.push_back(arg_parser.get_int("rpad_w"));
+
+    return n_dim_sp;
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("g", "2", "group dimension")
+        .insert("n", "32", "n dimension")
+        .insert("k", "32", "k dimension")
+        .insert("c", "32", "c dimension")
+
+        .insert("d", "64", "d dimension")
+        .insert("h", "64", "h dimension")
+        .insert("w", "64", "w dimension")
+
+        .insert("z", "4", "z dimension")
+        .insert("y", "4", "y dimension")
+        .insert("x", "4", "x dimension")
+
+        .insert("stride_d", "1", "d stride")
+        .insert("stride_h", "1", "h stride")
+        .insert("stride_w", "1", "w stride")
+
+        .insert("dilation_d", "1", "d dilation")
+        .insert("dilation_h", "1", "h dilation")
+        .insert("dilation_w", "1", "w dilation")
+
+        .insert("lpad_d", "0", "left pad for d dimension")
+        .insert("lpad_h", "0", "left pad for h dimension")
+        .insert("lpad_w", "0", "left pad for w dimension")
+
+        .insert("rpad_d", "0", "right pad for d dimension")
+        .insert("rpad_h", "0", "right pad for h dimension")
+        .insert("rpad_w", "0", "right pad for w dimension")
+
+        .insert("in_layout", "NHWGC", "Input image layout - NHWGC by default")
+        .insert("weight_layout", "GKYXC", "Weight layout - GKYXC by default")
+        .insert("out_layout", "NHWGK", "Output image layout - NHWGK by default")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
new file mode 100644
index 0000000000..ed72eb354d
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
+auto calculate_rtol_atol(const ck_tile::index_t GemmK,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(GemmK, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
+    const auto atol_split_k =
+        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
+            max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_fwd(ck_tile::GroupedConvHostArgs& args, int n_warmup, int n_repeat)
+{
+    float ave_time = grouped_conv_fwd<NDimSpatial,
+                                      InDataType,
+                                      WeiDataType,
+                                      AccDataType,
+                                      OutDataType,
+                                      InLayout,
+                                      WeiLayout,
+                                      OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_fwd_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(input);
+        ck_tile::FillUniformDistribution<WeiDataType>{-5.f, 5.f}(weight);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<InDataType>{}(input);
+        ck_tile::FillMonotonicSeq<WeiDataType>{}(weight);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<WeiDataType>{1.f, 1.f}(weight);
+    }
+    else
+    {
+        input.SetZero();
+        weight.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.ToDevice(input.data());
+    weight_dev_buf.ToDevice(weight.data());
+    output_dev_buf.SetZero();
+
+    ck_tile::GroupedConvHostArgs args(conv_param,
+                                      input_dev_buf.GetDeviceBuffer(),
+                                      weight_dev_buf.GetDeviceBuffer(),
+                                      {},
+                                      output_dev_buf.GetDeviceBuffer(),
+                                      kbatch);
+
+    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_fwd<NDimSpatial,
+                            InDataType,
+                            WeiDataType,
+                            AccDataType,
+                            OutDataType,
+                            InLayout,
+                            WeiLayout,
+                            OutLayout>(args, n_warmup, n_repeat);
+
+    output_dev_buf.FromDevice(output.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
+        output_host_ref.SetZero();
+
+        ck_tile::reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+            input,
+            weight,
+            output_host_ref,
+            conv_param.conv_filter_strides_,
+            conv_param.conv_filter_dilations_,
+            conv_param.input_left_pads_,
+            conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(output,
+                                  output_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 92b859a750..8989060842 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -19,6 +19,7 @@ add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
+add_subdirectory(20_grouped_convolution)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_copy)
 add_subdirectory(37_transpose)
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 44851fec4a..4a9748fcbb 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -27,6 +27,7 @@
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_fused_moe.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_fwd.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
 #include "ck_tile/host/reference/reference_moe_sorting.hpp"
diff --git a/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp b/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
new file mode 100644
index 0000000000..8a12fdb7e0
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+CK_TILE_HOST void reference_grouped_conv_fwd(const HostTensor<InDataType>& input,
+                                             const HostTensor<WeiDataType>& weight,
+                                             HostTensor<OutDataType>& output,
+                                             std::vector<ck_tile::long_index_t> conv_strides,
+                                             std::vector<ck_tile::long_index_t> conv_dilations,
+                                             std::vector<ck_tile::long_index_t> in_left_pads,
+                                             std::vector<ck_tile::long_index_t>)
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto n, auto k, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t x = 0; x < weight.get_lengths()[3]; ++x)
+                {
+                    auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
+                    {
+                        InDataType v_in   = input(g, n, c, wi);
+                        WeiDataType v_wei = weight(g, k, c, x);
+                        v_acc += ck_tile::type_convert<float>(v_in) *
+                                 ck_tile::type_convert<float>(v_wei);
+                    }
+                }
+            }
+            OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, wo)         = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t y = 0; y < weight.get_lengths()[3]; ++y)
+                {
+                    auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    for(std::size_t x = 0; x < weight.get_lengths()[4]; ++x)
+                    {
+                        auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+
+                        if(hi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
+                           wi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
+                        {
+                            InDataType v_in   = input(g, n, c, hi, wi);
+                            WeiDataType v_wei = weight(g, k, c, y, x);
+
+                            v_acc += ck_tile::type_convert<float>(v_in) *
+                                     ck_tile::type_convert<float>(v_wei);
+                        }
+                    }
+                }
+            }
+            OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, ho, wo)     = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3],
+                                   output.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t z = 0; z < weight.get_lengths()[3]; ++z)
+                {
+                    auto di = static_cast<ck_tile::long_index_t>(d_o * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+                    for(std::size_t y = 0; y < weight.get_lengths()[4]; ++y)
+                    {
+                        auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+                        for(std::size_t x = 0; x < weight.get_lengths()[5]; ++x)
+                        {
+                            auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
+                                      static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
+                                      static_cast<ck_tile::long_index_t>(in_left_pads[2]);
+                            if(di >= 0 &&
+                               ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
+                               hi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
+                               wi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
+                            {
+                                InDataType v_in   = input(g, n, c, di, hi, wi);
+                                WeiDataType v_wei = weight(g, k, c, z, y, x);
+
+                                v_acc += ck_tile::type_convert<float>(v_in) *
+                                         ck_tile::type_convert<float>(v_wei);
+                            }
+                        }
+                    }
+                }
+            }
+            OutDataType v_acc_converted  = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, d_o, ho, wo) = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3],
+                                   output.get_lengths()[4],
+                                   output.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("Ref_Conv_fwd: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 68e91520bf..bf58544259 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -27,7 +27,9 @@ template <typename ADataType_,
           index_t KPerXdl_,
           bool isCTransposed_,
           memory_operation_enum MemoryOperation_,
-          index_t kNumWaveGroups_ = 1>
+          index_t kNumWaveGroups_ = 1,
+          bool FixedVectorSize_   = false,
+          index_t VectorSizeC_    = 1>
 struct CShuffleEpilogueProblem
 {
     using ADataType                                        = remove_cvref_t<ADataType_>;
@@ -48,6 +50,8 @@ struct CShuffleEpilogueProblem
     static constexpr index_t KPerXdl                       = KPerXdl_;
     static constexpr index_t isCTransposed                 = isCTransposed_;
     static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
+    static constexpr bool FixedVectorSize                  = FixedVectorSize_;
+    static constexpr index_t VectorSizeC                   = VectorSizeC_;
     static constexpr index_t kNumWaveGroups                = kNumWaveGroups_;
     static constexpr index_t NumDTensor                    = DsDataType::size();
 
@@ -80,6 +84,8 @@ struct CShuffleEpilogue
     static constexpr index_t NPerXdl                       = Problem::NPerXdl;
     static constexpr index_t KPerXdl                       = Problem::KPerXdl;
     static constexpr index_t isCTransposed                 = Problem::isCTransposed;
+    static constexpr bool FixedVectorSize                  = Problem::FixedVectorSize;
+    static constexpr index_t VectorSizeC                   = Problem::VectorSizeC;
     static constexpr index_t MPerIteration                 = MPerXdl * MWave;
     static constexpr index_t NPerIteration                 = NPerXdl * NWave;
     static constexpr index_t NumDTensor                    = Problem::NumDTensor;
@@ -98,6 +104,10 @@ struct CShuffleEpilogue
      */
     CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeC()
     {
+        if constexpr(FixedVectorSize)
+        {
+            return VectorSizeC;
+        }
         constexpr index_t max_vector_size = 16;
         if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
         {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 55220730cd..424565060b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -1,8 +1,7 @@
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp"
 #include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 6bb14af9e6..0f7f6369f0 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -121,7 +121,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
         {
-            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M1           = Problem::VectorSizeA;
             constexpr index_t M0           = MPerBlock / M1;
             constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
             static_assert(total_pixels % M1 == 0);
@@ -211,7 +211,7 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
 
         if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t N1           = Problem::VectorSizeB;
             constexpr index_t N0           = NPerBlock / N1;
             constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
             static_assert(total_pixels % N1 == 0);
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index b10ee0320f..dc7d150b46 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -14,7 +14,10 @@ template <typename ADataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_ = ADataType_>
+          typename ComputeDataType_ = ADataType_,
+          bool FixedVectorSize_     = false,
+          index_t VectorSizeA_      = 1,
+          index_t VectorSizeB_      = 1>
 struct GemmPipelineProblemBase
 {
     using Traits = remove_cvref_t<Traits_>;
@@ -24,6 +27,8 @@ struct GemmPipelineProblemBase
     using CDataType       = remove_cvref_t<CDataType_>;
     using ComputeDataType = remove_cvref_t<ComputeDataType_>;
 
+    static constexpr bool FixedVectorSize = FixedVectorSize_;
+
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
     using ALayout = remove_cvref_t<typename Traits::ALayout>;
@@ -115,7 +120,11 @@ struct GemmPipelineProblemBase
     }
 
     static constexpr index_t VectorSizeA = []() {
-        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        if constexpr(FixedVectorSize)
+        {
+            return VectorSizeA_;
+        }
+        else if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
         {
             return kPadK ? 1 : GetAlignmentA();
         }
@@ -126,7 +135,11 @@ struct GemmPipelineProblemBase
     }();
 
     static constexpr index_t VectorSizeB = []() {
-        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+        if constexpr(FixedVectorSize)
+        {
+            return VectorSizeB_;
+        }
+        else if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
         {
             return kPadN ? 1 : GetAlignmentB();
         }
@@ -153,13 +166,19 @@ template <typename ADataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_ = ADataType_>
+          typename ComputeDataType_ = ADataType_,
+          bool FixedVectorSize_     = false,
+          index_t VectorSizeA_      = 1,
+          index_t VectorSizeB_      = 1>
 using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
                                                     BDataType_,
                                                     CDataType_,
                                                     BlockGemmShape_,
                                                     Traits_,
-                                                    ComputeDataType_>;
+                                                    ComputeDataType_,
+                                                    FixedVectorSize_,
+                                                    VectorSizeA_,
+                                                    VectorSizeB_>;
 
 template <typename ADataType_,
           typename BDataType_,
@@ -169,7 +188,10 @@ template <typename ADataType_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
           TailNumber TailNum_              = TailNumber::Full,
-          typename ComputeDataType_        = ADataType_>
+          typename ComputeDataType_        = ADataType_,
+          bool FixedVectorSize_            = false,
+          index_t VectorSizeA_             = 1,
+          index_t VectorSizeB_             = 1>
 struct UniversalGemmPipelineProblem
 {
     using Traits = remove_cvref_t<Traits_>;
@@ -179,6 +201,10 @@ struct UniversalGemmPipelineProblem
     using CDataType       = remove_cvref_t<CDataType_>;
     using ComputeDataType = remove_cvref_t<ComputeDataType_>;
 
+    static constexpr bool FixedVectorSize = FixedVectorSize_;
+    static constexpr index_t VectorSizeA  = VectorSizeA_;
+    static constexpr index_t VectorSizeB  = VectorSizeB_;
+
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
     using ALayout = remove_cvref_t<typename Traits::ALayout>;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 91e845d200..d5f2eedf2d 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -426,10 +426,11 @@ struct UniversalGemmBasePolicy
     {
         using ALayout = remove_cvref_t<typename Problem::ALayout>;
 
-        constexpr index_t BlockSize     = Problem::kBlockSize;
-        constexpr index_t MPerBlock     = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize   = GetVectorSizeA<Problem>();
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize =
+            Problem::FixedVectorSize ? Problem::VectorSizeA : GetVectorSizeA<Problem>();
         constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
         // Tile: MPerBlock X KPerBlock
@@ -461,10 +462,11 @@ struct UniversalGemmBasePolicy
     {
         using BLayout = remove_cvref_t<typename Problem::BLayout>;
 
-        constexpr index_t BlockSize     = Problem::kBlockSize;
-        constexpr index_t NPerBlock     = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock     = Problem::BlockGemmShape::kK;
-        constexpr index_t VecLoadSize   = GetVectorSizeB<Problem>();
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t VecLoadSize =
+            Problem::FixedVectorSize ? Problem::VectorSizeB : GetVectorSizeB<Problem>();
         constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
         // Tile: KPerBlock X NPerBlock
diff --git a/include/ck_tile/ops/grouped_convolution.hpp b/include/ck_tile/ops/grouped_convolution.hpp
new file mode 100644
index 0000000000..ae5720776c
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
new file mode 100644
index 0000000000..196c468c07
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -0,0 +1,800 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+
+namespace ck_tile {
+
+/// @brief The Grouped Convolution kernel device arguments.
+template <typename GroupedConvTraitsType>
+struct GroupedConvFwdKernelArgs
+{
+
+    using ConvToGemmFwdTransformer =
+        TransformConvFwdToGemm<GroupedConvTraitsType::NDimSpatial,
+                               GroupedConvTraitsType::ConvSpecialization>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0])};
+
+        k_batch = args.k_batch;
+
+        GemmM = args.N_ * args.output_spatial_lengths_[0];
+        GemmN = args.K_;
+        GemmK = args.C_ * args.filter_spatial_lengths_[0];
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                          wei_g_k_c_xs_lengths,
+                                                          out_g_n_k_wos_lengths,
+                                                          conv_filter_strides,
+                                                          conv_filter_dilations,
+                                                          input_left_pads,
+                                                          input_right_pads};
+
+        a_grid_desc_m_k =
+            conv_to_gemm_transformer
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+        b_grid_desc_n_k =
+            conv_to_gemm_transformer
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+        c_grid_desc_m_n =
+            conv_to_gemm_transformer
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+
+        group_stride_a = args.C_;
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+        group_stride_c = args.K_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                static_cast<index_t>(args.input_spatial_lengths_[1])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[1])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                               static_cast<index_t>(args.conv_filter_strides_[1])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                           static_cast<index_t>(args.input_left_pads_[1])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                            static_cast<index_t>(args.input_right_pads_[1])};
+
+        k_batch = args.k_batch;
+
+        GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
+        GemmN = args.K_;
+        GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1];
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                          wei_g_k_c_xs_lengths,
+                                                          out_g_n_k_wos_lengths,
+                                                          conv_filter_strides,
+                                                          conv_filter_dilations,
+                                                          input_left_pads,
+                                                          input_right_pads};
+
+        a_grid_desc_m_k =
+            conv_to_gemm_transformer
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+        b_grid_desc_n_k =
+            conv_to_gemm_transformer
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+        c_grid_desc_m_n =
+            conv_to_gemm_transformer
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+
+        group_stride_a = args.C_;
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+        group_stride_c = args.K_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                static_cast<index_t>(args.input_spatial_lengths_[1]),
+                                static_cast<index_t>(args.input_spatial_lengths_[2])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[1]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[2])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[2])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                               static_cast<index_t>(args.conv_filter_strides_[1]),
+                               static_cast<index_t>(args.conv_filter_strides_[2])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[2])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                           static_cast<index_t>(args.input_left_pads_[1]),
+                           static_cast<index_t>(args.input_left_pads_[2])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                            static_cast<index_t>(args.input_right_pads_[1]),
+                            static_cast<index_t>(args.input_right_pads_[2])};
+
+        k_batch = args.k_batch;
+
+        GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1] *
+                args.output_spatial_lengths_[2];
+        GemmN = args.K_;
+        GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1] *
+                args.filter_spatial_lengths_[2];
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                          wei_g_k_c_xs_lengths,
+                                                          out_g_n_k_wos_lengths,
+                                                          conv_filter_strides,
+                                                          conv_filter_dilations,
+                                                          input_left_pads,
+                                                          input_right_pads};
+
+        a_grid_desc_m_k =
+            conv_to_gemm_transformer
+                .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>();
+        b_grid_desc_n_k =
+            conv_to_gemm_transformer
+                .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>();
+        c_grid_desc_m_n =
+            conv_to_gemm_transformer
+                .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>();
+
+        group_stride_a = args.C_;
+        group_stride_b = args.K_ * args.C_ *
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+        group_stride_c = args.K_;
+    }
+
+    using AGridDescMK = remove_cvref_t<decltype(
+        ConvToGemmFwdTransformer{}
+            .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>())>;
+    using BGridDescNK = remove_cvref_t<decltype(
+        ConvToGemmFwdTransformer{}
+            .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>())>;
+    using CGridDescMN = remove_cvref_t<decltype(
+        ConvToGemmFwdTransformer{}
+            .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>())>;
+
+    static constexpr index_t NonSpatialDims = 3;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> in_g_n_c_wis_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> wei_g_k_c_xs_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> out_g_n_k_wos_lengths;
+
+    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_strides;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_dilations;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> input_left_pads;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> input_right_pads;
+
+    index_t k_batch;
+    index_t GemmM;
+    index_t GemmN;
+    index_t GemmK;
+
+    const void* in_ptr;
+    const void* wei_ptr;
+    std::array<const void*, NumDTensor> ds_ptr;
+    void* out_ptr;
+
+    AGridDescMK a_grid_desc_m_k;
+    BGridDescNK b_grid_desc_n_k;
+    CGridDescMN c_grid_desc_m_n;
+
+    long_index_t group_stride_a;
+    long_index_t group_stride_b;
+    long_index_t group_stride_c;
+};
+
+/// @brief The Grouped Convolution Forward kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the grouped convolution forward kernel template. By semantic
+///            division of Implicit GEMM algorithm into following parts we achieve flexible,
+///            versatile and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam GroupedConvTraitsType       The type of class providing traits for grouped convolution.
+/// @tparam TilePartitioner_            The type of class providing mapping of workgroup index into
+/// the
+///                                     output data tile to be calculated. It determines the
+///                                     workgroup to data relationship (or in other words - which
+///                                     data would be processed and calculated by which workgroup).
+/// @tparam GemmPipeline_               The type of class which provides the core part of matrix
+///                                     multiplication. This class should provide implementation of
+///                                     data loading from global memory and performing block-wise
+///                                     matrix multiplication. You can think of it as a work done by
+///                                     single workgroup point of view.
+/// @tparam EpiloguePipeline_           The type of class providing the final part of matrix
+///                                     multiplication implementation. It is responsible for storing
+///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                                     the output C tensor in global memory.
+template <typename GroupedConvTraitsType,
+          typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_>
+struct GroupedConvolutionForwardKernel
+{
+    static constexpr index_t NDimSpatial = GroupedConvTraitsType::NDimSpatial;
+    static constexpr ConvolutionSpecialization ConvSpecialization =
+        GroupedConvTraitsType::ConvSpecialization;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using GemmALayout      = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using GemmBLayout      = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using GemmCLayout      = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType::InLayout>;
+    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType::WeiLayout>;
+    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType::OutLayout>;
+    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType::DsLayout>;
+
+    using GemmDsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+
+    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+    // Below type is actually accumulation data type - the output of block GEMM.
+    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType>;
+
+    // TODO: Enable this
+    static constexpr bool IsSplitKSupported = false;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>();
+
+    static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
+                  "Not supported!");
+    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "grouped_convolution_forward", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const GroupedConvHostArgs& args)
+    {
+        const index_t GemmM = args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
+                                                        args.output_spatial_lengths_.end(),
+                                                        1,
+                                                        std::multiplies<index_t>());
+        const index_t GemmN = args.K_;
+        return dim3(TilePartitioner::GridSize(GemmM, GemmN), args.G_, args.k_batch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
+    MakeKernelArgs(const GroupedConvHostArgs& hostArgs)
+    {
+        return GroupedConvFwdKernelArgsSpecialized(hostArgs);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_HOST static bool IsSupportedArgument(const GroupedConvFwdKernelArgsSpecialized& kargs)
+    {
+        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                      is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
+                     !IsSplitKSupported)
+        {
+            if(kargs.k_batch != 1)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                }
+                return false;
+            }
+        }
+
+        const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
+        const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
+
+        // check ConvolutionSpecialization
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t ConvStride = kargs.conv_filter_strides[i];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if(ConvC != 1)
+            {
+                return false;
+            }
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t filter_spatial_dim = kargs.wei_g_k_c_xs_lengths[i + I3];
+
+                if(filter_spatial_dim != I3)
+                {
+                    return false;
+                }
+            }
+        }
+
+        namespace ctc = tensor_layout::convolution;
+
+        if constexpr(std::is_same_v<InLayout, ctc::NWGC> || std::is_same_v<InLayout, ctc::NHWGC> ||
+                     std::is_same_v<InLayout, ctc::NDHWGC>)
+        {
+            // Check access per C
+            if(ConvC % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported input layout!");
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKYXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKZYXC>)
+        {
+            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported weight layout!");
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
+                     std::is_same_v<OutLayout, ctc::NHWGK> ||
+                     std::is_same_v<OutLayout, ctc::NDHWGK>)
+        {
+            if(ConvK % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported output layout!");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const InDataType* a_ptr,
+                        const WeiDataType* b_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        OutDataType* c_ptr,
+                        const GroupedConvFwdKernelArgsSpecialized& kargs)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteB, "Not implemented!");
+        const auto& a_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(a_ptr, kargs.a_grid_desc_m_k);
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(b_ptr, kargs.b_grid_desc_n_k);
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(c_ptr, kargs.c_grid_desc_m_n);
+        }();
+
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsLayout>, OutLayout>,
+                              "Not supported!");
+                static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
+                              "Not supported!");
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, OutDataType>,
+                              "Not supported!");
+
+                return make_tensor_view<address_space_enum::global>(
+                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
+            },
+            number<NumDTensor>{});
+
+        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            return pad_tensor_view(a_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I1);
+            return pad_tensor_view(b_tensor_view,
+                                   make_tuple(number<TilePartitioner::NPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& ds_tensor_view = views.at(I2);
+        const auto& ds_pad_view    = generate_tuple(
+            [&](auto i) {
+                return pad_tensor_view(ds_tensor_view[i],
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<true, true>{});
+            },
+            number<NumDTensor>{});
+
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I3);
+            return pad_tensor_view(c_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::NPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view  = views.at(I0);
+        const auto& b_pad_view  = views.at(I1);
+        const auto& ds_pad_view = views.at(I2);
+        const auto& c_pad_view  = views.at(I3);
+
+        const auto& a_block_window = [&]() {
+            return make_tile_window(a_pad_view,
+                                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_m, 0});
+        }();
+
+        const auto& b_block_window = [&]() {
+            return make_tile_window(b_pad_view,
+                                    make_tuple(number<TilePartitioner::NPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_n, 0});
+        }();
+
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                return make_tile_window(ds_pad_view[i],
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            },
+            number<NumDTensor>{});
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_block_window, ds_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm(const InDataType* a_ptr,
+                                       const WeiDataType* b_ptr,
+                                       const std::array<const void*, NumDTensor>& ds_ptr,
+                                       OutDataType* c_ptr,
+                                       void* smem_ptr_0,
+                                       const GroupedConvFwdKernelArgsSpecialized& kargs,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm2LDS(const InDataType* a_ptr,
+                                           const WeiDataType* b_ptr,
+                                           const std::array<const void*, NumDTensor>& ds_ptr,
+                                           OutDataType* c_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const GroupedConvFwdKernelArgsSpecialized& kargs,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0, smem_ptr_1);
+    }
+
+    CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
+    {
+        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto [iM, iN] =
+            TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
+        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+
+        // options
+        const InDataType* a_ptr  = static_cast<const InDataType*>(kargs.in_ptr) + group_offset_a;
+        const WeiDataType* b_ptr = static_cast<const WeiDataType*>(kargs.wei_ptr) + group_offset_b;
+        OutDataType* c_ptr       = static_cast<OutDataType*>(kargs.out_ptr) + group_offset_c;
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+        {
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm2LDS(
+                    a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, smem_ptr_1, kargs, i_m, i_n);
+            }
+        }
+        else
+        {
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm(a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, i_m, i_n);
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp b/include/ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp
new file mode 100644
index 0000000000..4cbc5c506a
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+namespace ck_tile {
+
+enum struct ConvolutionSpecialization
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    Filter3x3,
+};
+
+CK_TILE_HOST std::string getConvSpecializationString(const ConvolutionSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionSpecialization::Default: return "Default";
+    case ConvolutionSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case ConvolutionSpecialization::Filter3x3: return "Filter3x3";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
new file mode 100644
index 0000000000..4b7cb3c895
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+
+namespace ck_tile {
+
+/// @brief The Grouped Conv kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to Grouped Convolution Kernels when creating kernel
+///      arguments object. It contain all necessary information required to
+///      build proper kernel argument and launch kernel on GPU.
+struct GroupedConvHostArgs : public conv::ConvParam
+{
+    CK_TILE_HOST GroupedConvHostArgs() = delete;
+    CK_TILE_HOST GroupedConvHostArgs(ConvParam conv_param,
+                                     const void* in_ptr_,
+                                     const void* wei_ptr_,
+                                     const std::vector<const void*> ds_ptr_,
+                                     void* out_ptr_,
+                                     index_t k_batch_)
+        : conv::ConvParam(conv_param),
+          in_ptr(in_ptr_),
+          wei_ptr(wei_ptr_),
+          ds_ptr(ds_ptr_),
+          out_ptr(out_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* in_ptr;
+    const void* wei_ptr;
+    const std::vector<const void*> ds_ptr;
+    void* out_ptr;
+    index_t k_batch;
+};
+
+template <index_t NDimSpatial_,
+          ConvolutionSpecialization ConvSpecialization_,
+          typename InLayout_,
+          typename WeiLayout_,
+          typename DsLayout_,
+          typename OutLayout_>
+struct GroupedConvTraits
+{
+    private:
+    static constexpr auto generate_implicit_gemm_layout()
+    {
+        return generate_tuple([](auto) { return ck_tile::tensor_layout::gemm::RowMajor{}; },
+                              number<DsLayout_::size()>{});
+    }
+
+    public:
+    static constexpr index_t NDimSpatial                          = NDimSpatial_;
+    static constexpr ConvolutionSpecialization ConvSpecialization = ConvSpecialization_;
+    using InLayout                                                = InLayout_;
+    using WeiLayout                                               = WeiLayout_;
+    using DsLayout                                                = DsLayout_;
+    using OutLayout                                               = OutLayout_;
+    using GroupedConvImplicitGemmTraits                           = TileGemmTraits<true,
+                                                         true,
+                                                         true,
+                                                         ck_tile::tensor_layout::gemm::RowMajor,
+                                                         ck_tile::tensor_layout::gemm::ColumnMajor,
+                                                         ck_tile::tensor_layout::gemm::RowMajor>;
+    static constexpr index_t NumDTensor                           = DsLayout::size();
+    using ImplicitGemmDsLayout = decltype(generate_implicit_gemm_layout());
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
new file mode 100644
index 0000000000..c468ae4398
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
@@ -0,0 +1,1432 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
+
+namespace ck_tile {
+
+template <index_t NDimSpatial,
+          ConvolutionSpecialization ConvSpecialization,
+          bool SplitN              = false,
+          typename ADataType       = float,
+          typename CDataType       = float,
+          index_t NumGroupsToMerge = 1,
+          typename IndexType       = index_t>
+struct TransformConvFwdToGemm
+{
+    private:
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+    static constexpr auto I3 = number<3>{};
+    static constexpr auto I4 = number<4>{};
+    static constexpr auto I5 = number<5>{};
+#if 0 // TODO: Enable these functionalities
+    template <typename ConvDimsType>
+    static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths,
+                                                          const ConvDimsType& strides,
+                                                          index_t i)
+    {
+        long_index_t acc = 1;
+        for(; i < (NDimSpatial + 3); i++)
+        {
+            acc +=
+                static_cast<long_index_t>(lengths[i] - I1) * static_cast<long_index_t>(strides[i]);
+        }
+
+        return acc;
+    }
+
+    template <typename ConvDimsType>
+    static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_c_wis_lengths,
+                                     const ConvDimsType& a_g_n_c_wis_strides,
+                                     const ConvDimsType& c_g_n_k_wos_lengths,
+                                     const ConvDimsType& c_g_n_k_wos_strides)
+    {
+        const long_index_t a_element_space_size =
+            calculate_element_space_size_impl(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, I1);
+        const long_index_t c_element_space_size =
+            calculate_element_space_size_impl(c_g_n_k_wos_lengths, c_g_n_k_wos_strides, I1);
+        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
+                                                          c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+
+        const IndexType N = a_g_n_c_wis_lengths[I1];
+
+        if(element_space_size > TwoGB)
+        {
+            // Minimum divisor of N to not exceed 2GB
+            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);
+
+            if(divisor <= static_cast<double>(N))
+            {
+                // Find least divisor of N larger than element_space_size / TwoGB
+                // Iterate up to sqrt(N). There are no divisors above this value.
+                for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N;
+                    least_divisor++)
+                {
+                    if(N % least_divisor == 0)
+                    {
+                        return N / least_divisor;
+                    }
+                }
+                // Not found, process one Convolution N per block
+                return 1;
+            }
+            else
+            {
+                // Split Convolution's N dimension into N workgroups. However
+                // this still might not result in sufficiently small tensor,
+                // but at least later on we could divide the image as well.
+                return 1;
+            }
+        }
+        else
+        {
+            // Split N is not needed.
+            return N;
+        }
+    }
+#endif
+
+    public:
+    CK_TILE_HOST constexpr TransformConvFwdToGemm() {}
+
+    template <typename TransformConvFwdToGemmBase>
+    CK_TILE_HOST
+    TransformConvFwdToGemm(const TransformConvFwdToGemmBase& transform_conv_fwd_to_gemm_base)
+        : G_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.G_)},
+          N_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.N_)},
+          Di_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Di_)},
+          Hi_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Hi_)},
+          Wi_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Wi_)},
+          Do_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Do_)},
+          Ho_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Ho_)},
+          Wo_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Wo_)},
+          Z_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Z_)},
+          Y_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Y_)},
+          X_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.X_)},
+          K_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.K_)},
+          C_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.C_)},
+          ConvStrideD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvStrideD_)},
+          ConvStrideH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvStrideH_)},
+          ConvStrideW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvStrideW_)},
+          ConvDilationD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvDilationD_)},
+          ConvDilationH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvDilationH_)},
+          ConvDilationW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvDilationW_)},
+          InLeftPadD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InLeftPadD_)},
+          InLeftPadH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InLeftPadH_)},
+          InLeftPadW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InLeftPadW_)},
+          InRightPadD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InRightPadD_)},
+          InRightPadH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InRightPadH_)},
+          InRightPadW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InRightPadW_)},
+          ZYX_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ZYX_)}
+    {
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                        const ConvDimsType& b_g_k_c_xs_lengths,
+                                        const ConvDimsType& c_g_n_k_wos_lengths,
+                                        const ConvSpatialDimsType& conv_filter_strides,
+                                        const ConvSpatialDimsType& conv_filter_dilations,
+                                        const ConvSpatialDimsType& input_left_pads,
+                                        const ConvSpatialDimsType& input_right_pads)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          Di_{I1},
+          Hi_{I1},
+          Wi_{a_g_n_c_wis_lengths[I3]},
+          Do_{I1},
+          Ho_{I1},
+          Wo_{c_g_n_k_wos_lengths[I3]},
+          Z_{I1},
+          Y_{I1},
+          X_{b_g_k_c_xs_lengths[I3]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{I1},
+          ConvStrideH_{I1},
+          ConvStrideW_{conv_filter_strides[I0]},
+          ConvDilationD_{I1},
+          ConvDilationH_{I1},
+          ConvDilationW_{conv_filter_dilations[I0]},
+          InLeftPadD_{I0},
+          InLeftPadH_{I0},
+          InLeftPadW_{input_left_pads[I0]},
+          InRightPadD_{I0},
+          InRightPadH_{I0},
+          InRightPadW_{input_right_pads[I0]},
+          ZYX_{X_}
+    {
+        static_assert(std::is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
+        static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        N_ = c_g_n_k_wos_lengths[I1];
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                        const ConvDimsType& b_g_k_c_xs_lengths,
+                                        const ConvDimsType& c_g_n_k_wos_lengths,
+                                        const ConvSpatialDimsType& conv_filter_strides,
+                                        const ConvSpatialDimsType& conv_filter_dilations,
+                                        const ConvSpatialDimsType& input_left_pads,
+                                        const ConvSpatialDimsType& input_right_pads)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          Di_{I1},
+          Hi_{a_g_n_c_wis_lengths[I3]},
+          Wi_{a_g_n_c_wis_lengths[I4]},
+          Do_{I1},
+          Ho_{c_g_n_k_wos_lengths[I3]},
+          Wo_{c_g_n_k_wos_lengths[I4]},
+          Z_{I1},
+          Y_{b_g_k_c_xs_lengths[I3]},
+          X_{b_g_k_c_xs_lengths[I4]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{I1},
+          ConvStrideH_{conv_filter_strides[I0]},
+          ConvStrideW_{conv_filter_strides[I1]},
+          ConvDilationD_{I1},
+          ConvDilationH_{conv_filter_dilations[I0]},
+          ConvDilationW_{conv_filter_dilations[I1]},
+          InLeftPadD_{I0},
+          InLeftPadH_{input_left_pads[I0]},
+          InLeftPadW_{input_left_pads[I1]},
+          InRightPadD_{I0},
+          InRightPadH_{input_right_pads[I0]},
+          InRightPadW_{input_right_pads[I1]},
+          ZYX_{Y_ * X_}
+    {
+        static_assert(std::is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
+        static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        N_ = c_g_n_k_wos_lengths[I1];
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST TransformConvFwdToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                        const ConvDimsType& b_g_k_c_xs_lengths,
+                                        const ConvDimsType& c_g_n_k_wos_lengths,
+                                        const ConvSpatialDimsType& conv_filter_strides,
+                                        const ConvSpatialDimsType& conv_filter_dilations,
+                                        const ConvSpatialDimsType& input_left_pads,
+                                        const ConvSpatialDimsType& input_right_pads)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          Di_{a_g_n_c_wis_lengths[I3]},
+          Hi_{a_g_n_c_wis_lengths[I4]},
+          Wi_{a_g_n_c_wis_lengths[I5]},
+          Do_{c_g_n_k_wos_lengths[I3]},
+          Ho_{c_g_n_k_wos_lengths[I4]},
+          Wo_{c_g_n_k_wos_lengths[I5]},
+          Z_{b_g_k_c_xs_lengths[I3]},
+          Y_{b_g_k_c_xs_lengths[I4]},
+          X_{b_g_k_c_xs_lengths[I5]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{conv_filter_strides[I0]},
+          ConvStrideH_{conv_filter_strides[I1]},
+          ConvStrideW_{conv_filter_strides[I2]},
+          ConvDilationD_{conv_filter_dilations[I0]},
+          ConvDilationH_{conv_filter_dilations[I1]},
+          ConvDilationW_{conv_filter_dilations[I2]},
+          InLeftPadD_{input_left_pads[I0]},
+          InLeftPadH_{input_left_pads[I1]},
+          InLeftPadW_{input_left_pads[I2]},
+          InRightPadD_{input_right_pads[I0]},
+          InRightPadH_{input_right_pads[I1]},
+          InRightPadW_{input_right_pads[I2]},
+          ZYX_{Z_ * Y_ * X_}
+    {
+        static_assert(std::is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
+        static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        N_ = c_g_n_k_wos_lengths[I1];
+    }
+
+#if 0 // TODO: Enable these functionalities
+    __host__ bool AreDescriptorsSmallerThan2GB() const
+    {
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        const long_index_t in_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorA_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ +
+            (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorA_;
+        const long_index_t out_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorC_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ +
+            (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorC_;
+
+        bool is_a_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(ADataType)) <= TwoGB;
+        bool is_c_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(CDataType)) <= TwoGB;
+
+        return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
+    }
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
+    {
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+        // Calculate real filter size
+        const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1;
+        const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1;
+        const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1;
+        // Calculate start position in input for right tensor
+        const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_;
+        const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_;
+        const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_;
+        // Calculate last position in input for left tensor
+        const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff;
+        const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff;
+        const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff;
+        // Allow to split if whole left padding will be in left tensor and right padding in right
+        // tensor
+        const bool is_possible_to_split_d = Do_ != 1 &&
+                                            di_right_transformer_start_idx > InLeftPadD_ &&
+                                            di_left_transformer_end_idx <= (InLeftPadD_ + Di_);
+        const bool is_possible_to_split_h = Ho_ != 1 &&
+                                            hi_right_transformer_start_idx > InLeftPadH_ &&
+                                            hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_);
+        const bool is_possible_to_split_w = Wo_ != 1 &&
+                                            wi_right_transformer_start_idx > InLeftPadW_ &&
+                                            wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_);
+
+        if(is_possible_to_split_d)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Do_  = Do_ / 2;
+            conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_;
+            conv_to_gemm_transformer_right.Di_ =
+                math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_),
+                          (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff);
+            ;
+            // Calcualte offsets
+            a_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_;
+            c_right_offset = (Do_ / 2) * DoStride_;
+        }
+        else if(is_possible_to_split_h)
+        {
+            conv_to_gemm_transformer_left.Ho_  = Ho_ / 2;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2;
+
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;
+
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
+
+            conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_;
+            conv_to_gemm_transformer_right.Hi_ =
+                math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_),
+                          (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff);
+            a_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_;
+            c_right_offset = (Ho_ / 2) * HoStride_;
+        }
+        else if(is_possible_to_split_w)
+        {
+            conv_to_gemm_transformer_left.Wo_  = Wo_ / 2;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2;
+
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;
+
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
+
+            conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_;
+            conv_to_gemm_transformer_right.Wi_ =
+                math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_),
+                          (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff);
+
+            a_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
+            c_right_offset = (Wo_ / 2) * WoStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck_tile::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+#endif
+    // TODO: implement ck_tile::tensor_layout::convolution that describe packed/strided dimemsion as
+    // properties
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 1 &&
+                                          std::is_same_v<ALayout, tensor_layout::convolution::NWGC>,
+                                      bool>::type = false>
+    CK_TILE_HOST auto MakeADescriptor_M_K() const
+    {
+        IndexType WiStride_       = G_ * C_;
+        IndexType CStrideTensorA_ = 1;
+        IndexType NStrideTensorA_ = Di_ * Hi_ * Wi_ * G_ * C_;
+        IndexType GStrideTensorA_ = C_;
+
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wo_, C_),
+                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_));
+                return transform_tensor_descriptor(
+                    in_gemmm_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wo_, NumGroupsToMerge, C_),
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_, NumGroupsToMerge)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wi_), make_tuple(NStrideTensorA_, WiStride_));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(number<3>{}, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_))),
+                    make_tuple(sequence<0>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                               make_pass_through_transform(number<3>{})),
+                    make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wi_, NumGroupsToMerge),
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(number<3>{}, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_, NumGroupsToMerge)),
+                               make_pass_through_transform(number<3>{})),
+                    make_tuple(sequence<0, 2, 3>{}, sequence<1>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wi_, C_),
+                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_));
+
+                const auto in_n_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Wo_), make_tuple(ConvStrideW_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wi_, NumGroupsToMerge, C_),
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+
+                const auto in_n_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Wo_), make_tuple(ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_, NumGroupsToMerge)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wi_, C_),
+                    make_tuple(NStrideTensorA_, WiStride_, CStrideTensorA_));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(X_, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                               make_merge_transform(make_tuple(X_, C_))),
+                    make_tuple(sequence<0, 2>{}, sequence<1, 3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Wi_, NumGroupsToMerge, C_),
+                    make_tuple(NStrideTensorA_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+
+                const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                    in_n_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+                const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(X_, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}, sequence<4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Wo_, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(X_, C_))),
+                    make_tuple(sequence<0, 2, 3>{}, sequence<1, 4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<
+                  NDimSpatial == 2 && std::is_same_v<ALayout, tensor_layout::convolution::NHWGC>,
+                  bool>::type = false>
+    CK_TILE_HOST auto MakeADescriptor_M_K() const
+
+    {
+        IndexType HiStride_       = Wi_ * G_ * C_;
+        IndexType WiStride_       = G_ * C_;
+        IndexType CStrideTensorA_ = 1;
+        IndexType NStrideTensorA_ = Di_ * Hi_ * Wi_ * G_ * C_;
+        IndexType GStrideTensorA_ = C_;
+
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Ho_, Wo_, C_),
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Ho_, Wo_, NumGroupsToMerge, C_),
+                    make_tuple(
+                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_, NumGroupsToMerge)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2, 3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Hi_, Wi_), make_tuple(NStrideTensorA_, HiStride_, WiStride_));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+                const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(number<3>{}, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(number<3>{}, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_))),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                               make_merge_transform(make_tuple(number<3>{}, number<3>{}))),
+                    make_tuple(sequence<0, 2, 4>{}, sequence<1, 3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Hi_, Wi_, NumGroupsToMerge),
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_));
+
+                const auto in_n_hip_wip_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+                const auto in_n_y_ho_x_wo_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(number<3>{}, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(number<3>{}, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_groups_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(number<3>{}, number<3>{}))),
+                    make_tuple(sequence<0, 2, 4, 5>{}, sequence<1, 3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Hi_, Wi_, C_),
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_));
+
+                const auto in_n_ho_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Ho_), make_tuple(ConvStrideH_)),
+                               make_embed_transform(make_tuple(Wo_), make_tuple(ConvStrideW_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_ho_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Hi_, Wi_, NumGroupsToMerge, C_),
+                    make_tuple(
+                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+
+                const auto in_n_ho_wo_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Ho_), make_tuple(ConvStrideH_)),
+                               make_embed_transform(make_tuple(Wo_), make_tuple(ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_ho_wo_groups_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_, NumGroupsToMerge)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2, 3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Hi_, Wi_, C_),
+                    make_tuple(NStrideTensorA_, HiStride_, WiStride_, CStrideTensorA_));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+                const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Y_, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(X_, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                               make_merge_transform(make_tuple(Y_, X_, C_))),
+                    make_tuple(sequence<0, 2, 4>{}, sequence<1, 3, 5>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+
+                const auto in_n_hi_wi_groups_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Hi_, Wi_, NumGroupsToMerge, C_),
+                    make_tuple(
+                        NStrideTensorA_, HiStride_, WiStride_, GStrideTensorA_, CStrideTensorA_));
+
+                const auto in_n_hip_wip_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hi_wi_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+                const auto in_n_y_ho_x_wo_groups_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_groups_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Y_, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(X_, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{},
+                               sequence<1, 2>{},
+                               sequence<3, 4>{},
+                               sequence<5>{},
+                               sequence<6>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_y_ho_x_wo_groups_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_, NumGroupsToMerge)),
+                               make_merge_transform(make_tuple(Y_, X_, C_))),
+                    make_tuple(sequence<0, 2, 4, 5>{}, sequence<1, 3, 6>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<
+                  NDimSpatial == 3 && std::is_same_v<ALayout, tensor_layout::convolution::NDHWGC>,
+                  bool>::type = false>
+    CK_TILE_HOST auto MakeADescriptor_M_K() const
+
+    {
+        IndexType DiStride_       = Hi_ * Wi_ * G_ * C_;
+        IndexType HiStride_       = Wi_ * G_ * C_;
+        IndexType WiStride_       = G_ * C_;
+        IndexType CStrideTensorA_ = 1;
+        IndexType NStrideTensorA_ = Di_ * Hi_ * Wi_ * G_ * C_;
+        IndexType GStrideTensorA_ = C_;
+
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Do_, Ho_, Wo_, C_),
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2, 3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_gemmm_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Do_, Ho_, Wo_, NumGroupsToMerge, C_),
+                    make_tuple(NStrideTensorA_,
+                               DiStride_,
+                               HiStride_,
+                               WiStride_,
+                               GStrideTensorA_,
+                               CStrideTensorA_));
+
+                return transform_tensor_descriptor(
+                    in_gemmm_groups_gemmk_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_, NumGroupsToMerge)),
+                        make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2, 3, 4>{}, sequence<5>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Di_, Hi_, Wi_),
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_)),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(number<3>{}, Do_),
+                                                    make_tuple(ConvDilationD_, ConvStrideD_)),
+                               make_embed_transform(make_tuple(number<3>{}, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(number<3>{}, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_))),
+                    make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+                    make_tuple(
+                        sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5, 6>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                        make_merge_transform(make_tuple(number<3>{}, number<3>{}, number<3>{}))),
+                    make_tuple(sequence<0, 2, 4, 6>{}, sequence<1, 3, 5>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Di_, Hi_, Wi_, NumGroupsToMerge),
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, GStrideTensorA_));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(number<3>{}, Do_),
+                                                    make_tuple(ConvDilationD_, ConvStrideD_)),
+                               make_embed_transform(make_tuple(number<3>{}, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(number<3>{}, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{},
+                               sequence<1, 2>{},
+                               sequence<3, 4>{},
+                               sequence<5, 6>{},
+                               sequence<7>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_, NumGroupsToMerge)),
+                        make_merge_transform(make_tuple(number<3>{}, number<3>{}, number<3>{}))),
+                    make_tuple(sequence<0, 2, 4, 6, 7>{}, sequence<1, 3, 5>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Di_, Hi_, Wi_, C_),
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_));
+
+                const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Do_), make_tuple(ConvStrideD_)),
+                               make_embed_transform(make_tuple(Ho_), make_tuple(ConvStrideH_)),
+                               make_embed_transform(make_tuple(Wo_), make_tuple(ConvStrideW_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_do_ho_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2, 3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Di_, Hi_, Wi_, NumGroupsToMerge, C_),
+                    make_tuple(NStrideTensorA_,
+                               DiStride_,
+                               HiStride_,
+                               WiStride_,
+                               GStrideTensorA_,
+                               CStrideTensorA_));
+
+                const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Do_), make_tuple(ConvStrideD_)),
+                               make_embed_transform(make_tuple(Ho_), make_tuple(ConvStrideH_)),
+                               make_embed_transform(make_tuple(Wo_), make_tuple(ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{},
+                               sequence<1>{},
+                               sequence<2>{},
+                               sequence<3>{},
+                               sequence<4>{},
+                               sequence<5>{}),
+                    make_tuple(sequence<0>{},
+                               sequence<1>{},
+                               sequence<2>{},
+                               sequence<3>{},
+                               sequence<4>{},
+                               sequence<5>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_do_ho_wo_c_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_, NumGroupsToMerge)),
+                        make_pass_through_transform(C_)),
+                    make_tuple(sequence<0, 1, 2, 3, 4>{}, sequence<5>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Di_, Hi_, Wi_, C_),
+                    make_tuple(NStrideTensorA_, DiStride_, HiStride_, WiStride_, CStrideTensorA_));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(C_)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Z_, Do_),
+                                                    make_tuple(ConvDilationD_, ConvStrideD_)),
+                               make_embed_transform(make_tuple(Y_, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(X_, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(C_)),
+                    make_tuple(
+                        sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                    make_tuple(sequence<0>{},
+                               sequence<1, 2>{},
+                               sequence<3, 4>{},
+                               sequence<5, 6>{},
+                               sequence<7>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                               make_merge_transform(make_tuple(Z_, Y_, X_, C_))),
+                    make_tuple(sequence<0, 2, 4, 6>{}, sequence<1, 3, 5, 7>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+            else
+            {
+                const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_, Di_, Hi_, Wi_, NumGroupsToMerge, C_),
+                    make_tuple(NStrideTensorA_,
+                               DiStride_,
+                               HiStride_,
+                               WiStride_,
+                               GStrideTensorA_,
+                               CStrideTensorA_));
+
+                const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                    in_n_di_hi_wi_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                               make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                               make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{},
+                               sequence<1>{},
+                               sequence<2>{},
+                               sequence<3>{},
+                               sequence<4>{},
+                               sequence<5>{}),
+                    make_tuple(sequence<0>{},
+                               sequence<1>{},
+                               sequence<2>{},
+                               sequence<3>{},
+                               sequence<4>{},
+                               sequence<5>{}));
+
+                const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                    in_n_hip_wip_c_desc,
+                    make_tuple(make_pass_through_transform(N_),
+                               make_embed_transform(make_tuple(Z_, Do_),
+                                                    make_tuple(ConvDilationD_, ConvStrideD_)),
+                               make_embed_transform(make_tuple(Y_, Ho_),
+                                                    make_tuple(ConvDilationH_, ConvStrideH_)),
+                               make_embed_transform(make_tuple(X_, Wo_),
+                                                    make_tuple(ConvDilationW_, ConvStrideW_)),
+                               make_pass_through_transform(NumGroupsToMerge),
+                               make_pass_through_transform(C_)),
+                    make_tuple(sequence<0>{},
+                               sequence<1>{},
+                               sequence<2>{},
+                               sequence<3>{},
+                               sequence<4>{},
+                               sequence<5>{}),
+                    make_tuple(sequence<0>{},
+                               sequence<1, 2>{},
+                               sequence<3, 4>{},
+                               sequence<5, 6>{},
+                               sequence<7>{},
+                               sequence<8>{}));
+
+                return transform_tensor_descriptor(
+                    in_n_z_do_y_ho_x_wo_c_desc,
+                    make_tuple(
+                        make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_, NumGroupsToMerge)),
+                        make_merge_transform(make_tuple(Z_, Y_, X_, C_))),
+                    make_tuple(sequence<0, 2, 4, 6, 7>{}, sequence<1, 3, 5, 8>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+    }
+
+    template <
+        typename BLayout,
+        typename std::enable_if<std::is_same_v<BLayout, tensor_layout::convolution::GKXC> ||
+                                    std::is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
+                                    std::is_same_v<BLayout, tensor_layout::convolution::GKZYXC>,
+                                bool>::type = false>
+    CK_TILE_HOST auto MakeBDescriptor_N_K() const
+    {
+        IndexType CStrideTensorB_ = 1;
+        IndexType KStrideTensorB_ = Z_ * Y_ * X_ * C_;
+        IndexType GStrideTensorB_ = K_ * Z_ * Y_ * X_ * C_;
+
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            using FilterSizeNumType =
+                std::conditional_t<NDimSpatial == 1,
+                                   number<3>,
+                                   std::conditional_t<NDimSpatial == 2, number<9>, number<27>>>;
+
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(K_, FilterSizeNumType{}));
+            }
+            else
+            {
+
+                const auto wei_gemmn_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(K_, NumGroupsToMerge, FilterSizeNumType{}),
+                    make_tuple(KStrideTensorB_, GStrideTensorB_, CStrideTensorB_));
+                return transform_tensor_descriptor(
+                    wei_gemmn_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(K_, NumGroupsToMerge)),
+                               make_pass_through_transform(FilterSizeNumType{})),
+                    make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+        else
+        {
+            if constexpr(NumGroupsToMerge == 1)
+            {
+                return make_naive_tensor_descriptor_packed(make_tuple(K_, ZYX_ * C_));
+            }
+            else
+            {
+                const auto wei_gemmn_groups_gemmk_desc = make_naive_tensor_descriptor(
+                    make_tuple(K_, NumGroupsToMerge, ZYX_ * C_),
+                    make_tuple(KStrideTensorB_, GStrideTensorB_, CStrideTensorB_));
+                return transform_tensor_descriptor(
+                    wei_gemmn_groups_gemmk_desc,
+                    make_tuple(make_merge_transform(make_tuple(K_, NumGroupsToMerge)),
+                               make_pass_through_transform(ZYX_ * C_)),
+                    make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                    make_tuple(sequence<0>{}, sequence<1>{}));
+            }
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp                      = NDimSpatial,
+              typename std::enable_if<NDimSp == 1 &&
+                                          std::is_same_v<CLayout, tensor_layout::convolution::NWGK>,
+                                      bool>::type = false>
+    CK_TILE_HOST auto MakeCDescriptor_M_N() const
+    {
+        IndexType WoStride_       = G_ * K_;
+        IndexType KStrideTensorC_ = 1;
+        IndexType NStrideTensorC_ = Do_ * Ho_ * Wo_ * G_ * K_;
+        IndexType GStrideTensorC_ = K_;
+
+        const IndexType NDoHoWo = N_ * Wo_;
+        if constexpr(NumGroupsToMerge == 1)
+        {
+            return make_naive_tensor_descriptor(make_tuple(NDoHoWo, K_),
+                                                make_tuple(WoStride_, KStrideTensorC_));
+        }
+        else
+        {
+            const auto nhwo_groups_k_1_desc = make_naive_tensor_descriptor(
+                make_tuple(N_, Wo_, NumGroupsToMerge, K_, 1),
+                make_tuple(
+                    NStrideTensorC_, WoStride_, GStrideTensorC_, KStrideTensorC_, GStrideTensorC_));
+            // Padd 1 to NumGroupsToMerge
+            const auto padded_desc = transform_tensor_descriptor(
+                nhwo_groups_k_1_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(K_),
+                           make_pad_transform(1, 0, NumGroupsToMerge - 1)),
+                make_tuple(sequence<0, 1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+            // We need only matrices from diagonal. X_or returns 0 for the same
+            // values. So if matrices is not on diagonal then it will be stored in padding.
+            // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+            static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                          NumGroupsToMerge == 8 || NumGroupsToMerge == 16 ||
+                          NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
+            const auto unmerged_padded_desc = transform_tensor_descriptor(
+                padded_desc,
+                make_tuple(make_pass_through_transform(NDoHoWo),
+                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                           make_pass_through_transform(K_)),
+                make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}));
+            // Merge To M, N
+            return transform_tensor_descriptor(
+                unmerged_padded_desc,
+                make_tuple(make_merge_transform(make_tuple(NDoHoWo, NumGroupsToMerge)),
+                           make_merge_transform(make_tuple(K_, NumGroupsToMerge))),
+                make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename std::enable_if<
+                  NDimSp == 2 && std::is_same_v<CLayout, tensor_layout::convolution::NHWGK>,
+                  bool>::type = false>
+    CK_TILE_HOST auto MakeCDescriptor_M_N() const
+    {
+        IndexType HoStride_       = Wo_ * G_ * K_;
+        IndexType WoStride_       = G_ * K_;
+        IndexType KStrideTensorC_ = 1;
+        IndexType NStrideTensorC_ = Do_ * Ho_ * Wo_ * G_ * K_;
+        IndexType GStrideTensorC_ = K_;
+
+        const IndexType NDoHoWo = N_ * Ho_ * Wo_;
+        if constexpr(NumGroupsToMerge == 1)
+        {
+            return make_naive_tensor_descriptor(make_tuple(NDoHoWo, K_),
+                                                make_tuple(WoStride_, KStrideTensorC_));
+        }
+        else
+        {
+            const auto nhwo_groups_k_1_desc =
+                make_naive_tensor_descriptor(make_tuple(N_, Ho_, Wo_, NumGroupsToMerge, K_, 1),
+                                             make_tuple(NStrideTensorC_,
+                                                        HoStride_,
+                                                        WoStride_,
+                                                        GStrideTensorC_,
+                                                        KStrideTensorC_,
+                                                        GStrideTensorC_));
+            // Padd 1 to NumGroupsToMerge
+            const auto padded_desc = transform_tensor_descriptor(
+                nhwo_groups_k_1_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Ho_, Wo_)),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(K_),
+                           make_pad_transform(1, 0, NumGroupsToMerge - 1)),
+                make_tuple(sequence<0, 1, 2>{}, sequence<3>{}, sequence<4>{}, sequence<5>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+            // We need only matrices from diagonal. X_or returns 0 for the same
+            // values. So if matrices is not on diagonal then it will be stored in padding.
+            // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+            static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                          NumGroupsToMerge == 8 || NumGroupsToMerge == 16 ||
+                          NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
+            const auto unmerged_padded_desc = transform_tensor_descriptor(
+                padded_desc,
+                make_tuple(make_pass_through_transform(NDoHoWo),
+                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                           make_pass_through_transform(K_)),
+                make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}));
+            // Merge To M, N
+            return transform_tensor_descriptor(
+                unmerged_padded_desc,
+                make_tuple(make_merge_transform(make_tuple(NDoHoWo, NumGroupsToMerge)),
+                           make_merge_transform(make_tuple(K_, NumGroupsToMerge))),
+                make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+              typename std::enable_if<
+                  NDimSp == 3 && std::is_same_v<CLayout, tensor_layout::convolution::NDHWGK>,
+                  bool>::type = false>
+    CK_TILE_HOST auto MakeCDescriptor_M_N() const
+    {
+        IndexType DoStride_       = Ho_ * Wo_ * G_ * K_;
+        IndexType HoStride_       = Wo_ * G_ * K_;
+        IndexType WoStride_       = G_ * K_;
+        IndexType KStrideTensorC_ = 1;
+        IndexType NStrideTensorC_ = Do_ * Ho_ * Wo_ * G_ * K_;
+        IndexType GStrideTensorC_ = K_;
+
+        const IndexType NDoHoWo = N_ * Do_ * Ho_ * Wo_;
+        if constexpr(NumGroupsToMerge == 1)
+        {
+            return make_naive_tensor_descriptor(make_tuple(NDoHoWo, K_),
+                                                make_tuple(WoStride_, KStrideTensorC_));
+        }
+        else
+        {
+            const auto nhwo_groups_k_1_desc =
+                make_naive_tensor_descriptor(make_tuple(N_, Do_, Ho_, Wo_, NumGroupsToMerge, K_, 1),
+                                             make_tuple(NStrideTensorC_,
+                                                        DoStride_,
+                                                        HoStride_,
+                                                        WoStride_,
+                                                        GStrideTensorC_,
+                                                        KStrideTensorC_,
+                                                        GStrideTensorC_));
+            // Padd 1 to NumGroupsToMerge
+            const auto padded_desc = transform_tensor_descriptor(
+                nhwo_groups_k_1_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_)),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(K_),
+                           make_pad_transform(1, 0, NumGroupsToMerge - 1)),
+                make_tuple(sequence<0, 1, 2, 3>{}, sequence<4>{}, sequence<5>{}, sequence<6>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+            // We need only matrices from diagonal. X_or returns 0 for the same
+            // values. So if matrices is not on diagonal then it will be stored in padding.
+            // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+            static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                          NumGroupsToMerge == 8 || NumGroupsToMerge == 16 ||
+                          NumGroupsToMerge == 32 || NumGroupsToMerge == 64);
+            const auto unmerged_padded_desc = transform_tensor_descriptor(
+                padded_desc,
+                make_tuple(make_pass_through_transform(NDoHoWo),
+                           make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                           make_pass_through_transform(K_)),
+                make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}));
+            // Merge To M, N
+            return transform_tensor_descriptor(
+                unmerged_padded_desc,
+                make_tuple(make_merge_transform(make_tuple(NDoHoWo, NumGroupsToMerge)),
+                           make_merge_transform(make_tuple(K_, NumGroupsToMerge))),
+                make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+        }
+    }
+
+    IndexType G_, N_;
+    IndexType Di_, Hi_, Wi_;
+    IndexType Do_, Ho_, Wo_;
+    IndexType Z_, Y_, X_;
+    IndexType K_, C_;
+    IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_;
+    IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_;
+    IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_;
+    IndexType InRightPadD_, InRightPadH_, InRightPadW_;
+    IndexType ZYX_;
+};
+
+} // namespace ck_tile

From 0366fb2abc5a8da221f9ab50bdabdb5363bf5cf2 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sun, 22 Jun 2025 00:28:30 -0700
Subject: [PATCH 243/443] Update for xformers (#2372)

* update api

* update kernel api

* clang-format
---
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |  1 +
 ...mha_pipeline_qr_ks_vs_whole_k_prefetch.hpp | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index ac37f5dd06..fe426f925e 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -847,6 +847,7 @@ struct FmhaFwdKernel
             window_size_left,
             window_size_right,
             mask_type,
+            0, // min_seqlen_q
             p_drop,
             s_randval,
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp
index cc532040e8..074a94613c 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp
@@ -28,6 +28,7 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
     using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
     using ODataType             = remove_cvref_t<typename Problem::ODataType>;
     using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
 
     using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
     using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
@@ -54,6 +55,7 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
     static constexpr auto BiasEnum     = Problem::BiasEnum;
     static constexpr bool kStoreLSE    = Problem::kStoreLSE;
     static constexpr bool kHasDropout  = Problem::kHasDropout;
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -127,7 +129,9 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
               typename SAccElementFunction,
               typename PComputeElementFunction,
               typename OAccElementFunction,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*kSubQKHeaddim tile
                const QElementFunction& q_element_func,
@@ -146,6 +150,9 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& /* unused */,
+               const AttentionVariantParams& /* unused */,
+               const BlockIndices& /* unused */,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -890,7 +897,9 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
               typename BiasDramBlockWindowTmp,
               typename RandValDramBlockWindowTmp,
               typename LSEDramBlockWindowTmp,
-              typename PositionEncoding>
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
     CK_TILE_HOST_DEVICE auto
     operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
                const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
@@ -901,6 +910,9 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
                FmhaMask mask,
                PositionEncoding position_encoding,
                float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
                void* smem_ptr,
                DropoutType& dropout) const
     {
@@ -921,6 +933,9 @@ struct BlockFmhaPipelineQRKSVSWholeKPrefetch
                           mask,
                           position_encoding,
                           scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
                           smem_ptr,
                           dropout);
     }

From 7d669440a6a7b25ac539648ce77fe5a7ae87a657 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Mon, 23 Jun 2025 12:29:15 +0800
Subject: [PATCH 244/443] [CK_TILE] Fix compilation errors introduced in #2320,
 #2219 and #2214 (#2388)

* Fix compilation errors

* Fix more ck_tile example compilation errors
---
 example/ck_tile/02_layernorm2d/generate.py    |  20 ++--
 example/ck_tile/05_reduce/reduce.hpp          |   2 +-
 example/ck_tile/10_rmsnorm2d/generate.py      |  22 ++--
 .../ck_tile/12_smoothquant/smoothquant.hpp    |  20 ++--
 .../14_moe_smoothquant/moe_smoothquant.hpp    |  20 ++--
 .../17_grouped_gemm/grouped_gemm_tileloop.cpp |   3 +
 .../core/tensor/tile_window_linear.hpp        |  19 ++-
 .../fused_moe/kernel/fused_moegemm_shape.hpp  |   2 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 110 +++++++++---------
 .../norm_reduce/block/block_norm_reduce.hpp   |   4 +-
 10 files changed, 112 insertions(+), 110 deletions(-)

diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index 2dc9ccbd77..d77582630a 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -75,22 +75,22 @@ struct layernorm2d_fwd_traits_
     using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
     using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
         }
     }();
 
@@ -98,13 +98,13 @@ struct layernorm2d_fwd_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
         }
     }();
 
diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp
index 50ffb9c1c7..6fbb0b4274 100644
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -35,7 +35,7 @@ struct Reduce2dShape
     static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
 
     static constexpr index_t BlockSize =
-        WarpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
 };
 
 template <typename XDataType_,
diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py
index 6a181bf32e..4296b7373e 100644
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -74,22 +74,22 @@ struct rmsnorm2d_fwd_traits_
     using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
     using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
         }
     }();
 
@@ -97,13 +97,13 @@ struct rmsnorm2d_fwd_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
         }
     }();
 
@@ -712,4 +712,4 @@ if __name__ == "__main__":
     if args.list_blobs:
         list_blobs(args)
     else:
-        gen_blobs(args)
\ No newline at end of file
+        gen_blobs(args)
diff --git a/example/ck_tile/12_smoothquant/smoothquant.hpp b/example/ck_tile/12_smoothquant/smoothquant.hpp
index 265399c276..5f8254a664 100644
--- a/example/ck_tile/12_smoothquant/smoothquant.hpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.hpp
@@ -49,22 +49,22 @@ struct smoothquant_traits_
 {
     using DataType = ck_tile::remove_cvref_t<DataType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
         }
     }();
 
@@ -72,13 +72,13 @@ struct smoothquant_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
         }
     }();
 
diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
index b29295f175..36cf477a42 100644
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
@@ -38,22 +38,22 @@ struct moe_smoothquant_traits_
     using InputType  = ck_tile::remove_cvref_t<InputType_>;
     using OutputType = ck_tile::remove_cvref_t<OutputType_>;
 
-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
     static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
 
     // num of warps along m
     static constexpr ck_tile::index_t BlockWarps_M = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
         }
         else
         {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
         }
     }();
 
@@ -61,13 +61,13 @@ struct moe_smoothquant_traits_
     static constexpr ck_tile::index_t BlockWarps_N = []() {
         if constexpr(is_warp_per_row)
         {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
             return 1;
         }
         else
         {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
         }
     }();
 
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
index 5c0cb92683..4107181520 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
@@ -116,9 +116,12 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
+                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
+                                             ck_tile::tuple<>,
                                              CLayout,
+                                             ck_tile::element_wise::PassThrough,
                                              GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index 56c5066774..596584f3cc 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -314,8 +314,7 @@ struct tile_window_linear
 
         constexpr auto tile_dstr = typename Base::TileDstr{};
 
-        auto dst_tensor =
-            make_static_distributed_tensor<typename Base::DataTypeDataType>(tile_dstr);
+        auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
 
         auto issue = [&](auto i_access_) {
             constexpr auto IAccess = number<i_access_>{};
@@ -348,8 +347,9 @@ struct tile_window_linear
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                       Base::Traits::PackedSize;
 
-                dst_tensor.get_thread_buffer().template at<d>() = vec_value.template get_as<
-                    typename Base::DataTypeDataType>()[j / Base::Traits::PackedSize];
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value
+                        .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
             });
         };
 
@@ -400,8 +400,9 @@ struct tile_window_linear
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                       Base::Traits::PackedSize;
 
-                dst_tensor.get_thread_buffer().template at<d>() = vec_value.template get_as<
-                    typename Base::DataTypeDataType>()[j / Base::Traits::PackedSize];
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value
+                        .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
             });
         };
 
@@ -804,8 +805,7 @@ struct tile_window_linear
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                       Base::Traits::PackedSize;
 
-                vec_value.template get_as<typename Base::DataTypeDataType>()(
-                    j / Base::Traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
@@ -860,8 +860,7 @@ struct tile_window_linear
                 constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                       Base::Traits::PackedSize;
 
-                vec_value.template get_as<typename Base::DataTypeDataType>()(
-                    j / Base::Traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
index 336bdc806f..92f6a48648 100644
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
@@ -101,7 +101,7 @@ struct FusedMoeGemmShape
     static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1;
     static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1;
 
-    static constexpr index_t BlockSize = WarpSize * NumWarps;
+    static constexpr index_t BlockSize = get_warp_size() * NumWarps;
 
     // some assert
     static_assert(Block_M0 == Block_M1);
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 3e2e100025..5da675ae42 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -388,7 +388,7 @@ struct MoeSortingKernel
     }
 
     // reduce single pixel within a wave
-    template <typename T, typename F, index_t wave_size_ = WarpSize>
+    template <typename T, typename F, index_t wave_size_ = get_warp_size()>
     __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
     {
         // constexpr int wave_size = 64;
@@ -625,7 +625,7 @@ struct MoeSortingKernel
         {
             const index_t prefill_token = topk_mdiv.div(numel);
             // TODO: only support expert-tile like 8, 16, 32
-            static constexpr index_t experts_per_wave = WarpSize / Problem::ExpertTile;
+            static constexpr index_t experts_per_wave = get_warp_size() / Problem::ExpertTile;
             {
                 index_t eid           = tid / experts_per_wave;
                 index_t expert_offset = cumsum[eid] +
@@ -693,7 +693,7 @@ struct MoeSortingKernel
                                    void* smem) const
     {
         const index_t tid            = static_cast<index_t>(threadIdx.x);
-        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / WarpSize);
+        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / get_warp_size());
         const index_t lid            = __lane_id();
         constexpr index_t block_size = 256;           // blockDim.x;
         const index_t sub_tokens     = smem_rows - 2; // sub_tokens_mdiv.divisor;
@@ -798,7 +798,7 @@ struct MoeSortingKernel
                 // NOTE: under this block can never use __syncthreads!
                 int i_e_          = 0;
                 int local_cumsum_ = 0;
-                for(; i_e_ < num_experts; i_e_ += WarpSize)
+                for(; i_e_ < num_experts; i_e_ += get_warp_size())
                 {
                     int pre_cumsum_ = smem_cumsum(lid == 0 ? i_e_ : 0);
                     int local_cnt   = smem_cumsum(i_e_ + lid + 1);
@@ -843,7 +843,7 @@ struct MoeSortingKernel
                                                   // cumsum padded in case local cumsum is zero, but
                                                   // pre_sumsum has value, which will result int
                                                   // zero local cumsum(but we want at least padded)
-                    wave_cumsum<int, WarpSize>(local_cumsum_);
+                    wave_cumsum<int, get_warp_size()>(local_cumsum_);
 
                     if((i_e_ + lid) < num_experts)
                         smem_cumsum(i_e_ + lid + 1) = local_cumsum_;
@@ -851,7 +851,7 @@ struct MoeSortingKernel
                     if constexpr(Problem::LocalExpertMasking)
                     {
                         local_masking += pre_cumsum_masking;
-                        wave_cumsum<int, WarpSize>(local_masking);
+                        wave_cumsum<int, get_warp_size()>(local_masking);
                         if((i_e_ + lid) < num_experts)
                             smem_cumdup(i_e_ + lid + 1) = local_masking;
                     }
@@ -861,7 +861,7 @@ struct MoeSortingKernel
                     // than 0(which is not we want)
                     __builtin_amdgcn_s_waitcnt(0xc07f);
                 }
-                if((lid + i_e_ - WarpSize) == (num_experts - 1))
+                if((lid + i_e_ - get_warp_size()) == (num_experts - 1))
                 {
                     *p_total_tokens_post_pad = local_cumsum_;
                 }
@@ -1109,7 +1109,7 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
     return chunk * sizeof(index_t);
 };
 
-template <typename T, typename F, index_t wave_size_ = WarpSize>
+template <typename T, typename F, index_t wave_size_ = get_warp_size()>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
     // constexpr int wave_size = 64;
@@ -1504,7 +1504,7 @@ struct MoeSortingMultiPhaseKernel_P1
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return BLOCK_SIZE / WarpSize * sizeof(IndexType);
+        return BLOCK_SIZE / get_warp_size() * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1546,8 +1546,8 @@ struct MoeSortingMultiPhaseKernel_P1
             cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
         }
 
-        index_t lane_id = threadIdx.x % WarpSize;
-        index_t wave_id = threadIdx.x / WarpSize;
+        index_t lane_id = threadIdx.x % get_warp_size();
+        index_t wave_id = threadIdx.x / get_warp_size();
 
         // reduce cross wave
         IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -1560,7 +1560,7 @@ struct MoeSortingMultiPhaseKernel_P1
         if(threadIdx.x == 0)
         {
             index_t c = 0;
-            for(auto i = 0; i < (BLOCK_SIZE / WarpSize); i++)
+            for(auto i = 0; i < (BLOCK_SIZE / get_warp_size()); i++)
             {
                 c += s[i];
             }
@@ -1660,7 +1660,7 @@ struct MoeSortingMultiPhaseKernel_P01
     // in byte
     CK_TILE_HOST static constexpr auto GetSmemSize()
     {
-        return BLOCK_SIZE / WarpSize * sizeof(IndexType);
+        return BLOCK_SIZE / get_warp_size() * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1786,8 +1786,8 @@ struct MoeSortingMultiPhaseKernel_P01
                     cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
                 }
 
-                index_t lane_id = threadIdx.x % WarpSize;
-                index_t wave_id = threadIdx.x / WarpSize;
+                index_t lane_id = threadIdx.x % get_warp_size();
+                index_t wave_id = threadIdx.x / get_warp_size();
 
                 // reduce cross wave
                 IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -1801,7 +1801,7 @@ struct MoeSortingMultiPhaseKernel_P01
                 if(threadIdx.x == 0)
                 {
                     index_t c = 0;
-                    for(auto i = 0; i < (BLOCK_SIZE / WarpSize); i++)
+                    for(auto i = 0; i < (BLOCK_SIZE / get_warp_size()); i++)
                     {
                         c += s[i];
                     }
@@ -1880,7 +1880,7 @@ struct MoeSortingMultiPhaseKernel_P2
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
         // return 2 * BLOCK_SIZE * sizeof(IndexType);
-        return (4 + 2 * BLOCK_SIZE / WarpSize) * sizeof(IndexType);
+        return (4 + 2 * BLOCK_SIZE / get_warp_size()) * sizeof(IndexType);
     }
 
     // reduce single pixel within a wave
@@ -1905,8 +1905,8 @@ struct MoeSortingMultiPhaseKernel_P2
         IndexType* p_sorted_expert_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
 
         const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        index_t wave_id     = threadIdx.x / WarpSize;
-        index_t lane_id     = threadIdx.x % WarpSize;
+        index_t wave_id     = threadIdx.x / get_warp_size();
+        index_t lane_id     = threadIdx.x % get_warp_size();
 
         IndexType prev_cumsum_a = 0;
         IndexType prev_cumsum_b = 0;
@@ -1951,22 +1951,22 @@ struct MoeSortingMultiPhaseKernel_P2
             IndexType cumsum_b = b_;
 
             // Note: we first cumsum local round, then add previous cumsum
-            impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_a);
-            impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_b);
+            impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_a);
+            impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_b);
 
             __syncthreads();
-            if(lane_id == WarpSize - 1)
+            if(lane_id == get_warp_size() - 1)
             {
-                s[4 + wave_id]                         = cumsum_a;
-                s[4 + wave_id + BLOCK_SIZE / WarpSize] = cumsum_b;
+                s[4 + wave_id]                                = cumsum_a;
+                s[4 + wave_id + BLOCK_SIZE / get_warp_size()] = cumsum_b;
             }
 
             __syncthreads();
 
             // reduce cross wave
-            static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+            static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                 IndexType prev_a = s[4 + i_w];
-                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / WarpSize];
+                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / get_warp_size()];
                 prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                 prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                 cumsum_a += prev_a;
@@ -2083,7 +2083,7 @@ struct MoeSortingMultiPhaseKernel_P3
     // in byte
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
     {
-        return (4 + BLOCK_SIZE / WarpSize) * sizeof(IndexType);
+        return (4 + BLOCK_SIZE / get_warp_size()) * sizeof(IndexType);
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -2110,8 +2110,8 @@ struct MoeSortingMultiPhaseKernel_P3
             }
         }();
         int eid     = blockIdx.x;
-        int wave_id = threadIdx.x / WarpSize;
-        int lane_id = threadIdx.x % WarpSize;
+        int wave_id = threadIdx.x / get_warp_size();
+        int lane_id = threadIdx.x % get_warp_size();
         int e_start = p_expert_cumsum[eid];
         int e_end   = p_expert_cumsum[eid + 1];
         if constexpr(Problem::SkipExpertsWithZeroTokens)
@@ -2141,17 +2141,17 @@ struct MoeSortingMultiPhaseKernel_P3
             int i_topk = x - 1;          // topk of this token
             int i_show = x != 0 ? 1 : 0; // has this token or not
             int cumsum = i_show;
-            impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+            impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
 
             __syncthreads();
-            if(lane_id == WarpSize - 1)
+            if(lane_id == get_warp_size() - 1)
             {
                 s[4 + wave_id] = cumsum;
             }
             __syncthreads();
 
             // reduce cross wave
-            static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+            static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                 IndexType prev = s[4 + i_w];
                 prev           = wave_id > i_w ? prev : 0; // mask out
                 cumsum += prev;
@@ -2196,7 +2196,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_size_p23(int num_experts_)
 {
     constexpr index_t BLOCK_SIZE     = 256; // hardcoded 256
     const index_t expert_cumsum_elem = num_experts_ + 1;
-    return (4 + 2 * BLOCK_SIZE / WarpSize + expert_cumsum_elem) * sizeof(int);
+    return (4 + 2 * BLOCK_SIZE / get_warp_size() + expert_cumsum_elem) * sizeof(int);
 }
 } // namespace impl
 
@@ -2303,15 +2303,15 @@ struct MoeSortingMultiPhaseKernel_P23
             const IndexType* p_local_expert_mask =
                 static_cast<const IndexType*>(kargs.p_local_expert_mask);
             IndexType* p_expert_cumsum      = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / WarpSize;
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / get_warp_size();
             IndexType* p_total_tokens_post_pad =
                 reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
             IndexType* p_sorted_expert_ids =
                 reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
 
             const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-            index_t wave_id     = threadIdx.x / WarpSize;
-            index_t lane_id     = threadIdx.x % WarpSize;
+            index_t wave_id     = threadIdx.x / get_warp_size();
+            index_t lane_id     = threadIdx.x % get_warp_size();
 
             IndexType prev_cumsum_a = 0;
             IndexType prev_cumsum_b = 0;
@@ -2356,22 +2356,22 @@ struct MoeSortingMultiPhaseKernel_P23
                 IndexType cumsum_b = b_;
 
                 // Note: we first cumsum local round, then add previous cumsum
-                impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_a);
-                impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_b);
+                impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_a);
+                impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_b);
 
                 __syncthreads();
-                if(lane_id == WarpSize - 1)
+                if(lane_id == get_warp_size() - 1)
                 {
-                    s[4 + wave_id]                         = cumsum_a;
-                    s[4 + wave_id + BLOCK_SIZE / WarpSize] = cumsum_b;
+                    s[4 + wave_id]                                = cumsum_a;
+                    s[4 + wave_id + BLOCK_SIZE / get_warp_size()] = cumsum_b;
                 }
 
                 __syncthreads();
 
                 // reduce cross wave
-                static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                     IndexType prev_a = s[4 + i_w];
-                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / WarpSize];
+                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / get_warp_size()];
                     prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                     prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                     cumsum_a += prev_a;
@@ -2441,13 +2441,13 @@ struct MoeSortingMultiPhaseKernel_P23
             IndexType* s                  = reinterpret_cast<IndexType*>(smem);
             MeshType* p_expert_mesh       = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
             IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / WarpSize;
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / get_warp_size();
             const WeightType* p_weights     = static_cast<const WeightType*>(kargs.p_weights);
             WeightType* p_sorted_weights    = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
 
             int eid     = blockIdx.x;
-            int wave_id = threadIdx.x / WarpSize;
-            int lane_id = threadIdx.x % WarpSize;
+            int wave_id = threadIdx.x / get_warp_size();
+            int lane_id = threadIdx.x % get_warp_size();
             int e_start = p_expert_cumsum_smem[eid];
             int e_end   = p_expert_cumsum_smem[eid + 1];
             if constexpr(Problem::SkipExpertsWithZeroTokens)
@@ -2518,17 +2518,17 @@ struct MoeSortingMultiPhaseKernel_P23
                         int i_topk  = x - 1;          // topk of this token
                         int i_show  = x != 0 ? 1 : 0; // has this token or not
                         int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
 
                         __syncthreads();
-                        if(lane_id == WarpSize - 1)
+                        if(lane_id == get_warp_size() - 1)
                         {
                             s[4 + wave_id] = cumsum;
                         }
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
@@ -2569,17 +2569,17 @@ struct MoeSortingMultiPhaseKernel_P23
                             cumsum_store += i_show[j];
                         });
                         int cumsum = cumsum_store;
-                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
 
                         __syncthreads();
-                        if(lane_id == WarpSize - 1)
+                        if(lane_id == get_warp_size() - 1)
                         {
                             s[4 + wave_id] = cumsum;
                         }
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
@@ -2624,17 +2624,17 @@ struct MoeSortingMultiPhaseKernel_P23
                         int i_topk_1  = x1 - 1;          // topk of this token
                         int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
                         int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
 
                         __syncthreads();
-                        if(lane_id == WarpSize - 1)
+                        if(lane_id == get_warp_size() - 1)
                         {
                             s[4 + wave_id] = cumsum;
                         }
                         __syncthreads();
 
                         // reduce cross wave
-                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                             IndexType prev = s[4 + i_w];
                             prev           = wave_id > i_w ? prev : 0; // mask out
                             cumsum += prev;
diff --git a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
index 26437c7126..88da6be86e 100644
--- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -250,7 +250,7 @@ struct BlockNormReduceCrossWarpSync
         //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
         //
         //   -> also store data from every wave into LDS
-        constexpr index_t num_warps = BlockShape::BlockSize / WarpSize;
+        constexpr index_t num_warps = BlockShape::BlockSize / get_warp_size();
         return num_warps * 4 * thread_buf_size * sizeof(float);
     }
 
@@ -276,7 +276,7 @@ struct BlockNormReduceCrossWarpSync
         const index_t lane_id           = get_lane_id();
         const index_t warp_id           = get_warp_id();
         constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
-        constexpr index_t num_warps     = BlockShape::BlockSize / WarpSize;
+        constexpr index_t num_warps     = BlockShape::BlockSize / get_warp_size();
         const index_t smem_offset       = warp_id;
 
         // skip if nonthing to do

From 61eb622e8590fc7d78aa183e437aec4c32977a66 Mon Sep 17 00:00:00 2001
From: Linjun-AMD <Jun.Lin@amd.com>
Date: Mon, 23 Jun 2025 15:53:58 +0800
Subject: [PATCH 245/443] update the way to compute fmha fwd tflop, include
 mask type (#2386)

* update the way to compute fwd tflop, include mask type

Signed-off-by: JL-underdog <Jun.Lin@amd.com>

* remove unneccessary comment

* add necessary comment

* remove some comment

---------

Signed-off-by: JL-underdog <Jun.Lin@amd.com>
Co-authored-by: root <root@GT-SC-DI16-08.dh144.dcgpu>
---
 example/ck_tile/01_fmha/fmha_fwd.cpp |  4 ++--
 example/ck_tile/01_fmha/mask.hpp     | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 example/ck_tile/01_fmha/fmha_fwd.cpp
 mode change 100644 => 100755 example/ck_tile/01_fmha/mask.hpp

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
old mode 100644
new mode 100755
index bb1f495c4e..8958c0c96e
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -542,8 +542,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 max_seqlen_k = real_seqlen_k;
             }
 
-            flop += nhead * (static_cast<std::size_t>(2) * real_seqlen_q * real_seqlen_k * hdim_q +
-                             static_cast<std::size_t>(2) * real_seqlen_q * hdim_v * real_seqlen_k);
+            flop += nhead * (static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_q +
+                             static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_v);
 
             num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
                                  sizeof(KDataType) * real_seqlen_k * hdim_q +
diff --git a/example/ck_tile/01_fmha/mask.hpp b/example/ck_tile/01_fmha/mask.hpp
old mode 100644
new mode 100755
index c77b700b16..b96482f535
--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -21,6 +21,8 @@ enum class mask_enum
 struct mask_info
 {
     mask_enum type;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
     ck_tile::index_t y, x;
     ck_tile::index_t left, right; // FA style SWA left/right
 
@@ -42,6 +44,8 @@ struct mask_info
         ck_tile::index_t x_total = seqlen_k;
         ck_tile::index_t y_total = seqlen_q;
         mask_info tmp;
+        tmp.seqlen_q = seqlen_q;
+        tmp.seqlen_k = seqlen_k;
         auto found_0 = str.find(':');
         if(found_0 != std::string::npos)
         {
@@ -148,7 +152,22 @@ struct mask_info
         }
         return tmp;
     }
-
+    ck_tile::index_t get_unmaskarea() const
+    {
+        if(type == mask_enum::no_mask)
+            return seqlen_q * seqlen_k;
+        ck_tile::index_t area = 0;
+        for(ck_tile::index_t i_y = 0; i_y < seqlen_q; ++i_y)
+        {
+            ck_tile::index_t x_start = std::max(-y + i_y + 1, static_cast<ck_tile::index_t>(0));
+            ck_tile::index_t x_end   = std::min(i_y + x, seqlen_k);
+            if(x_end > x_start)
+            {
+                area += (x_end - x_start);
+            }
+        }
+        return area;
+    }
     friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
     {
         mi.serialize(os);

From 47ae4b0955582432a667b713865f13ec48a634ed Mon Sep 17 00:00:00 2001
From: John Shumway <john.shumwayjr@gmail.com>
Date: Mon, 23 Jun 2025 07:24:36 -0700
Subject: [PATCH 246/443] Shard several of the most costly targets. (#2373)

* Shard several of the most costly targets.

Introduces a filter_tuple_by_modulo to break up tuples.

Drops build time of target from 21 minutes to under 14 minutes with 64
build processes, or 11 minutes with 128 build processes.

time ninja -j 64 device_grouped_conv3d_fwd_instance

* fix clang format

* Fix build errors in instantiation code.

I wasn't sure how to test the header-only instantiation code on my
initial commit. From Jenkins CI test results, I see that there is a
test target that depends on these headers:

ninja -j 128 test_grouped_convnd_fwd

This allowed me to test the build locally. I found three mistakes I
made, mostly related to early experiments on I tried on the code.
This was hard to find earlier because this PR is really too large.

I also discovered that there are five 2D convolution targets that now
dominate the compilation time. I will likely address those in a later
PR, rather than adding even more changes to this PR.

* Fix link errors from mismatched declarations.

Our pattern for instantiating MIOpen templates uses duplicate
declarations (instead of headers). This is fragile, and I didn't
notice that my last commit had a bunch of link errors. I fixed these
mistakes, and the bin/test_grouped_conv_fwd test target binary now links
correctly.

* Migrate the design to a code-generation approach.

Use a CMake function with template files to generate the source files for the
intantiating the kerenels and to generate the calling function.

* Shard the longest 2D convolution builds

Now that we have automated the shard instantiation, we can shard the 2D
convolution targets that take the longest to build. The target
test_grouped_conv2d_fwd now compiles in 15 minutes.

* Use PROJECT_SOURCE_DIR for submodule compatibility

I used CMAKE_SOURCE_DIR to refer to the top-level source directory in
the ShardInstantiation.cmake file, but this can cause issues with
git submodules.  Instead, we should use PROJECT_SOURCE_DIR to ensure
compatibility when this project is used as a submodule in another
project.

* Migrate the design to a code-generation approach.

Use a CMake function with template files to generate the source files for the
intantiating the kerenels and to generate the calling function.

* Migrate the design to a code-generation approach.

Use a CMake function with template files to generate the source files for the
intantiating the kerenels and to generate the calling function.

* Remove accidental copy of a file

* Remove accidental copies of template files.

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
---
 .gitignore                                    |   3 +
 cmake/ShardInstantiation.cmake                | 116 ++++++++++++++
 cmake/call_shard.in                           |  15 ++
 cmake/instantiate_shard.in                    |   9 ++
 include/ck/utility/filter_tuple.hpp           |  66 ++++++++
 .../gpu/grouped_convolution_forward_xdl.inc   |   3 +-
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |  51 ++++++-
 ...l_ngchw_gkcyx_ngkhw_bf16_comp_instance.in} |  38 ++---
 ...wd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in} |  40 ++---
 ...fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in} |  64 ++++----
 ...gc_gkyxc_nhwgk_int8_mem_inter_instance.cpp |  66 --------
 ...wgc_gkyxc_nhwgk_int8_mem_inter_instance.in |  80 ++++++++++
 ...gc_gkyxc_nhwgk_int8_mem_intra_instance.cpp |  66 --------
 ...wgc_gkyxc_nhwgk_int8_mem_intra_instance.in |  80 ++++++++++
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     | 109 +++++++++++--
 ...dhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp | 111 --------------
 ...ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in | 143 ++++++++++++++++++
 ...ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp | 111 --------------
 ..._ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in | 143 ++++++++++++++++++
 ...gcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp |  54 -------
 ...ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in |  65 ++++++++
 ...ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp |  54 -------
 ..._ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in |  63 ++++++++
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp |  53 -------
 ...xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in} |  53 ++++---
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |  53 -------
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in} |  53 ++++---
 ...w_gkczyx_ngkdhw_bf16_mem_inter_instance.in |  64 ++++++++
 ...w_gkczyx_ngkdhw_bf16_mem_intra_instance.in |  65 ++++++++
 ...w_gkczyx_ngkdhw_f16_mem_inter_instance.in} |  69 +++++----
 ...w_gkczyx_ngkdhw_f16_mem_intra_instance.in} |  75 ++++-----
 ...w_gkczyx_ngkdhw_f32_mem_inter_instance.in} |  69 +++++----
 ...w_gkczyx_ngkdhw_f32_mem_intra_instance.in} |  69 +++++----
 33 files changed, 1346 insertions(+), 827 deletions(-)
 create mode 100644 cmake/ShardInstantiation.cmake
 create mode 100644 cmake/call_shard.in
 create mode 100644 cmake/instantiate_shard.in
 create mode 100644 include/ck/utility/filter_tuple.hpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in} (53%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in} (71%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/{device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp => device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in} (64%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/{mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in} (64%)
 delete mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/{mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in} (64%)
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in} (59%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in} (57%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in} (59%)
 rename library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/{device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp => device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in} (59%)

diff --git a/.gitignore b/.gitignore
index 599ef99e35..e4dd8f7513 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,3 +68,6 @@ build*/
 
 # Python cache
 __pycache__/
+
+.cache/
+
diff --git a/cmake/ShardInstantiation.cmake b/cmake/ShardInstantiation.cmake
new file mode 100644
index 0000000000..47a5d0c48c
--- /dev/null
+++ b/cmake/ShardInstantiation.cmake
@@ -0,0 +1,116 @@
+# Function to generate templated instantiation functions and caller function.
+
+# In order to reduce build times, we split the instantiation of template functions into multiple files.
+# Developers can use ck::util::generate_sharded_instantiations to generate the instantiation functions,
+# which can be placed the TEMPLATE_FILE (typically a .in file).
+
+# This CMake function generates the instantiation functions and a caller function that calls all the instantiation 
+# functions. The ck::util::generate_sharded_instantiations function allows us to generate an arbitrary number of
+# shards (NUM_SHARDS). This function loops over the shards, generates an instantiation function for each shard,
+# and generates a caller function that calls all the instantiation functions.
+
+# The explicit instatiation pattern requires the use of `extern template` to avoid implicit instantiation
+# of the template functions in the caller function, and that code is automatically generated by this function.
+
+# In addition to the user-supplied template, this CMake function uses two generic templates:
+#
+# 1. `instantiate_shard.in`: This is the template for the instantiation functions.
+# 2. `call_shard.in`: This is the template for the caller function that calls all the instantiation functions.
+
+# This function takes the following arguments:
+#
+# - INSTANCES_NAME: The name of the instances (the calling function will be named `add_${INSTANCE_NAMES}`).
+# - TEMPLATE_FILE: The path to the template file that contains the templated instantiation function definitions.
+# - NUM_SHARDS: The number of shards to generate.
+# - OUTPUT_DIR: The build directory where the generated source files will be placed.
+# - SRC_LIST: The list of source files to which the generated source files will be added.
+
+
+function(generate_sharded_instantiations)
+    cmake_parse_arguments(
+        GEN_SHARDED
+        # No boolean arguments
+        ""
+        # Single-value arguments
+        "INSTANCES_NAME;TEMPLATE_FILE;NUM_SHARDS;OUTPUT_DIR;SRC_LIST"
+        # No multi-value arguments.
+        ""
+        ${ARGN}
+    )
+    if (NOT GEN_SHARDED_INSTANCES_NAME)
+        message(FATAL_ERROR "INSTANCES_NAME is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_TEMPLATE_FILE)
+        message(FATAL_ERROR "TEMPLATE_FILE is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_NUM_SHARDS)
+        message(FATAL_ERROR "NUM_SHARDS is required for generate_sharded_instantiations")
+    endif()
+    if(NOT GEN_SHARDED_OUTPUT_DIR)
+        message(FATAL_ERROR "OUTPUT_DIR is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_SRC_LIST)
+        message(FATAL_ERROR "SRC_LIST is required for generate_sharded_instantiations")
+    endif()
+
+    file(MAKE_DIRECTORY ${GEN_SHARDED_OUTPUT_DIR})
+
+
+    set(GENERATED_SOURCE_FILES "")
+    set(EXTERN_TEMPLATE_STATEMENTS "")
+    set(CALL_STATEMENTS "")
+    message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
+
+    set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
+    
+    # Generate the inc file with the template function defintions.
+    # This include file will hold the template function definitions and a using alias for all the shard
+    # instantiation functions.
+    configure_file(
+        "${GEN_SHARDED_TEMPLATE_FILE}"
+        "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.inc"
+        @ONLY
+    )
+
+    # Generate the sharded instantiation functions.
+    # This is where the build parallelization happens.
+    # Each of these source files will contain a single instantiation function for a shard,
+    # which will be called sequentially by the caller function.
+    set(INC_DIR "${GEN_SHARDED_INC_DIR}")
+    math(EXPR LAST_SHARD_ID "${GEN_SHARDED_NUM_SHARDS} - 1")
+    foreach(SHARD_ID RANGE 0 ${LAST_SHARD_ID})
+        set(NUM_SHARDS "${GEN_SHARDED_NUM_SHARDS}")
+        set(SHARD_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}_shard_${SHARD_ID}.cpp")
+        set(SHARD_FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/instantiate_shard.in")
+        configure_file(
+            "${SHARD_FUNCTION_TEMPLATE}"
+            "${SHARD_FUNCTION_PATH}"
+            @ONLY
+        )
+        list(APPEND GENERATED_SOURCE_FILES "${SHARD_FUNCTION_PATH}")
+        set(SHARDED_FUNCTION_NAME "add_${INSTANCES}_shard<${NUM_SHARDS}, ${SHARD_ID}>")
+        list(APPEND EXTERN_TEMPLATE_STATEMENTS "extern template void\n${SHARDED_FUNCTION_NAME}(\n  ${INSTANCES}& instances)")
+        list(APPEND CALL_STATEMENTS "  ${SHARDED_FUNCTION_NAME}(instances)")
+    endforeach()
+
+    # Join the include statements, the extern template declarations, and the call statements each
+    # into a single string for variable substitution in the caller function.
+    string(REPLACE ";" ";\n" INCLUDE_STATEMENTS "${INCLUDE_STATEMENTS}")
+    string(REPLACE ";" ";\n" CALL_STATEMENTS "${CALL_STATEMENTS}")
+    string(REPLACE ";" ";\n" EXTERN_TEMPLATE_STATEMENTS "${EXTERN_TEMPLATE_STATEMENTS}")
+
+    # Generate the caller function.
+    set(CALLER_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.cpp")
+    set(FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/call_shard.in")
+    configure_file(
+        "${FUNCTION_TEMPLATE}"
+        "${CALLER_FUNCTION_PATH}"
+        @ONLY
+    )
+    list(APPEND GENERATED_SOURCE_FILES "${CALLER_FUNCTION_PATH}")
+
+    # Add the generated source files to the list of source files.
+    # This allows the generated source files to be included in the build.
+    list(APPEND ${GEN_SHARDED_SRC_LIST} ${GENERATED_SOURCE_FILES})
+    set(${GEN_SHARDED_SRC_LIST} "${${GEN_SHARDED_SRC_LIST}}" PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/cmake/call_shard.in b/cmake/call_shard.in
new file mode 100644
index 0000000000..daba79b055
--- /dev/null
+++ b/cmake/call_shard.in
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "@INSTANCES@.inc"
+
+namespace ck::tensor_operation::device::instance {
+
+@EXTERN_TEMPLATE_STATEMENTS@;
+
+void add_@INSTANCES@(
+    @INSTANCES@& instances) {
+@CALL_STATEMENTS@; 
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/cmake/instantiate_shard.in b/cmake/instantiate_shard.in
new file mode 100644
index 0000000000..dbc0af17a9
--- /dev/null
+++ b/cmake/instantiate_shard.in
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "@INSTANCES@.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_@INSTANCES@_shard<@NUM_SHARDS@, @SHARD_ID@>(
+    @INSTANCES@& instances);
+} // namespace ck::tensor_operation::device::instance
diff --git a/include/ck/utility/filter_tuple.hpp b/include/ck/utility/filter_tuple.hpp
new file mode 100644
index 0000000000..c2e378b879
--- /dev/null
+++ b/include/ck/utility/filter_tuple.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "ck/utility/functional.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck::util {
+
+template <typename Tuple, std::size_t Stride, std::size_t Offset>
+struct filter_tuple_by_modulo
+{
+    // Validate Stride and Offset.
+    static_assert(Stride > 0, "Offset must be positive.");
+    static_assert(Offset >= 0 && Offset < Stride,
+                  "Offset must be positive and less than the stride.");
+
+    // Generate filtered indices for this stride and offset.
+    static constexpr int new_size = (std::tuple_size_v<Tuple> + Stride - Offset - 1) / Stride;
+
+    template <std::size_t... Is>
+    static constexpr auto to_index(std::index_sequence<Is...>)
+    {
+        return std::index_sequence<(Offset + Is * Stride)...>{};
+    }
+
+    using filtered_indices = decltype(to_index(std::make_index_sequence<new_size>{}));
+
+    // Helper struct to construct the new tuple type from the filtered indices.
+    template <typename T, typename Indices>
+    struct make_filtered_tuple_type_impl;
+
+    template <typename T, std::size_t... Is>
+    struct make_filtered_tuple_type_impl<T, std::index_sequence<Is...>>
+    {
+        using type = std::tuple<std::tuple_element_t<Is, T>...>;
+    };
+
+    using type = typename make_filtered_tuple_type_impl<Tuple, filtered_indices>::type;
+};
+
+// Filter a tuple with a stride and offset.
+//
+// Tuple is a std::tuple or equivalent
+// Stride is a positive integer
+// Offset is a positive integer smaller than ofset
+//
+// Evaluates to a smaller tuple type from elements of T with stride M and offset I.
+//
+// Can be used to filter a tuple of types for sharded instantiations.
+template <typename Tuple, std::size_t Stride, std::size_t Offset>
+using filter_tuple_by_modulo_t = typename filter_tuple_by_modulo<Tuple, Stride, Offset>::type;
+
+// Example compile-time test:
+// using OriginalTuple =
+//    std::tuple<int, double, char, float, long, short, bool, char, long long, unsigned int>;
+// using NewTuple_Every3rdFrom2nd = filter_tuple_by_modulo_t<OriginalTuple, 3, 1>;
+// static_assert(std::is_same_v<NewTuple_Every3rdFrom2nd, std::tuple<double, long, char>>,
+//               "Test Case 1 Failed: Every 3rd from 2nd");
+
+} // namespace ck::util
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index b018737932..a3f2515099 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -688,7 +688,6 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 22e9d726b0..7f3621a2ba 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -1,5 +1,5 @@
 # XDL_DL_WMMA_KERNELS
-add_instance_library(device_grouped_conv2d_fwd_instance
+set(GROUPED_CONV2D_FWD
    #xdl
    # GNHWC, GKYXC, GNHWK
    xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -19,8 +19,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
    # NGCHW, GKCYX, NGKHW
-   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
-   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
@@ -46,12 +44,10 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
    # NHWGC, GKYXC, NHWGK
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
    # NGCHW, GKCYX, NGKHW
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
@@ -71,7 +67,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
    # NGCHW, GKCYX, NGKHW
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -105,3 +100,47 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
   wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
+# Add generated files for sharded instantiations.
+include(ShardInstantiation)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
+  NUM_SHARDS 21
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
+  NUM_SHARDS 21
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+add_instance_library(device_grouped_conv2d_fwd_instance ${GROUPED_CONV2D_FWD})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
similarity index 53%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
index 7368587c93..55165729c6 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
@@ -1,16 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -22,19 +20,23 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances_shard([[maybe_unused]]
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances& instances)
 {
     add_device_operation_instances(
         instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                        NGCHW,
-                                                        GKCYX,
-                                                        Empty_Tuple,
-                                                        NGKHW,
-                                                        ConvFwdDefault>{});
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                           NGCHW,
+                                                                                           GKCYX,
+                                                                                           Empty_Tuple,
+                                                                                           NGKHW,
+                                                                                           ConvFwdDefault>,
+                                           Shards,
+                                           ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
\ No newline at end of file
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
similarity index 71%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
index 4ca1b2b85e..88c84adfe2 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -21,32 +19,40 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances_shard(
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwdDefault>{});
+                                                                              ConvFwdDefault>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwd1x1P0>{});
+                                                                              ConvFwd1x1P0>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                               NGCHW,
                                                                               GKCYX,
                                                                               Empty_Tuple,
                                                                               NGKHW,
-                                                                              ConvFwd1x1S1P0>{});
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
index e3a12fd5f4..13fb583725 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                 NGCHW,
                                                                 GKCYX,
@@ -21,32 +19,40 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances_shard(
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwdDefault>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwd1x1P0>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1P0>,
+                                   Shards,
+                                   ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwd1x1S1P0>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
deleted file mode 100644
index f667481fa4..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdDefault,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
new file mode 100644
index 0000000000..d8b35bda68
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances_shard(
+    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
deleted file mode 100644
index 2ff2c7f51f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Intrawave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
new file mode 100644
index 0000000000..125e16139d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances_shard(
+    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index f8efa5a7c1..1d9d75a104 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -11,8 +11,6 @@ set(GROUPED_CONV3D_FWD
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
    xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -32,23 +30,13 @@ set(GROUPED_CONV3D_FWD
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
 
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
    
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+      xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
    xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -71,6 +59,99 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
+# Add generated files for sharded instantiations.
+include(ShardInstantiation)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
+  NUM_SHARDS 8
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
+  NUM_SHARDS 8
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
 
 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
deleted file mode 100644
index a94f687ef8..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NDHWGC,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NDHWGK,
-                                                        ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                                                   NDHWGC,
-                                                                                   GKZYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NDHWGK,
-                                                                                   ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NDHWGC,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NDHWGK,
-                                                        ConvFwd1x1S1P0>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwd1x1S1P0>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwd1x1S1P0>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
new file mode 100644
index 0000000000..9d0eba6a6c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});    }
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
deleted file mode 100644
index 0c63345e7f..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NDHWGC,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NDHWGK,
-                                                       ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                  NDHWGC,
-                                                                                  GKZYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NDHWGK,
-                                                                                  ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NDHWGC,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NDHWGK,
-                                                       ConvFwd1x1S1P0>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd1x1S1P0>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwd1x1S1P0>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
new file mode 100644
index 0000000000..ccabc2090a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+    
+        if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1P0>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1S1P0>,
+                Shards,
+                ShardIndex>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1P0>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1S1P0>,
+                Shards,
+                ShardIndex>{}); 
+    }
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
deleted file mode 100644
index 43241454a5..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NGCDHW,
-                                                        GKCZYX,
-                                                        Empty_Tuple,
-                                                        NGKDHW,
-                                                        ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                                                   NGCDHW,
-                                                                                   GKCZYX,
-                                                                                   Empty_Tuple,
-                                                                                   NGKDHW,
-                                                                                   ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NGCDHW,
-                                                        GKCZYX,
-                                                        Empty_Tuple,
-                                                        NGKDHW,
-                                                        ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
new file mode 100644
index 0000000000..4c67e4912c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
deleted file mode 100644
index d02d9f6778..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NGCDHW,
-                                                       GKCZYX,
-                                                       Empty_Tuple,
-                                                       NGKDHW,
-                                                       ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NGCDHW,
-                                                       GKCZYX,
-                                                       Empty_Tuple,
-                                                       NGKDHW,
-                                                       ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
new file mode 100644
index 0000000000..0fbefa3bbc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
deleted file mode 100644
index 060eebebc1..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwdDefault>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
index f3eccc7dc8..c87783eed9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +20,43 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_insta
                                                                 BF16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  ConvFwdDefault>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Intrawave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                   NGCDHW,
                                                                                   GKCZYX,
                                                                                   Empty_Tuple,
                                                                                   NGKDHW,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Intrawave>{});
+                                                                                  ConvFwd1x1S1P0>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
deleted file mode 100644
index 85b088f416..0000000000
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwdDefault>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
similarity index 64%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
index abea0bea81..ca6d571be1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +20,43 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instan
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Interwave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 ConvFwdDefault>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Interwave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Interwave>{});
+                                                                                 ConvFwd1x1S1P0>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
new file mode 100644
index 0000000000..2586bc0f16
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
new file mode 100644
index 0000000000..7405f86a5f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
index ba5d9fb1de..24d6b66976 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +19,44 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instan
                                                                 F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
similarity index 57%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
index fac3098341..91a2444241 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
@@ -3,53 +3,60 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
                                                                 Empty_Tuple,
                                                                 NGKDHW,
-                                                                BF16,
-                                                                BF16,
+                                                                F16,
+                                                                F16,
                                                                 Empty_Tuple,
-                                                                BF16,
+                                                                F16,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
index 5a2c4a0d5b..7571dff883 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +19,44 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instan
                                                                 F32,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Interwave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
similarity index 59%
rename from library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
rename to library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
index 701b8eb4a4..38ed240fab 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
@@ -3,13 +3,11 @@
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
 
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {
 
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NGCDHW,
                                                                 GKCZYX,
@@ -21,35 +19,44 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instan
                                                                 F32,
                                                                 PassThrough,
                                                                 PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
 {
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwdDefault,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
     add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                 NGCDHW,
-                                                                                 GKCZYX,
-                                                                                 Empty_Tuple,
-                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                     NGCDHW,
+                                                                                     GKCZYX,
+                                                                                     Empty_Tuple,
+                                                                                     NGKDHW,
+                                                                                     ConvFwd1x1S1P0,
+                                                                                     Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
 }
 
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance

From dbfe70e72a5f2f0317b715cd4c7f7fb662affbe5 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Mon, 23 Jun 2025 09:31:46 -0500
Subject: [PATCH 247/443] Add accelerated stochastic rounding on gfx950 (#2355)

* Add native prand generation support for gfx950

* Update seed calculation
---
 include/ck/utility/amd_ck_fp8.hpp   |  65 +++++++++++++---
 include/ck/utility/mxf8_utils.hpp   |  10 ++-
 include/ck/utility/type_convert.hpp | 114 ++++++++++++++++++----------
 3 files changed, 134 insertions(+), 55 deletions(-)

diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index d079639c6a..cdc2a4fbda 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -5,6 +5,7 @@
 
 #include "ck/ck.hpp"
 #include "ck/utility/enable_if.hpp"
+#include "ck/utility/get_id.hpp"
 #include "ck/utility/random_gen.hpp"
 #include "ck/utility/functional.hpp"
 #include "ck/utility/type.hpp"
@@ -1396,12 +1397,18 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
     uint32_t rng = 0;
     if constexpr(stochastic_rounding)
     {
+#if defined(__gfx950__)
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
+#else
         constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
 #else
         rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
     }
     return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
         f, rng);
@@ -1416,12 +1423,18 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
     uint32_t rng = 0;
     if constexpr(stochastic_rounding)
     {
+#if defined(__gfx950__)
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
+#else
         constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
 #else
         rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
     }
 
     if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
@@ -1487,12 +1500,18 @@ __device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
     uint32_t rng = 0;
     if constexpr(stochastic_rounding)
     {
+#if defined(__gfx950__)
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
+#else
         constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
 #else
         rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f[0]);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
     }
     return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
         f, rng);
@@ -1532,12 +1551,18 @@ __host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
         uint32_t rng = 0;
         if constexpr(stochastic_rounding)
         {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
             constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
             rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
             rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
         }
 #if defined(__gfx950__)
         return cast_to_f8_from_f16<interp,
@@ -1574,12 +1599,18 @@ __host__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
         uint32_t rng = 0;
         if constexpr(stochastic_rounding)
         {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
             constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
             rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
 #else
             rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
         }
 #if defined(__gfx950__)
         return cast_to_f8_from_f16<interp,
@@ -1616,13 +1647,19 @@ __host__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
         uint32_t rng = 0;
         if constexpr(stochastic_rounding)
         {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
             constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
             rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
                                                static_cast<float>(x));
 #else
             rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), static_cast<float>(x));
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
         }
 #if defined(__gfx950__)
         return cast_to_f8_from_bf16<interp,
@@ -1664,14 +1701,20 @@ __host__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
         uint32_t rng = 0;
         if constexpr(stochastic_rounding)
         {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
             constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+            rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
                                                static_cast<float>(x[0]));
 #else
             rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x),
                                                static_cast<float>(x[0]));
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
         }
 #if defined(__gfx950__)
         return cast_to_f8_from_bf16<interp,
diff --git a/include/ck/utility/mxf8_utils.hpp b/include/ck/utility/mxf8_utils.hpp
index 9046a24a3a..565e1b27dc 100644
--- a/include/ck/utility/mxf8_utils.hpp
+++ b/include/ck/utility/mxf8_utils.hpp
@@ -197,8 +197,9 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8_scaled(const fl
     uint32_t rng = 0;
     if constexpr(stochastic_rounding)
     {
-        constexpr int seed = 1254739;
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
     }
     return cast_to_f8_from_f32_scaled<interp, stochastic_rounding>(f, rng, scale);
 }
@@ -221,8 +222,9 @@ __host__ __device__ static inline fp8x2_storage_t cvt_float_to_fp8_scaled(const
     uint32_t rng = 0;
     if constexpr(stochastic_rounding)
     {
-        constexpr int seed = 1254739;
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
     }
     return cast_to_f8_from_f32_scaled<interp, stochastic_rounding>(f, rng, scale);
 }
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 5865f1dd78..2208a73860 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -5,6 +5,7 @@
 
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/f8_utils.hpp"
+#include "ck/utility/get_id.hpp"
 #include "ck/utility/mxf4_utils.hpp"
 #include "ck/utility/mxf6_utils.hpp"
 #include "ck/utility/random_gen.hpp"
@@ -234,12 +235,18 @@ __host__ __device__ constexpr Y f8_convert_sr(X x);
 template <>
 inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+#else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
 #if defined(__gfx94__)
     union
     {
@@ -296,12 +303,18 @@ inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
 template <>
 inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+#else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
 #if defined(__gfx94__)
     union
     {
@@ -1446,13 +1459,10 @@ inline __host__ __device__ f4x32_t f4_convert_rne(float32_t x, float scale = 1.0
 // convert fp32 to fp4 with stochastic rounding
 inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
     union
     {
         uint32_t bitwise;
@@ -1468,6 +1478,12 @@ inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
         value.bitwise, float_values.float2_array, rng, scale, 0);
     return value.f4_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
     return utils::sat_convert_to_type_sr<f4_t>(x / scale, rng);
 #endif
 }
@@ -1475,13 +1491,10 @@ inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 // convert vector of 2 fp32 to vector of 2 fp4 with sr
 inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
     union
     {
         uint32_t bitwise;
@@ -1499,6 +1512,12 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 #endif // CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
     return value.f4x2_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
     union
     {
         uint32_t bitwise;
@@ -1514,13 +1533,10 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 // convert vector of 32 fp32 to vector of 32 fp4 with sr
 inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
     union
     {
         __uint128_t bitwise;
@@ -1546,6 +1562,12 @@ inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f
 
     return f4_values.f4x32_array;
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
     union
     {
         __uint128_t bitwise;
@@ -1776,13 +1798,10 @@ inline __host__ __device__ f6x32_t f6_convert_rne(float32_t x, float scale = 1.0
  */
 inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
     union
     {
         float32_t float_vector;
@@ -1799,6 +1818,12 @@ inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
 
     return out.f6_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
     return utils::sat_convert_to_type_sr<f6_t>(x / scale, rng);
 #endif
 }
@@ -1815,6 +1840,12 @@ inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
  */
 inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+    return __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale);
+#else
     constexpr int seed = 1254739;
     union
     {
@@ -1828,9 +1859,6 @@ inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f
     uint32_t rng =
         prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
 #endif
-#if defined(__gfx950__)
-    return __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale);
-#else
     union
     {
         float32_t float_vector;
@@ -2044,13 +2072,10 @@ inline __host__ __device__ bf6x32_t bf6_convert_rne(float32_t x, float scale = 1
  */
 inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
     union
     {
         float32_t float_vector;
@@ -2067,6 +2092,12 @@ inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
 
     return out.bf6_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
     return utils::sat_convert_to_type_sr<bf6_t>(x / scale, rng);
 #endif
 }
@@ -2085,6 +2116,12 @@ inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
  */
 inline __host__ __device__ bf6x32_t bf6_convert_sr(float32_t x, float scale = 1.0f)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+    return __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale);
+#else
     constexpr int seed = 1254739;
     union
     {
@@ -2098,9 +2135,6 @@ inline __host__ __device__ bf6x32_t bf6_convert_sr(float32_t x, float scale = 1.
     uint32_t rng =
         prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
 #endif
-#if defined(__gfx950__)
-    return __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale);
-#else
     union
     {
         float32_t float_vector;

From b8212864cf569b347f26816bfd44a50cadd60e28 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Tue, 24 Jun 2025 01:33:31 +0800
Subject: [PATCH 248/443] [CK_TILE] FMHA Support hdim_v to as a Multiple of 32
 (#2114)

* 160+192

* Add splitkv d160

* cleanup

* fix

* Add change log

* Fix CHANGELOG

* Use static_cast

* Update ignored instance

---------

Co-authored-by: asleepzzz <hanwen.chang@amd.com>
---
 CHANGELOG.md                                  |  1 +
 example/ck_tile/01_fmha/README.md             |  1 +
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 45 +++++++--------
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |  5 +-
 example/ck_tile/01_fmha/fmha_fwd.cpp          | 43 ++++-----------
 include/ck_tile/core/tensor/shuffle_tile.hpp  |  7 ++-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 55 +++++++++++++++----
 7 files changed, 89 insertions(+), 68 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 368d1e502d..ab2076c0d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
+* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv)
 * Added benchmarking support for tile engine GEMM.
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md
index 12414a20ed..72109a660b 100644
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -71,6 +71,7 @@ args:
   -drop_seed    seed for random number generator (default:1)
 -drop_offset    offset for random number generator (default:0)
  -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
+ -num_splits    number of splits for key/value. 0 to determine actual number by heuristic (default:1)
      -warmup    number of iterations before benchmark the kernel (default:5)
      -repeat    number of iterations to benchmark the kernel (default:20)
 ```
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 7cbbdb9034..37a1b7329b 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -282,18 +282,19 @@ class FmhaFwdApiPool:
         # TODO: do we need to check duplication?
         if trait.dtype not in self.pool.keys():
             self.pool[trait.dtype] = dict()
-        if trait.hdim not in self.pool[trait.dtype].keys():
-            self.pool[trait.dtype][trait.hdim] = list()
+        hdim = trait.hdim, trait.bn1
+        if hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][hdim] = list()
 
-        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+        self.pool[trait.dtype][hdim].append(copy.copy(trait))
 
     @property
     def api(self) -> str:
         per_dtypes=str()
         for i, dtype in enumerate(self.pool.keys()):
             per_hdim_case=str()
-            for j, hdim in enumerate(self.pool[dtype].keys()):
-                traits=self.pool[dtype][hdim]
+            for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][(hdim, hdim_v)]
                 inners=str()
                 for k, trait in enumerate(traits):
                     if_k = 'if' if k == 0 else 'else if'
@@ -306,7 +307,7 @@ class FmhaFwdApiPool:
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                 if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
             if_i = 'if' if i == 0 else 'else if'
             per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
         if not per_dtypes:
@@ -435,18 +436,20 @@ class FmhaFwdKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            (32, 32)  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            (64, 64)  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### (96, 128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            (128,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### (160,160) : FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  1),
+            (192,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### (192,192) : FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  1),
+            (256,256) : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
         return {
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            (64,64 )  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            (128,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            (256,256) : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
         }
     else:
         return None
@@ -454,7 +457,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
 def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
     #       support this in future
-    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+    def get_pipelines(dtype, hdim, hdim_v) -> List[FmhaFwdPipeline]:
         # this function will populate a list possible pipelines
         # TODO: the order of List matters! the later in this list will be also be checked later
         # TODO: currently for qr pipeline, let 't' padding to appear later!!
@@ -463,7 +466,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-                if hdim == 256:
+                if hdim == 256 and hdim_v == 256:
                 # if True:
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
@@ -507,15 +510,13 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         if d == None:
             continue
         #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
-        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
-            tile = d[hdim_str]
-            hdim = int(hdim_str)
-            for pipeline in get_pipelines(dtype, hdim):
+        for ((hdim, hdim_v), tile), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for pipeline in get_pipelines(dtype, hdim, hdim_v):
                 if mode == "group":
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                         continue
-                if hdim == 192 and tile.F_bn1 == 128:
+                if (hdim, hdim_v) == (192, 128) or hdim == 160:
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 3ae0e28be3..2d2d71555d 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -34,6 +34,7 @@ K0_MAX_SUBMAX_MAP = {
     64 : 64,
     96 : 128,
     128: 128,
+    # 160: 160,
     256: 256
 }
 
@@ -638,6 +639,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
             '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
         ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+        ### '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -656,6 +658,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
             '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
         ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+        ### '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
     }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -683,7 +686,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
                 # TODO: use async pipeline when compiler is more stable
-                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
+                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128, 160]:
                 # if True:
                     pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
                     pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 8958c0c96e..972653c218 100755
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "fmha_fwd.hpp"
 #include "ck_tile/host.hpp"
@@ -178,50 +178,30 @@ auto get_elimit<FmhaFwdFp8>(std::string init_method)
     }
 }
 
-int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int num_n_blocks, int max_splits)
+int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int max_splits)
 {
     // If we have enough to almost fill the SMs, then just use 1 split
     if(batch_nhead_mblocks >= 0.8f * num_SMs)
     {
         return 1;
     }
-    max_splits           = std::min({max_splits, num_SMs, num_n_blocks});
+    max_splits           = std::min({max_splits, num_SMs});
     float max_efficiency = 0.f;
     std::vector<float> efficiency;
     efficiency.reserve(max_splits);
-    auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
-    // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits,
-    // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks
-    // (i.e. it's 11 splits anyway).
-    // So we check if the number of blocks per split is the same as the previous num_splits.
-    auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
-        return num_splits == 1 ||
-               ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1);
-    };
     for(int num_splits = 1; num_splits <= max_splits; num_splits++)
     {
-        if(!is_split_eligible(num_splits))
+        float n_waves = float(batch_nhead_mblocks * num_splits) / num_SMs;
+        float eff     = n_waves / ceil(n_waves);
+        // printf("num_splits = %d, eff = %f\n", num_splits, eff);
+        if(eff > max_efficiency)
         {
-            efficiency.push_back(0.f);
-        }
-        else
-        {
-            float n_waves = float(batch_nhead_mblocks * num_splits) / num_SMs;
-            float eff     = n_waves / ceil(n_waves);
-            // printf("num_splits = %d, eff = %f\n", num_splits, eff);
-            if(eff > max_efficiency)
-            {
-                max_efficiency = eff;
-            }
-            efficiency.push_back(eff);
+            max_efficiency = eff;
         }
+        efficiency.push_back(eff);
     }
     for(int num_splits = 1; num_splits <= max_splits; num_splits++)
     {
-        if(!is_split_eligible(num_splits))
-        {
-            continue;
-        }
         if(efficiency[num_splits - 1] >= 0.85 * max_efficiency)
         {
             // printf("num_splits chosen = %d\n", num_splits);
@@ -234,6 +214,7 @@ int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int num_n_blocks,
 int override_num_splits_if_necessary(
     int batch, int nhead, int max_seqlen_q, int hdim_v, float p_drop, int num_splits)
 {
+    (void)hdim_v;
     int device;
     auto status = hipGetDevice(&device);
     if(status != hipSuccess)
@@ -250,15 +231,13 @@ int override_num_splits_if_necessary(
 
     // tile size should match the generate.py
     const int kM0 = 64;
-    const int kN1 = hdim_v;
 
     const int num_m_blocks = ck_tile::integer_divide_ceil(max_seqlen_q, kM0);
-    const int num_n_blocks = ck_tile::integer_divide_ceil(hdim_v, kN1);
 
     if(num_splits < 1 && p_drop == 0.0f)
     {
         return num_splits_heuristic(
-            batch * nhead * num_m_blocks, props.multiProcessorCount * 2, num_n_blocks, 128);
+            batch * nhead * num_m_blocks, props.multiProcessorCount * 2, 128);
     }
 
     return num_splits;
diff --git a/include/ck_tile/core/tensor/shuffle_tile.hpp b/include/ck_tile/core/tensor/shuffle_tile.hpp
index 55e3274cde..84c2b7d2fa 100644
--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -129,7 +129,10 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
         // set output vectors
         static_for<0, num_vec_out, 1>{}([&](auto i) {
             constexpr auto idx_y_out_tmp = generate_array(
-                [&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
+                [&](auto ii) {
+                    return ii == y_dim_vec_in ? static_cast<index_t>(idx_y_start[ii]) + i
+                                              : static_cast<index_t>(idx_y_start[ii]);
+                },
                 number<NDimY>{});
 
             constexpr auto idx_y_out =
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 30d07a4754..0b8e5836cd 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -787,12 +787,29 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             constexpr index_t N0 = kNPerBlock / N1; // P
 
             constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-            static_assert(total_pixels % N1 == 0); // TODO: this is not always true?
-            constexpr index_t K3     = total_pixels / N1;
-            constexpr index_t kKPack = GetSmemKPackV<Problem>();
-            static_assert(kKPack % K3 == 0);
+            constexpr index_t kKPack       = GetSmemKPackV<Problem>();
+            constexpr index_t K3           = total_pixels / N1;
             constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
-            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            if constexpr(total_pixels % N1 != 0 || kKPack % K3 != 0) // if K2 or K3 is not divisible
+            {
+                constexpr index_t kNPack = 32;
+                static_assert(kNPerBlock % kNPack == 0);
+                constexpr index_t K0   = kBlockSize / get_warp_size();
+                constexpr index_t N2   = 2;
+                constexpr index_t N1_m = kNPack / N2;
+                constexpr index_t N0_m = kNPerBlock / kNPack;
+                constexpr index_t K1   = get_warp_size() / N1_m;
+                constexpr index_t K2_m = kKPerBlock / K1;
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,
+                        tuple<sequence<N0_m, N1_m, N2>, sequence<K0, K1, K2_m>>,
+                        tuple<sequence<2>, sequence<2, 1>>, // K0, K1 N0
+                        tuple<sequence<0>, sequence<1, 1>>,
+                        sequence<1, 2, 1>, // N0 K2 N2
+                        sequence<0, 2, 2>>{});
+            }
+            else if constexpr(get_warp_size() % (kKPack / K3 * N0) == 0)
             {
                 constexpr index_t K1 = get_warp_size() / (K2 * N0);
                 constexpr index_t K0 = kBlockSize / get_warp_size();
@@ -860,12 +877,28 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t N1           = GetAlignmentV<Problem>();
         constexpr index_t N0           = kNPerBlock / N1;
         constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-        static_assert(total_pixels % N1 == 0); // TODO: this is not always true?
-        constexpr index_t K3     = total_pixels / N1;
-        constexpr index_t kKPack = GetSmemKPackV<Problem>();
-        static_assert(kKPack % K3 == 0);
+        constexpr index_t K3           = total_pixels / N1;
+        constexpr index_t kKPack       = GetSmemKPackV<Problem>();
         constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
-        if constexpr(get_warp_size() % (K2 * N0) == 0)
+        if constexpr(total_pixels % N1 != 0 || kKPack % K3 != 0) // if K2 or K3 is not divisible
+        {
+            constexpr index_t kNPack = 32;
+            static_assert(kNPerBlock % kNPack == 0);
+            constexpr index_t K0   = kBlockSize / get_warp_size();
+            constexpr index_t N2   = 2;
+            constexpr index_t N1_m = kNPack / N2;
+            constexpr index_t N0_m = kNPerBlock / kNPack;
+            constexpr index_t K1   = get_warp_size() / N1_m;
+            constexpr index_t K2_m = kKPerBlock / K1;
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0_m, N1_m, N2>, sequence<K0, K1, K2_m>>,
+                                           tuple<sequence<2>, sequence<2, 1>>, // K0, K1 N0
+                                           tuple<sequence<0>, sequence<1, 1>>,
+                                           sequence<1, 1, 2>, // N0 K2 <-> N2
+                                           sequence<0, 2, 2>>{});
+        }
+        else if constexpr(get_warp_size() % (kKPack / K3 * N0) == 0)
         {
             constexpr index_t K1 = get_warp_size() / (K2 * N0);
             constexpr index_t K0 = kBlockSize / get_warp_size();

From bb571a033019fd5a8ba6de31119395c3621a4235 Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Tue, 24 Jun 2025 14:51:29 +0800
Subject: [PATCH 249/443] fix moe i4 bug from aiter (#2339)

---
 ...dlops_b_preshuffle_gufusion_dequant_v1.hpp | 50 ++++++++-----------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
index 4f7b8e768c..29750b8baa 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
@@ -122,7 +122,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
     using Base::B_K1;
     using Base::I0;
     using Base::I1;
-    using Base::KGroup;
     using Base::KRepeat;
     using Base::xdlops_gemm;
     using typename Base::HotLoopInstList;
@@ -154,9 +153,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
         constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
         constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
         constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K2 = KPack;
         constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat * KGroup;
+        constexpr index_t K0 = KRepeat;
 
         return transform_tensor_descriptor(
             TileDesc_M0_M1_M2_K{},
@@ -291,14 +290,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
         block_sync_lds();
         static_for<0, MRepeat, 1>{}([&](auto m0) {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, KGroup, 1>{}([&](auto kg0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                       a_thread_buf);
-                });
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
             });
         });
         // B VGPR->VGPR dequant
@@ -391,15 +388,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
 
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            static_for<0, KGroup, 1>{}([&](auto kg0) {
-                                a_thread_copy_.Run(
-                                    a_block_desc_m0_m1_m2_k0_k1_k2,
-                                    make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                    a_block_buf,
-                                    a_thread_desc_,
-                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                    a_thread_buf);
-                            });
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
                         });
                     });
                     // B VGPR->VGPR dequant
@@ -483,14 +477,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
 
             static_for<0, MRepeat, 1>{}([&](auto m0) {
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    static_for<0, KGroup, 1>{}([&](auto kg0) {
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                           make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                           a_block_buf,
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                           a_thread_buf);
-                    });
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
                 });
             });
             // B VGPR->VGPR dequant
@@ -596,7 +588,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
                                                          ComputeDataType,
                                                          decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                          decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
                                                          Sequence<0, 1, 2, 3, 4, 5>,
                                                          5,
                                                          A_K1,

From 9e74ae7c8955c2f7f42c8b49bb6c0d01878e671d Mon Sep 17 00:00:00 2001
From: Kiefer van Teutem <50830967+krithalith@users.noreply.github.com>
Date: Tue, 24 Jun 2025 16:28:13 +0200
Subject: [PATCH 250/443] Implement batched gemm wmma (RDNA batched gemm) based
 on wmma cshuffle v3 (#2319)

* Some prep work for adding batched_gemm_wmma_universal. Moved batched_gemm in general to gfx11 and gfx12 categories, and split existing batched_gemm test into xdl and wmma versions. Updated profiler and instance factory. For now only adding f16-row-row-row-GemmDefault. For now actual device instance list is empty.

* Add DeviceBatchedGemm_Wmma_CShuffleV3 based on DeviceGemm_Wmma_CShuffleV3 and make sure it's used in the instance factory and tests. Currently the new batched device level struct cannot actually handle batching, but it does pass tests with a trivial batch size of 1, meaning that the overall structure is good.

* Add custom kernel and Argument type to DeviceBatchedGemm_Wmma_CShuffleV3. Batching arguments not passed to kernel yet.

* Implement kernel-level batching logic for DeviceBatchedGemm_Wmma_CShuffleV3.  In principle the whole thing works now, just need to add other data types and perhaps do some cleanup.

* Add other layouts for batched gemm wmma chufflev3 f16 f16 f16. Now matching XDL (for f16).

* Add bf16 bf16 bf16 support for batched gemm wmma cshuffle v3 for all layouts.

* Fixup comments and TODOs

* Expand test cases for batched gemm wmma cshuffle v3 with more unusual shapes. Some of the original test cases for batched gemm do not work based on cshuffle v3 because the dimensions are too small.

* Fix argument order for calls to profile_batched_gemm_impl() ONLY in wmma tests.

* Take batching into account when using rotating memory or clearing the C tensor.

* Implement small refactors / comments etc. from review.

* Port recent gemm wmma updates to batched gemm wmma: V1 pipeline, non-main-k-block-loop, check compute type, packed buffer size calc. Ported new instance lists.

* Add MNKPadding instances to batched gemm wmma cshuffle v3, remove incompatible test problems.

* Put clearing the C matrix in a pre-process lambda for the non-flush case + small fixups.

* Once again switch order of strides and batch strides in calls to profile_batched_gemm_impl() from test_batched_gemm_wmma to match latest definition of that function.

---------

Co-authored-by: kiefer <kiefer.van.teutem@streamhpc.com>
---
 .../device_batched_gemm_wmma_cshuffle_v3.hpp  | 759 ++++++++++++++++++
 .../gpu/batched_gemm.hpp                      | 105 ++-
 .../gpu/batched_gemm/CMakeLists.txt           |  42 +-
 ...al_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp |  71 ++
 ...al_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp |  73 ++
 ...al_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp |  76 ++
 ...al_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp |  79 ++
 ...ersal_f16_f16_f16_gkm_gkn_gmn_instance.cpp |  70 ++
 ...ersal_f16_f16_f16_gkm_gnk_gmn_instance.cpp |  72 ++
 ...ersal_f16_f16_f16_gmk_gkn_gmn_instance.cpp |  75 ++
 ...ersal_f16_f16_f16_gmk_gnk_gmn_instance.cpp |  78 ++
 profiler/src/CMakeLists.txt                   |   4 +-
 test/batched_gemm/CMakeLists.txt              |   9 +-
 test/batched_gemm/test_batched_gemm_wmma.cpp  | 193 +++++
 14 files changed, 1684 insertions(+), 22 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
 create mode 100644 test/batched_gemm/test_batched_gemm_wmma.cpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
new file mode 100644
index 0000000000..580a47de14
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename ComputePtrOffsetOfStridedBatch,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_batched_gemm_wmma_cshuffle_v3(
+            typename GridwiseGemm::Argument
+                karg, // This works for now but it actually receives a
+                      // DeviceBatchedGemm_Wmma_CShuffleV3::Argument
+                      // argument through implicit conversion to base class!
+            const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if defined(__gfx11__)
+    // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
+    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
+                   (std::is_same_v<c_data_type, ck::half_t> ||
+                    std::is_same_v<c_data_type, ck::bhalf_t>)))
+    {
+#endif
+        // The normal approach to batching would be to increase the grid size by just stretching out
+        // the grid Z dimension (which is the outermost dimension), but this depends on lower level
+        // functions not directly using the Z dimension for other calculations. As it turns out, k
+        // batching does rely directly on blockIdx.Z through SplitKBatchOffset. Therefore, for now
+        // we will use the grid Y dimension for batching. This may be a bit fragile.
+        const index_t g_idx = amd_wave_read_first_lane(blockIdx.y);
+
+        const long_index_t a_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        const long_index_t c_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset + a_batch_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset + b_batch_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset + c_batch_offset,
+            p_shared,
+            karg);
+#if defined(__gfx11__)
+    }
+#endif
+#else
+    ignore = karg;
+    ignore = batch;
+    ignore = compute_ptr_offset_of_batch;
+#endif
+}
+
+/// @brief \"Universal\" Batched GEMM operation without SplitK support.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         C{G,M,N} = C_op(A_op(A{G,M,K}) * B_op(B{G,K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations applied to the A, B, and C tensors, respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through its design
+///         and versatilty.
+///
+/// @note   This Kernel implementation currently does not support the SplitK algorithm.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled). Currently not supported!
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
+                                                                    BLayout,
+                                                                    CLayout,
+                                                                    ADataType,
+                                                                    BDataType,
+                                                                    CDataType,
+                                                                    AElementwiseOperation,
+                                                                    BElementwiseOperation,
+                                                                    CElementwiseOperation>
+{
+    // We are inheriting from DeviceBatchedGemm and this base class does not support permuteA and
+    // permuteB arguments so for now we are not including this functionality.
+    static_assert(PermuteA == false,
+                  "Permute A functionality not supported by DeviceBatchedGemm operations.\n");
+    static_assert(PermuteB == false,
+                  "Permute B functionality not supported by DeviceBatchedGemm operations.\n");
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        false,  // PermuteA not supported by DeviceBatchedGemm base class.
+        false>; // PermuteB not supported by DeviceBatchedGemm base class.
+
+    // Argument
+    struct Argument : public GridwiseGemm::Argument
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t BatchStrideA_,
+                          index_t BatchStrideB_,
+                          index_t BatchStrideC_,
+                          index_t Batch_,
+                          index_t k_batch_,
+                          bool is_reduce_ = false)
+            : GridwiseGemm::Argument(p_a_grid_,
+                                     p_b_grid_,
+                                     p_c_grid_,
+                                     M_,
+                                     N_,
+                                     K_,
+                                     StrideA_,
+                                     StrideB_,
+                                     StrideC_,
+                                     k_batch_,
+                                     is_reduce_),
+              Batch(Batch_),
+              compute_ptr_offset_of_batch{BatchStrideA_, BatchStrideB_, BatchStrideC_}
+        {
+        }
+
+        index_t Batch;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
+    };
+
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
+    struct Invoker : public BaseInvoker
+    {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            // The normal approach to batching would be to increase the grid size by just stretching
+            // out the grid Z dimension (which is the outermost dimension), but this depends on
+            // lower level functions not directly using the Z dimension for other calculations. As
+            // it turns out, k batching does rely directly on blockIdx.Z through SplitKBatchOffset.
+            // Therefore, for now we will use the grid Y dimension for batching. This may be a bit
+            // fragile.
+            gdy *= arg.Batch;
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    // Packed sizes are 1 for all implemented data types but we include it anyway
+                    // for future compatibility.
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
+
+                    // Note: the grid descriptors and size_a / size_b do *not* take batching into
+                    // account, so we have to manually multiply overall buffer sizes for rotating
+                    // memory by batch.
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_,
+                        stream_config.rotating_count,
+                        arg_.Batch * size_a_buffer,
+                        arg_.Batch * size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            // Note: we multiply by batch since we want to clear the C matrix for
+                            // the whole batch. Untested since we don't have k batching ATM.
+                            // Note: This seems incorrect for non-contiguous memory layouts for C
+                            // (padding, gaps).
+                            HIP_CHECK_ERROR(
+                                hipMemsetAsync(arg_.p_c_grid,
+                                               0,
+                                               arg_.Batch * arg_.M * arg_.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_,
+                        arg_.compute_ptr_offset_of_batch);
+                }
+                else
+                {
+                    auto clear_workspace = [&]() {
+                        // clear c mem
+                        if(arg.KBatch > 1)
+                            // Note: we multiply by batch since we want to clear the C matrix for
+                            // the whole batch. Untested since we don't have k batching ATM.
+                            // Note: This seems incorrect for non-contiguous memory layouts for C
+                            // (padding, gaps).
+                            HIP_CHECK_ERROR(
+                                hipMemsetAsync(arg.p_c_grid,
+                                               0,
+                                               arg.Batch * arg.M * arg.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg,
+                        arg.compute_ptr_offset_of_batch);
+                }
+            };
+
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            ComputePtrOffsetOfStridedBatch,
+                            true,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<ComputePtrOffsetOfStridedBatch>,
+                            true,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                else
+                {
+                    // TODO: Implement
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            ComputePtrOffsetOfStridedBatch,
+                            false,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<ComputePtrOffsetOfStridedBatch>,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported())
+        {
+            return false;
+        }
+
+        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
+                     std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if(arg.KBatch > 1 && ck::is_gfx11_supported())
+            {
+                // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
+                     std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
+        {
+            if(ck::is_gfx11_supported())
+            {
+                return false;
+            }
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    // TODO: This is not part of the DeviceBatchedGemm base class but it was part of
+    // DeviceBatchedGemmV2. Remove?
+    // index_t GetKPerBlock() override { return KPerBlock; }
+    // bool GetPermuteA() override { return PermuteA; }
+    // bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideC,
+                             index_t Batch,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideC,
+                        Batch,
+                        1 /* KBatch */};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideC,
+                                                      index_t Batch,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideC,
+                                          Batch,
+                                          1); // KBatch
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceBatchedGemm_Wmma_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock << "x" << NPerBlock << "x" << KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma << "x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat << "x" << NRepeat << ", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector << "x" << BBlockTransferSrcScalarPerVector << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
index 8f15e80794..1caa750ad3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,6 +16,46 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif // CK_ENABLE_BF16
+#endif // CK_USE_WMMA
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_BF16
 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
     std::vector<std::unique_ptr<
@@ -124,6 +164,8 @@ void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances);
 #endif
+#endif // CK_USE_XDL
+
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -154,6 +196,66 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_BF16
+#endif // CK_USE_WMMA
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP32
         if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                      is_same_v<CDataType, float>)
@@ -258,6 +360,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
             }
         }
 #endif
+#endif // CK_USE_XDL
         return op_ptrs;
     }
 };
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
index 1227a77a38..519d549a3d 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
@@ -1,19 +1,27 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(BATCHED_GEMM_INSTANCES)
-list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp)
+list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp)
 add_instance_library(device_batched_gemm_instance ${BATCHED_GEMM_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 0000000000..659d6a99a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 0000000000..8ead225c7c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 0000000000..f9e0f610fa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 0000000000..41ed9bfb3b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 0000000000..21fee6f321
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 0000000000..ea9b725286
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 0000000000..fc0fc45887
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 0000000000..e67df2cada
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index fef09315d5..1e65e9e580 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -67,7 +67,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
     list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
   endif()
-  list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
   list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
   list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
   list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
@@ -92,6 +91,7 @@ endif()
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
+  list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
@@ -164,7 +164,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
     list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
   endif()
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
   list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]")
@@ -206,6 +205,7 @@ endif()
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
   list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
diff --git a/test/batched_gemm/CMakeLists.txt b/test/batched_gemm/CMakeLists.txt
index 759cf3da67..4c325b2872 100644
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -1,4 +1,9 @@
-add_gtest_executable(test_batched_gemm test_batched_gemm_xdl.cpp)
+add_gtest_executable(test_batched_gemm_xdl test_batched_gemm_xdl.cpp)
 if(result EQUAL 0)
-   target_link_libraries(test_batched_gemm PRIVATE utility device_batched_gemm_instance)
+   target_link_libraries(test_batched_gemm_xdl PRIVATE utility device_batched_gemm_instance)
+endif()
+
+add_gtest_executable(test_batched_gemm_wmma test_batched_gemm_wmma.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_batched_gemm_wmma PRIVATE utility device_batched_gemm_instance)
 endif()
diff --git a/test/batched_gemm/test_batched_gemm_wmma.cpp b/test/batched_gemm/test_batched_gemm_wmma.cpp
new file mode 100644
index 0000000000..18f9db8c39
--- /dev/null
+++ b/test/batched_gemm/test_batched_gemm_wmma.cpp
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
+struct GemmParams
+{
+    ck::index_t M;
+    ck::index_t N;
+    ck::index_t K;
+    ck::index_t BatchCount;
+};
+
+class TestBatchedGemm : public ::testing::Test
+{
+    protected:
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    std::vector<GemmParams> params;
+
+    template <typename DataType>
+    void Run()
+    {
+        using namespace ck::tensor_operation::device;
+
+        bool pass = true;
+        for(auto& param : params)
+        {
+            const auto M          = param.M;
+            const auto N          = param.N;
+            const auto K          = param.K;
+            const auto BatchCount = param.BatchCount;
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Row,
+                                                                Row,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Row,
+                                                                                  Row,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Row,
+                                                                Col,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Row,
+                                                                                  Col,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Col,
+                                                                Row,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Col,
+                                                                                  Row,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Col,
+                                                                Col,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Col,
+                                                                                  Col,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+// #ifdef CK_ENABLE_INT8
+// TEST_F(TestBatchedGemm, i8)
+// {
+//     this->params.push_back({64, 64, 64, 2});
+//     this->params.push_back({64, 64, 64, 1});
+//     this->params.push_back({60, 60, 60, 2});
+//     this->params.push_back({68, 68, 68, 2});
+//     this->params.push_back({40, 40, 40, 2});
+//     this->params.push_back({256, 256, 128, 3});
+//     this->template Run<int8_t>();
+// }
+// #endif
+
+#ifdef CK_ENABLE_BF16
+TEST_F(TestBatchedGemm, bf16)
+{
+    this->params.push_back({64, 64, 64, 2});
+    this->params.push_back({64, 64, 64, 1});
+    this->params.push_back({40, 40, 40, 2});
+    this->params.push_back({256, 256, 128, 3});
+
+    // Tests with larger MNK
+    this->params.push_back({512, 256, 128, 1});
+    this->params.push_back({256, 240, 192, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->params.push_back({240, 128, 128, 5});
+    this->template Run<ck::bhalf_t>();
+}
+#endif
+
+#ifdef CK_ENABLE_FP16
+TEST_F(TestBatchedGemm, fp16)
+{
+    this->params.push_back({64, 64, 64, 2});
+    this->params.push_back({64, 64, 64, 1});
+    this->params.push_back({40, 40, 40, 2});
+    this->params.push_back({256, 256, 128, 3});
+
+    // Tests with larger MNK
+    this->params.push_back({512, 256, 128, 1});
+    this->params.push_back({256, 240, 192, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->params.push_back({240, 128, 128, 5});
+    this->template Run<ck::half_t>();
+}
+#endif
+
+// #ifdef CK_ENABLE_FP32
+// TEST_F(TestBatchedGemm, fp32)
+// {
+//     this->params.push_back({64, 64, 64, 2});
+//     this->params.push_back({64, 64, 64, 1});
+//     this->params.push_back({60, 60, 60, 2});
+//     this->params.push_back({68, 68, 68, 2});
+//     this->params.push_back({40, 40, 40, 2});
+//     this->params.push_back({256, 256, 128, 3});
+//     this->template Run<float>();
+// }
+// #endif

From 42e246e90fa42d7dd745b9e843c62f4d90540af8 Mon Sep 17 00:00:00 2001
From: JonathanLichtnerAMD
 <195780826+JonathanLichtnerAMD@users.noreply.github.com>
Date: Tue, 24 Jun 2025 08:30:42 -0600
Subject: [PATCH 251/443] Fix build error when building with
 MIOPEN_REQ_LIBS_ONLY=ON (#2383)

Co-authored-by: John Shumway <john.shumwayjr@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0fc725236..6e032a30cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -634,7 +634,7 @@ option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
 
 add_subdirectory(library)
 
-if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
+if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
    rocm_package_setup_component(tests
         LIBRARY_NAME composablekernel
         PACKAGE_NAME tests # Prevent -static suffix on package name

From 87fdb368a73f1c21c2f556e87981801224c958ef Mon Sep 17 00:00:00 2001
From: JonathanLichtnerAMD
 <195780826+JonathanLichtnerAMD@users.noreply.github.com>
Date: Tue, 24 Jun 2025 08:32:16 -0600
Subject: [PATCH 252/443] Do not build "other" library for MIOpen (#2382)

MIOpen only needs the static CK library for convolutions.
---
 library/src/tensor_operation_instance/gpu/CMakeLists.txt | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index dbd503c0bd..aea3359aff 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -295,11 +295,8 @@ FOREACH(subdir_path ${dir_list})
 
         if(MIOPEN_REQ_LIBS_ONLY)
             message(STATUS "Removing all sources that are not required for MIOpen")
-            if("${cmake_instance}" MATCHES "gemm" OR 
-               "${cmake_instance}" MATCHES "mha" OR 
-               "${cmake_instance}" MATCHES "contraction" OR 
-               "${cmake_instance}" MATCHES "reduce")
-                    set(add_inst 0)
+            if(NOT "${cmake_instance}" MATCHES "conv")
+                set(add_inst 0)
             endif()
         endif()
 
@@ -328,7 +325,7 @@ ENDFOREACH()
 
 
-if(CK_DEVICE_OTHER_INSTANCES)
+if(CK_DEVICE_OTHER_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
         add_library(device_other_operations ${CK_DEVICE_OTHER_INSTANCES})
         add_library(composablekernels::device_other_operations ALIAS device_other_operations)
         set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)

From 77123600ee4b6fae077a2145b68b00a8b2ce9460 Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Tue, 24 Jun 2025 20:45:24 +0600
Subject: [PATCH 253/443] Improve fmha_bwd tests performance (#2376)

* Avoid passing indices (std::vector) by value to host tensor's operator()

Each access requires 2 allocations and copies of the vector.

* Remove 1 unneeded vector copy from the slowest part of fmha_bwd's verification

* Compute ds_hp_host_ref in parallel

This sequntial ForEach is the slowest part of validation and it benefits
from parallel computation.

* Do not use ForEach for simple copy and conversion of large tensors

These tensors all have the same shape {nhead, real_seqlen_q, real_seqlen_k} and
can be copied/converted without complex computations of linear indices.
---
 example/ck_tile/01_fmha/fmha_bwd.cpp       | 47 +++++++++-------------
 include/ck/library/utility/host_tensor.hpp |  6 +--
 include/ck_tile/host/host_tensor.hpp       |  9 +++--
 3 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp
index eaf99529f3..3b9cf09eb2 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "fmha_bwd.hpp"
 #include "ck_tile/host.hpp"
@@ -756,22 +756,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         if(p_drop > 0)
         {
-            p_hp_host_ref.ForEach(
-                [&](auto& self, auto idx) { p_dropped_hp_host_ref(idx) = self(idx); });
+            p_dropped_hp_host_ref = p_hp_host_ref;
             randval_host_ref.ForEach([&](auto& self, auto idx) {
                 self(idx) = randval_host(b, idx[0], idx[1] + query_offset, idx[2]);
             });
             ck_tile::reference_batched_dropout(
                 p_dropped_hp_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
-            p_dropped_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                p_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-            });
+            p_lp_host_ref = p_dropped_hp_host_ref.template CopyAsType<GemmDataType>();
         }
         else
         {
-            p_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                p_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-            });
+            p_lp_host_ref = p_hp_host_ref.template CopyAsType<GemmDataType>();
         }
 
         // O = P * V
@@ -854,29 +849,27 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
 
         // dS_i_j = P_i_j .* (dP_i_j - dO_i dot O_i)
-        ds_hp_host_ref.ForEach([&](auto& self, auto idx_gmn) {
-            AccDataType do_dot_o = 0;
-            for(int o = 0; o < hdim_v; o++)
-            {
-                auto idx_gmo = idx_gmn;
-                idx_gmo[2]   = o;
-                do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(idx_gmo)) *
-                            ck_tile::type_convert<AccDataType>(o_host_refs[wb](idx_gmo));
-            }
-            self(idx_gmn) = ck_tile::type_convert<AccDataType>(
-                p_hp_host_refs[wb](idx_gmn) * (dp_hp_host_ref(idx_gmn) - do_dot_o));
-        });
+        ck_tile::make_ParallelTensorFunctor(
+            [&](auto i0, auto i1, auto i2) {
+                AccDataType do_dot_o = 0;
+                for(int o = 0; o < hdim_v; o++)
+                {
+                    do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(i0, i1, o)) *
+                                ck_tile::type_convert<AccDataType>(o_host_refs[wb](i0, i1, o));
+                }
+                ds_hp_host_ref(i0, i1, i2) = ck_tile::type_convert<AccDataType>(
+                    p_hp_host_refs[wb](i0, i1, i2) * (dp_hp_host_ref(i0, i1, i2) - do_dot_o));
+            },
+            ds_hp_host_ref.mDesc.get_lengths()[0],
+            ds_hp_host_ref.mDesc.get_lengths()[1],
+            ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());
 
         if(use_dbias)
         {
-            ds_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                dbias_host_ref(idx) = ck_tile::type_convert<BiasGradDataType>(self(idx));
-            });
+            dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
         }
 
-        ds_hp_host_ref.ForEach([&](auto& self, auto idx) {
-            ds_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-        });
+        ds_lp_host_ref = ds_hp_host_ref.template CopyAsType<GemmDataType>();
 
         // dV = P_drop^T@dO^T
         // dV = P^T@dO^T w/o dropout
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 06e33afd20..286dffc36c 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -167,7 +167,7 @@ struct HostTensorDescriptor
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
     }
 
-    std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
+    std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
     {
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
     }
@@ -600,12 +600,12 @@ struct Tensor
                      ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
 
-    T& operator()(std::vector<std::size_t> idx)
+    T& operator()(const std::vector<std::size_t>& idx)
     {
         return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
 
-    const T& operator()(std::vector<std::size_t> idx) const
+    const T& operator()(const std::vector<std::size_t>& idx) const
     {
         return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
     }
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index deaa158d50..b8c764809c 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -230,7 +230,7 @@ struct HostTensorDescriptor
      * @param iss Vector containing the multi-dimensional indices
      * @return The calculated linear offset as a size_t
      */
-    std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
+    std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
     {
         return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
     }
@@ -540,9 +540,12 @@ struct HostTensor
         return mData[GetOffsetFromMultiIndex(is...)];
     }
 
-    T& operator()(std::vector<std::size_t> idx) { return mData[GetOffsetFromMultiIndex(idx)]; }
+    T& operator()(const std::vector<std::size_t>& idx)
+    {
+        return mData[GetOffsetFromMultiIndex(idx)];
+    }
 
-    const T& operator()(std::vector<std::size_t> idx) const
+    const T& operator()(const std::vector<std::size_t>& idx) const
     {
         return mData[GetOffsetFromMultiIndex(idx)];
     }

From 778ac24376813d18e63c9f77a2dd51cf87eb4a80 Mon Sep 17 00:00:00 2001
From: JiaLuo-CAN <jialuo12@amd.com>
Date: Tue, 24 Jun 2025 12:13:18 -0400
Subject: [PATCH 254/443] add a mx_fp8 client example (#2380)

* add a mx_fp8 client example

* remove verify code and fix date

* remove verify code and fix date, type

---------

Co-authored-by: root <root@bg-1w300-e1-2a.mkm.dcgpu>
Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
---
 client_example/32_gemm_mx/CMakeLists.txt  |   4 +
 client_example/32_gemm_mx/gemm_mx_fp8.cpp | 330 ++++++++++++++++++++++
 client_example/README.md                  |   2 +
 3 files changed, 336 insertions(+)
 create mode 100644 client_example/32_gemm_mx/CMakeLists.txt
 create mode 100644 client_example/32_gemm_mx/gemm_mx_fp8.cpp

diff --git a/client_example/32_gemm_mx/CMakeLists.txt b/client_example/32_gemm_mx/CMakeLists.txt
new file mode 100644
index 0000000000..558986bf5a
--- /dev/null
+++ b/client_example/32_gemm_mx/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(GPU_TARGETS MATCHES "gfx950")
+	add_executable(client_gemm_mx_fp8 gemm_mx_fp8.cpp)
+	target_link_libraries(client_gemm_mx_fp8 PRIVATE composable_kernel::device_gemm_operations)
+endif()
diff --git a/client_example/32_gemm_mx/gemm_mx_fp8.cpp b/client_example/32_gemm_mx/gemm_mx_fp8.cpp
new file mode 100644
index 0000000000..6e14bf2a5f
--- /dev/null
+++ b/client_example/32_gemm_mx/gemm_mx_fp8.cpp
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = ck::f8_t;
+using BDataType = ck::f8_t;
+using CDataType = ck::half_t;
+
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t;
+template <typename X, typename Y>
+inline constexpr bool is_same_v = ck::is_same<X, Y>::value;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AScaleLayout = Row;
+using BScaleLayout = Col;
+
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        mem_size_ = mem_size;
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+    std::size_t mem_size_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    ck::index_t KBatch = 1;
+
+    /* Require by mx type*/
+    constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideC = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, Row>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    /* Scale stride Calculation */
+    auto f_get_default_stride =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                    return static_cast<ck::index_t>(col);
+                else
+                    return static_cast<ck::index_t>(row);
+            }
+            else
+                return static_cast<ck::index_t>(stride);
+        };
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+    auto Scale_Padded_M = (M + ScaleBlockSize - 1) / ScaleBlockSize * ScaleBlockSize;
+    auto Scale_Stride_AM =
+        f_get_default_stride(Scale_Padded_M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+    SimpleDeviceMem a_scale_device_buf(
+        sizeof(XDataType) *
+        f_matrix_space_size(Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
+    SimpleDeviceMem b_scale_device_buf(
+        sizeof(XDataType) *
+        f_matrix_space_size(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                                   BLayout,
+                                                   CLayout,
+                                                   ADataType,
+                                                   XPackedDataType,
+                                                   BDataType,
+                                                   XPackedDataType,
+                                                   CDataType,
+                                                   ScaleBlockSize,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            StrideA,
+            Scale_Stride_AM,
+            StrideB,
+            Scale_Stride_BN,
+            StrideC,
+            KBatch,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop =
+                std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
+
+            std::size_t num_btype = sizeof(ADataType) * M * K / ck::packed_size_v<ADataType> +
+                                    sizeof(BDataType) * K * N / ck::packed_size_v<BDataType> +
+                                    sizeof(CDataType) * M * N +
+                                    sizeof(XDataType) * M * K / ScaleBlockSize +
+                                    sizeof(XDataType) * N * K / ScaleBlockSize;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            StrideA,
+            Scale_Stride_AM,
+            StrideB,
+            Scale_Stride_BN,
+            StrideC,
+            KBatch,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/README.md b/client_example/README.md
index d9f793434d..34c6733d05 100644
--- a/client_example/README.md
+++ b/client_example/README.md
@@ -14,8 +14,10 @@ cd client_example/build
 cmake                                                                 \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
 -D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}"      \
+-D GPU_TARGETS="gfx908;gfx90a"                                        \
 ..
 ```
+You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s).
 
 ### Build client example
 ```bash

From c5d9181e1bd8c64110941e244b3d3e1e6c5f6385 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Wed, 25 Jun 2025 07:35:54 +0800
Subject: [PATCH 255/443] Fix unmatched K size of
 WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution on gfx950 (#2393)

---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index be5d5690ff..f243aceda8 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -172,7 +172,7 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<

From bac51b6ec0d8e3e5f333a42af999cc097306a394 Mon Sep 17 00:00:00 2001
From: Xiao Li <swing1979@gmail.com>
Date: Tue, 24 Jun 2025 21:46:15 -0700
Subject: [PATCH 256/443] Fix amd_ck_fp8.hpp macro definitions (#2325)

* Fix amd_ck_fp8.hpp macro definitions

1. Define CK_USE_FNUZ_FP8 and CK_USE_OCP_FP8 definitions only if they were not defined before.
2. Prefix __assert_fnuz_support and __assert_ocp_support with namespace
   fp8_impl to avoid redefined error when building with rocm 6.4+
   (rocm/6.4.0/include/hip/amd_detail/amd_hip_fp8.h)


Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
---
 include/ck/utility/amd_ck_fp8.hpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index cdc2a4fbda..b7af32d3dc 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -10,15 +10,11 @@
 #include "ck/utility/functional.hpp"
 #include "ck/utility/type.hpp"
 
-#ifdef CK_USE_FNUZ_FP8
-#define CK_USE_FNUZ_FP8 1
-#else
+#ifndef CK_USE_FNUZ_FP8
 #define CK_USE_FNUZ_FP8 0
 #endif
 
-#ifdef CK_USE_OCP_FP8
-#define CK_USE_OCP_FP8 1
-#else
+#ifndef CK_USE_OCP_FP8
 #define CK_USE_OCP_FP8 0
 #endif
 
@@ -432,7 +428,7 @@ __host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a)
 namespace fp8_impl {
 
 // Assertions to check for supported conversion types
-#define __assert_ocp_support(interp)                                               \
+#define __fp8_impl_assert_ocp_support(interp)                                      \
     {                                                                              \
         if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP &&                       \
            interp != ck_fp8_interpretation_t::CK_E5M2_OCP)                         \
@@ -440,7 +436,7 @@ namespace fp8_impl {
             __hip_assert(false && "type is unsupported by current target device"); \
         }                                                                          \
     }
-#define __assert_fnuz_support(interp)                                              \
+#define __fp8_impl_assert_fnuz_support(interp)                                     \
     {                                                                              \
         if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ &&                      \
            interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ)                        \
@@ -454,10 +450,10 @@ __is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
 {
 #if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
 #if CK_USE_OCP_FP8
-    __assert_ocp_support(interp);
+    __fp8_impl_assert_ocp_support(interp);
 #endif
 #if CK_USE_FNUZ_FP8
-    __assert_fnuz_support(interp);
+    __fp8_impl_assert_fnuz_support(interp);
 #endif
 #endif
 }

From 50fad035248b154cdfa4505cf5de7465ce146149 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Wed, 25 Jun 2025 15:19:21 +0800
Subject: [PATCH 257/443] [CK_TILE] Add missing parameter 'min_seqlen_q' to the
 FMHA fwd kernel MakeKargs() interface (#2403)

* Rename batch_prerfill interface

* Add min_seqlen_q parameter in MakeKargs()
---
 example/ck_tile/01_fmha/fmha_fwd.hpp          | 170 ++++++++---------
 .../fmha/kernel/fmha_batch_prefill_kernel.hpp | 174 +++++++++---------
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |   5 +-
 3 files changed, 176 insertions(+), 173 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 5ce56d48b5..15b028fa9f 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -715,102 +715,102 @@ auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
         // create group mode kernel arguments
         if constexpr(FmhaKernel::kIsGroupMode)
         {
-            return FmhaKernel::MakeKargsImpl(args.q_ptr,
-                                             args.k_ptr,
-                                             args.v_ptr,
-                                             args.bias_ptr,
-                                             args.rand_val_ptr,
-                                             args.lse_ptr,
-                                             args.o_ptr,
-                                             args.seqstart_q_ptr,
-                                             args.hdim_q,
-                                             args.hdim_v,
-                                             args.nhead_q,
-                                             args.nhead_q / args.nhead_k,
-                                             args.num_total_pages,
-                                             args.kv_indptr,
-                                             args.kv_page_indices,
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.rand_val_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqstart_q_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.num_total_pages,
+                                         args.kv_indptr,
+                                         args.kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
                                          args.kv_last_page_lens,
                                          args.page_block_size,
 #endif
-                                             args.scale_s,
-                                             args.scale_p,
-                                             args.scale_o,
-                                             args.logits_soft_cap,
-                                             args.stride_q,
-                                             args.stride_k,
-                                             args.stride_v,
-                                             args.stride_bias,
-                                             args.stride_randval,
-                                             args.stride_o,
-                                             args.nhead_stride_q,
-                                             args.nhead_stride_k,
-                                             args.nhead_stride_v,
-                                             args.nhead_stride_bias,
-                                             args.nhead_stride_randval,
-                                             args.nhead_stride_lse,
-                                             args.nhead_stride_o,
-                                             args.batch_stride_k,
-                                             args.batch_stride_v,
-                                             args.window_size_left,
-                                             args.window_size_right,
-                                             args.mask_type,
-                                             args.p_drop,
-                                             args.s_randval,
-                                             args.drop_seed_offset);
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_randval,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
         }
         else
         { // create batch mode kernel arguments
-            return FmhaKernel::MakeKargsImpl(args.q_ptr,
-                                             args.k_ptr,
-                                             args.v_ptr,
-                                             args.bias_ptr,
-                                             args.rand_val_ptr,
-                                             args.lse_ptr,
-                                             args.o_ptr,
-                                             args.seqlen_q,
-                                             args.hdim_q,
-                                             args.hdim_v,
-                                             args.nhead_q,
-                                             args.nhead_q / args.nhead_k,
-                                             args.num_total_pages,
-                                             args.kv_indptr,
-                                             args.kv_page_indices,
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.rand_val_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqlen_q,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.num_total_pages,
+                                         args.kv_indptr,
+                                         args.kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
                                          args.kv_last_page_lens,
                                          args.page_block_size,
 #endif
-                                             args.scale_s,
-                                             args.scale_p,
-                                             args.scale_o,
-                                             args.logits_soft_cap,
-                                             args.stride_q,
-                                             args.stride_k,
-                                             args.stride_v,
-                                             args.stride_bias,
-                                             args.stride_randval,
-                                             args.stride_o,
-                                             args.nhead_stride_q,
-                                             args.nhead_stride_k,
-                                             args.nhead_stride_v,
-                                             args.nhead_stride_bias,
-                                             args.nhead_stride_randval,
-                                             args.nhead_stride_lse,
-                                             args.nhead_stride_o,
-                                             args.batch_stride_q,
-                                             args.batch_stride_k,
-                                             args.batch_stride_v,
-                                             args.batch_stride_bias,
-                                             args.batch_stride_randval,
-                                             args.batch_stride_lse,
-                                             args.batch_stride_o,
-                                             args.window_size_left,
-                                             args.window_size_right,
-                                             args.mask_type,
-                                             args.p_drop,
-                                             args.s_randval,
-                                             args.drop_seed_offset);
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_randval,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_q,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.batch_stride_bias,
+                                         args.batch_stride_randval,
+                                         args.batch_stride_lse,
+                                         args.batch_stride_o,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
         }
     }();
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index 7472c82114..0d0959ba27 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -316,56 +316,56 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
 
     template <bool Cond = !kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargsImpl(const void* q_ptr,
-                  const void* k_ptr,
-                  const void* v_ptr,
-                  const void* bias_ptr,
-                  void* rand_val_ptr,
-                  void* lse_ptr,
-                  void* o_ptr,
-                  ck_tile::index_t seqlen_q,
-                  ck_tile::index_t hdim_q,
-                  ck_tile::index_t hdim_v,
-                  ck_tile::index_t num_head_q,
-                  ck_tile::index_t nhead_ratio_qk,
-                  int32_t num_total_pages,
-                  const void* kv_indptr,
-                  const void* kv_page_indices,
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              int32_t num_total_pages,
+              const void* kv_indptr,
+              const void* kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
               const void* kv_last_page_lens,
               ck_tile::index_t page_block_size,
 #endif
-                  float scale_s,
-                  float scale_p,
-                  float scale_o,
-                  float logits_soft_cap,
-                  ck_tile::index_t stride_q,
-                  ck_tile::index_t stride_k,
-                  ck_tile::index_t stride_v,
-                  ck_tile::index_t stride_bias,
-                  ck_tile::index_t stride_randval,
-                  ck_tile::index_t stride_o,
-                  ck_tile::index_t nhead_stride_q,
-                  ck_tile::index_t nhead_stride_k,
-                  ck_tile::index_t nhead_stride_v,
-                  ck_tile::index_t nhead_stride_bias,
-                  ck_tile::index_t nhead_stride_randval,
-                  ck_tile::index_t nhead_stride_lse,
-                  ck_tile::index_t nhead_stride_o,
-                  ck_tile::index_t batch_stride_q,
-                  ck_tile::index_t batch_stride_k,
-                  ck_tile::index_t batch_stride_v,
-                  ck_tile::index_t batch_stride_bias,
-                  ck_tile::index_t batch_stride_randval,
-                  ck_tile::index_t batch_stride_lse,
-                  ck_tile::index_t batch_stride_o,
-                  ck_tile::index_t window_size_left,
-                  ck_tile::index_t window_size_right,
-                  ck_tile::index_t mask_type,
-                  float p_drop,
-                  bool s_randval,
-                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                      drop_seed_offset)
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              float logits_soft_cap,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_randval,
+              ck_tile::index_t batch_stride_lse,
+              ck_tile::index_t batch_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                  drop_seed_offset)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
@@ -468,51 +468,51 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
 
     template <bool Cond = kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargsImpl(const void* q_ptr,
-                  const void* k_ptr,
-                  const void* v_ptr,
-                  const void* bias_ptr,
-                  void* rand_val_ptr,
-                  void* lse_ptr,
-                  void* o_ptr,
-                  const void* seqstart_q_ptr,
-                  ck_tile::index_t hdim_q,
-                  ck_tile::index_t hdim_v,
-                  ck_tile::index_t num_head_q,
-                  ck_tile::index_t nhead_ratio_qk,
-                  int32_t num_total_pages,
-                  const void* kv_indptr,
-                  const void* kv_page_indices,
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              const void* seqstart_q_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              int32_t num_total_pages,
+              const void* kv_indptr,
+              const void* kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
               const void* kv_last_page_lens,
               ck_tile::index_t page_block_size,
 #endif
-                  float scale_s,
-                  float scale_p,
-                  float scale_o,
-                  float logits_soft_cap,
-                  ck_tile::index_t stride_q,
-                  ck_tile::index_t stride_k,
-                  ck_tile::index_t stride_v,
-                  ck_tile::index_t stride_bias,
-                  ck_tile::index_t stride_randval,
-                  ck_tile::index_t stride_o,
-                  ck_tile::index_t nhead_stride_q,
-                  ck_tile::index_t nhead_stride_k,
-                  ck_tile::index_t nhead_stride_v,
-                  ck_tile::index_t nhead_stride_bias,
-                  ck_tile::index_t nhead_stride_randval,
-                  ck_tile::index_t nhead_stride_lse,
-                  ck_tile::index_t nhead_stride_o,
-                  ck_tile::index_t batch_stride_k,
-                  ck_tile::index_t batch_stride_v,
-                  ck_tile::index_t window_size_left,
-                  ck_tile::index_t window_size_right,
-                  ck_tile::index_t mask_type,
-                  float p_drop,
-                  bool s_randval,
-                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                      drop_seed_offset)
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              float logits_soft_cap,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                  drop_seed_offset)
     {
         Kargs kargs{{q_ptr,
                      k_ptr,
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index fe426f925e..6dc014c9de 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -808,6 +808,7 @@ struct FmhaFwdKernel
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
               ck_tile::index_t mask_type,
+              ck_tile::index_t min_seqlen_q,
               float p_drop,
               bool s_randval,
               const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
@@ -847,7 +848,7 @@ struct FmhaFwdKernel
             window_size_left,
             window_size_right,
             mask_type,
-            0, // min_seqlen_q
+            min_seqlen_q,
             p_drop,
             s_randval,
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
@@ -890,6 +891,7 @@ struct FmhaFwdKernel
               ck_tile::index_t window_size_left,
               ck_tile::index_t window_size_right,
               ck_tile::index_t mask_type,
+              ck_tile::index_t min_seqlen_q,
               float p_drop,
               bool s_randval,
               const std::tuple<const void*, const void*>& drop_seed_offset)
@@ -929,6 +931,7 @@ struct FmhaFwdKernel
             window_size_left,
             window_size_right,
             mask_type,
+            min_seqlen_q,
             p_drop,
             s_randval,
             std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));

From 37e1a2753702f003b751425502e037f2384aaa5f Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Wed, 25 Jun 2025 16:07:45 +0800
Subject: [PATCH 258/443] [CK_TILE] Refine fp8 support in flatmm (#2239)

* [CK_TILE] Refine fp8 in flatmm

1. Replace USING_MFMA_16x16x32 & USING_MFMA_16x16x32 with constexpr
2. Add an additional const check to avoid build error in HotLoopScheduler
3. Refine shuffleb to support both tile 32x32 and 16x16
4. Support command option -init
5. Move Gemm warp defintion to a separate struct

* fix clang format

* fix clang format

* keep default bhavior unchanged (warp tile = 16x16)

* fix tile engine build error

* fix a typo in codegen_utils.py

* address review comments

* address review comments

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 example/ck_tile/18_flatmm/CMakeLists.txt      |   2 -
 example/ck_tile/18_flatmm/flatmm_basic.cpp    |  44 +++++--
 example/ck_tile/18_flatmm/flatmm_basic.hpp    | 109 +++++++++-------
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |  91 +++++++++-----
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   | 119 +++++++++---------
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp | 115 ++++++++++-------
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  13 +-
 tile_engine/ops/gemm/codegen_utils.py         |   3 +
 tile_engine/ops/gemm/gemm_instance_builder.py |  11 +-
 tile_engine/ops/gemm/gemm_profiler.hpp        |   4 +-
 10 files changed, 313 insertions(+), 198 deletions(-)

diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt
index 58e06f3c0f..6d6b71ea18 100644
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -3,6 +3,4 @@ add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
 set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
-list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_16x16x32=1 -Wno-unused-local-typedef)
-#list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_32x32x16=1 -Wno-unused-local-typedef)
 target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 8782d2bb6a..f96f558101 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -17,12 +17,12 @@ template <typename ADataType,
           typename BDataType,
           typename AccDataType,
           typename CDataType,
+          typename FlatmmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
 {
-    using FlatmmConfig       = FlatmmConfig<ADataType>;
     using CodegenFlatmmShape = ck_tile::TileFlatmmShape<
         ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
         ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
@@ -32,18 +32,20 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
 
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenFlatmmShape>;
 
-    using CodegenGemmTraits      = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
+    using CodegenGemmTraits = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
                                                       FlatmmConfig::kPadN,
                                                       FlatmmConfig::kPadK,
                                                       ALayout,
                                                       BLayout,
                                                       CLayout>;
+
     using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
                                                                 BDataType,
                                                                 AccDataType,
                                                                 CodegenFlatmmShape,
                                                                 CodegenGemmTraits>;
-    const auto Run               = [&](const auto memory_operation_) {
+
+    const auto Run = [&](const auto memory_operation_) {
         constexpr auto memory_operation = memory_operation_.value;
 
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
@@ -151,6 +153,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
     }
 }
 
+template <template <typename PreType> typename FlatmmConfig>
 int run_flatmm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -163,24 +166,27 @@ int run_flatmm_example(int argc, char* argv[])
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
-
     if(a_layout == "R" && b_layout == "C")
     {
         if(data_type == "fp16")
         {
-            run_flatmm_example_with_layouts<ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
+                argc, argv, Row{}, Col{}, Row{});
         }
         else if(data_type == "bf16")
         {
-            run_flatmm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::bf16_t, FlatmmConfig<ck_tile::bf16_t>>(
+                argc, argv, Row{}, Col{}, Row{});
         }
         else if(data_type == "fp8")
         {
-            run_flatmm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::fp8_t, FlatmmConfig<ck_tile::fp8_t>>(
+                argc, argv, Row{}, Col{}, Row{});
         }
         else if(data_type == "bf8")
         {
-            run_flatmm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>>(
+                argc, argv, Row{}, Col{}, Row{});
         }
         else
         {
@@ -196,9 +202,29 @@ int run_flatmm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return EXIT_FAILURE;
+
     try
     {
-        return !run_flatmm_example(argc, argv);
+        int warp_tile = arg_parser.get_int("warp_tile");
+        if(warp_tile == 0)
+        {
+            return !run_flatmm_example<FlatmmConfig16>(argc, argv);
+        }
+        else if(warp_tile == 1)
+        {
+            return !run_flatmm_example<FlatmmConfig32>(argc, argv);
+        }
+        else if(warp_tile == 2)
+        {
+            return !run_flatmm_example<FlatmmConfig16_950>(argc, argv);
+        }
+        else
+        {
+            return !run_flatmm_example<FlatmmConfig32_950>(argc, argv);
+        }
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index 6b52ce8b1b..01a02290ce 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -31,7 +31,63 @@
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif
 
-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
+// GEMM config with 32x132 warp tile
+template <typename DataType>
+struct FlatmmConfig32
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(DataType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 32;
+
+    static constexpr bool kPadM      = false;
+    static constexpr bool kPadN      = false;
+    static constexpr bool kPadK      = false;
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename DataType>
+struct FlatmmConfig32_950 : public FlatmmConfig32<DataType>
+{
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 64;
+};
+
+// GEMM config with 16x16 warp tile
+template <typename DataType>
+struct FlatmmConfig16
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(DataType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 64;
+
+    static constexpr bool kPadM      = false;
+    static constexpr bool kPadN      = false;
+    static constexpr bool kPadK      = false;
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename DataType>
+struct FlatmmConfig16_950 : public FlatmmConfig16<DataType>
+{
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 128;
+};
+
+template <typename ADataType>
 struct GemmBasicTypeConfig;
 
 template <>
@@ -103,47 +159,10 @@ struct DataTypeTraits<ck_tile::half_t>
     static constexpr const char* name = "fp16";
 };
 
-template <typename T>
-struct is_8bit_type
-    : std::bool_constant<std::is_same_v<T, ck_tile::fp8_t> || std::is_same_v<T, ck_tile::bf8_t>>
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
 {
-};
-
-template <typename ADataType>
-struct FlatmmConfig
-{
-#if defined(USING_MFMA_16x16x32)
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128;
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 4;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 64 : 16;
-
-#elif defined(USING_MFMA_32x32x16)
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 128;
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 8;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 16;
-#endif
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    static constexpr bool kPadM = false;
-    static constexpr bool kPadN = false;
-    static constexpr bool kPadK = false;
-
-    static constexpr int kBlockPerCu = 2;
+    static constexpr const char* name = "bf16";
 };
 
 auto create_args(int argc, char* argv[])
@@ -163,8 +182,11 @@ auto create_args(int argc, char* argv[])
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("split_k", "1", "splitK value");
-
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("warp_tile",
+                "0",
+                "0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only)");
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
@@ -174,6 +196,7 @@ template <typename ADataType,
           typename BDataType,
           typename AccDataType,
           typename CDataType,
+          typename FlatmmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout>
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 1607fb6163..24009ac132 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -73,6 +73,7 @@ template <typename ADataType,
           typename BDataType,
           typename AccDataType,
           typename CDataType,
+          typename FlatmmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout>
@@ -102,9 +103,15 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     args.stride_B = stride_B;
     args.stride_C = stride_C;
 
-    float ave_time =
-        flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    float ave_time = flatmm_calc<ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 FlatmmConfig,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_byte =
@@ -120,7 +127,11 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
     return ave_time;
 }
 
-template <typename PrecType, typename ALayout, typename BLayout, typename CLayout>
+template <typename PrecType,
+          typename FlatmmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                     char* argv[],
                                     const ALayout a_layout                  = ALayout{},
@@ -131,11 +142,10 @@ int run_flatmm_example_with_layouts(int argc,
     if(!result)
         return -1;
 
-    using ADataType    = typename GemmBasicTypeConfig<PrecType>::ADataType;
-    using BDataType    = typename GemmBasicTypeConfig<PrecType>::BDataType;
-    using CDataType    = typename GemmBasicTypeConfig<PrecType>::CDataType;
-    using AccDataType  = typename GemmBasicTypeConfig<PrecType>::AccDataType;
-    using FlatmmConfig = FlatmmConfig<ADataType>;
+    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
+    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
+    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
+    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
 
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
@@ -145,10 +155,10 @@ int run_flatmm_example_with_layouts(int argc,
     ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
     ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
 
-    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
-
-    int n_warmup = arg_parser.get_int("warmup");
-    int n_repeat = arg_parser.get_int("repeat");
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -162,8 +172,26 @@ int run_flatmm_example_with_layouts(int argc,
         ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
 
     // TODO: add different init types
-    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
-    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_host);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_origin_host);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
+    }
+    else
+    {
+        a_host.SetZero();
+        b_origin_host.SetZero();
+    }
 
     ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
     ck_tile::DeviceMem c_dev_buf(c_rslt_host.get_element_space_size_in_bytes());
@@ -173,23 +201,28 @@ int run_flatmm_example_with_layouts(int argc,
 
     // do pre-shuffle
     ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<FlatmmConfig>(b_origin_host);
-
     ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
-    invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        a_dev_buf,
-        b_shuffle_dev_buf,
-        c_dev_buf,
-        M,
-        N,
-        K,
-        stride_A,
-        stride_B,
-        stride_C,
-        kbatch,
-        n_warmup,
-        n_repeat);
+    invoke_flatmm<ADataType,
+                  BDataType,
+                  AccDataType,
+                  CDataType,
+                  FlatmmConfig,
+                  ALayout,
+                  BLayout,
+                  CLayout>(a_dev_buf,
+                           b_shuffle_dev_buf,
+                           c_dev_buf,
+                           M,
+                           N,
+                           K,
+                           stride_A,
+                           stride_B,
+                           stride_C,
+                           kbatch,
+                           n_warmup,
+                           n_repeat);
 
     c_dev_buf.FromDevice(c_rslt_host.data());
     bool pass = true;
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index aa4d233ecb..648b2b85bd 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -75,7 +75,6 @@ struct FlatmmPipelineAGmemBGmemCRegV1
 
     CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
     {
-#if defined(USING_MFMA_16x16x32) || defined(USING_MFMA_32x32x16)
         constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
 
         using WG = remove_cvref_t<decltype(config.template at<0>())>;
@@ -91,64 +90,68 @@ struct FlatmmPipelineAGmemBGmemCRegV1
         constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
         constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
         constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
-#endif
-#if defined(USING_MFMA_16x16x32)
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
-        });
-        static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-        });
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
-        });
 
-#elif defined(USING_MFMA_32x32x16)
-        static_for<0,
-                   A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num - B_Buffer_Load_Inst_Num,
-                   1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-            __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
-        });
-        __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
-#endif
+        if constexpr(WG::kM == 16 && WG::kN == 16)
+        {
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+            });
+        }
+        else if constexpr(WG::kM == 32 && WG::kN == 32 &&
+                          (A_LDS_Read_Inst_Num / 2 >
+                           A_Buffer_Load_Inst_Num + B_Buffer_Load_Inst_Num))
+        {
+            static_for<0,
+                       A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num - B_Buffer_Load_Inst_Num,
+                       1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+        }
     }
 
     template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 91323d2c39..5c33666ec4 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -19,55 +19,61 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
     {
         using namespace ck_tile;
-#if defined(USING_MFMA_16x16x32)
-        /*reduce transform layers,compare with old ck*/
-        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack     = GetSmemPackA<Problem>();
 
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
-            make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+        if constexpr(MPerXdl == 16 && NPerXdl == 16)
+        {
+            /*reduce transform layers,compare with old ck*/
+            constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t KPack     = GetSmemPackA<Problem>();
 
-        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(
-                make_xor_transform(make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
-                make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});
 
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_permuted,
-            make_tuple(make_pass_through_transform(number<MPerBlock>{}),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_xor_transform(
+                               make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
 
-        return a_lds_block_desc;
-#elif defined(USING_MFMA_32x32x16)
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t kKPack     = GetSmemPackA<Problem>();
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
 
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
-            make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
-            number<kKPack>{},
-            number<1>{});
+            return a_lds_block_desc;
+        }
+        else
+        {
+            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t kKPack     = GetSmemPackA<Problem>();
 
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(make_pass_through_transform(kMPerBlock),
-                       make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
+                make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
+                number<kKPack>{},
+                number<1>{});
 
-        return a_lds_block_desc;
-#endif
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_pass_through_transform(kMPerBlock),
+                           make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
 /*xor*/
 #if 0
         constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
@@ -138,6 +144,21 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         return Problem::VectorLoadSize / sizeof(typename Problem::ADataType);
     }
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+        if constexpr(TileShape::WarpTile::at(TileShape::idxN) == 32)
+        {
+            return TileShape::WarpTile::at(TileShape::idxK) / 2;
+        }
+        else
+        {
+            static_assert(TileShape::WarpTile::at(TileShape::idxN) == 16);
+            return TileShape::WarpTile::at(TileShape::idxK) / 4;
+        }
+    }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
     {
@@ -189,7 +210,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         }
         else
         {
-            constexpr index_t K1 = 16 / sizeof(ADataType);
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType);
             constexpr index_t K0 = KPerBlock / K1;
             constexpr index_t M2 = get_warp_size() / K0;
             // coalesce reading for each blocks
@@ -232,19 +253,17 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
     {
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-
         using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
 
         constexpr index_t BlockSize = Problem::kBlockSize;
         constexpr index_t WaveSize  = get_warp_size();
         constexpr index_t WaveNum   = BlockSize / WaveSize;
 
-        constexpr index_t KBPerLoad =
-            Problem::VectorLoadSize / sizeof(BDataType); // dwordx4 load B elem cnt
-        constexpr index_t KThdPerWave = WaveSize;        // threads cnt in K dim
+        constexpr index_t KBPerLoad   = GetKBPerLoad<Problem>();
+        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
         constexpr index_t KWavePerBlk = 1;
         constexpr index_t KRepeat     = 1;
+        static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");
 
         constexpr index_t NBPerLoad   = 1;
         constexpr index_t NThdPerWave = 1;
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 4cd26c2234..60de052dc0 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -59,14 +59,23 @@ struct GemmHostArgs
     const void* a_ptr;
     const void* b_ptr;
     const std::array<const void*, NumDTensor> ds_ptr;
-    void* e_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
     index_t M;
     index_t N;
     index_t K;
     index_t stride_A;
     index_t stride_B;
     const std::array<index_t, NumDTensor> stride_Ds;
-    index_t stride_E;
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
     index_t k_batch;
 };
 
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 58eed45dc6..f16a55ef87 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -44,9 +44,12 @@ CSHUFFLE_EPILOGUE = """
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                             ck_tile::CShuffleEpilogueProblem<ADataType,
                                                              BDataType,
+                                                             ck_tile::tuple<>,
                                                              AccDataType,
                                                              CDataType,
+                                                             ck_tile::tuple<>,
                                                              CLayout,
+                                                             ck_tile::element_wise::PassThrough,
                                                              GemmPipelineProblem::kBlockSize,
                                                              TilePartitioner::MPerBlock,
                                                              TilePartitioner::NPerBlock,
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index a677b842c5..e7690ac481 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -193,7 +193,7 @@ struct GemmKernel {{
     static constexpr bool kPadN = {pad_n};
     static constexpr bool kPadK = {pad_k};
 
-    static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
+    static float launch(ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
         static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
@@ -306,7 +306,7 @@ struct GemmKernel {{
                     // clear c mem
                     if(args.k_batch > 1)
                         hipGetErrorString(hipMemsetAsync(
-                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
+                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
                 }};
                 ave_time = ck_tile::launch_kernel_preprocess(
                     stream,
@@ -570,12 +570,13 @@ struct GemmDispatcher {
         // Use a static local variable
         static std::unordered_map<
             std::string,
-            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>>
+            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>>
             kernel_map;
         return kernel_map;
     }
 
     static void init(bool structured_sparsity) {
+        ck_tile::ignore = structured_sparsity;
         auto& kernel_map = get_kernel_map();
         if(!kernel_map.empty()) return;
         \n"""
@@ -586,7 +587,7 @@ struct GemmDispatcher {
                 for j in range(len(tile)):
                     tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile[
                         j]
-                    content += f"""[=](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ """
+                    content += f"""[=](ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{ """
                     content += f""" 
                                     if(structured_sparsity){{  // SMFMA"""
                     sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
@@ -615,7 +616,7 @@ struct GemmDispatcher {
         content += """    }
 
     template <typename Kernel>
-    static std::tuple<std::string, float> run_kernel(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream)
+    static std::tuple<std::string, float> run_kernel(ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream)
     {
         std::string name = Kernel::get_name();
         float avg_time = Kernel::launch(args, stream);
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 0125a759b3..0fd87ec07d 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -22,7 +22,7 @@ class GemmProfiler
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables)
+                       ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>& callables)
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -89,7 +89,7 @@ class GemmProfiler
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::GemmHostArgs gemm_args;
+        ck_tile::GemmHostArgs<> gemm_args;
         gemm_args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
         gemm_args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
         gemm_args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();

From daf71fb8e4442352b1c5bb0a7c5a3ecc9f7f0c5a Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Wed, 25 Jun 2025 07:38:54 -0500
Subject: [PATCH 259/443] Enable fp4 tests (#2329)

---
 include/ck/ck.hpp                   | 6 ------
 include/ck/utility/type_convert.hpp | 7 -------
 test/data_type/test_mx_fp4.cpp      | 2 --
 3 files changed, 15 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 3c1373a387..794c6f4e20 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -244,12 +244,6 @@
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1
 
-// workaround: compiler issue on gfx950
-#define CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION 1
-
-// workaround: compiler issue on gfx950
-#define CK_TEMP_DISABLE_FP4_TESTS 1
-
 // workaround: compiler issue on gfx950
 #define CK_WORKAROUND_FP16_TO_FP8_CONVERSION 1
 
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 2208a73860..69a953b575 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -1500,16 +1500,9 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
         uint32_t bitwise;
         f4x2_t f4x2_array[4];
     } value{0};
-// apply a temporary workaround for gfx950
-#if CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
-    uint8_t l     = utils::sat_convert_to_type_sr<f4_t>(x[1] / scale, rng);
-    uint8_t h     = utils::sat_convert_to_type_sr<f4_t>(x[0] / scale, rng);
-    value.bitwise = (h << 4) | l;
-#else
     // permute high bits and low bits to match the order of the original vector
     value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(
         value.bitwise, float2_t{x[1], x[0]}, rng, scale, 0);
-#endif // CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
     return value.f4x2_array[0];
 #else
     constexpr int seed = 1254739;
diff --git a/test/data_type/test_mx_fp4.cpp b/test/data_type/test_mx_fp4.cpp
index 7aca42567c..449f6fc777 100644
--- a/test/data_type/test_mx_fp4.cpp
+++ b/test/data_type/test_mx_fp4.cpp
@@ -240,7 +240,6 @@ TEST(MXFP4, HostScaledConvert)
     EXPECT_EQ(test_size, i);
 }
 
-#if !CK_TEMP_DISABLE_FP4_TESTS
 __global__ void test_mx_fp4_device_scaled_convert(uint64_t N, float* p_test, uint64_t* p_completed)
 {
     test_mx_fp4_scaled_convert(N, p_test, p_completed);
@@ -540,4 +539,3 @@ TEST(MXFP4, DeviceF4x32ToF32x32ScaledConvert)
     EXPECT_EQ(N, completed);
     EXPECT_EQ(N, i);
 }
-#endif // CK_TEMP_DISABLE_FP4_TESTS

From 6d6f4c76c13e5c320e6829207a119ed154509ca1 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 25 Jun 2025 08:01:50 -0700
Subject: [PATCH 260/443] Enable builds on gfx942 by default and run all tests
 on develop branch. (#2408)

* add switches for architectures and force develop to run all tests

* move the test condition inside the function

* enable build on gfx942 by default
---
 Jenkinsfile | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b2fda68b70..9f1c021878 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -225,6 +225,10 @@ def cmake_build(Map conf=[:]){
     def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
     def prefixpath = conf.get("prefixpath","/opt/rocm")
     def setup_args = conf.get("setup_args","")
+    // make sure all unit tests always run on develop branch
+    if(env.BRANCH_NAME == "develop"){
+        params.RUN_ALL_UNIT_TESTS = true
+    }
 
     if (prefixpath != "/usr/local"){
         setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
@@ -896,10 +900,26 @@ pipeline {
             name: "BUILD_GFX908",
             defaultValue: false,
             description: "Build CK and run tests on gfx908 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX90A",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx90a (default: ON)")
+        booleanParam(
+            name: "BUILD_GFX942",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx942 (default: ON)")
         booleanParam(
             name: "BUILD_GFX950",
             defaultValue: false,
             description: "Build CK and run tests on gfx950 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX10",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx10 (default: ON)")
+        booleanParam(
+            name: "BUILD_GFX11",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx11 (default: ON)")
         booleanParam(
             name: "BUILD_GFX12",
             defaultValue: true,
@@ -1276,7 +1296,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { (params.BUILD_GFX942.toBoolean() || params.RUN_FULL_QA.toBoolean()) && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx942") }
                     environment{
@@ -1314,7 +1334,7 @@ pipeline {
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "rocm/composable_kernel-private:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
                 }
@@ -1343,7 +1363,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX90A.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx90a") }
                     environment{
@@ -1382,7 +1402,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX10.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx1030") }
                     environment{
@@ -1403,7 +1423,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{

From e03293ebce78d6b01e2fe47447f2d18a3c3c9959 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Wed, 25 Jun 2025 08:20:35 -0700
Subject: [PATCH 261/443] [CK Tile] Int8 Support on CK Tile GEMM (#2267)

* updates to support int8 in 03_gemm example

* added comments, using aliases, helper functions

* test(gemm_universal): add test cases for int8 gemm pipeline

* fix(test_gemm): fix for failing test unit test for int8

* test(ck_tile): add int8 unit test for gemm universal

* refactor(gemm_universal): GPU reference verification for GEMM code improved

* style(gemm_universal): removed extra comments and did clang format

* merging recent changes to universal gemm to tile_engine

* ck tile engine integration work

* feat(tile_engine): add int8 support to tile engine ops/gemm

* feat(tile_engine): added 32 32 16 mfma instances to tile engine for int8

* style: Format code with clang-format-12

* refactor(tile_engine): address review comments

* style: removed unhelpful comments & unused variables.

* build: tile engine uses default config

* feat: add int8 support for CK_TILE GEMM

* style: added trailing commas to codegen_utils.py

* refactor: tile engine

* refactor: formatting and code review

* refactor: code formatting for python files

* fix: suppress build warning

* add support for gfx950

* refactor:KWarpTile size in gemms util

* Fix the branch and wrap up the k warp tile

* Add bf8 integration

* refactor: clang format and rebase

---------

Co-authored-by: zjli2013 <leezhengjiang@gmail.com>
Co-authored-by: AviralGoelAMD <aviral.goel@amd.com>
Co-authored-by: Khushbu Agarwal <khuagarw@amd.com>
---
 CHANGELOG.md                                  |   1 +
 example/ck_tile/03_gemm/README.md             |   2 +-
 example/ck_tile/03_gemm/gemm_basic.cpp        |   5 +
 example/ck_tile/03_gemm/gemm_utils.hpp        |  53 ++-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  38 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    |   7 +
 include/ck_tile/core/numeric/integer.hpp      |   1 +
 include/ck_tile/core/tensor/buffer_view.hpp   |  17 +
 include/ck_tile/host/host_tensor.hpp          |   2 +
 .../block/block_universal_gemm_as_bs_cr.hpp   |   2 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  15 +
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 181 +++++++-
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  17 +-
 .../gemm/test_gemm_pipeline_kernel_types.hpp  |  31 +-
 .../gemm/test_gemm_pipeline_ut_cases.inc      |   4 +-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |   6 +
 tile_engine/ops/gemm/CMakeLists.txt           |   1 -
 tile_engine/ops/gemm/codegen_utils.py         | 166 ++++++--
 .../ops/gemm/configs/default_config.json      |   6 +-
 .../gemm/configs/user_provided_config.json    |  14 +-
 tile_engine/ops/gemm/gemm_host_api.hpp        |  12 +
 tile_engine/ops/gemm/gemm_instance_builder.py | 402 ++++++++++++------
 tile_engine/ops/gemm/gemm_profiler.hpp        |  26 +-
 tile_engine/ops/gemm/json_config.py           | 107 ++---
 24 files changed, 815 insertions(+), 301 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab2076c0d8..0f04935b8d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added benchmarking support for tile engine GEMM.
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
+* Added int8 support for CK_TILE GEMM.
 
 ### Optimized
 
diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index 4c16f13cef..da37159aeb 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -30,7 +30,7 @@ args:
    -stride_c    Tensor C stride (default:0)
           -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
           -e    Absolute error tolerance (default:1e-5)
-       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+       -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
      -warmup    number of iterations before benchmark the kernel (default:10)
      -repeat    number of iterations to benchmark the kernel (default:100)
       -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 090a98486e..80c18cdb87 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -212,6 +212,11 @@ int run_gemm_example(int argc, char* argv[])
         return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
             a_layout, b_layout, argc, argv);
     }
+    else if(data_type == "i8")
+    {
+        return run_gemm_example_prec_type<ck_tile::int8_t, ck_tile::int8_t, int32_t>(
+            a_layout, b_layout, argc, argv);
+    }
     else if(data_type == "pk_int4_t")
     {
         // TODO: Add support for bhalf_t ADataType
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 101e195903..5f767d56aa 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -1,4 +1,3 @@
-
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
@@ -16,6 +15,25 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4
 
+// temporary workaround to get k_warp_tile based on PrecType and gfx950 or not
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(__gfx950__)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+
 struct GemmConfigBase
 {
     static constexpr bool kPadM = false;
@@ -90,7 +108,7 @@ struct GemmConfigComputeV3 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer     = false;
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
@@ -109,7 +127,7 @@ struct GemmConfigComputeV3_1 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer     = false;
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
@@ -128,7 +146,7 @@ struct GemmConfigComputeV3_2 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 32 : 128;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer     = false;
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
@@ -151,7 +169,7 @@ struct GemmConfigComputeV4 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer     = true;
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
@@ -170,7 +188,7 @@ struct GemmConfigComputeV4_1 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer     = true;
     static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
@@ -189,7 +207,7 @@ struct GemmConfigComputeV5 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer               = false;
     static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
@@ -245,6 +263,15 @@ struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
     using CDataType   = ck_tile::half_t;
 };
 
+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
 template <typename T>
 struct DataTypeTraits;
 
@@ -260,6 +287,12 @@ struct DataTypeTraits<double>
     static constexpr const char* name = "fp64";
 };
 
+template <>
+struct DataTypeTraits<int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
 template <>
 struct DataTypeTraits<ck_tile::half_t>
 {
@@ -290,6 +323,12 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
     static constexpr const char* name = "pk_int4_t";
 };
 
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
 template <ck_tile::index_t PipelineId>
 struct PipelineTypeTraits;
 
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 140107bfb4..d3ef974d91 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -297,8 +297,8 @@ int run_gemm_example_with_layouts(int argc,
 
     if(init_method == 0)
     {
-        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
     }
     else if(init_method == 1)
     {
@@ -415,29 +415,19 @@ int run_gemm_example_with_layouts(int argc,
             // Restore input for B for gpu reference
             b_k_n_dev_buf.ToDevice(b_k_n.data());
         }
+
+        // memory on host to store gpu reference result
         ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
             ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        // memory on device to store gpu reference result
         ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+
         c_m_n_gpu_ref.SetZero();
         c_m_n_gpu_buf_ref.SetZero();
 
-        ADataType* d_A;
-        BDataType* d_B;
-        CDataType* d_C;
-
-        ck_tile::hip_check_error(hipMalloc(&d_A, a_m_k.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(hipMalloc(&d_B, b_k_n.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(
-            hipMalloc(&d_C, c_m_n_dev_result.get_element_space_size_in_bytes()));
-
-        ck_tile::hip_check_error(hipMemcpy(d_A,
-                                           a_m_k_dev_buf.GetDeviceBuffer(),
-                                           a_m_k.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
-        ck_tile::hip_check_error(hipMemcpy(d_B,
-                                           b_k_n_dev_buf.GetDeviceBuffer(),
-                                           b_k_n.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
 
         ck_tile::reference_gemm_gpu<ADataType,
                                     BDataType,
@@ -447,16 +437,8 @@ int run_gemm_example_with_layouts(int argc,
                                     BLayout,
                                     CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
 
-        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
-                                           d_C,
-                                           c_m_n_dev_result.get_element_space_size_in_bytes(),
-                                           hipMemcpyDeviceToHost));
-
-        ck_tile::hip_check_error(hipFree(d_A));
-        ck_tile::hip_check_error(hipFree(d_B));
-        ck_tile::hip_check_error(hipFree(d_C));
-
         c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+
         const float max_accumulated_value =
             *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
         const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index ecfaa92b9a..c2c3fc1fa4 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -299,6 +299,13 @@ int run_gemm_example(int argc, char* argv[])
                                           ck_tile::bf8_t,
                                           ck_tile::half_t>(a_layout, b_layout, argc, argv);
     }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+    }
     else if(data_type == "pk_int4_t")
     {
         // TODO: Add support for bhalf_t ADataType
diff --git a/include/ck_tile/core/numeric/integer.hpp b/include/ck_tile/core/numeric/integer.hpp
index 3faf3020a6..502026c231 100644
--- a/include/ck_tile/core/numeric/integer.hpp
+++ b/include/ck_tile/core/numeric/integer.hpp
@@ -7,6 +7,7 @@
 namespace ck_tile {
 
 using index_t      = int32_t;
+using int32_t      = int32_t;
 using long_index_t = int64_t;
 using int8_t       = int8_t;
 
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 8d19337b86..231a2c832b 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -1009,6 +1009,15 @@ struct buffer_view<address_space_enum::lds,
                          std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
                         (std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
                          std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        // int8 on thread buffer
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 2>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 1>>) ||
                         // ext_vector_type for pk_int4 must use int8_t as type
                         (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                          std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>) ||
@@ -1031,6 +1040,8 @@ struct buffer_view<address_space_enum::lds,
 
                 if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                               std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                             (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                              std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 1>>) ||
                              (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                               std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>))
                 {
@@ -1041,6 +1052,8 @@ struct buffer_view<address_space_enum::lds,
                 }
                 else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                    std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 2>>) ||
                                   (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                    std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>))
                 {
@@ -1051,6 +1064,8 @@ struct buffer_view<address_space_enum::lds,
                 }
                 else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                    std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 4>>) ||
                                   (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                    std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>))
                 {
@@ -1061,6 +1076,8 @@ struct buffer_view<address_space_enum::lds,
                 }
                 else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                    std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 8>>) ||
                                   (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                    std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>))
                 {
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index b8c764809c..ecbc009b85 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -722,6 +722,8 @@ struct HostTensor
                     file << type_convert<float>(itm) << std::endl;
                 else if(dtype == "int")
                     file << type_convert<int>(itm) << std::endl;
+                else if(dtype == "int8_t")
+                    file << static_cast<int>(type_convert<ck_tile::int8_t>(itm)) << std::endl;
                 else
                     // TODO: we didn't implement operator<< for all custom
                     // data types, here fall back to float in case compile error
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index c4d527da63..d4e23d12dd 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -215,7 +215,7 @@ struct BlockUniversalGemmAsBsCr
         using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
 
         ALdsTile a_warp_tile_;
-        ALdsTile b_warp_tile_;
+        BLdsTile b_warp_tile_;
 
         // C += A * B
         template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index f243aceda8..185abccd3f 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -282,4 +282,19 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
         2,
         swizzle_factor>>;
 
+// int8
+using WarpGemmMfma_i32_32x32x16_i8_i8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_i32_16x16x32_i8_i8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 7f7a835a69..80f38f263b 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1578,8 +1578,8 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
         DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x16_i8", Ctrl)
         else
         {
-#if defined(__gfx94__)
-            c_vec = __builtin_amdgcn_mfma_i32_32x32x8i8(
+#if defined(__gfx94__) or defined(__gfx95__)
+            c_vec = __builtin_amdgcn_mfma_i32_32x32x16_i8(
                 bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
 #elif defined(__gfx908__) || defined(__gfx90a__)
             static_for<0, 8, 1>{}([&](auto k) {
@@ -1609,6 +1609,183 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
     }
 };
 
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4; // write to 4x AccVGPRs
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_16x16x32_i8", Ctrl)
+        else
+        {
+#if defined(__gfx94__) or defined(__gfx95__)
+            c_vec = __builtin_amdgcn_mfma_i32_16x16x32_i8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_16x16x64_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 16>;
+    using BVecType = ext_vector_t<BDataType, 16>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 64;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 16;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4; // write to 4x AccVGPRs
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_16x16x64_i8", Ctrl)
+        else
+        {
+#if defined(__gfx95__)
+            c_vec = __builtin_amdgcn_mfma_i32_16x16x64_i8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_32x32x32_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 16>;
+    using BVecType = ext_vector_t<BDataType, 16>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 16;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x32_i8", Ctrl)
+        else
+        {
+#if defined(__gfx95__)
+            c_vec =
+                __builtin_amdgcn_mfma_i32_32x32x32_i8(a_vec, bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
 #undef DISPATCH_MFMA_
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index b2f5d56d01..b6ada83532 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -11,7 +11,7 @@ namespace ck_tile {
 namespace impl {
 template <typename AType,
           typename BType,
-          typename CType,
+          typename AccType,
           index_t MPerWave,
           index_t NPerWave,
           index_t KPerWave,
@@ -22,6 +22,7 @@ struct WarpGemmMfmaDispatcher;
 
 // clang-format off
 // fp16
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
@@ -37,10 +38,12 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
 
 // fp16 2:4 structural sparsity
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
 
 // bf16
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
@@ -56,6 +59,7 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
 
 // fp8
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
@@ -81,12 +85,19 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8; };
 
+// int8
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, true> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, false> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, true> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed; };
+
 // clang-format on
 } // namespace impl
 
 template <typename AType,
           typename BType,
-          typename CType,
+          typename AccType,
           index_t MPerWave,
           index_t NPerWave,
           index_t KPerWave,
@@ -95,7 +106,7 @@ template <typename AType,
           bool UseStructuredSparsity = false>
 using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      BType,
-                                                                     CType,
+                                                                     AccType,
                                                                      MPerWave,
                                                                      NPerWave,
                                                                      KPerWave,
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index b9d3f57dbb..5b7d105638 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -9,9 +9,13 @@
 #include "ck_tile/host.hpp"
 #include "test_gemm_pipeline_util.hpp"
 
-using F16       = ck_tile::half_t;
-using F32       = float;
-using F8        = ck_tile::fp8_t;
+using I8  = ck_tile::int8_t;
+using I32 = ck_tile::int32_t;
+
+using F16 = ck_tile::half_t;
+using F32 = float;
+using F8  = ck_tile::fp8_t;
+
 using Row       = ck_tile::tensor_layout::gemm::RowMajor;
 using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
 using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
@@ -46,14 +50,19 @@ using KernelTypesMem = ::testing::Types<
 >;
 
 using KernelTypesCompV3 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,        CompV3>
+     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+     std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
+     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+     std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
+     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+     std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
+     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
+     std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,        CompV3>,
+     std::tuple<    Row,     Row,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>, 
+     std::tuple<    Row,     Col,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>,
+     std::tuple<    Col,     Row,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>, 
+     std::tuple<    Col,     Col,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>
+    
 >;
 
 using KernelTypesCompV4 = ::testing::Types<
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
index 1f0683f8b8..c824d034a9 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc
@@ -32,7 +32,8 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
     constexpr int N           = 1024;
     constexpr int K           = 320;
     constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
-                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t>)
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
                                     ? 16
                                     : 8;
 
@@ -41,7 +42,6 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
         if constexpr(std::is_same_v<typename TestFixture::ALayout,
                                     ck_tile::tensor_layout::gemm::ColumnMajor>)
         {
-            // TODO: Can we anyhow deduce used vector load size?
             if(M % VecLoadSize == 0)
             {
                 this->Run(M, N, K);
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 5f2a53645d..a6a4817143 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -47,6 +47,8 @@ struct GemmPipelineTypeSelector<GemmPipelineType::Mem, Problem>
 {
     using base_pipeline = ck_tile::BaseGemmPipelineAgBgCrMem<Problem>;
     using pipeline      = ck_tile::GemmPipelineAgBgCrMem<Problem>;
+
+    static constexpr auto GetName() { return "GemmPipelineAgBgCrMem"; }
 };
 
 template <typename Problem>
@@ -54,6 +56,8 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV3, Problem>
 {
     using base_pipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<Problem>;
     using pipeline      = ck_tile::GemmPipelineAgBgCrCompV3<Problem>;
+
+    static constexpr auto GetName() { return "GemmPipelineAgBgCrCompV3"; }
 };
 
 template <typename Problem>
@@ -61,6 +65,8 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
 {
     using base_pipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<Problem>;
     using pipeline      = ck_tile::GemmPipelineAgBgCrCompV4<Problem>;
+
+    static constexpr auto GetName() { return "GemmPipelineAgBgCrCompV4"; }
 };
 
 template <typename Tuple>
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index cbba248211..c3c177487f 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 # generate a list of kernels, but not actually emit files at config stage
 execute_process(
     COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index f16a55ef87..ae496636c6 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -11,17 +11,21 @@ import subprocess
 import re
 from functools import lru_cache
 
-DATA_TYPE_MAP = {'fp32': 'float',
-                 'fp16': 'ck_tile::half_t',
-                 'bf16': 'ck_tile::bf16_t',
-                 'int8': 'ck_tile::int8_t',
-                 'fp8': 'ck_tile::fp8_t',
-                 'bf8': 'ck_tile::bf8_t',
-                 'int4': 'ck_tile::pk_int4_t'
-                 }
+DATA_TYPE_MAP = {
+    "fp32": "float",
+    "fp16": "ck_tile::half_t",
+    "bf16": "ck_tile::bf16_t",
+    "int8": "ck_tile::int8_t",
+    "fp8": "ck_tile::fp8_t",
+    "bf8": "ck_tile::bf8_t",
+    "int4": "ck_tile::pk_int4_t",
+    "int32": "ck_tile::int32_t",
+}
 
-LAYOUT_MAP = {'r': 'ck_tile::tensor_layout::gemm::RowMajor',
-              'c': 'ck_tile::tensor_layout::gemm::ColumnMajor'}
+LAYOUT_MAP = {
+    "r": "ck_tile::tensor_layout::gemm::RowMajor",
+    "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
+}
 
 DEFAULT_EPILOGUE = """
             using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
@@ -149,44 +153,109 @@ RUN_COMPV4 = """
 """
 
 
-PIPELINE_MAP = {'mem': ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'],
-                'compv3': ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'],
-                'compv4': ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']}
+PIPELINE_MAP = {
+    "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"],
+    "compv3": [
+        "ck_tile::BaseGemmPipelineAgBgCrCompV3",
+        "ck_tile::GemmPipelineAgBgCrCompV3",
+    ],
+    "compv4": [
+        "ck_tile::BaseGemmPipelineAgBgCrCompV4",
+        "ck_tile::GemmPipelineAgBgCrCompV4",
+    ],
+}
 
-SCHEDULER_MAP = {'interwave': 'ck_tile::GemmPipelineScheduler::Interwave',
-                 'intrawave': 'ck_tile::GemmPipelineScheduler::Intrawave'}
+SCHEDULER_MAP = {
+    "interwave": "ck_tile::GemmPipelineScheduler::Interwave",
+    "intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
+}
 
-EPILOGUE_MAP = {'default': DEFAULT_EPILOGUE,
-                'cshuffle': CSHUFFLE_EPILOGUE}
+EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE}
 
-HOT_LOOP_TRUE = {'mem': RUN_MEM,
-                 'compv3': RUN_COMPV3,
-                 'compv4': RUN_COMPV4}
+HOT_LOOP_TRUE = {"mem": RUN_MEM, "compv3": RUN_COMPV3, "compv4": RUN_COMPV4}
 
 
-def BOOL_MAP(b_): return {True: 'true', False: 'false'}[bool(b_)]
+def BOOL_MAP(b_):
+    return {True: "true", False: "false"}[bool(b_)]
 
 
 # To Do: add some more supported combinations
 warp_tile_supported_combinations = {
     "gfx90a": {
-        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]],
-        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]]
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
     },
     "gfx942": {
-        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
-        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]]
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
+        "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
     },
     "gfx950": {
-        'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-        'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
-        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
-        'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
-    }
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 32],
+            [16, 16, 64],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+        "fp8_fp8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 64],
+            [16, 16, 32],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+    },
 }
 
 # To Do: remove some unsupported combinations
@@ -194,24 +263,30 @@ trait_unsupported_combinations = {
     ("compv3", "cshuffle", "interwave"),
     ("compv3", "default", "interwave"),
     ("compv4", "cshuffle", "interwave"),
-    ("compv4", "default", "interwave")
+    ("compv4", "default", "interwave"),
+}
+
+
+ELEMENT_SIZE_MAP = {
+    "fp16": 2,
+    "bf16": 2,
+    "int8": 1,
+    "fp8": 1,
+    "bf8": 1,
+    "int4": 0.5,
+    "int32": 4,
 }
 
 
 def element_size(data_type: str) -> float:
     """Calculate the size (in bytes) of a single element for given data type."""
     data_type = data_type.lower()
-    if data_type in {'fp16', 'bf16'}:
-        return 2
-    elif data_type in {'int8', 'fp8', 'bf8'}:
-        return 1
-    elif data_type == 'int4':
-        return 0.5
-    else:
+    if data_type not in ELEMENT_SIZE_MAP:
         raise ValueError(f"Unsupported data type: {data_type}")
+    return ELEMENT_SIZE_MAP[data_type]
 
 
-GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)')
+GPU_NAME_PATTERN = re.compile(r"Name:\s*(gfx\d+\w*)")
 
 
 @lru_cache(maxsize=1)
@@ -219,10 +294,7 @@ def get_gpu_name_by_id(gpu_id: int = 0) -> str:
     """Retrieve GPU name (e.g. gfx90a) by device ID"""
     try:
         output = subprocess.check_output(
-            ["rocminfo"],
-            text=True,
-            stderr=subprocess.PIPE,
-            timeout=5
+            ["rocminfo"], text=True, stderr=subprocess.PIPE, timeout=5
         )
         if matches := GPU_NAME_PATTERN.finditer(output):
             gpu_list = [m.group(1) for m in matches]
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
index d20c5eef7d..9f71e430de 100644
--- a/tile_engine/ops/gemm/configs/default_config.json
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -33,19 +33,19 @@
     },
     "tile_config": {
         "tile_m": {
-            "max": 512,
+            "max": 256,
             "min": 64,
             "step": 64,
             "exclude": []
         },
         "tile_n": {
-            "max": 512,
+            "max": 256,
             "min": 64,
             "step": 32,
             "exclude": []
         },
         "tile_k": {
-            "max": 512,
+            "max": 256,
             "min": 64,
             "step": 64,
             "exclude": [192]
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
index 6a6e726e40..43c8784667 100644
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -17,17 +17,17 @@
     },
     "datatype_a": {
       "values": [
-        "fp16"
+        "int8"
       ]
     },
     "datatype_b": {
       "values": [
-        "fp16"
+        "int8"
       ]
     },
     "datatype_c": {
       "values": [
-        "fp16"
+        "int32"  
       ]
     }
   },
@@ -44,7 +44,7 @@
     },
     "tile_k": {
       "values": [
-        32
+        128
       ]
     },
     "warp_m": {
@@ -64,17 +64,17 @@
     },
     "warp_tile_m": {
       "values": [
-        32
+        16, 32
       ]
     },
     "warp_tile_n": {
       "values": [
-        32
+        16, 32
       ]
     },
     "warp_tile_k": {
       "values": [
-        16
+        16, 32
       ]
     }
   },
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index b3aab6ad92..2c4af8955f 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -50,6 +50,18 @@ struct DataTypeTraits<ck_tile::bf8_t>
     static constexpr const char* name = "bf8";
 };
 
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
 template <>
 struct DataTypeTraits<ck_tile::pk_int4_t>
 {
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index e7690ac481..f217522feb 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -29,10 +29,9 @@ from codegen_utils import (
     warp_tile_supported_combinations,
     trait_unsupported_combinations,
     element_size,
-    get_gpu_name_by_id
+    get_gpu_name_by_id,
 )
 import logging
-import time
 
 logging.basicConfig(level=logging.INFO)
 
@@ -40,16 +39,18 @@ logging.basicConfig(level=logging.INFO)
 class GemmCodeGenerator:
     """GEMM (General Matrix Multiplication) code generator."""
 
-    def __init__(self, output_dir: str,
-                 user_provided_config: Optional[GemmConfig] = None):
+    def __init__(
+        self, output_dir: str, user_provided_config: Optional[GemmConfig] = None
+    ):
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
 
         if user_provided_config is not None:
             self.config = user_provided_config
         else:
-            config_path = Path(__file__).resolve().parent / \
-                "configs" / "default_config.json"
+            config_path = (
+                Path(__file__).resolve().parent / "configs" / "default_config.json"
+            )
             self.config = GemmConfig.from_json(config_path)
 
         self.valid_trait_names: List[str] = []
@@ -58,46 +59,82 @@ class GemmCodeGenerator:
     def list_all_trait_names(self):
         """List all possible kernel trait names into file."""
         w_p = Path(self.output_dir)
-        file_path = w_p / 'gemm_instance_blobs.txt'
+        file_path = w_p / "gemm_instance_blobs.txt"
         self._generate_all_traits()
         self._get_valid_trait_tile_combinations()
 
         # Write all file paths to the header file
-        with file_path.open('w') as f:
-            f.write(str(w_p / "gemm_common.hpp") + "\n")
-            f.write(str(w_p / "gemm_instances.hpp") + "\n")
-            f.write(str(w_p / "gemm_dispatcher.hpp") + "\n")
+        files_listed = 0
+        with file_path.open("w") as f:
+            # Core files
+            core_files = [
+                "gemm_common.hpp",
+                "gemm_instances.hpp",
+                "gemm_dispatcher.hpp",
+            ]
+            for core_file in core_files:
+                f.write(str(w_p / core_file) + "\n")
+                files_listed += 1
+
+            # Trait header files
             for trait in self.valid_trait_names:
-                f.write(str(w_p / f"gemm_{trait}.hpp") + "\n")
+                trait_file = f"gemm_{trait}.hpp"
+                f.write(str(w_p / trait_file) + "\n")
+                files_listed += 1
+
+            # Instance source files
             for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
                 for tile in tile_valid_params:
-                    for tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k in tile:
-                        sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
-                            self.config.problem.datatype_map['matrix_b'] == 'fp16' and \
-                            self.config.problem.datatype_map['matrix_c'] == 'fp16' and \
-                            ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or
-                             (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32))
+                    for (
+                        tile_m,
+                        tile_n,
+                        tile_k,
+                        warp_m,
+                        warp_n,
+                        warp_k,
+                        warp_tile_m,
+                        warp_tile_n,
+                        warp_tile_k,
+                    ) in tile:
+                        instance_name = f"{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+                        sparse = (
+                            self.config.problem.datatype_map["matrix_a"] == "fp16"
+                            and self.config.problem.datatype_map["matrix_b"] == "fp16"
+                            and self.config.problem.datatype_map["matrix_c"] == "fp16"
+                            and (
+                                (
+                                    warp_tile_m == 32
+                                    and warp_tile_n == 32
+                                    and warp_tile_k == 16
+                                )
+                                or (
+                                    warp_tile_m == 16
+                                    and warp_tile_n == 16
+                                    and warp_tile_k == 32
+                                )
+                            )
+                        )
                         if sparse:
-                            f.write(str(
-                                w_p / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_true.cpp") + "\n")
-                        f.write(str(
-                                w_p / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_false.cpp") + "\n")
+                            sparse_file = f"gemm_{trait}_{instance_name}_true.cpp"
+                            f.write(str(w_p / sparse_file) + "\n")
+                            files_listed += 1
+
+                        regular_file = f"gemm_{trait}_{instance_name}_false.cpp"
+                        f.write(str(w_p / regular_file) + "\n")
+                        files_listed += 1
+
+        print(f"File listing complete: {files_listed} files listed in {file_path}\n")
 
     def _generate_all_traits(self):
         """Generate all possible kernel traits names."""
-        params = [
-            "pipeline",
-            "epilogue",
-            "scheduler",
-            "pad_m",
-            "pad_n",
-            "pad_k"]
+        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"]
 
         # Generate all unique_combinations
-        _unique = set(itertools.product(*[
-            getattr(self.config.trait_config, param).values
-            for param in params
-        ]))
+        _unique = set(
+            itertools.product(
+                *[getattr(self.config.trait_config, param).values for param in params]
+            )
+        )
 
         for combo in _unique:
             pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo
@@ -110,9 +147,7 @@ class GemmCodeGenerator:
                 )
                 self.valid_trait_names.append(trait_name)
             else:
-                logging.debug(
-                    f"Invalid combination: {pipeline}-{epilogue}-{scheduler}"
-                )
+                logging.debug(f"Invalid combination: {pipeline}-{epilogue}-{scheduler}")
 
     def generate_all_instance_files(self):
         """Generate all kernel instances files."""
@@ -123,6 +158,16 @@ class GemmCodeGenerator:
     def _generate_common_header_file(self):
         """Generate common header file with datatypes and layout."""
 
+        # Determine appropriate accumulation type based on input types
+        a_type = self.config.problem.datatype_map["matrix_a"]
+        b_type = self.config.problem.datatype_map["matrix_b"]
+        c_type = self.config.problem.datatype_map["matrix_c"]
+
+        if a_type in ["int8", "int4"] and b_type in ["int8", "int4"]:
+            acc_type = "ck_tile::int32_t"
+        else:
+            acc_type = "float"
+
         content = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
@@ -132,15 +177,15 @@ class GemmCodeGenerator:
 #include "ck_tile/ops/common.hpp"
 
 // Data types
-using ADataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_a']]};
-using BDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_b']]};
-using AccDataType = float;
-using CDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_c']]};
+using ADataType = {DATA_TYPE_MAP[self.config.problem.datatype_map["matrix_a"]]};
+using BDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map["matrix_b"]]};
+using AccDataType = {acc_type};
+using CDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map["matrix_c"]]};
 
 // Layout configurations
-using ALayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_a']]};
-using BLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_b']]};
-using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
+using ALayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_a"]]};
+using BLayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_b"]]};
+using CLayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_c"]]};
 """
 
         (self.output_dir / "gemm_common.hpp").write_text(content)
@@ -174,13 +219,21 @@ namespace {trait} {{
 """
         # Add template struct with configuration
         content += self._generate_kernel_struct(
-            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k)
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k
+        )
 
         content += f"\n}} // namespace {trait}\n"
         (self.output_dir / filename).write_text(content)
 
-    def _generate_kernel_struct(self, pipeline: str, epilogue: str, scheduler: str,
-                                pad_m: str, pad_n: str, pad_k: str) -> str:
+    def _generate_kernel_struct(
+        self,
+        pipeline: str,
+        epilogue: str,
+        scheduler: str,
+        pad_m: str,
+        pad_n: str,
+        pad_k: str,
+    ) -> str:
         """Generate the code block of kernel struct"""
         return f"""
 
@@ -193,7 +246,7 @@ struct GemmKernel {{
     static constexpr bool kPadN = {pad_n};
     static constexpr bool kPadK = {pad_k};
 
-    static float launch(ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{
+    static float launch(ck_tile::GemmHostArgs<><>& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
         static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
@@ -307,6 +360,7 @@ struct GemmKernel {{
                     if(args.k_batch > 1)
                         hipGetErrorString(hipMemsetAsync(
                             args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
+                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
                 }};
                 ave_time = ck_tile::launch_kernel_preprocess(
                     stream,
@@ -367,28 +421,36 @@ struct GemmKernel {{
 #pragma once
 """
         for trait in self.valid_trait_names:
-            content += f"#include \"gemm_{trait}.hpp\"\n"
+            content += f'#include "gemm_{trait}.hpp"\n'
         (self.output_dir / "gemm_instances.hpp").write_text(content)
 
     def is_tile_valid(self, tile: tuple, trait: str) -> bool:
         """Check if the tile configuration is valid for the given trait."""
-        tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile
+        (
+            tile_m,
+            tile_n,
+            tile_k,
+            warp_m,
+            warp_n,
+            warp_k,
+            warp_tile_m,
+            warp_tile_n,
+            warp_tile_k,
+        ) = tile
         pipeline, *_ = trait.split("_")
 
         # Parameter validity check
         invalid_params = []
         if (warp_m, warp_n, warp_k) not in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]:
             invalid_params.append(
-                f"warp_m({warp_m}) * warp_n({warp_n}) * warp_k({warp_k})")
+                f"warp_m({warp_m}) * warp_n({warp_n}) * warp_k({warp_k})"
+            )
         if (warp_m * warp_tile_m) == 0:
-            invalid_params.append(
-                f"warp_m({warp_m}) * warp_tile_m({warp_tile_m})")
+            invalid_params.append(f"warp_m({warp_m}) * warp_tile_m({warp_tile_m})")
         if (warp_n * warp_tile_n) == 0:
-            invalid_params.append(
-                f"warp_n({warp_n}) * warp_tile_n({warp_tile_n})")
+            invalid_params.append(f"warp_n({warp_n}) * warp_tile_n({warp_tile_n})")
         if (warp_k * warp_tile_k) == 0:
-            invalid_params.append(
-                f"warp_k({warp_k}) * warp_tile_k({warp_tile_k})")
+            invalid_params.append(f"warp_k({warp_k}) * warp_tile_k({warp_tile_k})")
 
         if invalid_params:
             logging.debug(
@@ -397,18 +459,20 @@ struct GemmKernel {{
                 f"warp_tile=({warp_tile_m},{warp_tile_n},{warp_tile_k})"
             )
             return False
-
         # Dimension alignment check
         alignment_issues = []
         if tile_m % (warp_m * warp_tile_m) != 0:
             alignment_issues.append(
-                f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}")
+                f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}"
+            )
         if tile_n % (warp_n * warp_tile_n) != 0:
             alignment_issues.append(
-                f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}")
+                f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}"
+            )
         if tile_k % (warp_k * warp_tile_k) != 0:
             alignment_issues.append(
-                f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}")
+                f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}"
+            )
 
         if alignment_issues:
             logging.debug(
@@ -419,17 +483,20 @@ struct GemmKernel {{
             return False
 
         # LDS capacity verification
-        matrix_a_size = (tile_m * tile_k) * \
-            element_size(self.config.problem.datatype_map['matrix_a'])
-        matrix_b_size = (tile_n * tile_k) * \
-            element_size(self.config.problem.datatype_map['matrix_b'])
+        matrix_a_size = (tile_m * tile_k) * element_size(
+            self.config.problem.datatype_map["matrix_a"]
+        )
+        matrix_b_size = (tile_n * tile_k) * element_size(
+            self.config.problem.datatype_map["matrix_b"]
+        )
         total_tile_in_lds = matrix_a_size + matrix_b_size
 
         max_tile_size = 2**15 if pipeline == "compv4" else 2**16
+
         if total_tile_in_lds > max_tile_size:
             logging.debug(
-                f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds/1024:.1f}KB) > "
-                f"maximum allowed {max_tile_size:,}B ({max_tile_size/1024}KB). Breakdown:\n"
+                f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds / 1024:.1f}KB) > "
+                f"maximum allowed {max_tile_size:,}B ({max_tile_size / 1024}KB). Breakdown:\n"
                 f"- Matrix A ({self.config.problem.datatype_map['matrix_a']}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n"
                 f"- Matrix B ({self.config.problem.datatype_map['matrix_b']}): {tile_n}x{tile_k} = {matrix_b_size:,}B"
             )
@@ -440,16 +507,19 @@ struct GemmKernel {{
         current_combination = [warp_tile_m, warp_tile_n, warp_tile_k]
 
         gpu_name = get_gpu_name_by_id(0)
+
         gpu_warp_tile_key = warp_tile_supported_combinations.get(gpu_name, {})
         if not gpu_warp_tile_key:
             logging.debug(
-                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check.")
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check."
+            )
             return False
 
         allowed_combinations = gpu_warp_tile_key.get(warp_tile_key, [])
         if not allowed_combinations:
             logging.debug(
-                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check.")
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check."
+            )
             return False
 
         if current_combination not in allowed_combinations:
@@ -462,49 +532,68 @@ struct GemmKernel {{
         return True
 
     def _get_valid_trait_tile_combinations(self):
-        def get_tile_value(tile_param): return tile_param.generate_candidates(
-        ) if isinstance(tile_param, RangeConfigParam) else tile_param.values
+        def get_tile_value(tile_param):
+            return (
+                tile_param.generate_candidates()
+                if isinstance(tile_param, RangeConfigParam)
+                else tile_param.values
+            )
 
-        tile_group = list(itertools.product(
-            get_tile_value(self.config.tile_config.tile_m),
-            get_tile_value(self.config.tile_config.tile_n),
-            get_tile_value(self.config.tile_config.tile_k)
-        ))
+        tile_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.tile_m),
+                get_tile_value(self.config.tile_config.tile_n),
+                get_tile_value(self.config.tile_config.tile_k),
+            )
+        )
 
-        warp_group = list(itertools.product(
-            get_tile_value(self.config.tile_config.warp_m),
-            get_tile_value(self.config.tile_config.warp_n),
-            get_tile_value(self.config.tile_config.warp_k)
-        ))
+        warp_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.warp_m),
+                get_tile_value(self.config.tile_config.warp_n),
+                get_tile_value(self.config.tile_config.warp_k),
+            )
+        )
 
-        warp_tile_group = list(itertools.product(
-            get_tile_value(self.config.tile_config.warp_tile_m),
-            get_tile_value(self.config.tile_config.warp_tile_n),
-            get_tile_value(self.config.tile_config.warp_tile_k)
-        ))
+        warp_tile_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.warp_tile_m),
+                get_tile_value(self.config.tile_config.warp_tile_n),
+                get_tile_value(self.config.tile_config.warp_tile_k),
+            )
+        )
 
         tile_params = {
-            t + w + wt
-            for t in tile_group
-            for w in warp_group
-            for wt in warp_tile_group
+            t + w + wt for t in tile_group for w in warp_group for wt in warp_tile_group
         }
 
         for trait in self.valid_trait_names:
-            tile_valid_params = list(
-                filter(lambda t: self.is_tile_valid(t, trait), tile_params))
+            tile_valid_params = [
+                tile for tile in tile_params if self.is_tile_valid(tile, trait)
+            ]
 
-            # if len(tile_valid_params) == 0:
-            #     raise RuntimeError(f"No valid kernel instance selected for trait: {trait}")
             if trait not in self.valid_trait_tile_combinations:
                 self.valid_trait_tile_combinations[trait] = []
             self.valid_trait_tile_combinations[trait].append(tile_valid_params)
 
     def _generate_instantiation_source_files(self):
-        """Generate kernel instance instantiation source files """
+        """Generate kernel instance instantiation source files"""
+
         for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
             for tile in tile_valid_params:
-                for tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k in tile:
+                for (
+                    tile_m,
+                    tile_n,
+                    tile_k,
+                    warp_m,
+                    warp_n,
+                    warp_k,
+                    warp_tile_m,
+                    warp_tile_n,
+                    warp_tile_k,
+                ) in tile:
+                    instance_name = f"{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+
                     content = f"""
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
@@ -514,23 +603,41 @@ struct GemmKernel {{
 #include "gemm_{trait}.hpp" 
 
 """
-                    sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
-                        self.config.problem.datatype_map['matrix_b'] == 'fp16' and \
-                        self.config.problem.datatype_map['matrix_c'] == 'fp16' and \
-                        ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or
-                            (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32))
+                    sparse = (
+                        self.config.problem.datatype_map["matrix_a"] == "fp16"
+                        and self.config.problem.datatype_map["matrix_b"] == "fp16"
+                        and self.config.problem.datatype_map["matrix_c"] == "fp16"
+                        and (
+                            (
+                                warp_tile_m == 32
+                                and warp_tile_n == 32
+                                and warp_tile_k == 16
+                            )
+                            or (
+                                warp_tile_m == 16
+                                and warp_tile_n == 16
+                                and warp_tile_k == 32
+                            )
+                        )
+                    )
                     if sparse:
-                        sparse_content = content + f"""
+                        sparse_filename = f"gemm_{trait}_{instance_name}_true.cpp"
+                        sparse_content = (
+                            content
+                            + f"""
 template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, true>;
 """
-                        (self.output_dir /
-                         f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_true.cpp").write_text(sparse_content)
+                        )
+                        (self.output_dir / sparse_filename).write_text(sparse_content)
 
-                    no_sparse_content = content + f"""
+                    no_sparse_filename = f"gemm_{trait}_{instance_name}_false.cpp"
+                    no_sparse_content = (
+                        content
+                        + f"""
 template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, false>;
 """
-                    (self.output_dir /
-                     f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_false.cpp").write_text(no_sparse_content)
+                    )
+                    (self.output_dir / no_sparse_filename).write_text(no_sparse_content)
 
     def _generate_dispatcher_file(self):
         """Generate the code block of dispatch mechanism."""
@@ -576,7 +683,7 @@ struct GemmDispatcher {
     }
 
     static void init(bool structured_sparsity) {
-        ck_tile::ignore = structured_sparsity;
+        (void)structured_sparsity;  // Suppress unused parameter warning
         auto& kernel_map = get_kernel_map();
         if(!kernel_map.empty()) return;
         \n"""
@@ -585,16 +692,37 @@ struct GemmDispatcher {
             content += f"""         kernel_map["{trait}"] = {{"""
             for _, tile in enumerate(tile_valid_params):
                 for j in range(len(tile)):
-                    tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile[
-                        j]
-                    content += f"""[=](ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{ """
+                    (
+                        tile_m,
+                        tile_n,
+                        tile_k,
+                        warp_m,
+                        warp_n,
+                        warp_k,
+                        warp_tile_m,
+                        warp_tile_n,
+                        warp_tile_k,
+                    ) = tile[j]
+                    content += f"""[=](ck_tile::GemmHostArgs<><>& args, const ck_tile::stream_config& stream) {{ """
                     content += f""" 
                                     if(structured_sparsity){{  // SMFMA"""
-                    sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \
-                        self.config.problem.datatype_map['matrix_b'] == 'fp16' and \
-                        self.config.problem.datatype_map['matrix_c'] == 'fp16' and \
-                        ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or
-                            (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32))
+                    sparse = (
+                        self.config.problem.datatype_map["matrix_a"] == "fp16"
+                        and self.config.problem.datatype_map["matrix_b"] == "fp16"
+                        and self.config.problem.datatype_map["matrix_c"] == "fp16"
+                        and (
+                            (
+                                warp_tile_m == 32
+                                and warp_tile_n == 32
+                                and warp_tile_k == 16
+                            )
+                            or (
+                                warp_tile_m == 16
+                                and warp_tile_n == 16
+                                and warp_tile_k == 32
+                            )
+                        )
+                    )
                     content += f"""
                                         return run_kernel<{trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, {BOOL_MAP(sparse)}>>(args, stream);"""
                     content += f"""
@@ -604,7 +732,7 @@ struct GemmDispatcher {
                     content += f"""
                                     }} """
 
-                    if j == len(tile)-1:
+                    if j == len(tile) - 1:
                         content += f"""
                                 }} """
                     else:
@@ -651,22 +779,26 @@ private:
         (self.output_dir / "gemm_dispatcher.hpp").write_text(content)
 
 
-def do_list_blobs(args: argparse.Namespace,
-                  user_provide_config: Optional[GemmConfig] = None):
+def do_list_blobs(
+    args: argparse.Namespace, user_provide_config: Optional[GemmConfig] = None
+):
     generator = GemmCodeGenerator(args.working_path, user_provide_config)
     generator.list_all_trait_names()
 
 
-def do_gen_blobs(args: argparse.Namespace,
-                 user_provide_config: Optional[GemmConfig] = None):
+def do_gen_blobs(
+    args: argparse.Namespace, user_provide_config: Optional[GemmConfig] = None
+):
     generator = GemmCodeGenerator(args.working_path, user_provide_config)
     generator.generate_all_instance_files()
 
 
 def main(args):
-
-    gemm_config = GemmConfig.from_json(
-        args.config_json) if args.config_json is not None else args.config_json
+    gemm_config = (
+        GemmConfig.from_json(args.config_json)
+        if args.config_json is not None
+        else args.config_json
+    )
 
     if args.list_blobs:
         do_list_blobs(args, gemm_config)
@@ -674,7 +806,8 @@ def main(args):
         do_gen_blobs(args, gemm_config)
     else:
         logging.warning(
-            "No mode specified (use --list_blobs or --gen_blobs). Generating by default...")
+            "No mode specified (use --list_blobs or --gen_blobs). Generating by default..."
+        )
         do_gen_blobs(args, gemm_config)
 
 
@@ -684,16 +817,29 @@ if __name__ == "__main__":
         description="gen API for CK gemm kernel",
     )
     parser.add_argument(
-        "-w", "--working_path", default="./", required=False, help="The path where all the blobs are going to be generated"
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="The path where all the blobs are going to be generated",
     )
     parser.add_argument(
-        "-j", "--config_json", required=False, help="Path to the json which contains the configurations that user provide"
+        "-j",
+        "--config_json",
+        required=False,
+        help="Path to the json which contains the configurations that user provide",
     )
     parser.add_argument(
-        "-l", "--list_blobs", action='store_true', help="List all kernel instances to file"
+        "-l",
+        "--list_blobs",
+        action="store_true",
+        help="List all kernel instances to file",
     )
     parser.add_argument(
-        "-g", "--gen_blobs", action='store_true', help="Generate all kernel instances into different files"
+        "-g",
+        "--gen_blobs",
+        action="store_true",
+        help="Generate all kernel instances into different files",
     )
 
     args = parser.parse_args()
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 0fd87ec07d..272799e4d6 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -23,6 +23,7 @@ class GemmProfiler
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
                        ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>& callables)
+                       ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>& callables)
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -89,17 +90,20 @@ class GemmProfiler
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::GemmHostArgs<> gemm_args;
-        gemm_args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-        gemm_args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-        gemm_args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-        gemm_args.k_batch  = gemm_problem.split_k_;
-        gemm_args.M        = gemm_problem.m_;
-        gemm_args.N        = gemm_problem.n_;
-        gemm_args.K        = gemm_problem.k_;
-        gemm_args.stride_A = gemm_problem.stride_a_;
-        gemm_args.stride_B = gemm_problem.stride_b_;
-        gemm_args.stride_C = gemm_problem.stride_c_;
+        ck_tile::GemmHostArgs<> gemm_args = {
+            a_m_k_dev_buf.GetDeviceBuffer(),
+            b_k_n_dev_buf.GetDeviceBuffer(),
+            {}, // ds_ptr
+            c_m_n_dev_buf.GetDeviceBuffer(),
+            gemm_problem.split_k_,
+            gemm_problem.m_,
+            gemm_problem.n_,
+            gemm_problem.k_,
+            gemm_problem.stride_a_,
+            gemm_problem.stride_b_,
+            {}, // stride_Ds
+            gemm_problem.stride_c_,
+        };
 
         ck_tile::HostTensor<CDataType> c_m_n_host_result(ck_tile::host_tensor_descriptor(
             gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)));
diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
index 597caba76f..aaf732c6a8 100644
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -16,12 +16,14 @@ import json
 @dataclass
 class EnumConfigParam:
     """Represents an enumeration-type configuration parameter"""
+
     values: List[Union[int, str, bool]]
 
 
 @dataclass
 class RangeConfigParam:
     """Represents a numeric range-type configuration parameter"""
+
     min: int
     max: int
     step: int
@@ -31,17 +33,13 @@ class RangeConfigParam:
         """Generates valid candidates after applying range constraints"""
 
         if self.min > self.max:
-            raise ValueError(
-                f"Invalid range: min({self.min}) > max({self.max})"
-            )
+            raise ValueError(f"Invalid range: min({self.min}) > max({self.max})")
         if self.step <= 0:
-            raise ValueError(
-                f"Step must be positive, got {self.step}"
-            )
+            raise ValueError(f"Step must be positive, got {self.step}")
 
         candidates = list(range(self.min, self.max + 1, self.step))
 
-        if hasattr(self, 'exclude') and self.exclude:
+        if hasattr(self, "exclude") and self.exclude:
             if not isinstance(self.exclude, list):
                 raise TypeError("exclude must be list type")
             exclude_set = set(self.exclude)
@@ -59,6 +57,7 @@ class RangeConfigParam:
 @dataclass
 class ProblemConfig:
     """configuration class for problem parameter."""
+
     datatypes: Tuple[EnumConfigParam, ...]
     layouts: Tuple[EnumConfigParam, ...]
 
@@ -66,24 +65,25 @@ class ProblemConfig:
     def datatype_map(self) -> Dict[str, str]:
         """Get datatype as a key-value map."""
         return {
-            'matrix_a': self.datatypes[0].values[0],
-            'matrix_b': self.datatypes[1].values[0],
-            'matrix_c': self.datatypes[2].values[0]
+            "matrix_a": self.datatypes[0].values[0],
+            "matrix_b": self.datatypes[1].values[0],
+            "matrix_c": self.datatypes[2].values[0],
         }
 
     @property
     def layout_map(self) -> Dict[str, str]:
         """Get layout as a key-value map."""
         return {
-            'matrix_a': self.layouts[0].values[0],
-            'matrix_b': self.layouts[1].values[0],
-            'matrix_c': self.layouts[2].values[0]
+            "matrix_a": self.layouts[0].values[0],
+            "matrix_b": self.layouts[1].values[0],
+            "matrix_c": self.layouts[2].values[0],
         }
 
 
 @dataclass
 class TileConfig:
     """Configuration class for tile parameter."""
+
     tile_m: Union[EnumConfigParam, RangeConfigParam]
     tile_n: Union[EnumConfigParam, RangeConfigParam]
     tile_k: Union[EnumConfigParam, RangeConfigParam]
@@ -100,6 +100,7 @@ class TileConfig:
 @dataclass
 class TraitConfig:
     """Configuration class for kernel traits."""
+
     pipeline: EnumConfigParam
     scheduler: EnumConfigParam
     epilogue: EnumConfigParam
@@ -110,7 +111,8 @@ class TraitConfig:
 
 @dataclass
 class GemmConfig:
-    """Main configuration class for GEMM operations """
+    """Main configuration class for GEMM operations"""
+
     problem: ProblemConfig
     tile_config: TileConfig
     trait_config: TraitConfig
@@ -124,76 +126,83 @@ class GemmConfig:
             if not config_path.exists():
                 raise FileNotFoundError(f"Config file {filepath} not found")
 
-            with config_path.open('r') as f:
+            with config_path.open("r") as f:
                 config_dict = json.load(f)
 
             # Parse problem config
             problem = ProblemConfig(
                 datatypes=(
                     EnumConfigParam(
-                        values=config_dict['problem']['datatype_a']['values']),
+                        values=config_dict["problem"]["datatype_a"]["values"]
+                    ),
                     EnumConfigParam(
-                        values=config_dict['problem']['datatype_b']['values']),
+                        values=config_dict["problem"]["datatype_b"]["values"]
+                    ),
                     EnumConfigParam(
-                        values=config_dict['problem']['datatype_c']['values'])
+                        values=config_dict["problem"]["datatype_c"]["values"]
+                    ),
                 ),
                 layouts=(
                     EnumConfigParam(
-                        values=config_dict['problem']['layout_a']['values']),
+                        values=config_dict["problem"]["layout_a"]["values"]
+                    ),
                     EnumConfigParam(
-                        values=config_dict['problem']['layout_b']['values']),
+                        values=config_dict["problem"]["layout_b"]["values"]
+                    ),
                     EnumConfigParam(
-                        values=config_dict['problem']['layout_c']['values'])
-                )
+                        values=config_dict["problem"]["layout_c"]["values"]
+                    ),
+                ),
             )
 
             # Parse tile config
             def create_param(param_dict):
-                if 'values' in param_dict:
-                    return EnumConfigParam(values=param_dict['values'])
+                if "values" in param_dict:
+                    return EnumConfigParam(values=param_dict["values"])
                 else:
                     return RangeConfigParam(
-                        min=param_dict['min'],
-                        max=param_dict['max'],
-                        step=param_dict['step'],
-                        exclude=param_dict.get('exclude', [])
+                        min=param_dict["min"],
+                        max=param_dict["max"],
+                        step=param_dict["step"],
+                        exclude=param_dict.get("exclude", []),
                     )
 
             tile_config = TileConfig(
-                tile_m=create_param(config_dict['tile_config']['tile_m']),
-                tile_n=create_param(config_dict['tile_config']['tile_n']),
-                tile_k=create_param(config_dict['tile_config']['tile_k']),
-                warp_m=create_param(config_dict['tile_config']['warp_m']),
-                warp_n=create_param(config_dict['tile_config']['warp_n']),
-                warp_k=create_param(config_dict['tile_config']['warp_k']),
-                warp_tile_m=create_param(
-                    config_dict['tile_config']['warp_tile_m']),
-                warp_tile_n=create_param(
-                    config_dict['tile_config']['warp_tile_n']),
-                warp_tile_k=create_param(
-                    config_dict['tile_config']['warp_tile_k'])
+                tile_m=create_param(config_dict["tile_config"]["tile_m"]),
+                tile_n=create_param(config_dict["tile_config"]["tile_n"]),
+                tile_k=create_param(config_dict["tile_config"]["tile_k"]),
+                warp_m=create_param(config_dict["tile_config"]["warp_m"]),
+                warp_n=create_param(config_dict["tile_config"]["warp_n"]),
+                warp_k=create_param(config_dict["tile_config"]["warp_k"]),
+                warp_tile_m=create_param(config_dict["tile_config"]["warp_tile_m"]),
+                warp_tile_n=create_param(config_dict["tile_config"]["warp_tile_n"]),
+                warp_tile_k=create_param(config_dict["tile_config"]["warp_tile_k"]),
             )
 
             # Parse trait config
             trait_config = TraitConfig(
                 pipeline=EnumConfigParam(
-                    values=config_dict['trait_config']['pipeline']['values']),
+                    values=config_dict["trait_config"]["pipeline"]["values"]
+                ),
                 scheduler=EnumConfigParam(
-                    values=config_dict['trait_config']['scheduler']['values']),
+                    values=config_dict["trait_config"]["scheduler"]["values"]
+                ),
                 epilogue=EnumConfigParam(
-                    values=config_dict['trait_config']['epilogue']['values']),
+                    values=config_dict["trait_config"]["epilogue"]["values"]
+                ),
                 pad_m=EnumConfigParam(
-                    values=config_dict['trait_config']['pad_m']['values']),
+                    values=config_dict["trait_config"]["pad_m"]["values"]
+                ),
                 pad_n=EnumConfigParam(
-                    values=config_dict['trait_config']['pad_n']['values']),
+                    values=config_dict["trait_config"]["pad_n"]["values"]
+                ),
                 pad_k=EnumConfigParam(
-                    values=config_dict['trait_config']['pad_k']['values'])
+                    values=config_dict["trait_config"]["pad_k"]["values"]
+                ),
             )
 
             return cls(
-                problem=problem,
-                tile_config=tile_config,
-                trait_config=trait_config
+                problem=problem, tile_config=tile_config, trait_config=trait_config
             )
 
         except json.JSONDecodeError as e:

From a14753b86f0b14b558a82f4a057cc2e0edd60a61 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Wed, 25 Jun 2025 15:18:24 -0700
Subject: [PATCH 262/443] Enabling diff datatypes for tile_engine and build
 with more granularity (#2392)

* merging recent changes to universal gemm to tile_engine

* Reducing Linking time by generating less intermediate files

* make small libs to build faster

* Reducing the instances

* reducing instances

* Restoring default config

* Restoring default config

* warp_n reverted in default config

* Adding diff json files for fp8 and fp16, cmake changes for fp8

* Restructure the CMake File

* Added more granularity for build and some debugging code

* removed some of debugging statements

* added fp8 instances

* tahe datatype from command line to enable both type of json files

* updated README file

* code cleanup

* code cleanup

* updated jenkinsfile

* enable tile_engine daily builds

* updating cmake file

* updated CMakeLists.txt

* Updating CMake code fixing gfx12 build

* Updating CMake code fixing gfx12 build

* Fix CMake file null checks

* fixed traces of rebase

* Update tile_engine/ops/gemm/README.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update tile_engine/ops/gemm/README.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update tile_engine/ops/gemm/README.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* fixing rebase issue

---------

Co-authored-by: khushbu <khuagarw@gmail.com>
Co-authored-by: ThomasNing <thomas.ning@amd.com>
Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
Co-authored-by: AviralGoelAMD <aviral.goel@amd.com>
Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 Jenkinsfile                                   |  16 +-
 tile_engine/ops/gemm/CMakeLists.txt           | 193 +++++++++-----
 tile_engine/ops/gemm/README.md                |  23 +-
 tile_engine/ops/gemm/codegen_utils.py         |   6 +-
 tile_engine/ops/gemm/configs/benchmark.json   | 116 +++++++++
 .../ops/gemm/configs/default_config.json      | 241 ++++++++----------
 .../gemm/configs/user_provided_config.json    |  22 +-
 tile_engine/ops/gemm/gemm_instance_builder.py | 112 ++++----
 tile_engine/ops/gemm/gemm_profiler.hpp        |   1 -
 tile_engine/ops/gemm/json_config.py           |  20 +-
 10 files changed, 458 insertions(+), 292 deletions(-)
 create mode 100644 tile_engine/ops/gemm/configs/benchmark.json

diff --git a/Jenkinsfile b/Jenkinsfile
index 9f1c021878..aec833587f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -800,7 +800,7 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=false;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -1216,9 +1216,12 @@ pipeline {
                                             -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                             -D CMAKE_BUILD_TYPE=Release \
                                             -D GPU_TARGETS="gfx90a" \
+                                            -D GEMM_DATATYPE="fp8;fp16" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm && \
-                                           ./bin/benchmark_gemm """
+                                           ninja -j64 benchmark_gemm_fp8 && \
+                                           ./bin/benchmark_gemm_fp8 && \
+                                           ninja -j64 benchmark_gemm_fp16 && \
+                                           ./bin/benchmark_gemm_fp16 """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1238,9 +1241,12 @@ pipeline {
                                             -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                             -D CMAKE_BUILD_TYPE=Release \
                                             -D GPU_TARGETS="gfx942" \
+                                            -D GEMM_DATATYPE="fp8;fp16" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j128 benchmark_gemm && \
-                                           ./bin/benchmark_gemm """
+                                           ninja -j128 benchmark_gemm_fp8 && \
+                                           ./bin/benchmark_gemm_fp8 && \
+                                           ninja -j128 benchmark_gemm_fp16 && \
+                                           ./bin/benchmark_gemm_fp16 """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index c3c177487f..5db55f02d5 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,67 +1,134 @@
-# generate a list of kernels, but not actually emit files at config stage
-execute_process(
-    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
-            --working_path ${CMAKE_CURRENT_BINARY_DIR}
-            # --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json
-            --list_blobs
-    RESULT_VARIABLE ret
-)
 
-if(ret AND NOT ret EQUAL 0)
-  message( FATAL_ERROR "Fail to list kernels via Python. ${ret}")
-endif()
+set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
 
-file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt GEMM_CODEGEN_BLOBS)
-
-set(GEMM_CODEGEN_CPP_FILES "")
-set(GEMM_CODEGEN_HPP_FILES "")
-
-foreach(blob ${GEMM_CODEGEN_BLOBS})
-    string(STRIP "${blob}" stripped_blob)
-    
-    if(stripped_blob MATCHES "\\.cpp$")
-        list(APPEND GEMM_CODEGEN_CPP_FILES "${stripped_blob}")
-    elseif(stripped_blob MATCHES "\\.hpp$")
-        list(APPEND GEMM_CODEGEN_HPP_FILES "${stripped_blob}")
+function(build_gemm_for_datatype datatype)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/")
+    set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+    #set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    # Generate kernel list
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --config_json ${json_blob}
+                --list_blobs
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype}: ${ret}")
     endif()
+
+    file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs)
+    file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range)
+    
+    # Generate the blobs
+    add_custom_command(
+        OUTPUT ${codegen_blobs}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+                --working_path "${working_path}"
+                --datatype ${datatype}
+                --config_json "${json_blob}"
+                --gen_blobs
+        COMMENT "Generating GEMM instance sources for ${datatype}"
+    )
+    add_custom_target(gemm_gen_${datatype} DEPENDS ${codegen_blobs})
+
+    set(intermediate_libs)
+    list(LENGTH codegen_blobs codegen_blobs_len)
+
+    foreach(blob IN LISTS codegen_blobs_range)
+        string(STRIP "${blob}" stripped_blob)
+        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
+        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
+        list(GET spilit_blob 0 name)
+        list(GET spilit_blob 1 first)
+        list(GET spilit_blob 2 last)
+        math(EXPR total_files "${last} - ${first}")
+        if(total_files EQUAL 0)
+            continue()        # nothing for this trait
+        endif()
+
+        # Object libraries (chunked) per trait
+        set(sub_intermediate_libs)
+        set(chunk_size 3)
+        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
+        
+        foreach(i RANGE 0 ${num_chunks_minus_1})
+            math(EXPR start "${first} + ${i} * ${chunk_size} ")
+            math(EXPR end "${start} + ${chunk_size} - 1")
+
+            set(chunk_files)
+            foreach(j RANGE ${start} ${end})
+                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
+                    list(GET codegen_blobs ${j} f)
+                    list(APPEND chunk_files "${f}")
+                endif()
+            endforeach()
+
+            #list(LENGTH chunk_files chunk_files_len)
+            #if(chunk_files_len AND chunk_files_len GREATER 1)
+            if(chunk_files)
+                set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}")
+                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
+                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
+            endif()
+
+        endforeach()
+
+        # ------------------ Bundle the object libs into one static lib ---------
+        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
+        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
+        if(sub_intermediate_libs)
+            set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}")
+            # Collect the $<TARGET_OBJECTS:...> expressions
+            
+            set(obj_exprs)
+            foreach(objlib IN LISTS sub_intermediate_libs)
+                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
+            endforeach()
+            
+            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
+            add_dependencies(${intermediate_lib_name} gemm_gen_${datatype})
+            #foreach(objlib IN LISTS sub_intermediate_libs)
+            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
+            #endforeach()
+            list(APPEND intermediate_libs ${intermediate_lib_name})
+        endif()
+
+    endforeach()
+    
+    # Interface library for instances
+    add_library(gemm_template_instances_${datatype} INTERFACE)
+    add_dependencies(gemm_template_instances_${datatype} gemm_gen_${datatype})
+    target_link_libraries(gemm_template_instances_${datatype} INTERFACE ${intermediate_libs})
+    target_include_directories(gemm_template_instances_${datatype} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    set_target_properties(gemm_template_instances_${datatype} PROPERTIES LINKER_LANGUAGE CXX)
+    
+    # Host API interface library
+    add_library(gemm_host_api_${datatype} INTERFACE)
+    target_link_libraries(gemm_host_api_${datatype} INTERFACE gemm_template_instances_${datatype})
+    target_include_directories(gemm_host_api_${datatype} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    
+
+    # Executable per datatype
+    set(exec_name "benchmark_gemm_${datatype}")
+    add_executable(${exec_name} benchmark_gemm.cpp)
+    target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype})
+    target_compile_options(${exec_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+    )
+endfunction()
+
+# Process each datatype in isolation
+foreach(dt IN LISTS GEMM_DATATYPE)
+    build_gemm_for_datatype(${dt})
 endforeach()
-
-add_custom_command(
-    OUTPUT  ${GEMM_CODEGEN_BLOBS}
-    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
-            --working_path ${CMAKE_CURRENT_BINARY_DIR}
-            # --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json
-            --gen_blobs
-)
-
-add_library(gemm_template_instances OBJECT EXCLUDE_FROM_ALL ${GEMM_CODEGEN_CPP_FILES})
-# Explicitly set LINKER_LANGUAGE to avoid build config failures with Ninja.
-set_target_properties(gemm_template_instances PROPERTIES LINKER_LANGUAGE CXX)
-target_include_directories(gemm_template_instances PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(gemm_template_instances PRIVATE ${GEMM_CODEGEN_HPP_FILES})
-
-set(BENCHMARK_GEMM_EXECUTABLE "benchmark_gemm")
-message(DEBUG "adding example ${BENCHMARK_GEMM_EXECUTABLE}")
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
-add_library(gemm_host_api INTERFACE EXCLUDE_FROM_ALL)
-target_include_directories(gemm_host_api INTERFACE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(gemm_host_api INTERFACE ${GEMM_CODEGEN_HPP_FILES} gemm_host_api.hpp)
-target_link_libraries(gemm_host_api INTERFACE gemm_template_instances)
-
-add_executable(${BENCHMARK_GEMM_EXECUTABLE} EXCLUDE_FROM_ALL benchmark_gemm.cpp)
-target_include_directories(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE benchmark_gemm.hpp gemm_profiler.hpp)
-target_link_libraries(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE gemm_host_api)
-
-set(EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS)
-
-list(APPEND EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS
-     -Wno-undefined-func-template
-     -Wno-float-equal
-     --offload-compress)
-
-target_compile_options(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS})
-
-set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index db624e576e..40cb9acd1c 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -15,16 +15,27 @@ If user does not provide kernel configuration, the tile engine uses default kern
 # in the root of composable kernel create build directory
 mkdir build && cd build
 # build composable kernel
-sh ../script/cmake-ck-dev.sh  ../ <arch> # replace <arch> with the appropriate architecture (example gfx942) or leave blank
-# generate the executable
-make benchmark_gemm -j
+# replace [Arch] with the appropriate architecture or leave blank and 
+# replace [Datatype1;Datatype2;...] in comma separated datatypes string (possible datatypes are [fp8, bf8, int8, fp16, bf16])
+sh ../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" 
+# generate different executable for each passed datatype
+make benchmark_gemm_[Datatype1] -j
+make benchmark_gemm_[Datatype2] -j
 ```
-`benchmark_gemm` will be located in the `./bin/` directory.
+`benchmark_gemm_[Datatypes]` will be located in the `./bin/` directory.
 
-`benchmark_gemm` must be rebuilt everytime if configuration file is modified.
+`benchmark_gemm_[Datatypes]` must be rebuilt everytime if configuration file is modified.
 
 ``` bash
-rm -rf tile_engine/ && make benchmark_gemm -j  # rebuild
+rm -rf tile_engine/ && make benchmark_gemm_[Datatypes] -j  # rebuild
+```
+
+## For eaxmple build for gfx942 for fp8 and fp16 datatypes
+``` bash
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" 
+make benchmark_gemm_fp8 -j
+make benchmark_gemm_fp16 -j
 ```
 
 ## benchmark_gemm inputs
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index ae496636c6..9ff76724cc 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -199,7 +199,7 @@ warp_tile_supported_combinations = {
             [64, 4, 16],
         ],
         "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
-        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32]],
     },
     "gfx942": {
         "fp16_fp16_fp16": [
@@ -219,7 +219,7 @@ warp_tile_supported_combinations = {
             [64, 4, 16],
         ],
         "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
-        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
         "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
     },
     "gfx950": {
@@ -247,7 +247,7 @@ warp_tile_supported_combinations = {
             [16, 16, 128],
             [32, 32, 64],
         ],
-        "fp8_fp8_fp16": [
+        "bf8_bf8_fp16": [
             [32, 32, 16],
             [32, 32, 32],
             [16, 16, 64],
diff --git a/tile_engine/ops/gemm/configs/benchmark.json b/tile_engine/ops/gemm/configs/benchmark.json
new file mode 100644
index 0000000000..601784049b
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/benchmark.json
@@ -0,0 +1,116 @@
+{
+    "problem": {
+        "layout_a": {
+            "values": [
+                "r"
+            ]
+        },
+        "layout_b": {
+            "values": [
+                "c"
+            ]
+        },
+        "layout_c": {
+            "values": [
+                "r"
+            ]
+        }
+    },
+    "tile_config": {
+        "tile_m": {
+            "max": 256,
+            "min": 64,
+            "step": 64,
+            "exclude": [192]
+        },
+        "tile_n": {
+            "max": 256,
+            "min": 64,
+            "step": 64,
+            "exclude": [192]
+        },
+        "tile_k": {
+            "max": 256,
+            "min": 64,
+            "step": 64,
+            "exclude": [192]
+        },
+        "warp_m": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_n": {
+            "values": [
+                4,
+                2,
+                1
+            ]
+        },
+        "warp_k": {
+            "values": [
+                1
+            ]
+        },
+        "warp_tile_m": {
+            "values": [
+                4,
+                16,
+                32
+            ]
+        },
+        "warp_tile_n": {
+            "values": [
+                16,
+                32,
+                64
+            ]
+        },
+        "warp_tile_k": {
+            "values": [
+                8,
+                16,
+                32,
+                64,
+                128
+            ]
+        }
+    },
+    "trait_config": {
+        "pipeline": {
+            "values": [
+                "compv3",
+                "compv4",
+                "mem"
+            ]
+        },
+        "scheduler": {
+            "values": [
+                "intrawave",
+                "interwave"
+            ]
+        },
+        "epilogue": {
+            "values": [
+                "cshuffle"
+            ]
+        },
+        "pad_m": {
+            "values": [
+                false
+            ]
+        },
+        "pad_n": {
+            "values": [
+                false
+            ]
+        },
+        "pad_k": {
+            "values": [
+                false
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
index 9f71e430de..069a3b080c 100644
--- a/tile_engine/ops/gemm/configs/default_config.json
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -1,136 +1,115 @@
 {
-    "problem": {
-        "layout_a": {
-            "values": [
-                "r"
-            ]
-        },
-        "layout_b": {
-            "values": [
-                "c"
-            ]
-        },
-        "layout_c": {
-            "values": [
-                "r"
-            ]
-        },
-        "datatype_a": {
-            "values": [
-                "fp16"
-            ]
-        },
-        "datatype_b": {
-            "values": [
-                "fp16"
-            ]
-        },
-        "datatype_c": {
-            "values": [
-                "fp16"
-            ]
-        }
+  "problem": {
+    "layout_a": {
+      "values": [
+        "r"
+      ]
     },
-    "tile_config": {
-        "tile_m": {
-            "max": 256,
-            "min": 64,
-            "step": 64,
-            "exclude": []
-        },
-        "tile_n": {
-            "max": 256,
-            "min": 64,
-            "step": 32,
-            "exclude": []
-        },
-        "tile_k": {
-            "max": 256,
-            "min": 64,
-            "step": 64,
-            "exclude": [192]
-        },
-        "warp_m": {
-            "values": [
-                4,
-                2,
-                1
-            ]
-        },
-        "warp_n": {
-            "values": [
-                4,
-                2,
-                1
-            ]
-        },
-        "warp_k": {
-            "values": [
-                1
-            ]
-        },
-        "warp_tile_m": {
-            "values": [
-                4,
-                8,
-                16,
-                32,
-                64
-            ]
-        },
-        "warp_tile_n": {
-            "values": [
-                4,
-                8,
-                16,
-                32,
-                64
-            ]
-        },
-        "warp_tile_k": {
-            "values": [
-                8,
-                16,
-                32,
-                64,
-                128
-            ]
-        }
+    "layout_b": {
+      "values": [
+        "c"
+      ]
     },
-    "trait_config": {
-        "pipeline": {
-            "values": [
-                "compv4",
-                "compv3",
-                "mem"
-            ]
-        },
-        "scheduler": {
-            "values": [
-                "intrawave",
-                "interwave"
-            ]
-        },
-        "epilogue": {
-            "values": [
-                "default",
-                "cshuffle"
-            ]
-        },
-        "pad_m": {
-            "values": [
-                false
-            ]
-        },
-        "pad_n": {
-            "values": [
-                false
-            ]
-        },
-        "pad_k": {
-            "values": [
-                false
-            ]
-        }
+    "layout_c": {
+      "values": [
+        "r"
+      ]
     }
+  },
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256
+      ]
+    },
+    "tile_n": {
+      "values": [
+        128,
+        256
+      ]
+    },
+    "tile_k": {
+      "values": [
+        32
+      ]
+    },
+    "warp_m": {
+      "values": [
+        1,
+        2,
+        4
+      ]
+    },
+    "warp_n": {
+      "values": [
+        1,
+        2,
+        4
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        4,
+        16, 
+        32
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        16,
+        32,
+        64
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        8,
+        16,
+        32,
+        64,
+        128
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3",
+        "compv4",
+        "mem"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave",
+        "interwave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle",
+        "default"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
index 43c8784667..79bcced82a 100644
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -14,27 +14,13 @@
       "values": [
         "r"
       ]
-    },
-    "datatype_a": {
-      "values": [
-        "int8"
-      ]
-    },
-    "datatype_b": {
-      "values": [
-        "int8"
-      ]
-    },
-    "datatype_c": {
-      "values": [
-        "int32"  
-      ]
     }
   },
   "tile_config": {
     "tile_m": {
       "values": [
-        128
+        128,
+        256
       ]
     },
     "tile_n": {
@@ -49,12 +35,12 @@
     },
     "warp_m": {
       "values": [
-        2
+        4
       ]
     },
     "warp_n": {
       "values": [
-        2
+        1
       ]
     },
     "warp_k": {
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index f217522feb..de1fd0bb62 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -62,7 +62,7 @@ class GemmCodeGenerator:
         file_path = w_p / "gemm_instance_blobs.txt"
         self._generate_all_traits()
         self._get_valid_trait_tile_combinations()
-
+        file_range_map = {}
         # Write all file paths to the header file
         files_listed = 0
         with file_path.open("w") as f:
@@ -81,9 +81,10 @@ class GemmCodeGenerator:
                 trait_file = f"gemm_{trait}.hpp"
                 f.write(str(w_p / trait_file) + "\n")
                 files_listed += 1
-
+            file_name = set()
             # Instance source files
             for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+                start_idx = files_listed
                 for tile in tile_valid_params:
                     for (
                         tile_m,
@@ -92,38 +93,24 @@ class GemmCodeGenerator:
                         warp_m,
                         warp_n,
                         warp_k,
-                        warp_tile_m,
-                        warp_tile_n,
-                        warp_tile_k,
+                        _,
+                        _,
+                        _,
                     ) in tile:
-                        instance_name = f"{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
-                        sparse = (
-                            self.config.problem.datatype_map["matrix_a"] == "fp16"
-                            and self.config.problem.datatype_map["matrix_b"] == "fp16"
-                            and self.config.problem.datatype_map["matrix_c"] == "fp16"
-                            and (
-                                (
-                                    warp_tile_m == 32
-                                    and warp_tile_n == 32
-                                    and warp_tile_k == 16
-                                )
-                                or (
-                                    warp_tile_m == 16
-                                    and warp_tile_n == 16
-                                    and warp_tile_k == 32
-                                )
-                            )
-                        )
-                        if sparse:
-                            sparse_file = f"gemm_{trait}_{instance_name}_true.cpp"
-                            f.write(str(w_p / sparse_file) + "\n")
+                        instance_name = f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
+                        
+                        if instance_name not in file_name:
+                            file_name.add(instance_name)
+                            f.write(str(w_p / instance_name) + "\n")
                             files_listed += 1
 
-                        regular_file = f"gemm_{trait}_{instance_name}_false.cpp"
-                        f.write(str(w_p / regular_file) + "\n")
-                        files_listed += 1
-
-        print(f"File listing complete: {files_listed} files listed in {file_path}\n")
+                file_range_map[trait] = (start_idx, files_listed)
+        
+        file_path = w_p / 'gemm_instance_blobs_range.txt'
+        with  file_path.open('w') as f:
+            for name, ranges in file_range_map.items():
+                s, l = ranges
+                f.write(name + " " + f"{s}" + " " + f"{l}"+ "\n")
 
     def _generate_all_traits(self):
         """Generate all possible kernel traits names."""
@@ -246,7 +233,7 @@ struct GemmKernel {{
     static constexpr bool kPadN = {pad_n};
     static constexpr bool kPadK = {pad_k};
 
-    static float launch(ck_tile::GemmHostArgs<><>& args, const ck_tile::stream_config& stream) {{
+    static float launch(ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
         static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
@@ -360,7 +347,6 @@ struct GemmKernel {{
                     if(args.k_batch > 1)
                         hipGetErrorString(hipMemsetAsync(
                             args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
-                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
                 }};
                 ave_time = ck_tile::launch_kernel_preprocess(
                     stream,
@@ -577,8 +563,8 @@ struct GemmKernel {{
             self.valid_trait_tile_combinations[trait].append(tile_valid_params)
 
     def _generate_instantiation_source_files(self):
-        """Generate kernel instance instantiation source files"""
-
+        """Generate kernel instance instantiation source files """
+        tile_map = {}
         for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
             for tile in tile_valid_params:
                 for (
@@ -592,17 +578,28 @@ struct GemmKernel {{
                     warp_tile_n,
                     warp_tile_k,
                 ) in tile:
-                    instance_name = f"{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+                    key = f"{tile_m}x{tile_n}x{tile_k}x{warp_m}x{warp_n}x{warp_k}"
+                    value = f"{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+                    if key not in tile_map:
+                        tile_map[key] = set()
+                    tile_map[key].add(value)
+       
+        files_listed = 0
+        for trait, _ in self.valid_trait_tile_combinations.items():
+            for block_tile, warp_tiles in tile_map.items():
+                tile_m, tile_n, tile_k, warp_m, warp_n, warp_k = map(int, block_tile.split('x'))
 
-                    content = f"""
+                content = f"""
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 
-
 #include "gemm_{trait}.hpp" 
 
 """
+                for warp_tile in warp_tiles:
+                    warp_tile_m, warp_tile_n, warp_tile_k = map(int, warp_tile.split("x"))
+                    
                     sparse = (
                         self.config.problem.datatype_map["matrix_a"] == "fp16"
                         and self.config.problem.datatype_map["matrix_b"] == "fp16"
@@ -621,23 +618,17 @@ struct GemmKernel {{
                         )
                     )
                     if sparse:
-                        sparse_filename = f"gemm_{trait}_{instance_name}_true.cpp"
-                        sparse_content = (
-                            content
-                            + f"""
-template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, true>;
+                        files_listed = files_listed + 1
+                        content = content + f"""
+template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, true>;"""
+                    files_listed = files_listed + 1
+                    content = content + f"""
+template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, false>;"""
+                content += f"""
 """
-                        )
-                        (self.output_dir / sparse_filename).write_text(sparse_content)
-
-                    no_sparse_filename = f"gemm_{trait}_{instance_name}_false.cpp"
-                    no_sparse_content = (
-                        content
-                        + f"""
-template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, false>;
-"""
-                    )
-                    (self.output_dir / no_sparse_filename).write_text(no_sparse_content)
+                (self.output_dir /
+                    f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp").write_text(content)
+        print(f"Generated {files_listed} kernel instances in total.")
 
     def _generate_dispatcher_file(self):
         """Generate the code block of dispatch mechanism."""
@@ -682,8 +673,7 @@ struct GemmDispatcher {
         return kernel_map;
     }
 
-    static void init(bool structured_sparsity) {
-        (void)structured_sparsity;  // Suppress unused parameter warning
+    static void init([[maybe_unused]]bool structured_sparsity) {
         auto& kernel_map = get_kernel_map();
         if(!kernel_map.empty()) return;
         \n"""
@@ -703,7 +693,7 @@ struct GemmDispatcher {
                         warp_tile_n,
                         warp_tile_k,
                     ) = tile[j]
-                    content += f"""[=](ck_tile::GemmHostArgs<><>& args, const ck_tile::stream_config& stream) {{ """
+                    content += f"""[=](ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{ """
                     content += f""" 
                                     if(structured_sparsity){{  // SMFMA"""
                     sparse = (
@@ -795,7 +785,7 @@ def do_gen_blobs(
 
 def main(args):
     gemm_config = (
-        GemmConfig.from_json(args.config_json)
+        GemmConfig.from_json(args.config_json, args.datatype)
         if args.config_json is not None
         else args.config_json
     )
@@ -829,6 +819,12 @@ if __name__ == "__main__":
         required=False,
         help="Path to the json which contains the configurations that user provide",
     )
+    parser.add_argument(
+        "-d",
+        "--datatype",
+        required=True,
+        help="Specify what datatype to use for the kernel generation, e.g. fp16, bf16, int8, fp8, bf8"
+    )
     parser.add_argument(
         "-l",
         "--list_blobs",
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 272799e4d6..20f601d46e 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -23,7 +23,6 @@ class GemmProfiler
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
                        ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>& callables)
-                       ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>& callables)
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
index aaf732c6a8..8b83977dd3 100644
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -118,7 +118,7 @@ class GemmConfig:
     trait_config: TraitConfig
 
     @classmethod
-    def from_json(cls: Type["GemmConfig"], filepath: str) -> "GemmConfig":
+    def from_json(cls: Type["GemmConfig"], filepath: str, datatype: str) -> "GemmConfig":
         """JSON configuration loader with validation controls"""
         config_path = Path(filepath)
 
@@ -129,18 +129,24 @@ class GemmConfig:
             with config_path.open("r") as f:
                 config_dict = json.load(f)
 
+            a_type = datatype
+            b_type = datatype
+            c_type = datatype
+            if b_type == 'int4':
+                a_type = "fp16"
+            if b_type in ['bf8', 'fp8', 'int4']:
+                c_type = "fp16"
+
             # Parse problem config
+            #TODO: Not reading datatype information from json file.
             problem = ProblemConfig(
                 datatypes=(
                     EnumConfigParam(
-                        values=config_dict["problem"]["datatype_a"]["values"]
-                    ),
+                        values=[a_type]),
                     EnumConfigParam(
-                        values=config_dict["problem"]["datatype_b"]["values"]
-                    ),
+                        values=[b_type]),
                     EnumConfigParam(
-                        values=config_dict["problem"]["datatype_c"]["values"]
-                    ),
+                        values=[c_type])
                 ),
                 layouts=(
                     EnumConfigParam(

From 1749c0409e69b4b736a47139a6b34d8bb92cd147 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Thu, 26 Jun 2025 08:32:39 +0800
Subject: [PATCH 263/443] [CK][CONV] Support NCHW in class
 DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle (#2375)

1. When conv spec is 1x1 stride1 pad0, nchw is equal with matrix A + column major, we only need minor change in conv transformer to support it.
2. when out is NKHW, it is equal with matrix C with column major. we need swap A & B to get best performance.
3. Add new instance device_grouped_conv_fwd_xdl_f16_nchw_instances for nchw.
---
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 413 ++++++++++++------
 .../transform_conv_fwd_to_gemm.hpp            | 218 ++++++++-
 .../device_grouped_conv_fwd_xdl_instance.hpp  |  32 ++
 ..._fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in |   9 +
 ...fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp |   8 +
 ...gcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp |   9 +
 6 files changed, 552 insertions(+), 137 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 6d04835b21..6d2988ba24 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -77,7 +77,8 @@ template <typename GridwiseGemm,
           typename ComputePtrOffsetOfN,
           bool HasMainKBlockLoop,
           bool isMultiA,
-          bool isMultiB>
+          bool isMultiB,
+          bool CTranspose>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -171,17 +172,22 @@ __global__ void
     }
     else
     {
-        const long_index_t a_group_offset =
-            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
         const long_index_t b_group_offset =
-            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-
+            CTranspose
+                ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx))
+                : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+        const long_index_t a_group_offset =
+            CTranspose
+                ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx))
+                : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+        const long_index_t b_n_offset =
+            CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)) : 0;
         const long_index_t a_n_offset =
-            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+            CTranspose ? 0 : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
 
         GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
             p_as_grid + a_group_offset + a_n_offset,
-            p_bs_grid + b_group_offset,
+            p_bs_grid + b_group_offset + b_n_offset,
             p_ds_grid_grp,
             p_e_grid + e_group_offset + e_n_offset,
             p_shared,
@@ -335,12 +341,28 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     static constexpr auto I4 = Number<4>{};
     static constexpr auto I5 = Number<5>{};
 
+    static constexpr bool isATensorColMajor =
+        (ConvForwardSpecialization == ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) &&
+        (ABlockTransferSrcVectorDim == 1) && (NumGroupsToMerge == 1) &&
+        (is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>());
+
+    static constexpr bool NeedTransposeKernel =
+        (isATensorColMajor == false) && (is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>());
+
+    static constexpr bool CTranspose = (NeedTransposeKernel == false) && (isMultiAB == false) &&
+                                       (is_same_v<ELayout, tensor_layout::convolution::NGKHW> ||
+                                        is_same_v<ELayout, tensor_layout::convolution::NGKDHW>);
+
     using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial,
                                                             ConvForwardSpecialization,
                                                             true /*SplitN*/,
                                                             ADataType,
                                                             EDataType,
-                                                            NumGroupsToMerge>;
+                                                            NumGroupsToMerge,
+                                                            index_t,
+                                                            CTranspose>;
 
     static constexpr index_t ClusterLengthNPerBlock =
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3);
@@ -361,9 +383,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
             ctc::NHWGC,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::NDHWGC, ALay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                               ctc::NDHWGC,
+                               ALay>>;
 
         const auto in_gemmmraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeADescriptor_M_K<Layout>();
@@ -379,9 +403,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
             ctc::GKYXC,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::GKZYXC, BLay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                               ctc::GKZYXC,
+                               BLay>>;
 
         const auto wei_gemmnraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeBDescriptor_N_K<Layout>();
@@ -397,17 +423,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
             ctc::NHWGK,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::NDHWGK, ELay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                               ctc::NDHWGK,
+                               ELay>>;
 
         const auto out_gemmmraw_gemmnraw_desc =
             conv_to_gemm_transformer.template MakeCDescriptor_M_N<Layout>();
-
-        const auto out_gemmm_gemmn_desc =
-            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
-
-        return out_gemmm_gemmn_desc;
+        if constexpr(CTranspose)
+        {
+            constexpr auto matrix_padder_trans =
+                MatrixPadder<GemmSpec, index_t, index_t, index_t>{NPerBlock, MPerBlock, KPerBlock};
+            return matrix_padder_trans.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+        }
+        else
+        {
+            return matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+        }
     }
 
     // Shape of Ds and E must be aligned. Strides can be different.
@@ -471,11 +504,32 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
         BComputeDataType, DoElementwiseBeforeCShuffle
+
+#define GridwiseGemmCTransposeTemplateParameters                                               \
+    GemmBDataType, GemmADataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \
+        EDataType, BElementwiseOperation, AElementwiseOperation, CDEElementwiseOperation,      \
+        NumGemmKPrefetchStage, BlockSize, NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL,  \
+        MPerXDL, NXdlPerWave, MXdlPerWave, BBlockTransferThreadClusterLengths_BK0_N_BK1,       \
+        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                 \
+        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                          \
+        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                          \
+        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \
+        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                              \
+        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,         \
+        ABlockLdsExtraM, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
+        BComputeDataType, DoElementwiseBeforeCShuffle
+
     // Use appropriate gridwise gemm
     using GridwiseGemm = std::conditional_t<
         isMultiA || isMultiB,
         GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>,
         GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
+    using GridwiseGemmCTranspose = std::conditional_t<
+        CTranspose,
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>,
+        GridwiseGemm>;
 
     // If ADataTypes or BDataTypes is tuple, user has to pass std::array with pointers.
     using APointers =
@@ -497,15 +551,16 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemmCTranspose::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
 
     // block-to-e-tile map
     using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(
+            EGridDesc_M_N{}))>;
     using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, NPerBlock>;
 
     using NGCHWTransposeDescType =
@@ -612,16 +667,22 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
               p_ds_grid_{},
               p_e_grid_{static_cast<EDataType*>(p_e)},
               a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
-              a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
-                  a_g_n_c_wis_lengths, a_g_n_c_wis_strides)},
+              a_g_n_c_wis_strides_{NeedTransposeKernel
+                                       ? conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
+                                             a_g_n_c_wis_lengths, a_g_n_c_wis_strides)
+                                       : a_g_n_c_wis_strides},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
-                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
+              b_g_k_c_xs_strides_{NeedTransposeKernel
+                                      ? conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
+                                            b_g_k_c_xs_lengths, b_g_k_c_xs_strides)
+                                      : b_g_k_c_xs_strides},
               ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
               ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
               e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
-              e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
-                  e_g_n_k_wos_lengths, e_g_n_k_wos_strides)},
+              e_g_n_k_wos_strides_{NeedTransposeKernel
+                                       ? conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
+                                             e_g_n_k_wos_lengths, e_g_n_k_wos_strides)
+                                       : e_g_n_k_wos_strides},
               conv_filter_strides_{conv_filter_strides},
               conv_filter_dilations_{conv_filter_dilations},
               input_left_pads_{input_left_pads},
@@ -651,7 +712,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                   GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
               ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
               e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{
+                  GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               compute_ptr_offset_of_groups_{},
               compute_ptr_offset_of_n_{},
               a_element_op_{a_element_op},
@@ -783,24 +845,34 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             }
             else
             {
-                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                               b_grid_desc_n_k_,
-                                               ds_grid_desc_m_n_,
-                                               e_grid_desc_m_n_,
-                                               block_2_etile_map_))
+                bool valid = false;
+                if constexpr(CTranspose)
                 {
-                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            e_grid_desc_m_n_);
+                    valid = GridwiseGemmCTranspose::CheckValidity(b_grid_desc_n_k_,
+                                                                  a_grid_desc_m_k_,
+                                                                  ds_grid_desc_m_n_,
+                                                                  e_grid_desc_m_n_,
+                                                                  block_2_etile_map_);
+                }
+                else
+                {
+                    valid = GridwiseGemmCTranspose::CheckValidity(a_grid_desc_m_k_,
+                                                                  b_grid_desc_n_k_,
+                                                                  ds_grid_desc_m_n_,
+                                                                  e_grid_desc_m_n_,
+                                                                  block_2_etile_map_);
+                }
+                if(valid)
+                {
+                    e_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
+                        MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_);
 
-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            ds_grid_desc_m_n_);
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
+                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n_);
                 }
             }
 
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 // Use not modified base strides
                 a_in_transpose_desc_ =
@@ -835,8 +907,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const long_index_t a_acum = ck::accumulate_n<long_index_t>(
                     a_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -851,8 +922,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
         std::size_t GetWorkspaceBTensorSizeBytes() const
         {
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const long_index_t b_acum = ck::accumulate_n<long_index_t>(
                     b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -867,8 +937,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
         std::size_t GetWorkspaceETensorSizeBytes() const
         {
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const long_index_t e_accum = ck::accumulate_n<long_index_t>(
                     e_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -1007,7 +1076,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                         ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
                         has_main_loop,
                         isMultiA,
-                        isMultiB>;
+                        isMultiB,
+                        CTranspose>;
 
                     return launch_and_time_kernel(
                         stream_config,
@@ -1035,68 +1105,118 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     const ADataType* p_a_grid = arg.p_as_grid_.At(I0);
                     const BDataType* p_b_grid = arg.p_bs_grid_.At(I0);
                     EDataType* p_e_grid       = arg.p_e_grid_;
-
-                    if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
-                                 is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                    if constexpr(NeedTransposeKernel)
                     {
-                        p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                        p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
-                                   arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
-                        p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                                   (arg.GetWorkspaceATensorSizeBytes() +
-                                    arg.GetWorkspaceBTensorSizeBytes()) /
-                                       sizeof(EDataType);
-                    }
-                    else if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                                      is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
-                    {
-                        p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                        p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                                   (arg.GetWorkspaceATensorSizeBytes() +
-                                    arg.GetWorkspaceBTensorSizeBytes()) /
-                                       sizeof(EDataType);
+                        if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                                     is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                        {
+                            p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                            p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                                       arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                            p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
+                                       (arg.GetWorkspaceATensorSizeBytes() +
+                                        arg.GetWorkspaceBTensorSizeBytes()) /
+                                           sizeof(EDataType);
+                        }
+                        else if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
+                                          is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+                        {
+                            p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                            p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
+                                       (arg.GetWorkspaceATensorSizeBytes() +
+                                        arg.GetWorkspaceBTensorSizeBytes()) /
+                                           sizeof(EDataType);
+                        }
                     }
 
-                    const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
-                        GridwiseGemm,
-                        const ADataType*,
-                        const BDataType*,
-                        typename GridwiseGemm::DsGridPointer,
-                        EDataType,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CDEElementwiseOperation,
-                        DeviceOp::AGridDesc_AK0_M_AK1,
-                        DeviceOp::BGridDesc_BK0_N_BK1,
-                        DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        Block2ETileMap,
-                        ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>,
-                        ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
-                        has_main_loop,
-                        isMultiA,
-                        isMultiB>;
+                    if constexpr(CTranspose)
+                    {
+                        const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
+                            GridwiseGemmCTranspose,
+                            const BDataType*,
+                            const ADataType*,
+                            typename GridwiseGemm::DsGridPointer,
+                            EDataType,
+                            BElementwiseOperation,
+                            AElementwiseOperation,
+                            CDEElementwiseOperation,
+                            DeviceOp::BGridDesc_BK0_N_BK1,
+                            DeviceOp::AGridDesc_AK0_M_AK1,
+                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            Block2ETileMap,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
+                            has_main_loop,
+                            isMultiA,
+                            isMultiB,
+                            CTranspose>;
 
-                    return launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(gdx, gdy, gdz),
-                        dim3(BlockSize),
-                        0,
-                        p_a_grid,
-                        p_b_grid,
-                        arg.p_ds_grid_,
-                        p_e_grid,
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.cde_element_op_,
-                        arg.a_grid_desc_ak0_m_ak1_,
-                        arg.b_grid_desc_bk0_n_bk1_,
-                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        arg.block_2_etile_map_,
-                        arg.compute_ptr_offset_of_groups_,
-                        arg.compute_ptr_offset_of_n_);
+                        return launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            p_b_grid,
+                            p_a_grid,
+                            arg.p_ds_grid_,
+                            p_e_grid,
+                            arg.b_element_op_,
+                            arg.a_element_op_,
+                            arg.cde_element_op_,
+                            arg.b_grid_desc_bk0_n_bk1_,
+                            arg.a_grid_desc_ak0_m_ak1_,
+                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.block_2_etile_map_,
+                            arg.compute_ptr_offset_of_groups_,
+                            arg.compute_ptr_offset_of_n_);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
+                            GridwiseGemm,
+                            const ADataType*,
+                            const BDataType*,
+                            typename GridwiseGemm::DsGridPointer,
+                            EDataType,
+                            AElementwiseOperation,
+                            BElementwiseOperation,
+                            CDEElementwiseOperation,
+                            DeviceOp::AGridDesc_AK0_M_AK1,
+                            DeviceOp::BGridDesc_BK0_N_BK1,
+                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            Block2ETileMap,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
+                            has_main_loop,
+                            isMultiA,
+                            isMultiB,
+                            CTranspose>;
+
+                        return launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            p_a_grid,
+                            p_b_grid,
+                            arg.p_ds_grid_,
+                            p_e_grid,
+                            arg.a_element_op_,
+                            arg.b_element_op_,
+                            arg.cde_element_op_,
+                            arg.a_grid_desc_ak0_m_ak1_,
+                            arg.b_grid_desc_bk0_n_bk1_,
+                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.block_2_etile_map_,
+                            arg.compute_ptr_offset_of_groups_,
+                            arg.compute_ptr_offset_of_n_);
+                    }
                 }
             };
 
@@ -1114,8 +1234,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         {
             float avg_time = 0.f;
 
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const index_t a_grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
@@ -1166,8 +1285,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
             avg_time += RunGemm(arg, stream_config);
 
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const index_t grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
@@ -1215,9 +1333,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
 
-        const index_t G = arg.b_g_k_c_xs_lengths_[I0];
-        const index_t K = arg.b_g_k_c_xs_lengths_[I1];
-        const index_t C = arg.b_g_k_c_xs_lengths_[I2];
+        const index_t G                  = arg.b_g_k_c_xs_lengths_[I0];
+        const index_t K                  = arg.b_g_k_c_xs_lengths_[I1];
+        const index_t C                  = arg.b_g_k_c_xs_lengths_[I2];
+        const index_t input_spatial_acum = ck::accumulate_n<index_t>(
+            arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
 
         // check device
         if(get_device_name() == "gfx908")
@@ -1310,7 +1430,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                      is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
                      is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
                      is_same_v<ALayout, ctc::NDHWGC> || is_same_v<ALayout, ctc::NGCW> ||
-                     is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
+                     NeedTransposeKernel)
         {
             // Check access per C
             if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
@@ -1326,6 +1446,23 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 }
             }
         }
+        else if constexpr(is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
+        {
+            static_assert(NeedTransposeKernel == false);
+            static_assert(NumGroupsToMerge == 1);
+
+            if constexpr(ABlockTransferSrcScalarPerVector != 1)
+            {
+                if(ABlockTransferSrcVectorDim != 1)
+                {
+                    return false;
+                }
+                if(input_spatial_acum % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+        }
         else
         {
             return false;
@@ -1350,7 +1487,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         {
             return false;
         }
-
         //  check vector access of Ds
         bool valid = true;
 
@@ -1396,8 +1532,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             }
         });
 
-        if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                     is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+        if constexpr(NeedTransposeKernel)
         {
             if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
@@ -1409,8 +1544,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 return false;
             }
 
-            const index_t input_spatial_acum = ck::accumulate_n<index_t>(
-                arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
             const index_t output_spatial_acum = ck::accumulate_n<index_t>(
                 arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
 
@@ -1457,9 +1590,22 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                      is_same_v<ELayout, ctc::NDHWGK> || is_same_v<ELayout, ctc::NGKW> ||
                      is_same_v<ELayout, ctc::NGKHW> || is_same_v<ELayout, ctc::NGKDHW>)
         {
-            if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            if(CTranspose == false)
             {
-                return false;
+                if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                const index_t output_spatial_acum = ck::accumulate_n<index_t>(
+                    arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+
+                if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
             }
         }
         else
@@ -1483,11 +1629,22 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         }
         else
         {
-            return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                               arg.b_grid_desc_n_k_,
-                                               arg.ds_grid_desc_m_n_,
-                                               arg.e_grid_desc_m_n_,
-                                               arg.block_2_etile_map_);
+            if constexpr(CTranspose)
+            {
+                return GridwiseGemmCTranspose::CheckValidity(arg.b_grid_desc_n_k_,
+                                                             arg.a_grid_desc_m_k_,
+                                                             arg.ds_grid_desc_m_n_,
+                                                             arg.e_grid_desc_m_n_,
+                                                             arg.block_2_etile_map_);
+            }
+            else
+            {
+                return GridwiseGemmCTranspose::CheckValidity(arg.a_grid_desc_m_k_,
+                                                             arg.b_grid_desc_n_k_,
+                                                             arg.ds_grid_desc_m_n_,
+                                                             arg.e_grid_desc_m_n_,
+                                                             arg.block_2_etile_map_);
+            }
         }
     }
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index c291f3994c..92b48c44b3 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -19,7 +19,8 @@ template <index_t NDimSpatial,
           typename ADataType       = float,
           typename CDataType       = float,
           index_t NumGroupsToMerge = 1,
-          typename IndexType       = index_t>
+          typename IndexType       = index_t,
+          bool CTranspose          = false>
 struct TransformConvFwdToGemm
 {
     private:
@@ -1253,6 +1254,83 @@ struct TransformConvFwdToGemm
         }
     }
 
+    template <typename ALayout,
+              typename ck::enable_if<NDimSpatial == 1 &&
+                                         is_same_v<ALayout, tensor_layout::convolution::NGCW>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_M_K() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        static_assert(ConvForwardSpecialization ==
+                      device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0);
+
+        const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, Wo_, C_), make_tuple(NStrideTensorA_, I1, CStrideTensorA_));
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, Wo_)), make_pass_through_transform(C_)),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename ALayout,
+              typename ck::enable_if<NDimSpatial == 2 &&
+                                         is_same_v<ALayout, tensor_layout::convolution::NGCHW>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_M_K() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        static_assert(ConvForwardSpecialization ==
+                      device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0);
+
+        const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, Ho_ * Wo_, C_), make_tuple(NStrideTensorA_, I1, CStrideTensorA_));
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, Ho_ * Wo_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename ALayout,
+              typename ck::enable_if<NDimSpatial == 3 &&
+                                         is_same_v<ALayout, tensor_layout::convolution::NGCDHW>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_M_K() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        static_assert(ConvForwardSpecialization ==
+                      device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0);
+
+        const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, Do_ * Ho_ * Wo_, C_), make_tuple(NStrideTensorA_, I1, CStrideTensorA_));
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename BLayout,
+              typename ck::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKCX> ||
+                                         is_same_v<BLayout, tensor_layout::convolution::GKCYX> ||
+                                         is_same_v<BLayout, tensor_layout::convolution::GKCZYX>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeBDescriptor_N_K() const
+    {
+        static_assert(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0 ||
+                      ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0);
+        static_assert(NumGroupsToMerge == 1);
+        return make_naive_tensor_descriptor_packed(make_tuple(K_, C_));
+    }
+
     template <typename BLayout,
               typename ck::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKXC> ||
                                          is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
@@ -1338,8 +1416,16 @@ struct TransformConvFwdToGemm
                                bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
-        return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_),
-                                            make_tuple(I0, KStrideTensorC_));
+        if constexpr(CTranspose)
+        {
+            return make_naive_tensor_descriptor(make_tuple(K_, N_ * Wo_),
+                                                make_tuple(KStrideTensorC_, I0));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_),
+                                                make_tuple(I0, KStrideTensorC_));
+        }
     }
 
     template <
@@ -1350,8 +1436,16 @@ struct TransformConvFwdToGemm
                                bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
-        return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
-                                            make_tuple(I0, KStrideTensorC_));
+        if constexpr(CTranspose)
+        {
+            return make_naive_tensor_descriptor(make_tuple(K_, N_ * Ho_ * Wo_),
+                                                make_tuple(KStrideTensorC_, I0));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
+                                                make_tuple(I0, KStrideTensorC_));
+        }
     }
 
     template <
@@ -1362,12 +1456,21 @@ struct TransformConvFwdToGemm
                                bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
-        return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
-                                            make_tuple(I0, KStrideTensorC_));
+        if constexpr(CTranspose)
+        {
+            return make_naive_tensor_descriptor(make_tuple(K_, N_ * Do_ * Ho_ * Wo_),
+                                                make_tuple(KStrideTensorC_, I0));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
+                                                make_tuple(I0, KStrideTensorC_));
+        }
     }
 
     template <typename CLayout,
-              index_t NDimSp                     = NDimSpatial,
+              index_t NDimSp = NDimSpatial,
+
               typename ck::enable_if<NDimSp == 1 &&
                                          (is_same_v<CLayout, tensor_layout::convolution::G_NW_K> ||
                                           is_same_v<CLayout, tensor_layout::convolution::NWGK> ||
@@ -1375,6 +1478,7 @@ struct TransformConvFwdToGemm
                                      bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
+        static_assert(CTranspose == false);
         const IndexType NDoHoWo = N_ * Wo_;
         if constexpr(NumGroupsToMerge == 1)
         {
@@ -1429,6 +1533,7 @@ struct TransformConvFwdToGemm
                                      bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
+        static_assert(CTranspose == false);
         const IndexType NDoHoWo = N_ * Ho_ * Wo_;
         if constexpr(NumGroupsToMerge == 1)
         {
@@ -1486,7 +1591,7 @@ struct TransformConvFwdToGemm
                   bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
-
+        static_assert(CTranspose == false);
         const IndexType NDoHoWo = N_ * Do_ * Ho_ * Wo_;
         if constexpr(NumGroupsToMerge == 1)
         {
@@ -1536,6 +1641,101 @@ struct TransformConvFwdToGemm
         }
     }
 
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename ck::enable_if<NDimSp == 1 &&
+                                         (is_same_v<CLayout, tensor_layout::convolution::GNKW> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::NGKW>),
+                                     bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        auto n_k_wo_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, K_, Wo_), make_tuple(NStrideTensorC_, KStrideTensorC_, I1));
+        if constexpr(CTranspose)
+        {
+            return transform_tensor_descriptor(
+                n_k_wo_desc,
+                make_tuple(make_pass_through_transform(K_),
+                           make_merge_transform(make_tuple(N_, Wo_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(n_k_wo_desc,
+                                               make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                                                          make_pass_through_transform(K_)),
+                                               make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename ck::enable_if<NDimSp == 2 &&
+                                         (is_same_v<CLayout, tensor_layout::convolution::GNKHW> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::NGKHW>),
+                                     bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        auto n_k_howo_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, K_, Ho_ * Wo_), make_tuple(NStrideTensorC_, KStrideTensorC_, I1));
+        if constexpr(CTranspose)
+        {
+            return transform_tensor_descriptor(
+                n_k_howo_desc,
+                make_tuple(make_pass_through_transform(K_),
+                           make_merge_transform(make_tuple(N_, Ho_ * Wo_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                n_k_howo_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Ho_ * Wo_)),
+                           make_pass_through_transform(K_)),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename ck::enable_if<NDimSp == 3 &&
+                                         (is_same_v<CLayout, tensor_layout::convolution::GNKDHW> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::NGKDHW>),
+                                     bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        auto n_k_dohowo_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, K_, Do_ * Ho_ * Wo_), make_tuple(NStrideTensorC_, KStrideTensorC_, I1));
+
+        if constexpr(CTranspose)
+        {
+            return transform_tensor_descriptor(
+                n_k_dohowo_desc,
+                make_tuple(make_pass_through_transform(K_),
+                           make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                n_k_dohowo_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_)),
+                           make_pass_through_transform(K_)),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
     IndexType N_;
     IndexType Di_, Hi_, Wi_;
     IndexType Do_, Ho_, Wo_;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index d6b695360b..c641019b70 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -179,6 +179,38 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_f16_nchw_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+    // generic instance
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   64,    128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              1>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,               1>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,              1>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 8, 1, 8>,               1>,
+    // 32x32 instance 
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   64,    128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,                4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               4>,
+    // 16x16 instance
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,               4>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
index 13fb583725..158b1cb9d5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
@@ -53,6 +53,15 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances_shard(
                                                                               ConvFwd1x1S1P0>,
                                    Shards,
                                    ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_nchw_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }
 
 } // namespace ck::tensor_operation::device::instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
index 78d1747548..10267573da 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -31,6 +31,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
                                                           Empty_Tuple,
                                                           NGKHW,
                                                           ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_nchw_instances<2,
+                                                       NGCHW,
+                                                       GKYXC,
+                                                       Empty_Tuple,
+                                                       NGKHW,
+                                                       ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
index 0ddf5bfa48..9795b6a096 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -47,6 +47,15 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instances(
                                                         Empty_Tuple,
                                                         NGKDHW,
                                                         ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_nchw_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
 }
 
 } // namespace instance

From e1c5172fdb7eb4072943696f6a33937234843e3b Mon Sep 17 00:00:00 2001
From: huaiguxu <145733371+huaiguxu@users.noreply.github.com>
Date: Fri, 27 Jun 2025 10:24:34 +0800
Subject: [PATCH 264/443] Huaiguxu/moe fp8 pertoken scale fix (#2391)

* fix pertoken_scale a_scale dimension

* clang-format

* Fix moe_gemm2_fp8 perTokenScale reference and example.
---
 .../moe_gemm2_xdl_fp8.cpp                          | 14 +++++++++-----
 .../gpu/grid/gridwise_moe_gemm.hpp                 | 14 ++++++++++++--
 .../cpu/reference_moe_gemm2.hpp                    |  7 +++++--
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 3188ba142c..6a3986ea32 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -139,6 +139,7 @@ static constexpr ck::index_t EVec          = 2;
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
 static constexpr ck::index_t D2Vec         = 1;
+static constexpr bool PerTokenQuant        = true;
 static constexpr bool MulRoutedWeight      = true;
 using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
     // clang-format off
@@ -169,7 +170,7 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
                //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                 //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
                2,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, false, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, PerTokenQuant, int32_t, A0DataType>;
         // kernel 2: 128->32x128x128
         //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
 
@@ -197,7 +198,7 @@ int main(int argc, char* argv[])
     {
         // use default case
     }
-    else if(argc == 3)
+    else if(argc == 4)
     {
         // use default case
         do_verification = std::stoi(argv[1]);
@@ -238,7 +239,8 @@ int main(int argc, char* argv[])
     ck::index_t StrideB              = K;
     ck::index_t StrideE              = N;
     constexpr ck::index_t NumDTensor = DsDataType::Size();
-    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+    constexpr auto StrideDs          = PerTokenQuant ? std::array<ck::index_t, NumDTensor>{1, 1, 0}
+                                                     : std::array<ck::index_t, NumDTensor>{0, 0, 0};
 
     ck::index_t KBatch = 1;
 
@@ -279,8 +281,10 @@ int main(int argc, char* argv[])
     Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
     Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
     Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
-    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D0DataType> d0_t_n(
+        HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(
+        HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
     Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
     Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
     Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 92aab5af52..36f8fd7cc1 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1473,7 +1473,12 @@ struct GridwiseMoeGemm
                                         index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
                                         const index_t token_offset = fused_token & 0xffffff;
                                         return token_offset < problem.NumTokens
-                                                   ? p_sorted_weights_0[token_offset]
+                                                   ? p_sorted_weights_0[IsInputGemm
+                                                                            ? token_offset
+                                                                            : token_offset *
+                                                                                      problem.TopK +
+                                                                                  (fused_token >>
+                                                                                   24)]
                                                    : 0.0;
                                     }
                                     else
@@ -2190,7 +2195,12 @@ struct GridwiseMoeGemm
                                         index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
                                         const index_t token_offset = fused_token & 0xffffff;
                                         return token_offset < problem.NumTokens
-                                                   ? p_sorted_weights_0[token_offset]
+                                                   ? p_sorted_weights_0[IsInputGemm
+                                                                            ? token_offset
+                                                                            : token_offset *
+                                                                                      problem.TopK +
+                                                                                  (fused_token >>
+                                                                                   24)]
                                                    : 0.0;
                                     }
                                     else
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
index 583d704040..58e4adfdfa 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>
 #include <unordered_map>
+#include <mutex>
 
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
@@ -85,6 +86,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
 
         float Run(const Argument& arg)
         {
+            std::vector<std::mutex> n_locks(arg.c_t_n_.mDesc.GetLengths()[1]);
             arg.c_t_n_.SetZero();
             auto f_mk_kn_mn = [&](auto m, auto n) {
                 const int K = arg.a_t_k_k_.mDesc.GetLengths()[2];
@@ -142,8 +144,8 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                             ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                     }
                     CDataType v_c{0};
-                    D0DataType v_d0 = arg.d0_(m, n); // a
-                    D0DataType v_d1 = arg.d1_(e, n); // b
+                    D0DataType v_d0 = arg.d0_(t, topk_id); // a
+                    D0DataType v_d1 = arg.d1_(e, n);       // b
                     if constexpr(MulRoutedWeight)
                     {
                         arg.c_element_op_(v_c, v_acc, v_d0, v_d1, v_topk_w);
@@ -152,6 +154,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                     {
                         arg.c_element_op_(v_c, v_acc, v_d0, v_d1, 1.f);
                     }
+                    std::lock_guard<std::mutex> lock(n_locks[n]);
                     arg.c_t_n_(t, n) += v_c;
                 }
             };

From 28a63d7dcb371f50c2f4f5ced6d09345b4f207f7 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Fri, 27 Jun 2025 16:40:10 -0700
Subject: [PATCH 265/443] =?UTF-8?q?Revert=20"Enable=20builds=20on=20gfx942?=
 =?UTF-8?q?=20by=20default=20and=20run=20all=20tests=20on=20develop=20bran?=
 =?UTF-8?q?c=E2=80=A6"=20(#2418)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 6d6f4c76c13e5c320e6829207a119ed154509ca1.
---
 Jenkinsfile | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index aec833587f..9c68c408bf 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -225,10 +225,6 @@ def cmake_build(Map conf=[:]){
     def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
     def prefixpath = conf.get("prefixpath","/opt/rocm")
     def setup_args = conf.get("setup_args","")
-    // make sure all unit tests always run on develop branch
-    if(env.BRANCH_NAME == "develop"){
-        params.RUN_ALL_UNIT_TESTS = true
-    }
 
     if (prefixpath != "/usr/local"){
         setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
@@ -900,26 +896,10 @@ pipeline {
             name: "BUILD_GFX908",
             defaultValue: false,
             description: "Build CK and run tests on gfx908 (default: OFF)")
-        booleanParam(
-            name: "BUILD_GFX90A",
-            defaultValue: true,
-            description: "Build CK and run tests on gfx90a (default: ON)")
-        booleanParam(
-            name: "BUILD_GFX942",
-            defaultValue: true,
-            description: "Build CK and run tests on gfx942 (default: ON)")
         booleanParam(
             name: "BUILD_GFX950",
             defaultValue: false,
             description: "Build CK and run tests on gfx950 (default: OFF)")
-        booleanParam(
-            name: "BUILD_GFX10",
-            defaultValue: true,
-            description: "Build CK and run tests on gfx10 (default: ON)")
-        booleanParam(
-            name: "BUILD_GFX11",
-            defaultValue: true,
-            description: "Build CK and run tests on gfx11 (default: ON)")
         booleanParam(
             name: "BUILD_GFX12",
             defaultValue: true,
@@ -1302,7 +1282,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { (params.BUILD_GFX942.toBoolean() || params.RUN_FULL_QA.toBoolean()) && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx942") }
                     environment{
@@ -1340,7 +1320,7 @@ pipeline {
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "rocm/composable_kernel-private:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
                 }
@@ -1369,7 +1349,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { params.BUILD_GFX90A.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx90a") }
                     environment{
@@ -1408,7 +1388,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { params.BUILD_GFX10.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx1030") }
                     environment{
@@ -1429,7 +1409,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{

From 2fa9270a25fadaacdbca73784f771d8ac77d0127 Mon Sep 17 00:00:00 2001
From: Vidyasagar Ananthan <vanantha@amd.com>
Date: Sat, 28 Jun 2025 07:07:14 -0700
Subject: [PATCH 266/443] Fix an earlier static check error due to assignment
 of variable in Jenkinsfile (#2420)

* Testing assignment of param fix

* Removing redundant changes

* Adding back unit test runs

* Ensuring Jenkins changes work on develop - to be reverted

* Revert "Ensuring Jenkins changes work on develop - to be reverted"

This reverts commit cf1cab4a430553ce9cdaa41b28e70b1cae491a01.
---
 Jenkinsfile | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 9c68c408bf..fbd7c65109 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -225,7 +225,9 @@ def cmake_build(Map conf=[:]){
     def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
     def prefixpath = conf.get("prefixpath","/opt/rocm")
     def setup_args = conf.get("setup_args","")
-
+    // make sure all unit tests always run on develop branch
+    def runAllUnitTests = (env.BRANCH_NAME == "develop") ? true : params.RUN_ALL_UNIT_TESTS
+    
     if (prefixpath != "/usr/local"){
         setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
     }
@@ -372,7 +374,7 @@ def cmake_build(Map conf=[:]){
                 archiveArtifacts "clang_build_analysis.log"
                 // do not run unit tests when building instances only
                 if(!params.BUILD_INSTANCES_ONLY){
-                    if (!params.RUN_ALL_UNIT_TESTS){
+                    if (!runAllUnitTests){
                         sh "../script/launch_tests.sh"
                     }
                     else{
@@ -391,7 +393,7 @@ def cmake_build(Map conf=[:]){
             else{
                 // run unit tests unless building library for all targets
                 if (!params.BUILD_INSTANCES_ONLY){
-                    if (!params.RUN_ALL_UNIT_TESTS){
+                    if (!runAllUnitTests){
                         sh "../script/launch_tests.sh"
                     }
                     else{
@@ -896,10 +898,26 @@ pipeline {
             name: "BUILD_GFX908",
             defaultValue: false,
             description: "Build CK and run tests on gfx908 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX90A",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx90a (default: ON)")
+        booleanParam(
+            name: "BUILD_GFX942",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx942 (default: ON)")
         booleanParam(
             name: "BUILD_GFX950",
             defaultValue: false,
             description: "Build CK and run tests on gfx950 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX10",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx10 (default: ON)")
+        booleanParam(
+            name: "BUILD_GFX11",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx11 (default: ON)")
         booleanParam(
             name: "BUILD_GFX12",
             defaultValue: true,
@@ -1282,7 +1300,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { (params.BUILD_GFX942.toBoolean() || params.RUN_FULL_QA.toBoolean()) && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx942") }
                     environment{
@@ -1320,7 +1338,7 @@ pipeline {
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "rocm/composable_kernel-private:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
                 }
@@ -1349,7 +1367,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX90A.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx90a") }
                     environment{
@@ -1388,7 +1406,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX10.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx1030") }
                     environment{
@@ -1409,7 +1427,7 @@ pipeline {
                 {
                     when {
                         beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{

From e9036a8fc2248a2a35d0a8d4e804c7a9260b5cb6 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 1 Jul 2025 04:11:10 -0400
Subject: [PATCH 267/443] Enhancements in precommit_install.sh for Python and
 CK Tile code (#2400)

* fix(precommit_install): script now installs packages in virtual env

* fix(precommit_install): installs packages in virtual env

* feat(precommit): added ruff for python linting and formatting

* feat(precommit): added ruff for python linting and formatting

* feat(precommit): run ruff when py files are commited

* feat(precommit): remod.py is run when ck_tile modified

* add empty line at the end

* style(precommit.yaml): remove empty line

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
---
 .pre-commit-config.yaml     | 30 ++++++++++++++++++++++++------
 script/install_precommit.sh |  9 ++++++---
 script/remod_for_ck_tile.sh | 17 +++++++++++++++++
 3 files changed, 47 insertions(+), 9 deletions(-)
 create mode 100755 script/remod_for_ck_tile.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4dc70c1ffd..e4e85651f6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,9 +12,27 @@ repos:
         verbose: false
         language: script
         types: [c++]
-    - id: remove-exec-bit
-      name: Remove executable bit from non-executable files
-      entry: script/remove_exec_bit.sh
-      language: script
-      types_or: [c++, text]
-      verbose: true
+    -   id: remove-exec-bit
+        name: Remove executable bit from non-executable files
+        entry: script/remove_exec_bit.sh
+        language: script
+        types_or: [c++, text]
+        verbose: true
+    -   id: ruff-check
+        name: Ruff Linter
+        entry: ruff check --fix
+        language: python
+        types: [python]
+        additional_dependencies: [ruff]
+    -   id: ruff-format
+        name: Ruff Formatter
+        entry: ruff format
+        language: python
+        types: [python]
+        additional_dependencies: [ruff]
+    -   id: run-remod-if-ck-tile-changed
+        name: Run remod.py if ck_tile files changed
+        entry: script/remod_for_ck_tile.sh
+        language: script
+        always_run: true
+        pass_filenames: false
diff --git a/script/install_precommit.sh b/script/install_precommit.sh
index 296280bb03..83e526035c 100755
--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-
 run_and_check() {
     "$@"
     status=$?
@@ -13,8 +12,12 @@ run_and_check() {
 echo "I: Installing tools required for pre-commit checks..."
 run_and_check apt install clang-format-12
 
-echo "I: Installing pre-commit itself..."
-run_and_check pip3 install pre-commit
+echo "I: Creating and activating virtual environment for pre-commit..."
+python3 -m venv "$(dirname "$0")/../.venv"
+source "$(dirname "$0")/../.venv/bin/activate"
+
+echo "I: Installing pre-commit in virtual environment..."
+run_and_check pip install pre-commit
 run_and_check pre-commit install
 
 echo "I: Installation successful."
diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh
new file mode 100755
index 0000000000..5c7a78d0cc
--- /dev/null
+++ b/script/remod_for_ck_tile.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Get list of staged files
+STAGED_FILES=$(git diff --cached --name-only)
+
+# Check if any staged file is under include/ck_tile/ or example/ck_tile/
+if echo "$STAGED_FILES" | grep -qE '^(include/ck_tile/|example/ck_tile/)'; then
+    echo "Detected changes in ck_tile-related files. Running remod.py..."
+
+    # Run remod.py in both required locations
+    (cd include/ck_tile/ && python3 remod.py)
+    (cd example/ck_tile/ && python3 remod.py)
+
+    echo "remod.py completed."
+else
+    echo "No changes in ck_tile-related files. Skipping remod.py."
+fi

From a03682cb80219848385ba29f2dea9f86f9876f00 Mon Sep 17 00:00:00 2001
From: Thrupti Raj Lakshmana Gowda <thruptiraj.lakshmanagowda@amd.com>
Date: Tue, 1 Jul 2025 12:59:49 -0500
Subject: [PATCH 268/443] Updating Runtime log for CK Tile Engine (#2431)

* Updating runtime log message for CK TILE ENGINE

* Fixing Clang Format

* Update tile_engine/ops/gemm/README.md

Co-authored-by: Aviral Goel <aviral.goel@amd.com>

---------

Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 tile_engine/ops/gemm/README.md          | 3 ++-
 tile_engine/ops/gemm/benchmark_gemm.hpp | 8 +++++---
 tile_engine/ops/gemm/gemm_profiler.hpp  | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index 40cb9acd1c..e74da4b958 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -4,7 +4,8 @@ CK Tile Engine GEMM is used to generate and run GEMM kernels with different comb
 
 # Kernel Configurations
 
-User can provide kernel configuration such as tile size, warp size, padding, pipeline, scheduler and epilogue in the config file with limited values. For reference please see `./configs/user_provided_config.json`. 
+Users can specify custom kernel configurations such as tile size, warp size, padding, pipeline, scheduler, and epilogue in the config file. This allows building only for selected configurations, significantly reducing build time.
+For reference please see `./configs/user_provided_config.json`.
 
 The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark. For reference please see in `./configs/default_config.json`
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 459a40b080..bbb9c1d715 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -163,7 +163,8 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
 }
 
 /// @brief Function to compare the results of the device and host computations
-bool compare(ck_tile::index_t K,
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
              ck_tile::index_t kbatch,
              ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
              ck_tile::HostTensor<CDataType>& c_m_n_host_result)
@@ -178,8 +179,9 @@ bool compare(ck_tile::index_t K,
                                    rtol_atol.at(ck_tile::number<0>{}),
                                    rtol_atol.at(ck_tile::number<1>{}));
 
-    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
     std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
 
     return pass;
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 20f601d46e..2b0cbe7880 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -172,7 +172,8 @@ class GemmProfiler
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         bool verified_correct =
             !setting_.verify_ ||
-            compare(gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result);
+            compare(
+                name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result);
 
         if(verified_correct)
         {

From 36df1cbd0aa106d2b61fa585935dedfb980e4d40 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Tue, 1 Jul 2025 21:34:52 -0400
Subject: [PATCH 269/443] [ckProfiler] Add infrastructure and instances to
 profile gemm_universal with B preshuffle (#2427)

* works on mi300

* fix(profiler): add error message for unsupported type/layout

* refactor(preshuffle.inc): add type aliases for code readability
---
 .../gpu/gemm_universal_preshuffle.hpp         | 151 ++++++
 .../gpu/gemm_universal_preshuffle.inc         | 122 +++++
 .../gpu/CMakeLists.txt                        |   4 +
 .../gemm_universal_preshuffle/CMakeLists.txt  |  82 ++++
 ...ma16x16_nk_mn_comp_default_instance_p1.cpp |  33 ++
 ...ma16x16_nk_mn_comp_default_instance_p2.cpp |  33 ++
 ...ma16x16_nk_mn_comp_default_instance_p3.cpp |  33 ++
 ...ma16x16_nk_mn_comp_default_instance_p4.cpp |  33 ++
 ...ma16x16_nk_mn_comp_default_instance_p5.cpp |  33 ++
 ...ma16x16_nk_mn_comp_default_instance_p6.cpp |  32 ++
 ...f8_bf16_mk_mfma_mn_p1_default_instance.cpp |  32 ++
 ...f8_bf16_mk_mfma_mn_p2_default_instance.cpp |  32 ++
 ...f8_bf16_mk_mfma_mn_p3_default_instance.cpp |  32 ++
 ...f8_bf16_mk_mfma_mn_p4_default_instance.cpp |  32 ++
 ...f8_bf16_mk_mfma_mn_p5_default_instance.cpp |  32 ++
 ...mk_mfma_nk_mn_comp_default_instance_p1.cpp |  33 ++
 ...mk_mfma_nk_mn_comp_default_instance_p2.cpp |  33 ++
 ...iversal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp | 279 +++++++++++
 ..._f8_bf16_mk_mfma32x32_mn_comp_instance.cpp |  32 ++
 ..._bf16_mk_mfma32x32_mn_default_instance.cpp |  31 ++
 ...ma16x16_mn_compute_default_instance_p1.cpp |  31 ++
 ...ma16x16_mn_compute_default_instance_p2.cpp |  31 ++
 ...ma16x16_mn_compute_default_instance_p3.cpp |  31 ++
 ...ma16x16_mn_compute_default_instance_p4.cpp |  31 ++
 ...ma16x16_mn_compute_default_instance_p5.cpp |  31 ++
 ...ma16x16_mn_compute_default_instance_p6.cpp |  31 ++
 ...al_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp | 332 +++++++++++++
 ...mk_mfma_mn_compute_default_instance_p1.cpp |  33 ++
 ...mk_mfma_mn_compute_default_instance_p2.cpp |  31 ++
 ..._f8_f16_mk_mfma_mn_p1_default_instance.cpp |  30 ++
 ..._f16_mk_mfma_mn_p1_default_instance_v2.cpp |  30 ++
 ..._f8_f16_mk_mfma_mn_p2_default_instance.cpp |  30 ++
 ..._f16_mk_mfma_mn_p2_default_instance_v2.cpp |  30 ++
 ..._f8_f16_mk_mfma_mn_p3_default_instance.cpp |  30 ++
 ..._f16_mk_mfma_mn_p3_default_instance_v2.cpp |  30 ++
 ..._f8_f16_mk_mfma_mn_p4_default_instance.cpp |  30 ++
 ..._f16_mk_mfma_mn_p4_default_instance_v2.cpp |  30 ++
 ..._f8_f16_mk_mfma_mn_p5_default_instance.cpp |  30 ++
 ..._f16_mk_mfma_mn_p5_default_instance_v2.cpp |  30 ++
 ...profile_gemm_universal_preshuffle_impl.hpp | 444 ++++++++++++++++++
 profiler/src/CMakeLists.txt                   |   2 +
 .../src/profile_gemm_universal_preshuffle.cpp | 180 +++++++
 42 files changed, 2632 insertions(+)
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p4.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p5.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p6.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
 create mode 100644 profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
 create mode 100644 profiler/src/profile_gemm_universal_preshuffle.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp
new file mode 100644
index 0000000000..b6acb7bfd8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#ifdef CK_USE_XDL
+#include "gemm_universal_preshuffle.inc"
+#endif
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    // ck::tensor_operation::device::DeviceGemmV2BPreshuffle<ALayout,
+    ck::tensor_operation::device::DeviceGemmV2BPreshuffle<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        CDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmV2BPreshuffle<ALayout,
+                                             BLayout,
+                                             CLayout,
+                                             ADataType,
+                                             BDataType,
+                                             CDataType,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+#ifdef CK_USE_XDL
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p1(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p2(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part5(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part6(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part4(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part3(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part2(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part1(
+                    op_ptrs);
+            }
+        }
+#endif
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
+                    op_ptrs);
+            }
+        }
+#endif
+#endif // CK_USE_XDL
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
new file mode 100644
index 0000000000..b44d60deaf
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
+
+using GemmF8F8BF16InstanceVector =
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>&;
+
+using GemmF8F8F16InstanceVector =
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>&;
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_instances(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_instances(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
+        GemmF8F8BF16InstanceVector& instances);
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p1(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p2(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part1(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part2(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part3(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part4(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part5(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part6(
+    GemmF8F8BF16InstanceVector& instances);
+
+#endif
+#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+    GemmF8F8F16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+        GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
+    GemmF8F8F16InstanceVector& instances);
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
+    GemmF8F8F16InstanceVector& instances);
+#endif
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index aea3359aff..d1466206f0 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -283,6 +283,10 @@ FOREACH(subdir_path ${dir_list})
             message(DEBUG "Found gemm_multiply_multiply_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
+        if(("${cmake_instance}" MATCHES "gemm_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
+            message(STATUS "Found gemm_universal_preshuffle_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
+            set(add_inst 0)
+        endif()
         if ("${cmake_instance}" MATCHES "gemm_bilinear")
             set(add_inst 0)
             if((SUPPORTED_GPU_TARGETS MATCHES "gfx9") AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES))
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/CMakeLists.txt
new file mode 100644
index 0000000000..5967258789
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/CMakeLists.txt
@@ -0,0 +1,82 @@
+# ONLY XDL_KERNELS
+set(GEMM_UNIVERSAL_INSTANCES)
+
+# F8_F8_BF16
+list(APPEND GEMM_UNIVERSAL_INSTANCES
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p6.cpp 
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p5.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p4.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p3.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p2.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p1.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp
+device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp
+)
+
+# F8_F8_F16
+list(APPEND GEMM_UNIVERSAL_INSTANCES 
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
+        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
+)
+
+# F8_F8_F16
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+# F8_F8_BF16
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+add_instance_library(device_gemm_universal_preshuffle_instance ${GEMM_UNIVERSAL_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p1.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p1.cpp
new file mode 100644
index 0000000000..d069bfaeb5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p1.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part1(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part1<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p2.cpp
new file mode 100644
index 0000000000..a03aa265e9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p3.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p3.cpp
new file mode 100644
index 0000000000..135c4ff77a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p3.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part3(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part3<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p4.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p4.cpp
new file mode 100644
index 0000000000..e87f5c1e2b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p4.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part4(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part4<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p5.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p5.cpp
new file mode 100644
index 0000000000..19ace490ab
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p5.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part5(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part5<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p6.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p6.cpp
new file mode 100644
index 0000000000..808b812716
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instance_p6.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// device_gemm_xdl_universal_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_p6
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part6(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_part6<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
new file mode 100644
index 0000000000..cd309d528d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
new file mode 100644
index 0000000000..f95f77b8cf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
new file mode 100644
index 0000000000..a0f42dfa8e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
new file mode 100644
index 0000000000..20d2cf4d5d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
new file mode 100644
index 0000000000..abc304542d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp
new file mode 100644
index 0000000000..77b35506b7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p1<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp
new file mode 100644
index 0000000000..d5b3cd95c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..c761e6ad8c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmNKPadding  = GemmSpecialization::NKPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances = std::tuple<
+    // no valid instances available
+    >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances =
+    std::tuple<
+        // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_instances = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // Commented out instances are invalid. MRepeat < 4
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,          16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    ////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,          16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    ////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   128,  16,  16,          16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   128,  16,  16,          16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    ////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   128,  16,  16,          16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    512,   128,  16,  16,          16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    /////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    512,   128,  16,  16,          16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_instances = std::tuple<
+    // Commented out instances do not work because MRepeat < 4
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    64,    128,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    32,    128,   256,  16,  16,  16,   16,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    32,    128,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    64,    128,   512,  16,  16,  16,   16,    4,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    32,    128,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // N 256
+//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                           256,    64,    256,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                         256,    32,    256,   256,  16,  16,  16,   16,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                           256,    64,    256,   512,  16,  16,  16,   16,    4,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                         256,    32,    256,   512,  16,  16,  16,   16,    2,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                           256,    64,    512,   256,  16,  16,  16,   16,    4,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                         256,    32,    512,   256,  16,  16,  16,   16,    2,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances = std::tuple<
+    // None of the instacnes have MRepeat >= 4
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                         64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                        128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                        256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  128,               32,                         16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  128,               16,                         32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  128,               16,                         32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                         64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               32,                         64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               64,                         64,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               64,                         16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               64,                         16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances = std::tuple<
+    // MRepeat < 1, invalid instances
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           1,          S<1, 16, 1, 16>,    4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           2,          S<1, 16, 1, 16>,    8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           2,          S<1, 16, 1, 16>,    8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           2,          S<1, 16, 1, 16>,    8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p1 = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p2 = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     256,    128,   128,  16,  16,  16,   16,    8,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     224,    128,   128,  16,  16,  16,   16,    7,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     192,    128,   128,  16,  16,  16,   16,    6,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     160,    128,   128,  16,  16,  16,   16,    5,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     160,    128,   128,  16,  16,  16,   16,    10,   2,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     128,    128,   128,  16,  16,  16,   16,    4,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part1 =
+    std::tuple<
+        // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              256,     256,    128, 16, 16, 16, 16, 8, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              224,     256,    128, 16, 16, 16, 16, 7, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              192,     256,    128, 16, 16, 16, 16, 6, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              160,     256,    128, 16, 16, 16, 16, 5, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              128,     256,    128, 16, 16, 16, 16, 4, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part2 =
+    std::tuple<
+        // clang-format off
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,   14,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,   14,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,   14,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,   14,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part3 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,   12,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,   12,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,   12,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,   12,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        //clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part4 = std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |        
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,   256,  128,  16,  16,  16,   16,    10,    4,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,   192,  128,  16,  16,  16,   16,    10,    3,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,   128,  128,  16,  16,  16,   16,    10,    2,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,    64,  128,  16,  16,  16,   16,    10,    1,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part5 =
+    std::tuple<
+        // clang-format off
+    //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   64,   128,  16,  16,  16,   16,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,  256,  16,  16,  16,   16,    8,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,            16,          0,          2,            1,                   S<1, 64, 1, 4>,               8,   BlockGemmPipelineScheduler::Intrawave,   BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   64,   256,  16,  16,  16,   16,    8,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_part6 =
+    std::tuple<
+// clang-format off
+    //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer|             AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|                |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |                |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |                |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   192,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp
new file mode 100644
index 0000000000..e367cf2c4b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp
new file mode 100644
index 0000000000..bb34354261
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_nk_mn.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
new file mode 100644
index 0000000000..3bdfbf5b0b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p1.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p1<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
new file mode 100644
index 0000000000..e1bb7bfda3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p2.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
new file mode 100644
index 0000000000..68c2bcd3b5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p3.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p3<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
new file mode 100644
index 0000000000..1581440d9e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p4<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
new file mode 100644
index 0000000000..29cfba2e91
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p5(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p5<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
new file mode 100644
index 0000000000..5da668486a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p6<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
new file mode 100644
index 0000000000..df45aa06db
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough      = element_wise::PassThrough;
+using MultiplyMultiply = element_wise::MultiplyMultiply;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto v1 = BlockGemmPipelineVersion::v1;
+static constexpr auto v2 = BlockGemmPipelineVersion::v2;
+
+// All commented out instances are invalid because MRepeat < 4.
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma32x32_mn_instances = std::tuple<
+    // clang-format off
+    // None of these will work because MRepeat < 4
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    //// p1
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //// N 256
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //// N 512
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //// p2
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //// p3
+    //// N 256
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //// N 512
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+    //// p4
+    //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    64,    512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma32x32_mn_compute_instances = std::tuple<
+    // clang-format off
+    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+    //p1
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    //// p2
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        //clang-format on
+        >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances =
+    std::tuple<
+        // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //// N 256
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //// N 512
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    512,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_instances = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   256,  16,  16,  16,   16,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   512,  16,  16,  16,   16,    4,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_instances = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // N 256
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   256,  16,  16,  16,   16,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   512,  16,  16,  16,   16,    4,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   512,  16,  16,  16,   16,    2,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //// N 512
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    512,   256,  16,  16,  16,   16,    4,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    512,   256,  16,  16,  16,   16,    2,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_instances = std::tuple<
+    // None of these will work because MRepeat < 4
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   128,     32,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     32,    64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     64,    64,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
+    // clang-format on
+    >;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_instances = std::tuple<
+    // None of these will work because MRepeat < 4
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p1 = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p2 = std::tuple<
+    // clang-format off
+        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    10,   2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p1 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // Compute friendly
+        // 256x[64, 256, 32]x128
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,   16,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    224,   128,  16,  16,  16,   16,    8,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,   16,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,   16,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,   16,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p2 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // 224x[64, 256, 32]x128
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,   14,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,   14,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,   14,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,   14,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p3 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // 192x[64, 256, 32]x128, 192x[64]x256
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,   12,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,   12,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,   12,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,   12,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p4 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,   10,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,   10,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,   10,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,   10,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p5 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    8,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    8,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    8,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances_p6 =
+    std::tuple<
+        // clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,    F16,  F32,     F16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
new file mode 100644
index 0000000000..2a6b98bbe9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p1.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    printf("add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_"
+           "instances_p1\n");
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p1<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
new file mode 100644
index 0000000000..c647aaa4eb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance_p2.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances_p2<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
new file mode 100644
index 0000000000..0a2df2887a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances<v2, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
new file mode 100644
index 0000000000..27a43ced98
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances<v1, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
new file mode 100644
index 0000000000..a16aed9c22
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_instances<v1, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
new file mode 100644
index 0000000000..7221768b6d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_instances<v2, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
new file mode 100644
index 0000000000..3d254a8bf6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_instances<v1, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
new file mode 100644
index 0000000000..92ac2fa1de
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_instances<v2, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
new file mode 100644
index 0000000000..76ed9a1ffe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_instances<v1, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
new file mode 100644
index 0000000000..096d28f4bb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instance_v2.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p4_instances<v2, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
new file mode 100644
index 0000000000..c413fa770c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_instances<v1, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
new file mode 100644
index 0000000000..6df56b5f50
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instance_v2.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p5_instances<v2, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
new file mode 100644
index 0000000000..e218143857
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename T>
+void preShuffleBuffer(const T* src, T* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+    int K0    = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0          = n / NLane;
+            int n1          = n % NLane;
+            int k0          = k / (KLane * KPack);
+            tempk           = k % (KLane * KPack);
+            int k1          = tempk / KPack;
+            int k2          = tempk % KPack;
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+            dst[outputIndex] = src[n * K + k];
+        }
+    }
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_gemm_universal_preshuffle_impl(int do_verification,
+                                            int init_method,
+                                            bool do_log,
+                                            bool time_kernel,
+                                            int M,
+                                            int N,
+                                            int K,
+                                            int StrideA,
+                                            int StrideB,
+                                            int StrideC,
+                                            int KBatch,
+                                            int n_warmup,
+                                            int n_iter,
+                                            uint64_t rotating = 0)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_preshuffled(
+        f_host_tensor_descriptor(K, N, StrideB, BLayout{})); // for preshuffle
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::size_t total_gemm_needed =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmV2BPreshuffle<ALayout,
+                                                                           BLayout,
+                                                                           CLayout,
+                                                                           ADataType,
+                                                                           BDataType,
+                                                                           CDataType,
+                                                                           AElementOp,
+                                                                           BElementOp,
+                                                                           CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp,
+                                                                                ComputeDataType>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    std::optional<std::string> best_op_object_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        const int KPerBlock = op_ptr->GetKPerBlock();
+
+        if(op_ptr->GetPermuteB())
+        {
+            int K1 = KPerBlock;
+            int K0 = K / KPerBlock;
+
+            // int K0, N, K1
+            for(int j = 0; j < K0; j++)
+            {
+                for(int i = 0; i < N; i++)
+                {
+                    for(int jj = 0; jj < K1; jj++)
+                    {
+                        b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                    }
+                }
+            }
+
+            if constexpr(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
+            {
+                // vector pk_i4x4 permute
+                for(int i = 0; i < N; i++)
+                {
+                    for(int j = 0; j < K; j += 8)
+                    {
+                        int input[8];
+
+                        for(int k = 0; k < 4; k++)
+                        {
+                            int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                        }
+
+                        // permute 01234567->20643175
+                        {
+                            int hi   = input[2];
+                            int lo   = input[0];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 0, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[6];
+                            int lo   = input[4];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 2, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[3];
+                            int lo   = input[1];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 4, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[7];
+                            int lo   = input[5];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 6, i) = i4x2;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            b_k_n_permute = b_k_n;
+        }
+        int NPerXdl = op_ptr->GetPreShuffleParameters();
+
+        preShuffleBuffer<BDataType>(
+            b_k_n_permute.mData.data(), b_preshuffled.mData.data(), N, K, NPerXdl);
+
+        b_device_buf.ToDevice(b_preshuffled.mData.data());
+
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
+
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideC,
+                                            kbatch_curr,
+                                            a_element_op,
+                                            b_element_op,
+                                            c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+#if defined CK_ENABLE_FP8
+                    // set softer tolerances for fp8
+                    if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                                 is_same_v<CDataType, f8_t>)
+                    {
+                        std::string msg = "Error: Incorrect results!";
+                        double rtol     = 1e-1;
+                        double atol     = 1e-1;
+                        pass            = pass & ck::utils::check_err(
+                                          c_m_n_device_result, c_m_n_host_result, msg, rtol, atol);
+                    }
+                    else
+                    {
+#endif
+                        pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+#if defined CK_ENABLE_FP8
+                    }
+#endif
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
+                }
+
+                std::string op_name                    = op_ptr->GetTypeString();
+                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();
+
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});
+
+                std::size_t flop = std::size_t(2) * M * N * K;
+
+                static constexpr index_t BPackedSize = []() {
+                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                        return 2;
+                    else
+                        return 1;
+                }();
+
+                std::size_t num_btype = sizeof(ADataType) * M * K +
+                                        sizeof(BDataType) * K * N / BPackedSize +
+                                        sizeof(CDataType) * M * N;
+
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;
+
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name        = op_name;
+                    best_op_object_name = op_obj_name;
+                    best_tflops         = tflops;
+                    best_ave_time       = ave_time;
+                    best_gb_per_sec     = gb_per_sec;
+                    best_kbatch         = kbatch_curr;
+                }
+            }
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << "M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    if(best_op_object_name)
+        std::cout << best_op_object_name.value() << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 1e65e9e580..1dc942699f 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -63,6 +63,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
     list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
     list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
+    list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp)
   endif()
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
     list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
@@ -171,6 +172,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
     list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
+    list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
   endif()
   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
     list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
diff --git a/profiler/src/profile_gemm_universal_preshuffle.cpp b/profiler/src/profile_gemm_universal_preshuffle.cpp
new file mode 100644
index 0000000000..bc09d7d35d
--- /dev/null
+++ b/profiler/src/profile_gemm_universal_preshuffle.cpp
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+
+#include "profiler/profile_gemm_universal_preshuffle_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    // MK_KN_MN, // 0
+    MK_NK_MN = 1, // 1
+                  // KM_KN_MN, // 2
+                  // KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F8_F8_F16  = 0, // 0
+    F8_F8_BF16 = 1, // 1
+    // F32_F32_F32,    // 0
+    // F16_F16_F16,    // 1
+    // BF16_BF16_BF16, // 2
+    // INT8_INT8_INT8, // 3
+    // F8_F16_F16,     // 4
+    // F16_F8_F16,     // 5
+    // F16_F16_F16_F8, // 6
+    // F8_F8_BF16,     // 7
+    // F16_I4_F16,     // 8
+    // BF16_I4_BF16,   // 9
+
+};
+
+#define OP_NAME "gemm_universal_preshuffle"
+#define OP_DESC "Universal GEMM Preshuffle"
+
+int profile_gemm_universal_preshuffle(int argc, char* argv[])
+{
+    if(argc != 15 && argc != 18)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: f8->bf16, 1: f8->f16)\n");
+        // printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        // printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        // printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: split k into  mulitiple batch\n");
+        printf("optional:\n");
+        printf("arg15: number of warm-up cycles (default 1)\n");
+        printf("arg16: number of iterations (default 10)\n");
+        printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+
+    int M;
+    int N;
+    int StrideA;
+    int StrideB;
+    // Analyze the unsupported matrix shapes, switch the M and N number
+    if(std::stoi(argv[9]) % 8 != 0 && std::stoi(argv[8]) % 8 == 0)
+    {
+        M       = std::stoi(argv[9]);
+        StrideA = std::stoi(argv[12]);
+        N       = std::stoi(argv[8]);
+        StrideB = std::stoi(argv[11]);
+    }
+    else
+    {
+        M       = std::stoi(argv[8]);
+        StrideA = std::stoi(argv[11]);
+        N       = std::stoi(argv[9]);
+        StrideB = std::stoi(argv[12]);
+    }
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int K = std::stoi(argv[10]);
+
+    const int StrideC = std::stoi(argv[13]);
+    const int KBatch  = std::stoi(argv[14]);
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 18)
+    {
+        n_warmup = std::stoi(argv[15]);
+        n_iter   = std::stoi(argv[16]);
+        rotating = std::stoull(argv[17]) * 1024 * 1024;
+    }
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+    using F8 = ck::f8_t;
+#endif
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto comp_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using CDataType       = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_universal_preshuffle_impl<ADataType,
+                                                                         BDataType,
+                                                                         ComputeDataType,
+                                                                         AccDataType,
+                                                                         CDataType,
+                                                                         ALayout,
+                                                                         BLayout,
+                                                                         CLayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideC < 0) ? DefaultStrideC : StrideC,
+            KBatch,
+            n_warmup,
+            n_iter,
+            rotating);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F8_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94) || defined(CK_USE_WMMA_FP8)
+    if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+#endif
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_universal_preshuffle);

From 60eb70f5436c41dc6e227b5b1d7f9a6445b17892 Mon Sep 17 00:00:00 2001
From: Gino Lu <gino.lu@amd.com>
Date: Wed, 2 Jul 2025 14:53:00 +0800
Subject: [PATCH 270/443] Fix return value bug that drops minus sign in some
 cases. (#2415)

* fix return value bug.

* refine change according to comment.
---
 include/ck/utility/mxfp_utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/utility/mxfp_utils.hpp b/include/ck/utility/mxfp_utils.hpp
index cf7a3e8713..ebed85f5fd 100644
--- a/include/ck/utility/mxfp_utils.hpp
+++ b/include/ck/utility/mxfp_utils.hpp
@@ -213,7 +213,7 @@ __host__ __device__ inline T convert_to_type(float value)
     {
         // closer to 0
         if(std::abs(value) <= std::abs(min_subnorm - value))
-            return 0;
+            return sign << (NumericUtils<T>::exp + NumericUtils<T>::mant);
         else
             return 1 | (sign << (NumericUtils<T>::exp + NumericUtils<T>::mant));
     }
@@ -249,7 +249,7 @@ __host__ __device__ inline T convert_to_type(float value)
 
     if(out_exponent == 0 && mantissa == 0)
     {
-        return 0;
+        return sign << (NumericUtils<T>::exp + NumericUtils<T>::mant);
     }
 
     mantissa &= (1UL << NumericUtils<T>::mant) - 1;

From 74a34e0f507cde4502f397dffd0b15fcea5e9982 Mon Sep 17 00:00:00 2001
From: chenjun <46212055+junhaha666@users.noreply.github.com>
Date: Wed, 2 Jul 2025 19:12:07 +0800
Subject: [PATCH 271/443] fix KPerBlock = 64 a8w8 bpreshulle gemm build fail in
 gfx950 (#2437)

Co-authored-by: valarLip <340077269@qq.com>
---
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp | 49 +++++++++----------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index d31ed19787..e3898ffa36 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -33,12 +33,12 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -65,12 +65,12 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
@@ -168,16 +168,17 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
     static constexpr bool is_single_rate_mfma =
         (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
           lcm_AK1_BK1 <= 4) ||
-         (is_same<ComputeTypeA, int8_t>::value && lcm_AK1_BK1 <= 8))
+         (is_same<ComputeTypeA, int8_t>::value && KPerBlock < 128) ||
+         (is_same<ComputeTypeA, f8_t>::value && KPerBlock < 128))
             ? true
             : false;
     static constexpr auto is_scale_mfma = false;
     static constexpr auto mfma          = MfmaSelector<ComputeTypeA,
-                                              MPerXdl,
-                                              NPerXdl,
-                                              ComputeTypeA,
-                                              is_single_rate_mfma,
-                                              is_scale_mfma>{};
+                                                       MPerXdl,
+                                                       NPerXdl,
+                                                       ComputeTypeA,
+                                                       is_single_rate_mfma,
+                                                       is_scale_mfma>{};
     static constexpr index_t KPack      = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
     static constexpr index_t KGroup     = []() {
         if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
@@ -1413,18 +1414,16 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
@@ -1854,18 +1853,16 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(

From 11838245735a9efac43b975b46e05d662e80284e Mon Sep 17 00:00:00 2001
From: damien-lejeune <31985270+damien-lejeune@users.noreply.github.com>
Date: Wed, 2 Jul 2025 18:07:47 +0200
Subject: [PATCH 272/443] Fix clang in ck develop branch (#2445)

Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
---
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp | 46 ++++++++++---------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index e3898ffa36..3553a1d040 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -33,12 +33,12 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle(typename GridwiseGemm::Argument karg)
 {
-#if (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -65,12 +65,12 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
 {
-#if (!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
@@ -174,11 +174,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             : false;
     static constexpr auto is_scale_mfma = false;
     static constexpr auto mfma          = MfmaSelector<ComputeTypeA,
-                                                       MPerXdl,
-                                                       NPerXdl,
-                                                       ComputeTypeA,
-                                                       is_single_rate_mfma,
-                                                       is_scale_mfma>{};
+                                              MPerXdl,
+                                              NPerXdl,
+                                              ComputeTypeA,
+                                              is_single_rate_mfma,
+                                              is_scale_mfma>{};
     static constexpr index_t KPack      = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
     static constexpr index_t KGroup     = []() {
         if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
@@ -1414,16 +1414,18 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                             Number<NumDTensor>{}));
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_buf[i]; },
-                             Number<NumDTensor>{}));
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
@@ -1853,16 +1855,18 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                             Number<NumDTensor>{}));
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_buf[i]; },
-                             Number<NumDTensor>{}));
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(

From 58d24a7172f00c1ef3fbfaeb3c152797c485d0bc Mon Sep 17 00:00:00 2001
From: Vidyasagar Ananthan <vanantha@amd.com>
Date: Wed, 2 Jul 2025 20:47:09 -0700
Subject: [PATCH 273/443] Adding ddembeck to codeowners. (#2449)

Co-authored-by: Dave Dembeck <dave.dembeck@amd.com>
---
 .github/CODEOWNERS | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f9ded8a029..bd597344ea 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
 * @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @shumway @vidyasagar-amd
 # Documentation files
-docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
-*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
-*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
-.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
 # Header directory for Doxygen documentation
 library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd

From d2536b91bc8ef860bbdb161c5a35dbf633613198 Mon Sep 17 00:00:00 2001
From: Vidyasagar Ananthan <vanantha@amd.com>
Date: Thu, 3 Jul 2025 07:54:12 -0700
Subject: [PATCH 274/443] Remove ftime tracing to avoid printing json files
 (#2452)

* Remove ftime tracing to avoid printing json files

* Factoring out build commands
---
 Jenkinsfile | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index fbd7c65109..2b3dd559e6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -345,8 +345,19 @@ def cmake_build(Map conf=[:]){
     def build_cmd
     def execute_cmd = conf.get("execute_cmd", "")
     if(!setup_args.contains("NO_CK_BUILD")){
-        setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace "  .. """)
-        build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
+        def cmake_flags = params.NINJA_BUILD_TRACE ? "-O3 -ftime-trace" : "-O3"
+        if (params.NINJA_BUILD_TRACE) {
+            echo "running ninja build trace"
+        }
+        setup_cmd = conf.get(
+            "setup_cmd",
+            """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" ${cmake_flags} " .. """
+        )
+        build_cmd = conf.get(
+            "build_cmd",
+            "${build_envs} ninja -j${nt} ${config_targets}"
+        )
+        
         cmd = conf.get("cmd", """
             ${setup_cmd}
             ${build_cmd}

From 2e971eff904f1cbbb3c654f1a047d09c3d8c7eae Mon Sep 17 00:00:00 2001
From: Vidyasagar Ananthan <vanantha@amd.com>
Date: Thu, 3 Jul 2025 20:10:29 -0700
Subject: [PATCH 275/443] Removing reference to undefined parameter for ignore
 statement. (#2447)

---
 .../gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
index 580a47de14..abd6574d8c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
@@ -76,7 +76,6 @@ __global__ void
 #endif
 #else
     ignore = karg;
-    ignore = batch;
     ignore = compute_ptr_offset_of_batch;
 #endif
 }

From 158ddeb8ce2698de9e1dea0b31912c2f28155c80 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 4 Jul 2025 02:26:18 -0700
Subject: [PATCH 276/443] [CK-TILE] File-level documentation for static
 encoding pattern (#2433)

* add file-level comment

* Finished the write-up

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 .../algorithm/static_encoding_pattern.hpp     | 99 ++++++++++++++++---
 1 file changed, 83 insertions(+), 16 deletions(-)

diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index d8a8f6ab66..8a3de3e5e0 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -1,6 +1,73 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
+/**
+ * @file
+ * We're defining the data access pattern for a 2D window (`XPerTile` by `YPerTile`)
+ for `BlockSize` threads in a thread block.
+ * X dimension is considered contiguous in memory, so a single instruction can access
+ several adjacent and properly aligned elements (vector); the access pattern along X tile
+ dimension is parameterized only by the suggested vector size `VecSize`.
+ * We can't access more than `MaxVecSize = TileElementsPerThread = TileSize / BlockSize` elements
+ with a single memory access, so the actual vector size along the X dimension is
+ `X0 = min(MaxVecSize, VecSize)`.
+ * This leaves `X1 = XPerTile / X0` threads per tile in X dimension.
+ * X1 is also the number of threads per warp in X dimension, that is,
+ X dimension is not split between warps, and each warp accesses X dimension entirely,
+ and there is no iteration in X dimension.
+ * The tuple <X0, X1> defines the X-axis access pattern.
+ This part is common between the 2D distribution patterns.
+
+ * What's different between the different 2D distribution patterns, is the Y axis access pattern.
+ * There are 3 components in this access pattern;
+ * (1) number of Y-axis elements (rows) per warp for a single instruction access,
+ * (2) number of warps per thread block,
+ * (3) number of iterations to cover the entire Y axis.
+
+ * The raked here represents how data is partitioned across different processing granularity.
+ * It represents howe we are going to access the data in thread, warp, or blocked in contiguous
+ region.
+ * From below, the qualifier for 'raked' is the part of warp/thread hierarchy
+ * in the split of Y tile dimension where the iteration happens,
+ * meaning, the iteration can be logically inserted as a tile dimension in 3 ways,
+ * (1) after thread -> thread-raked,
+ * (2) between warp and thread -> warp-raked,
+ * (3) before warp -> block-raked
+
+ * *Thread raked*
+
+ * Y0 is the number of warps, which we can get from the equation `Y0 * WarpSize == BlockSize`
+ * Y1 is the number of rows accessed by a warp within a single iteration,
+ compute it from the equation `Y0 * X1 == WarpSize`
+ * Y2 is the number of iterations to cover the tile,
+ compute it from the equation `Y0 * Y1 * Y2 == YPerTile`
+
+ * *Warp raked*
+
+ * Y0 is the number of warps, we can get it in the same way as for thread-raked pattern,
+ `Y0 * WarpSize == BlockSize`
+ * Y1 is the number of iterations to cover the tile, `Y0 * Y1 * Y2 == YPerTile`.
+ Compute Y2 from the equation below
+ * Y2 is the number of rows accessed by a warp in a single iteration, `Y2 * X1 == WarpSize`
+
+ * *Block raked*
+
+ * Y0 is the number of iterations to cover the tile, `Y0 * Y1 * Y2 == YPerTile`.
+ Compute Y1 and Y2 from the equations below
+ * Y1 is the number of warps, `Y1 * WarpSize == BlockSize`
+ * Y2 is the number of rows accessed by a warp in a single iteration, `Y2 * X1 == WarpSize`
+
+ * In all cases, the tuple <Y0, Y1, Y2> defines the Y-axis access pattern.
+
+ * *Selection*
+ * When we are selecting, Thread-raked is used in element-wise operation because it is the
+ * Thread-major memory order.
+ * Warp-raked is used in matrix multiplication because the vectorization is in warp level.
+ * Block-raked is used mostly for the reduction process, where will reduce the block in global
+ * atomic level.
+ *
+ */
+
 #pragma once
 
 #include "ck_tile/core/arch/arch.hpp"
@@ -105,9 +172,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                 tile_distribution_encoding<sequence<Y0>,
                                            tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
                                            tuple<sequence<0>, sequence<1, 2>>,
-                                           tuple<sequence<0>, sequence<0, 0>>,
+                                           tuple<sequence<0>, sequence<0, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
-                                           sequence<1, 1>>{});
+                                           sequence<1, 1>>{}); // -> <Y2, X1>
         }
         else
         {
@@ -115,9 +182,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                            tuple<sequence<1>, sequence<1, 2>>,
-                                           tuple<sequence<0>, sequence<1, 0>>,
+                                           tuple<sequence<0>, sequence<1, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
-                                           sequence<2, 1>>{});
+                                           sequence<2, 1>>{}); // -> <Y2, X1>
         }
     }
 
@@ -129,9 +196,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                 tile_distribution_encoding<sequence<Y0>,
                                            tuple<sequence<X0, X1>, sequence<Y1, Y2>>,
                                            tuple<sequence<0>, sequence<2, 1>>,
-                                           tuple<sequence<0>, sequence<0, 0>>,
+                                           tuple<sequence<0>, sequence<0, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
-                                           sequence<1, 1>>{});
+                                           sequence<1, 1>>{}); // -> <X1, Y2>
         }
         else
         {
@@ -139,9 +206,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
                 tile_distribution_encoding<sequence<1>,
                                            tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                            tuple<sequence<2>, sequence<2, 1>>,
-                                           tuple<sequence<0>, sequence<1, 0>>,
+                                           tuple<sequence<0>, sequence<1, 0>>, // -> <Y0>, <Y1, X0>
                                            sequence<1, 2>,
-                                           sequence<1, 2>>{});
+                                           sequence<1, 2>>{}); // -> <X1, Y2>
         }
     }
 };
@@ -182,9 +249,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<2, 0>>,
+                                       tuple<sequence<0>, sequence<2, 0>>, // -> <Y0>, <Y2, X0>
                                        sequence<1, 2>,
-                                       sequence<1, 1>>{});
+                                       sequence<1, 1>>{}); // -> <Y1, X1>
     }
 
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
@@ -193,9 +260,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                        tuple<sequence<2>, sequence<2, 1>>,
-                                       tuple<sequence<0>, sequence<2, 0>>,
+                                       tuple<sequence<0>, sequence<2, 0>>, // -> <Y0>, <Y2, X0>
                                        sequence<1, 2>,
-                                       sequence<1, 1>>{});
+                                       sequence<1, 1>>{}); // -> <X1, Y1>
     }
 };
 
@@ -233,9 +300,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       tuple<sequence<1>, sequence<2, 0>>, // -> <Y1>, <Y2, X0>
                                        sequence<1, 2>,
-                                       sequence<0, 1>>{});
+                                       sequence<0, 1>>{}); // -> <Y0, X1>
     }
 
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
@@ -244,9 +311,9 @@ struct TileDistributionEncodingPattern2D<BlockSize,
             tile_distribution_encoding<sequence<1>,
                                        tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
                                        tuple<sequence<2>, sequence<2, 1>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       tuple<sequence<1>, sequence<2, 0>>, // -> <Y1>, <Y2, X0>
                                        sequence<1, 2>,
-                                       sequence<1, 0>>{});
+                                       sequence<1, 0>>{}); // -> <X1, Y0>
     }
 };
 

From 394e5be10d77adbdbd3a523129a21c0c332d038f Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Fri, 4 Jul 2025 11:56:42 +0200
Subject: [PATCH 277/443] [CK-TILE DOC] Ck-tile grouped GEMM documentation
 (#1939)

* Ck-tile readme

* After review

* Review: part1

* Review part 3
---
 example/ck_tile/17_grouped_gemm/README.md | 166 ++++++++++++++++++++--
 1 file changed, 157 insertions(+), 9 deletions(-)

diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md
index 59396a558b..29642e96c1 100644
--- a/example/ck_tile/17_grouped_gemm/README.md
+++ b/example/ck_tile/17_grouped_gemm/README.md
@@ -1,8 +1,149 @@
-# Grouped CShuffle GEMM
+# Grouped Gemm
 
-This folder contains example for Grouped GEMM using ck_tile tile-programming implementation.
+Grouped General Matrix Multiplication (Grouped GEMM) is a technique used in GPU computing and high-performance computing to batch together multiple independent GEMM operations (matrix multiplications) into a single kernel launch in order to improve performance and efficiency. This folder contains Grouped GEMM examples that use the ck_tile tile-programming implementation.  
 
-## build
+## Quick Tour for New Users
+
+The `Grouped GEMM` operators are versions of GEMM that run multiple GEMM operations within a single kernel call. Each GEMM operation performs a matrix multiplication. Unlike regular batched GEMM operations where both matrices must be of the same size and have the same configuration, Grouped GEMM operations can take matrices with different sizes and configurations, making them more flexible for diverse workloads.
+
+Let's now break the example into the following parts: parsing arguments, preparing host and device buffers, preparing data, invoking GEMM, and building the example, while explaining each function.
+
+### Parsing Arguments
+The example takes three arguments: `group_count`, `repeat`, and `warmup`:
+- `group_count`: the number of GEMM operations in the group, 
+- `repeat`: the number of times to repeat the kernel for benchmarking
+- `warmup`: the number of iterations before the actual kernel run time measure.
+
+```cpp
+// Example
+const int group_count = arg_parser.get_int("group_count");
+const int repeat      = arg_parser.get_int("repeat");
+const int warmup      = arg_parser.get_int("warmup");
+```
+In the next step, the input parameters `Ms`, `Ns`, `Ks`, as well as the corresponding `stride_As`, `stride_Bs`, and `stride_Cs` are either provided from the comand line or generated by default. Since one or more input data sets are expected for `A` and `B`, each parameter is stored in a `std::vector`. The size of the `vector` is defined by `group_count`.
+
+```cpp
+// Example
+std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
+std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
+std::vector<ck_tile::index_t> Ks        = arg_parser.get_int_vec("Ks");
+std::vector<ck_tile::index_t> stride_As = arg_parser.get_int_vec("stride_As");
+std::vector<ck_tile::index_t> stride_Bs = arg_parser.get_int_vec("stride_Bs");
+std::vector<ck_tile::index_t> stride_Cs = arg_parser.get_int_vec("stride_Cs");
+```
+Where:
+- `Ms` is the M dimension of each GEMM.
+- `Ns` is the N dimension of each GEMM.
+- `Ks` is the K dimension of each GEMM.
+- `stride_As` is the stride values for matrix A.
+- `stride_Bs` is the stride  values for matrix B.
+- `stride_Cs` is the stride  values for matrix C.
+
+### HostTensor and Device Memory Buffers (for CPU and GPU) 
+Each parameter `Ms`, `Ns`, `Ks`, `stride_As`, `stride_Bs` and `stride_Cs` contains values for more than one matrix, meaning different matrix sizes and strides can be used for different grouped GEMM computations.
+The next step is to properly load the input values. For each input matrix, `A` and `B`, and for each output matrix, `C`, you need to create both `HostTensor` and `DeviceMemory`, where: 
+- `HostTensor` represents the matrix data on the host (CPU). It stores the data before they are transferred to the device for computation.
+- `DeviceMemory` represents the matrix data on the device (GPU). This will store the data on the GPU for computation during the Grouped GEMM operation.
+
+#### HostTensor Buffers (for CPU)
+In the first step, create `HostTensor` for `A`, `B`, `C`. `HostTensor` allocates memory on the host (CPU) to store the matrices, initializing the memory with the appropriate dimensions and values to store the data. Below is an example code showing how to create HostTensors for those tensors:
+```cpp
+// Example
+std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
+```
+Where:
+- `a_m_k_tensors` is the vector of `HostTensor` objects for matrix `A` (with dimensions `M × K`). Each tensor stores the data for single GEMM operation.
+- `b_k_n_tensors` is the vector of `HostTensor` objects for matrix `B` (with dimensions `K × N`).
+- `c_m_n_tensors` is the vector of `HostTensor` objects for matrix `C` (the output matrix with dimensions `M × N`).
+
+The `std::vector` container is used for this purpose throughout. As mentioned above, the number of HostTensors is equal to `group_count`.
+
+#### Device Memory Buffers (for GPU)
+Now it's time to allocate memory on the device (GPU) and transfer the data from `HostTensor` to `DeviceMemory` for actual computation..
+```cpp
+// Example
+std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
+``` 
+Where:
+- `a_m_k_dev_buf` is the buffer used for storing matrix A on the GPU.
+- `b_k_n_dev_buf` is the buffer used for storing matrix B on the GPU.
+- `c_m_n_dev_buf` is the buffer used for storing the result matrix C on the GPU.
+
+## Prepare data
+In the next step, the input tensors are populated. A pseudorandom number generator, an existing distribution (e.g., `FillUniformDistribution`), or user data can be used to populate the tensors. Descriptors also need to be create for each input tensor.
+
+Use `get_default_stride` to get the strides for A, B, and C. `get_default_stride` is a template function that calculates the default stride for a 2D array based on whether it is row-major or column-major. Template parameter determines whether the storage order is row-major (true) or column-major (false). The function takes four params `row`, `col`, `stride` and `bool_constant<is_row_major>`. If the stride is explicitly provided (`stride != 0`), the stride is returned as-is. If the stride is not provided (`stride == 0`), the function computes the default stride. For the Row-major order (`is_row_major == true`), the stride is set to the number of columns (col). For the column-major order (`is_row_major == false`), the stride is set to the number of rows (row). This function is useful when working with dynamically allocated 2D arrays, where the user may not specify the stride explicitly. It ensures a natural default stride based on the chosen storage order.
+
+```cpp
+// Example, API
+template <bool is_row_major>
+auto get_default_stride(std::size_t row, std::size_t col, std::size_t stride, bool_constant<is_row_major>) {
+  // code
+}
+```
+
+Where: 
+- `is_row_major` is a bool template parameter that determines whether the storage order is row-major (true) or column-major (false).
+- `row` is the number of rows in the matrix.
+- `col` is the number of columns in the matrix.
+- `stride` is the current stride (the distance between consecutive elements in memory).
+- `bool_constant<is_row_major>` is a tag type that helps in differentiating behavior at compile-time.
+
+Next host descriptors for each of the input tensors, A, B, and C are created. Use the `f_host_tensor_descriptor` function defined below. This function takes four parameters, row, col, stride, and layout, and returns a HostTensorDescriptor based on the specified layout.
+
+```cpp
+// Example for tensor A
+ck_tile::HostTensor<ADataType>(f_host_tensor_descriptor(M, K, stride_As[i], a_layout)))
+```
+
+After creating the host_tensors, create `deviceMem` for each tensor `A`, `B`, and `C`, and then transfer the data to the device. The `get_element_space_size_in_bytes()` function is used to get the buffer size in bytes. Use `ToDevice()` to transfer data from the host to the device. The data that was previously generated (`a_m_k_tensors[i].data()`) is passed as a parameter to `ToDevice()`.
+
+The final step before running the GEMM operation is to retrieve the pointers to the buffers of `A`, `B`, and `C` stored on the device using `->GetDeviceBuffer()` and pack them into a shared container. For example: `gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]})`, where `gemm_descs` is `std::vector<grouped_gemm_kargs> gemm_descs` ([Code](https://github.com/ROCm/composable_kernel/blob/develop/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc#L221)). The container should include values such as:
+```cpp
+struct GroupedGemmHostArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+```
+The data prepared in this way can be passed to the `invoke_gemm` function. This is a templated function that also takes three template parameters: `ALayout`, `BLayout`, and `CLayout`:
+```cpp
+// Example, API
+template <typename ALayout, typename BLayout, typename CLayout, bool Persistent>
+float invoke_gemm(int n_warmup,
+                  int n_repeat,
+                  int group_count,
+                  const std::vector<grouped_gemm_kargs>& args)
+```
+`invoke_gemm` returns the run time in milliseconds. The workspace memory required for computation is allocated. Workspace memory on the GPU refers to temporary memory buffers allocated when some operations are run. This extra space is needed to hold GEMM descriptions. The following structure can be used to allocate workspace:
+
+```cpp
+// Example
+ck_tile::DeviceMem gemm_workspace;
+gemm_workspace.Realloc(GetWorkspaceSize(args));
+```
+Finally the arguments are passed to group_gemm and the kernel is launched.
+```cpp
+// API
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                   const ck_tile::stream_config& s,
+                   void* kargs_ptr)
+```
+All the necessary parameters are set, the tiling is computed, the GEMM pipeline and epilogue are prepared, and the GroupedGemmKernel is launched.
+
+## Build
 ```
 # in the root of ck_tile
 mkdir build && cd build
@@ -16,10 +157,17 @@ This will result in an executable `build/bin/tile_example_grouped_gemm`
 ## example
 ```
 args:
-   -a_layout    Tensor A layout (default:R)
-   -b_layout    Tensor B layout (default:R)
-   -c_layout    Tensor C layout (default:R)
-          -v    0. No validation, 1. Validation on CPU
-     -warmup    number of iterations before benchmark the kernel (default:10)
-     -repeat    number of iterations to benchmark the kernel (default:100)
+ -Ms          M dimensions - (Default: empty).
+ -Ns          N dimensions - (Default: empty).
+ -Ks          K dimensions - (Default: empty).
+ -stride_As   Tensor A strides - (Default: empty).
+ -stride_Bs   Tensor B strides - (Default: empty).
+ -stride_Cs   Tensor C strides - (Default: empty).
+ -a_layout    A tensor data layout - (Default: Row).
+ -b_layout    B tensor data layout - (Default: Col).
+ -c_layout    C tensor data layout - (Default: Row).
+ -validate    0. No validation, 1. Validation on CPU. (Default: 1).
+ -warmup      Number of iterations before benchmark the kernel. (Default: 10).
+ -repeat      Number of iterations to benchmark the kernel. (Default: 100).
+ -group_count Group count. (Default: 16).
 ```

From 3d70c638d1a217869fe0d90636232d239786b4e5 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Fri, 4 Jul 2025 15:49:52 +0200
Subject: [PATCH 278/443] Always force output clearing for grouped conv bwd
 data (#2446)

* Always force output clearing

* dont run set zero for residual

---------

Co-authored-by: Bartlomiej Kocot <barkocot@amd.com>
---
 ...uped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index efb91bd13d..89a304fda4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -611,7 +611,19 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 // If stride is larger than windows size then we will have some empty places
                 image_covered_strides &= conv_filter_strides[d] <= b_g_k_c_xs_lengths[d + I3];
             }
+            bool if_d_is_output_mem  = false;
+            const void* out_mem_void = static_cast<const void*>(p_e);
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                if(p_ds[i] == out_mem_void)
+                {
+                    if_d_is_output_mem = true;
+                }
+            });
+
             bwd_needs_zero_out = k_batch_ > 1 || !image_covered_dilation || !image_covered_strides;
+
+            // Temporary workaround untill prove/fix above conditions.
+            bwd_needs_zero_out = !if_d_is_output_mem;
             e_space_size_bytes =
                 ck::accumulate_n<long_index_t>(
                     e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *

From 7998ae89693dbc24793334bdb5e12568fa30fe2b Mon Sep 17 00:00:00 2001
From: Mingtao Gu <145657261+mtgu0705@users.noreply.github.com>
Date: Sun, 6 Jul 2025 15:42:00 +0800
Subject: [PATCH 279/443] [CK] Mxfp4 moe blockscale buf2lds version support
 (#2455)

* change cshuffle size

* added mxfp4 moe async buffer loading without B preshuffle

* added mx moe B shuffling + scale shuffling (async loads)

* minor fix

---------

Co-authored-by: mtgu0705 <mtgu@amd.com>
---
 example/67_gemm_microscaling/CMakeLists.txt   |   23 +-
 .../moe_gemm1_xdl_mx_fp4.cpp                  |  548 ++++
 .../moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp      |  574 ++++
 .../moe_gemm2_xdl_mx_fp4.cpp                  |  542 ++++
 .../moe_gemm2_xdl_mx_fp4_bns.cpp              |    2 +-
 .../moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp      |  584 ++++
 ...xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp |  919 ------
 ...xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp | 1300 ++++----
 ...ne_xdlops_b_preshuffle_mx_moe_selector.hpp |   49 +-
 ...pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp |  813 -----
 ...pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp |  917 +++---
 ...emm_pipeline_xdlops_mx_moe_gufusion_v3.hpp | 1332 ++++++++
 ...e_gemm_pipeline_xdlops_mx_moe_selector.hpp |  109 +
 ...ockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp | 1090 +++++++
 ...nsor_slice_transfer_gather_direct_load.hpp |  405 +++
 .../gpu/device/impl/device_moe_mx_gemm.hpp    |   83 +-
 .../impl/device_moe_mx_gemm_bpreshuffle.hpp   |  567 ++++
 .../gpu/grid/gridwise_moe_mx_gemm.hpp         | 1532 +++++----
 .../grid/gridwise_moe_mx_gemm_bpreshuffle.hpp | 2761 +++++++++++++++++
 19 files changed, 10677 insertions(+), 3473 deletions(-)
 create mode 100644 example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp
 create mode 100644 example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
 create mode 100644 example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp
 create mode 100644 example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp
 delete mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_gufusion_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp

diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 34c54a7e12..07315d4aa5 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -22,16 +22,35 @@ add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4_bns)
 add_example_executable(example_moe_gemm2_xdl_mx_fp4_bns moe_gemm2_xdl_mx_fp4_bns.cpp)
 add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4_bns)
 
+add_example_executable(example_moe_gemm1_xdl_mx_fp4 moe_gemm1_xdl_mx_fp4.cpp)
+add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4)
+
+add_example_executable(example_moe_gemm2_xdl_mx_fp4 moe_gemm2_xdl_mx_fp4.cpp)
+add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4)
+
+add_example_executable(example_moe_gemm1_xdl_mx_fp4_bpreshuffle moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp)
+add_example_dependencies(example_gemm_mx example_moe_gemm1_xdl_mx_fp4_bpreshuffle)
+
+add_example_executable(example_moe_gemm2_xdl_mx_fp4_bpreshuffle moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp)
+add_example_dependencies(example_gemm_mx example_moe_gemm2_xdl_mx_fp4_bpreshuffle)
+
 set(FP4_MXGEMM_OPTIONS)
 list(APPEND FP4_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --amdgpu-use-amdgpu-trackers=1")
 example_compile_options(example_gemm_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
 
-example_compile_options(example_moe_gemm1_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
-example_compile_options(example_moe_gemm2_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+# mx moe B no-shuffling + scale shuffling 
 example_compile_options(example_moe_gemm1_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
 example_compile_options(example_moe_gemm2_xdl_mx_fp4_bns PRIVATE ${FP4_MXGEMM_OPTIONS})
 
+# mx moe B no-shuffling + scale shuffling (async loads)
+example_compile_options(example_moe_gemm1_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+example_compile_options(example_moe_gemm2_xdl_mx_fp4 PRIVATE ${FP4_MXGEMM_OPTIONS})
+
+# mx moe B shuffling + scale shuffling (async loads)
+example_compile_options(example_moe_gemm1_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
+example_compile_options(example_moe_gemm2_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
+
 set(FP8_MXGEMM_OPTIONS)
 list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp
new file mode 100644
index 0000000000..aaf0cb3891
--- /dev/null
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F4              = ck::f4x2_pk_t;
+using F16             = ck::half_t;
+using BF16            = ck::bhalf_t;
+using F32             = float;
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F4;
+using A1DataType       = XPackedDataType;
+using B0DataType       = F4;
+using B1DataType       = XPackedDataType;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, F16, float, float, float>(
+        EDataType& e, const F16& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+// A, B Scale preshuffle
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+constexpr ck::index_t DataPackedSize   = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize   = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock        = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+static constexpr ck::index_t Nswizzle  = false;
+static constexpr ck::index_t ActOP     = 0; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t NPerBlock = 64;
+static constexpr ck::index_t BlockSize = 256;
+static constexpr bool MulRoutedWeight  = true;
+
+// clang-format off
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemmMX<      
+    A0Layout,    B0Layout,    DsLayout,    ELayout, 
+    A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
+    AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
+    ScaleBlockSize, BlockSize,   
+    MPerBlock,      NPerBlock,    KPerBlock,
+    16,   16, 
+    16,   16,
+    4,     2,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    2,    2,     S<1, 32, 1, 8>, S<8, 1, 1, 1>,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 
+    ActOP, Nswizzle, true, MulRoutedWeight, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    constexpr ck::index_t sorted_tile_num = 13;
+    constexpr ck::index_t valid_tile_num  = sorted_tile_num;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+
+    ck::index_t N       = 6144;
+    ck::index_t K       = 4096;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    ck::index_t Scale_Stride_AM      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    ck::index_t Scale_Stride_BN      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({sorted_tile_num + 1}));
+    max_token_id.mData[0] = valid_size;
+
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
+    }
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    expert_ids.savetxt("expert_ids.txt", "int");
+    sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<XDataType> a1_t_k(HostTensorDescriptor(
+        {tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<XDataType> b1_e_n_k(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
+                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
+
+    // A, B Scale preshuffle
+    Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> a_scale_preshuffled(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> b_scale_preshuffled(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
+                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_k_n_host_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_k_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+
+    e_t_k_n_device_result.SetZero();
+    std::cout << "a0_t_k:   " << a0_t_k.mDesc << std::endl;
+    std::cout << "a1_t_k:   " << a1_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n:   " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_k_n:  " << e_t_k_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    case 3:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 4:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 5.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 5:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{1});
+        break;
+    case 6:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 7:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{0.5f});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{1.5f});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{1.0f});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{1.0f});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(XDataType) * a_scale_sorted.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(XDataType) * b1_e_n_k.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_k_n_device_result.GetElementSpaceSize());
+
+    // A scale sorted
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int token_id = sorted_token_ids.mData[i] & 0x00FFFFFF;
+
+        for(int k = 0; k < (K + ScaleBlockSize - 1) / ScaleBlockSize; k++)
+        {
+            if(token_id == tokens)
+            {
+                a_scale_sorted(i, k) = ck::type_convert<XDataType>(0);
+            }
+            else
+            {
+                a_scale_sorted(i, k) = a1_t_k(token_id, k);
+            }
+        }
+    }
+
+    // A/B scale shuffle
+    preShuffleScaleBuffer<ck::is_same_v<A0Layout, Row>>(a_scale_sorted.mData.data(),
+                                                        a_scale_preshuffled.mData.data(),
+                                                        sorted_size,
+                                                        K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<B0Layout, Col>>(b1_e_n_k.mData.data(),
+                                                        b_scale_preshuffled.mData.data(),
+                                                        N * 2 * experts,
+                                                        K / ScaleBlockSize);
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    b0_device_buf.ToDevice(b0_e_n_k.mData.data());
+    a1_device_buf.ToDevice(a_scale_preshuffled.mData.data());
+    b1_device_buf.ToDevice(b_scale_preshuffled.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(
+        sorted_token_ids_dev.GetDeviceBuffer(),
+        expert_ids_dev.GetDeviceBuffer(),
+        max_token_id_dev.GetDeviceBuffer(),
+        a0_device_buf.GetDeviceBuffer(),
+        a1_device_buf.GetDeviceBuffer(),
+        b0_device_buf.GetDeviceBuffer(),
+        b1_device_buf.GetDeviceBuffer(),
+        std::array<const void*, NumDTensor>{nullptr, nullptr, d2_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        tokens,
+        topk,
+        sorted_size,
+        N,
+        K,
+        StrideA,
+        Scale_Stride_AM,
+        StrideB,
+        Scale_Stride_BN,
+        StrideDs,
+        StrideE,
+        KBatch,
+        a_element_op,
+        b_element_op,
+        cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop =
+            // FMA * tokens * N * (Gate+Up) * topk * K +
+            // FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale)
+            std::size_t(2) * tokens * N * 2 * topk * K +
+            std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize;
+
+        std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K +
+                                sizeof(B0DataType) / 2 * K * N * 2 * experts +
+                                sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
+                                sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts +
+                                sizeof(EDataType) * tokens * topk * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
+                                                            XDataType,
+                                                            B0DataType,
+                                                            XDataType,
+                                                            float, // CShuffleDataType,
+                                                            D2DataType,
+                                                            AccDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            ActOP,
+                                                            MulRoutedWeight>;
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k,
+                                                      a1_t_k,
+                                                      b0_e_n_k,
+                                                      b1_e_n_k,
+                                                      d2_e_n,
+                                                      c_t_k_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_k_n_host_result(t, topk_id, n) =
+                    ck::type_convert<EDataType>(c_t_k_n(t, topk_id, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_k_n_device_result.mData.data());
+
+        auto status =
+            ck::utils::check_err(
+                e_t_k_n_device_result, e_t_k_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                ? 0
+                : 1;
+        if(status == 0)
+        {
+            printf("Validation Pass.\n");
+        }
+        return status;
+    }
+
+    return 0;
+}
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
new file mode 100644
index 0000000000..08ed8e11fb
--- /dev/null
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
@@ -0,0 +1,574 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F4              = ck::f4x2_pk_t;
+using F16             = ck::half_t;
+using BF16            = ck::bhalf_t;
+using F32             = float;
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
+using I64             = int64_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F4;
+using A1DataType       = XPackedDataType;
+using B0DataType       = F4;
+using B1DataType       = XPackedDataType;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, F16, float, float, float>(
+        EDataType& e, const F16& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        (void)d0;
+        (void)d1;
+        (void)d2;
+        e = ck::type_convert<EDataType>(c);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+// B preshuffle
+void preShuffleBuffer(const F4* src, F4* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+    int K_pk  = K / 2;
+    int K0    = K_pk / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    I64 tempk;
+    for(I64 n = 0; n < N; ++n)
+    {
+        for(I64 k = 0; k < K_pk; ++k)
+        {
+            I64 n0 = n / NLane;
+            I64 n1 = n % NLane;
+
+            I64 k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            I64 k1 = tempk / KPack;
+            I64 k2 = tempk % KPack;
+
+            I64 outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K_pk + k];
+        }
+    }
+}
+
+// A, B Scale preshuffle
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+constexpr ck::index_t DataPackedSize   = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize   = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock        = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+static constexpr ck::index_t Nswizzle  = false;
+static constexpr ck::index_t ActOP     = 0; // 0: gelu_and_mul, 1: silu_and_mul
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr bool MulRoutedWeight  = true;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffle<
+    A0Layout,    B0Layout,    DsLayout,    ELayout, 
+    A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
+    AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
+    ScaleBlockSize,  256, 
+    MPerBlock,  64,  KPerBlock,
+    16,   16,
+    16,   16,
+    4,    2,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    2,    2,   S<1, 32, 1, 8>, S<8, 1, 1, 1>,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, ActOP, Nswizzle, true, MulRoutedWeight, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    constexpr ck::index_t sorted_tile_num = 13;
+    constexpr ck::index_t valid_tile_num  = sorted_tile_num;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+
+    ck::index_t N       = 6144;
+    ck::index_t K       = 4096;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    ck::index_t Scale_Stride_AM      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    ck::index_t Scale_Stride_BN      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({sorted_tile_num + 1}));
+    max_token_id.mData[0] = valid_size;
+
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
+    }
+    int token_per_tile = (tokens * topk + valid_tile_num - 1) / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
+    Tensor<XDataType> a1_t_k(HostTensorDescriptor(
+        {tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+    Tensor<XDataType> b1_e_n_k(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
+                             {(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
+    // B preshuffle
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
+
+    // A, B Scale preshuffle
+    Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> a_scale_preshuffled(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> b_scale_preshuffled(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
+                             {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_k_n_host_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+    Tensor<EDataType> e_t_k_n_device_result(
+        HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
+
+    e_t_k_n_device_result.SetZero();
+    std::cout << "a0_t_k:   " << a0_t_k.mDesc << std::endl;
+    std::cout << "a1_t_k:   " << a1_t_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n:   " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_k_n:  " << e_t_k_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    case 3:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    case 4:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    case 5:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{0.1f});
+        break;
+    case 6:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_t_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(XDataType) * a_scale_sorted.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(XDataType) * b1_e_n_k.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_k_n_device_result.GetElementSpaceSize());
+
+    // A scale sorted
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int token_id = sorted_token_ids.mData[i] & 0x00FFFFFF;
+
+        for(int k = 0; k < (K + ScaleBlockSize - 1) / ScaleBlockSize; k++)
+        {
+            if(token_id == tokens)
+            {
+                a_scale_sorted(i, k) = ck::type_convert<XDataType>(0);
+            }
+            else
+            {
+                a_scale_sorted(i, k) = a1_t_k(token_id, k);
+            }
+        }
+    }
+
+    // A/B scale shuffle
+    preShuffleScaleBuffer<ck::is_same_v<A0Layout, Row>>(a_scale_sorted.mData.data(),
+                                                        a_scale_preshuffled.mData.data(),
+                                                        sorted_size,
+                                                        K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<B0Layout, Col>>(b1_e_n_k.mData.data(),
+                                                        b_scale_preshuffled.mData.data(),
+                                                        N * 2 * experts,
+                                                        K / ScaleBlockSize);
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
+    a1_device_buf.ToDevice(a_scale_preshuffled.mData.data());
+    b1_device_buf.ToDevice(b_scale_preshuffled.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    preShuffleBuffer(b0_e_n_k.mData.data(),
+                     b0_preshuffled.mData.data(),
+                     N * 2 * experts,
+                     K,
+                     device_op.GetPreShuffleParameters());
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(
+        sorted_token_ids_dev.GetDeviceBuffer(),
+        expert_ids_dev.GetDeviceBuffer(),
+        max_token_id_dev.GetDeviceBuffer(),
+        a0_device_buf.GetDeviceBuffer(),
+        a1_device_buf.GetDeviceBuffer(),
+        b0_device_buf.GetDeviceBuffer(),
+        b1_device_buf.GetDeviceBuffer(),
+        std::array<const void*, NumDTensor>{nullptr, nullptr, d2_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        tokens,
+        topk,
+        sorted_size,
+        N,
+        K,
+        StrideA,
+        Scale_Stride_AM,
+        StrideB,
+        Scale_Stride_BN,
+        StrideDs,
+        StrideE,
+        KBatch,
+        a_element_op,
+        b_element_op,
+        cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop =
+            // FMA * tokens * N * (Gate+Up) * topk * K +
+            // FMA * tokens * N * (Gate+Up) * topk * (K/BlockScale)
+            std::size_t(2) * tokens * N * 2 * topk * K +
+            std::size_t(2) * tokens * N * 2 * topk * K / ScaleBlockSize;
+
+        std::size_t num_btype = sizeof(A0DataType) / 2 * tokens * topk * K +
+                                sizeof(B0DataType) / 2 * K * N * 2 * experts +
+                                sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
+                                sizeof(XDataType) * K / ScaleBlockSize * N * 2 * experts +
+                                sizeof(EDataType) * tokens * topk * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
+                                                            XDataType,
+                                                            B0DataType,
+                                                            XDataType,
+                                                            float, // CShuffleDataType,
+                                                            D2DataType,
+                                                            AccDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            ActOP,
+                                                            MulRoutedWeight>;
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k,
+                                                      a1_t_k,
+                                                      b0_e_n_k,
+                                                      b1_e_n_k,
+                                                      d2_e_n,
+                                                      c_t_k_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < valid_size; ++m)
+        {
+            const int fuse_t  = sorted_token_ids.mData[m];
+            const int t       = fuse_t & 0xffffff;
+            const int topk_id = (fuse_t & 0xff000000) >> 24;
+
+            if(t >= tokens)
+            {
+                continue;
+            }
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_k_n_host_result(t, topk_id, n) =
+                    ck::type_convert<EDataType>(c_t_k_n(t, topk_id, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_k_n_device_result.mData.data());
+
+        auto status =
+            ck::utils::check_err(
+                e_t_k_n_device_result, e_t_k_n_host_result, "Error: Incorrect results!", 1e-3, 5e-1)
+                ? 0
+                : 1;
+        if(status == 0)
+        {
+            printf("Validation Pass.\n");
+        }
+        return status;
+    }
+
+    return 0;
+}
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp
new file mode 100644
index 0000000000..1b8a7a16e3
--- /dev/null
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp
@@ -0,0 +1,542 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F4              = ck::f4x2_pk_t;
+using F16             = ck::half_t;
+using BF16            = ck::bhalf_t;
+using F32             = float;
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F4;
+using A1DataType       = XPackedDataType;
+using B0DataType       = F4;
+using B1DataType       = XPackedDataType;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, F16, float, float, float>(
+        EDataType& e, const F16& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+// A, B Scale preshuffle
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+constexpr ck::index_t DataPackedSize = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock      = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr bool MulRoutedWeight  = true;
+
+// clang-format off
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemmMX<      
+    A0Layout,    B0Layout,    DsLayout,    ELayout, 
+    A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
+    AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
+    ScaleBlockSize,      256,   
+    MPerBlock,  128,    KPerBlock,
+    16,   16,
+    16,   16,
+    4,    4,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    2,    4,   S<1, 4, 1, 64>, S<2, 1, 1, 1>,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    constexpr ck::index_t sorted_tile_num = 13;
+    constexpr ck::index_t valid_tile_num  = sorted_tile_num;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+
+    ck::index_t N       = 6144;
+    ck::index_t K       = 4096;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    ck::index_t Scale_Stride_AM      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    ck::index_t Scale_Stride_BN      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
+    max_token_id.mData[0] = valid_size;
+    // int eids[]            = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
+    int eids[sorted_tile_num]{};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        if(i < valid_tile_num)
+        {
+            eids[i] = (i * experts) / valid_tile_num;
+        }
+        else
+        {
+            eids[i] = 3;
+        }
+    }
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    int token_per_tile = tokens * topk / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    expert_ids.savetxt("expert_ids.txt", "int");
+    sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<XDataType> a1_t_k_k(
+        HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<XDataType> b1_e_n_k(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
+                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
+
+    // A, B Scale preshuffle
+    Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> a_scale_preshuffled(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> b_scale_preshuffled(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
+                             {N * Scale_Stride_BN, 1, Scale_Stride_BN}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k_k: " << a0_t_k_k.mDesc << std::endl;
+    std::cout << "a1_t_k_k: " << a1_t_k_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 4:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 5.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 5:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 6:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 7:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 8:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(XDataType) * a_scale_sorted.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(XDataType) * b1_e_n_k.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.GetElementSpaceSize());
+    // d2_e_n.savetxt("weight.txt", "int");
+
+    // A scale sorted
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int token_id = sorted_token_ids.mData[i] & 0x00FFFFFF;
+        int topk_id  = (sorted_token_ids.mData[i] >> 24) & 0x000000FF;
+
+        for(int k = 0; k < (K + ScaleBlockSize - 1) / ScaleBlockSize; k++)
+        {
+            if(token_id == tokens)
+            {
+                a_scale_sorted(i, k) = ck::type_convert<XDataType>(0);
+            }
+            else
+            {
+                a_scale_sorted(i, k) = a1_t_k_k(token_id, topk_id, k);
+            }
+        }
+    }
+
+    preShuffleScaleBuffer<ck::is_same_v<A0Layout, Row>>(a_scale_sorted.mData.data(),
+                                                        a_scale_preshuffled.mData.data(),
+                                                        sorted_size,
+                                                        K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<B0Layout, Col>>(
+        b1_e_n_k.mData.data(), b_scale_preshuffled.mData.data(), N * experts, K / ScaleBlockSize);
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k_k.mData.data());
+    b0_device_buf.ToDevice(b0_e_n_k.mData.data());
+    a1_device_buf.ToDevice(a_scale_preshuffled.mData.data());
+    b1_device_buf.ToDevice(b_scale_preshuffled.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(
+        sorted_token_ids_dev.GetDeviceBuffer(),
+        expert_ids_dev.GetDeviceBuffer(),
+        max_token_id_dev.GetDeviceBuffer(),
+        a0_device_buf.GetDeviceBuffer(),
+        a1_device_buf.GetDeviceBuffer(),
+        b0_device_buf.GetDeviceBuffer(),
+        b1_device_buf.GetDeviceBuffer(),
+        std::array<const void*, NumDTensor>{nullptr, nullptr, d2_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        tokens,
+        topk,
+        sorted_size,
+        N,
+        K,
+        StrideA,
+        Scale_Stride_AM,
+        StrideB,
+        Scale_Stride_BN,
+        StrideDs,
+        StrideE,
+        KBatch,
+        a_element_op,
+        b_element_op,
+        cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        // FMA * tokens * N * topk * K +
+        // FMA * tokens * N * topk * (K/BlockScale)
+        std::size_t flop = std::size_t(2) * tokens * topk * N * K +
+                           std::size_t(2) * tokens * topk * N * K / ScaleBlockSize;
+
+        std::size_t num_btype =
+            sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts +
+            sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
+            sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> c_t_n({tokens, N});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeMXGemm2<A0DataType,
+                                                            XDataType,
+                                                            B0DataType,
+                                                            XDataType,
+                                                            D2DataType,
+                                                            float, // using float for Cshuffle type
+                                                                   // in reference
+                                                            AccDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            CDEElementOp,
+                                                            MulRoutedWeight,
+                                                            float,
+                                                            float>;
+
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k_k,
+                                                      a1_t_k_k,
+                                                      b0_e_n_k,
+                                                      b1_e_n_k,
+                                                      d2_e_n, // topk weights
+                                                      c_t_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
index 6718581a50..829bf9af24 100644
--- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp
@@ -158,7 +158,7 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
     4,    4,
     S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
     S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-    2,    2,   S<1, 32, 1, 8>, S<2, 1, 1, 1>,
+    2,    4,   S<1, 4, 1, 64>, S<2, 1, 1, 1>,
     ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, 0, false, false, MulRoutedWeight, ck::index_t, A0DataType>;
 // clang-format on
 
diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp
new file mode 100644
index 0000000000..efbd0f0c03
--- /dev/null
+++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp
@@ -0,0 +1,584 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F4              = ck::f4x2_pk_t;
+using F16             = ck::half_t;
+using BF16            = ck::bhalf_t;
+using F32             = float;
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
+using I64             = int64_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = F4;
+using A1DataType       = XPackedDataType;
+using B0DataType       = F4;
+using B1DataType       = XPackedDataType;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    // for real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, F16, float, float, float>(
+        EDataType& e, const F16& c, const float& d0, const float& d1, const float& d2) const
+    {
+        (void)d0;
+        (void)d1;
+        (void)d2;
+
+        e = ck::type_convert<EDataType>(c);
+    }
+    // for reference cpu
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>(
+        float& e, const float& c, const float& d0, const float& d1, const float& d2) const
+    {
+        // for reference cpu
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+};
+
+using CDEElementOp = MulABScaleExpertWeight;
+
+// B preshuffle
+void preShuffleBuffer(const F4* src, F4* dst, int N, int K, int NXdl)
+{
+    int KPack = 16;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+    int K_pk  = K / 2;
+    int K0    = K_pk / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    I64 tempk;
+    for(I64 n = 0; n < N; ++n)
+    {
+        for(I64 k = 0; k < K_pk; ++k)
+        {
+            I64 n0 = n / NLane;
+            I64 n1 = n % NLane;
+
+            I64 k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            I64 k1 = tempk / KPack;
+            I64 k2 = tempk % KPack;
+
+            I64 outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+
+            dst[outputIndex] = src[n * K_pk + k];
+        }
+    }
+}
+
+// A, B Scale preshuffle
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+constexpr ck::index_t DataPackedSize = 2;                    // Packed representation of data
+constexpr ck::index_t ScaleBlockSize = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock      = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
+
+static constexpr ck::index_t MPerBlock = 128;
+static constexpr bool MulRoutedWeight  = true;
+
+// clang-format off
+using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffle<      
+    A0Layout,    B0Layout,    DsLayout,    ELayout, 
+    A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
+    AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
+    ScaleBlockSize,      256,   
+    MPerBlock,   128,    KPerBlock,
+    16,   16,
+    16,   16,
+    8,    2,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
+    2,    2,   S<1, 4, 1, 64>, S<2, 1, 1, 1>,
+    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, ck::index_t, A0DataType>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // per expert:
+    // GEMM shape
+    constexpr ck::index_t sorted_tile_num = 13;
+    constexpr ck::index_t valid_tile_num  = 13;
+    ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
+    ck::index_t valid_size                = valid_tile_num * MPerBlock;
+
+    ck::index_t N       = 6144;
+    ck::index_t K       = 4096;
+    ck::index_t experts = 8;
+    ck::index_t tokens  = 832;
+    ck::index_t topk    = 2;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        // use default case
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 7)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        tokens          = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: N, K, tokens\n");
+        exit(0);
+    }
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+
+    ck::index_t StrideA              = K;
+    ck::index_t StrideB              = K;
+    ck::index_t StrideE              = N;
+    ck::index_t Scale_Stride_AM      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    ck::index_t Scale_Stride_BN      = (K + ScaleBlockSize - 1) / ScaleBlockSize;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+
+    ck::index_t KBatch = 1;
+
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
+    Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
+    max_token_id.mData[0] = valid_size;
+    // int eids[]            = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
+    int eids[sorted_tile_num]{};
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        if(i < valid_tile_num)
+        {
+            eids[i] = (i * experts) / valid_tile_num;
+        }
+        else
+        {
+            eids[i] = 3;
+        }
+    }
+
+    for(int i = 0; i < sorted_tile_num; i++)
+    {
+        expert_ids.mData[i] = eids[i];
+    }
+    if(tokens * topk > valid_size)
+    {
+        printf("err config, tokens * topk > valid_size\n");
+        exit(-1);
+    }
+    int token_per_tile = tokens * topk / valid_tile_num;
+    int tokenid        = 0;
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int tile_off = i % MPerBlock;
+        if(tile_off < token_per_tile)
+        {
+            sorted_token_ids.mData[i] = (tokenid % tokens) | ((tokenid / tokens) << 24);
+            tokenid++;
+        }
+        else
+        {
+            sorted_token_ids.mData[i] = tokens;
+        }
+    }
+
+    expert_ids.savetxt("expert_ids.txt", "int");
+    sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
+    Tensor<XDataType> a1_t_k_k(
+        HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
+                             {(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+    Tensor<XDataType> b1_e_n_k(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
+                             {(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
+    // B preshuffle
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
+
+    // A, B Scale preshuffle
+    Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> a_scale_preshuffled(HostTensorDescriptor(
+        {sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
+    Tensor<XDataType> b_scale_preshuffled(
+        HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
+                             {N * Scale_Stride_BN, 1, Scale_Stride_BN}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_t_k_k: " << a0_t_k_k.mDesc << std::endl;
+    std::cout << "a1_t_k_k: " << a1_t_k_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "b1_e_n_k: " << b1_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-1, 1});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-1, 1});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 3:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 4:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 5.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 5:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 6:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 7:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    case 8:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_t_k_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_t_k_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        b1_e_n_k.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.GetElementSpaceSize());
+    DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k_k.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(XDataType) * a_scale_sorted.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(XDataType) * b1_e_n_k.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.GetElementSpaceSize());
+
+    // A scale sorted
+    for(int i = 0; i < sorted_size; i++)
+    {
+        int token_id = sorted_token_ids.mData[i] & 0x00FFFFFF;
+        int topk_id  = (sorted_token_ids.mData[i] >> 24) & 0x000000FF;
+
+        for(int k = 0; k < (K + ScaleBlockSize - 1) / ScaleBlockSize; k++)
+        {
+            if(token_id == tokens)
+            {
+                a_scale_sorted(i, k) = ck::type_convert<XDataType>(0);
+            }
+            else
+            {
+                a_scale_sorted(i, k) = a1_t_k_k(token_id, topk_id, k);
+            }
+        }
+    }
+
+    // A, B Scale preshuffle
+    preShuffleScaleBuffer<ck::is_same_v<A0Layout, Row>>(a_scale_sorted.mData.data(),
+                                                        a_scale_preshuffled.mData.data(),
+                                                        sorted_size,
+                                                        K / ScaleBlockSize);
+    preShuffleScaleBuffer<ck::is_same_v<B0Layout, Col>>(
+        b1_e_n_k.mData.data(), b_scale_preshuffled.mData.data(), N * experts, K / ScaleBlockSize);
+
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    max_token_id_dev.ToDevice(max_token_id.mData.data());
+    a0_device_buf.ToDevice(a0_t_k_k.mData.data());
+    a1_device_buf.ToDevice(a_scale_preshuffled.mData.data());
+    b1_device_buf.ToDevice(b_scale_preshuffled.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    preShuffleBuffer(b0_e_n_k.mData.data(),
+                     b0_preshuffled.mData.data(),
+                     N * experts,
+                     K,
+                     device_op.GetPreShuffleParameters());
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(
+        sorted_token_ids_dev.GetDeviceBuffer(),
+        expert_ids_dev.GetDeviceBuffer(),
+        max_token_id_dev.GetDeviceBuffer(),
+        a0_device_buf.GetDeviceBuffer(),
+        a1_device_buf.GetDeviceBuffer(),
+        b0_device_buf.GetDeviceBuffer(),
+        b1_device_buf.GetDeviceBuffer(),
+        std::array<const void*, NumDTensor>{nullptr, nullptr, d2_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        tokens,
+        topk,
+        sorted_size,
+        N,
+        K,
+        StrideA,
+        Scale_Stride_AM,
+        StrideB,
+        Scale_Stride_BN,
+        StrideDs,
+        StrideE,
+        KBatch,
+        a_element_op,
+        b_element_op,
+        cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
+    {
+        std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
+    }
+
+    if(time_kernel)
+    {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+        // FMA * tokens * N * topk * K +
+        // FMA * tokens * N * topk * (K/BlockScale)
+        std::size_t flop = std::size_t(2) * tokens * topk * N * K +
+                           std::size_t(2) * tokens * topk * N * K / ScaleBlockSize;
+
+        std::size_t num_btype =
+            sizeof(A0DataType) / 2 * tokens * K * topk + sizeof(B0DataType) / 2 * K * N * experts +
+            sizeof(XDataType) * tokens * topk * K / ScaleBlockSize +
+            sizeof(XDataType) * K / ScaleBlockSize * N * experts + sizeof(EDataType) * tokens * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << device_op.GetTypeString() << std::endl;
+    }
+
+    if(do_verification)
+    {
+        // gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
+
+        Tensor<float> c_t_n({tokens, N});
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeMXGemm2<A0DataType,
+                                                            XDataType,
+                                                            B0DataType,
+                                                            XDataType,
+                                                            D2DataType,
+                                                            float, // using float for Cshuffle type
+                                                                   // in reference
+                                                            AccDataType,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            CDEElementOp,
+                                                            MulRoutedWeight,
+                                                            float,
+                                                            float>;
+
+        auto ref_moe_gemm = ReferenceGemmInstance{};
+        auto ref_invoker  = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(sorted_token_ids,
+                                                      expert_ids,
+                                                      max_token_id,
+                                                      MPerBlock,
+                                                      a0_t_k_k,
+                                                      a1_t_k_k,
+                                                      b0_e_n_k,
+                                                      b1_e_n_k,
+                                                      d2_e_n, // topk weights
+                                                      c_t_n,
+                                                      PassThrough{},
+                                                      PassThrough{},
+                                                      cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp
deleted file mode 100644
index ac3b82f800..0000000000
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp
+++ /dev/null
@@ -1,919 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
-
-namespace ck {
-
-// Naive pipeline with lowest resource request per WGP
-// GlobalPrefetchStages: 2
-// LocalPreFillStages: 1
-// LocalPreFetchStages: 1
-// LocalSharedMemoryBuffer: 1
-
-template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
-          index_t ThreadBlockSize,
-          index_t ScaleBlockSize,
-          typename ADataType,
-          typename AScaleDataType,
-          typename BDataType,
-          typename BScaleDataType,
-          typename ATileDesc,
-          typename BTileDesc,
-          typename AMmaTileDesc,
-          typename BMmaTileDesc,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MRepeat, // MXdlPerWave
-          index_t NRepeat, // NXdlPerWave
-          index_t KPack>
-struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v1
-{
-};
-
-template <index_t ThreadBlockSize,
-          index_t ScaleBlockSize,
-          typename ADataType,
-          typename AScaleDataType,
-          typename BDataType,
-          typename BScaleDataType,
-          typename ATileDesc,
-          typename BTileDesc,
-          typename AMmaTileDesc,
-          typename BMmaTileDesc,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MRepeat, // MXdlPerWave
-          index_t NRepeat, // NXdlPerWave
-          index_t KPack>
-struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v1<
-    BlockGemmPipelineScheduler::Intrawave,
-    ThreadBlockSize,
-    ScaleBlockSize,
-    ADataType,
-    AScaleDataType,
-    BDataType,
-    BScaleDataType,
-    ATileDesc,
-    BTileDesc,
-    AMmaTileDesc,
-    BMmaTileDesc,
-    ABlockTransferSrcScalarPerVector,
-    BBlockTransferSrcScalarPerVector,
-    MPerBlock,
-    NPerBlock,
-    KPerBlock,
-    MPerXDL,
-    NPerXDL,
-    MRepeat,
-    NRepeat,
-    KPack> : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
-                                                  ADataType,
-                                                  BDataType,
-                                                  ATileDesc,
-                                                  BTileDesc,
-                                                  AMmaTileDesc,
-                                                  BMmaTileDesc,
-                                                  ABlockTransferSrcScalarPerVector,
-                                                  BBlockTransferSrcScalarPerVector,
-                                                  MPerBlock,
-                                                  NPerBlock,
-                                                  KPerBlock,
-                                                  MPerXDL,
-                                                  NPerXDL,
-                                                  MRepeat,
-                                                  NRepeat,
-                                                  KPack>
-
-{
-
-    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
-                                                      ADataType,
-                                                      BDataType,
-                                                      ATileDesc,
-                                                      BTileDesc,
-                                                      AMmaTileDesc,
-                                                      BMmaTileDesc,
-                                                      ABlockTransferSrcScalarPerVector,
-                                                      BBlockTransferSrcScalarPerVector,
-                                                      MPerBlock,
-                                                      NPerBlock,
-                                                      KPerBlock,
-                                                      MPerXDL,
-                                                      NPerXDL,
-                                                      MRepeat,
-                                                      NRepeat,
-                                                      KPack>;
-    using Base::I0;
-    using Base::I1;
-    using Base::KRepeat;
-    using Base::MWaves;
-    using Base::NWaves;
-    using Base::WaveSize;
-    using Base::xdlops_gemm;
-
-    using Base::CalculateCThreadOriginDataIndex;
-    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
-    using Base::GetCThreadBuffer;
-    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
-    using Base::GetWaveIdx;
-    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-
-    using Base::a_block_desc_m0_m1_m2_k;
-    using Base::b_block_desc_n0_n1_n2_k;
-
-    using Base::AMmaKStride;
-    using Base::BMmaKStride;
-    using Base::KThreadChunk;
-
-    using Base::APackedSize;
-    using Base::BPackedSize;
-    using Base::ComputePackedSize;
-
-    using AccType      = typename Base::AccType;
-    using Tuple4       = typename Base::Tuple4;
-    using ComputeTypeA = typename Base::ComputeTypeA;
-    using ComputeTypeB = typename Base::ComputeTypeB;
-
-    static constexpr index_t PrefetchStages  = 2;
-    static constexpr index_t PrefillStages   = 1;
-    static constexpr index_t GlobalBufferNum = 2;
-
-    template <typename TileDesc_M0_M1_M2_K>
-    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
-    {
-        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
-        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
-        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
-
-        return transform_tensor_descriptor(
-            TileDesc_M0_M1_M2_K{},
-            make_tuple(
-                make_pass_through_transform(Number<M0>{}),
-                make_pass_through_transform(Number<M1>{}),
-                make_pass_through_transform(Number<M2>{}),
-                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
-    }
-
-    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
-        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
-
-    static constexpr auto ScalesPerKBlockSize =
-        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
-
-    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
-
-    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRunPerThread =
-        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
-
-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
-    {
-        return num_loop > PrefetchStages;
-    }
-
-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
-    {
-        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
-    }
-
-    template <bool HasMainLoop,
-              TailNumber TailNum,
-              typename AGridDesc,
-              typename ABlockDesc,
-              typename ABlockTransfer,
-              typename AGridBuffer,
-              typename ABlockBuffer,
-              typename ABlockTransferStep,
-              typename BGridDesc,
-              typename BBlockDesc,
-              typename BBlockTransfer,
-              typename BGridBuffer,
-              typename BBlockBuffer,
-              typename BBlockTransferStep,
-              typename CThreadBuffer,
-              typename AScaleGridBuffer,
-              typename AScaleGridDesc,
-              typename AScaleThreadTransfer,
-              typename BScaleGridBuffer,
-              typename BScaleGridDesc,
-              typename BScaleThreadTransfer>
-    __device__ void Run(
-        // ABlockCopy
-        const AGridDesc& a_grid_desc,
-        const ABlockDesc& a_block_desc,
-        ABlockTransfer& a_blockwise_copy,
-        const AGridBuffer& a_grid_buf,
-        ABlockBuffer& a_block_buf,
-        const ABlockTransferStep& a_block_copy_step,
-        // BBlockCopy
-        const BGridDesc& b_grid_desc,
-        const BBlockDesc& b_block_desc,
-        BBlockTransfer& b_blockwise_copy,
-        BBlockTransfer& b_blockwise_copy_up,
-        const BGridBuffer& b_grid_buf,
-        const BGridBuffer& b_grid_buf_up,
-        BBlockBuffer& b_block_buf,
-        const BBlockTransferStep& b_block_copy_step,
-        // CThread
-        CThreadBuffer& c_thread_buf,
-        CThreadBuffer& c_thread_buf_up,
-        // A and B scales
-        const AScaleGridDesc& a_scale_grid_desc,
-        AScaleThreadTransfer& a_scale_thread_copy,
-        const AScaleGridBuffer& a_scale_grid_buf,
-        const BScaleGridDesc& b_scale_grid_desc,
-        BScaleThreadTransfer& b_scale_thread_copy,
-        BScaleThreadTransfer& b_scale_thread_copy_up,
-        const BScaleGridBuffer& b_scale_grid_buf,
-        const BScaleGridBuffer& b_scale_grid_buf_up,
-        index_t num_loop) const
-    {
-        ignore            = b_block_desc;
-        ignore            = b_block_buf;
-        ignore            = a_scale_grid_buf;
-        ignore            = b_scale_grid_buf;
-        ignore            = b_scale_grid_buf_up;
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
-            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
-            b_thread_desc_.GetElementSpaceSize());
-
-        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
-        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
-        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
-
-        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-            a_scale_thread_desc.GetElementSpaceSize());
-        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-            b_scale_thread_desc.GetElementSpaceSize());
-
-        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
-        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
-        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs_up;
-
-        // Global prefetch A1 B1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-        b_blockwise_copy.Run(b_grid_desc,
-                             b_grid_buf,
-                             b_block_desc_n0_n1_k0_k1,
-                             b_block_origin_idx,
-                             b_thread_bufs(I0));
-        b_blockwise_copy_up.Run(b_grid_desc,
-                                b_grid_buf_up,
-                                b_block_desc_n0_n1_k0_k1,
-                                b_block_origin_idx,
-                                b_thread_bufs_up(I0));
-
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-        // Prefetch a_scales to buf 0
-        a_scale_thread_copy.Run(a_scale_grid_desc,
-                                a_scale_grid_buf,
-                                a_scale_thread_desc,
-                                make_tuple(I0, I0, I0),
-                                a_scale_thread_bufs(I0));
-
-        // restore row id and advance to the next set of scales
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                               make_multi_index(0, ScalesPerKBlockSize, 0));
-
-        // Prefetch b_scales to buf 0
-        static_for<0, NRepeat, 1>{}([&](auto n0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto b_scale_offset =
-                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                    auto b_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            b_scale_thread_buf_copy);
-
-                    b_scale_thread_bufs(I0)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy[Number<0>{}];
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-
-                    auto b_scale_thread_buf_copy_up =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
-                                               b_scale_grid_buf_up,
-                                               b_scale_thread_desc_copy,
-                                               make_tuple(I0, I0),
-                                               b_scale_thread_buf_copy_up);
-
-                    b_scale_thread_bufs_up(I0)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy_up[Number<0>{}];
-                    b_scale_thread_copy_up.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
-            });
-            b_scale_thread_copy.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-            b_scale_thread_copy_up.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-        });
-
-        // restore col id and advance to the next set of scales
-        // NWaves * NPerXDL * NRepeat == NPerBlock
-        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-        b_scale_thread_copy_up.MoveSrcSliceWindow(
-            b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-
-        __builtin_amdgcn_sched_barrier(0);
-
-        // Local prefill A1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
-
-        // Global prefetch A2
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-        // Prefetch a_scales to buf 1
-        a_scale_thread_copy.Run(a_scale_grid_desc,
-                                a_scale_grid_buf,
-                                a_scale_thread_desc,
-                                make_tuple(I0, I0, I0),
-                                a_scale_thread_bufs(I1));
-
-        // restore row id and advance to the next set of scales
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                               make_multi_index(0, ScalesPerKBlockSize, 0));
-
-        // Prefetch b_scales to buf 1
-        static_for<0, NRepeat, 1>{}([&](auto n0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto b_scale_offset =
-                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                    auto b_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            b_scale_thread_buf_copy);
-
-                    b_scale_thread_bufs(I1)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy[Number<0>{}];
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-
-                    auto b_scale_thread_buf_copy_up =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
-                                               b_scale_grid_buf_up,
-                                               b_scale_thread_desc_copy,
-                                               make_tuple(I0, I0),
-                                               b_scale_thread_buf_copy_up);
-
-                    b_scale_thread_bufs_up(I1)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy_up[Number<0>{}];
-                    b_scale_thread_copy_up.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
-            });
-            b_scale_thread_copy.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-            b_scale_thread_copy_up.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-        });
-
-        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-        b_scale_thread_copy_up.MoveSrcSliceWindow(
-            b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-
-        // Local prefetch A1
-        block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k) {
-            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                    constexpr auto a_k_step_chunk =
-                        k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                       make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                       a_thread_buf);
-                });
-            });
-        });
-
-        // Initialize C
-        c_thread_buf.Clear();
-        c_thread_buf_up.Clear();
-
-        // main body
-        if constexpr(HasMainLoop)
-        {
-            // loop over k with the step KPerBlock
-            index_t i = 0;
-            do
-            {
-                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
-                    b_blockwise_copy.Run(b_grid_desc,
-                                         b_grid_buf,
-                                         b_block_desc_n0_n1_k0_k1,
-                                         b_block_origin_idx,
-                                         b_thread_bufs(local_read_buf));
-                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                    b_blockwise_copy_up.Run(b_grid_desc,
-                                            b_grid_buf_up,
-                                            b_block_desc_n0_n1_k0_k1,
-                                            b_block_origin_idx,
-                                            b_thread_bufs_up(local_read_buf));
-                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                    block_sync_lds();
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
-
-                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
-                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                vector_type<ComputeTypeA, KPack> a_thread_vec;
-                                vector_type<ComputeTypeB, KPack> b_thread_vec;
-                                vector_type<ComputeTypeB, KPack> b_thread_vec_up;
-
-                                static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                            make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                        b_thread_bufs[mfma_reg_buf]
-                                                     [Number<b_thread_desc_.CalculateOffset(
-                                                         make_tuple(n0, I0, k0, ik))>{}];
-                                    b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
-                                        b_thread_bufs_up[mfma_reg_buf]
-                                                        [Number<b_thread_desc_.CalculateOffset(
-                                                            make_tuple(n0, I0, k0, ik))>{}];
-                                });
-
-                                constexpr index_t a_scale_offset =
-                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-                                constexpr index_t b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                                static_assert(
-                                    0 < ScalesPerXdlopsRunPerThread,
-                                    "Must have at least one scale per Xdlops per Thread.");
-
-                                vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    a_scale_thread_vec;
-                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    b_scale_thread_vec;
-                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    b_scale_thread_vec_up;
-
-                                // Pack scale_thread_buf into scale_thread_vec
-                                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                        a_scale_thread_bufs[mfma_reg_buf]
-                                                           [Number<a_scale_offset + s>{}];
-                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                        b_scale_thread_bufs[mfma_reg_buf]
-                                                           [Number<b_scale_offset + s>{}];
-                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                        b_scale_thread_bufs_up[mfma_reg_buf]
-                                                              [Number<b_scale_offset + s>{}];
-                                });
-
-                                using mfma_input_type_a =
-                                    typename vector_type<ComputeTypeA,
-                                                         xdlops_gemm.K1PerXdlops /
-                                                             APackedSize>::type;
-                                using mfma_input_type_b =
-                                    typename vector_type<ComputeTypeB,
-                                                         xdlops_gemm.K1PerXdlops /
-                                                             BPackedSize>::type;
-
-                                constexpr index_t c_offset =
-                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                                // MFMA accumulation
-                                xdlops_gemm.template Run<>(
-                                    a_thread_vec.template AsType<mfma_input_type_a>(),
-                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
-                                    b_thread_vec.template AsType<mfma_input_type_b>(),
-                                    b_scale_thread_vec.template AsType<BScaleDataType>(),
-                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                                xdlops_gemm.template Run<>(
-                                    a_thread_vec.template AsType<mfma_input_type_a>(),
-                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
-                                    b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                                    b_scale_thread_vec_up.template AsType<BScaleDataType>(),
-                                    c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                            });
-                        });
-                    });
-
-                    block_sync_lds();
-
-                    // a thread copy
-                    static_for<0, KRepeat, 1>{}([&](auto k) {
-                        constexpr auto k_step =
-                            k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-                        static_for<0, MRepeat, 1>{}([&](auto m0) {
-                            static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}(
-                                [&](auto chunk) {
-                                    constexpr auto a_k_step_chunk =
-                                        k_step + chunk * KThreadChunk *
-                                                     xdlops_gemm.mfma_instr.num_input_blks;
-                                    a_thread_copy_.Run(
-                                        a_block_desc_m0_m1_m2_k,
-                                        make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                        a_block_buf,
-                                        a_thread_desc_,
-                                        make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                        a_thread_buf);
-                                });
-                        });
-                    });
-
-                    // Prefetch a_scales
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc,
-                                            make_tuple(I0, I0, I0),
-                                            a_scale_thread_bufs(mfma_reg_buf));
-
-                    // restore row id and advance to the next set of scales
-                    a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc, make_multi_index(0, ScalesPerKBlockSize, 0));
-
-                    // Prefetch b_scales
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                                constexpr auto b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                                auto b_scale_thread_buf_copy =
-                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                        b_scale_thread_desc_copy.GetElementSpaceSize());
-                                b_scale_thread_copy.Run(b_scale_grid_desc,
-                                                        b_scale_grid_buf,
-                                                        b_scale_thread_desc_copy,
-                                                        make_tuple(I0, I0),
-                                                        b_scale_thread_buf_copy);
-
-                                b_scale_thread_bufs(mfma_reg_buf)(Number<b_scale_offset>{}) =
-                                    b_scale_thread_buf_copy[Number<0>{}];
-                                b_scale_thread_copy.MoveSrcSliceWindow(
-                                    b_scale_grid_desc,
-                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-
-                                auto b_scale_thread_buf_copy_up =
-                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                        b_scale_thread_desc_copy.GetElementSpaceSize());
-                                b_scale_thread_copy_up.Run(b_scale_grid_desc,
-                                                           b_scale_grid_buf_up,
-                                                           b_scale_thread_desc_copy,
-                                                           make_tuple(I0, I0),
-                                                           b_scale_thread_buf_copy_up);
-
-                                b_scale_thread_bufs_up(mfma_reg_buf)(Number<b_scale_offset>{}) =
-                                    b_scale_thread_buf_copy_up[Number<0>{}];
-                                b_scale_thread_copy_up.MoveSrcSliceWindow(
-                                    b_scale_grid_desc,
-                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                            });
-                        });
-                        b_scale_thread_copy.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-                        b_scale_thread_copy_up.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-                    });
-
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-                    b_scale_thread_copy_up.MoveSrcSliceWindow(
-                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-                };
-
-                LoopFunc(I0, I1);
-                LoopFunc(I1, I0);
-
-                i += 2;
-            } while(i < (num_loop - 2));
-        }
-
-        // tail
-        if constexpr(TailNum == TailNumber::Even)
-        {
-            b_blockwise_copy.Run(b_grid_desc,
-                                 b_grid_buf,
-                                 b_block_desc_n0_n1_k0_k1,
-                                 b_block_origin_idx,
-                                 b_thread_bufs(I1));
-
-            b_blockwise_copy_up.Run(b_grid_desc,
-                                    b_grid_buf_up,
-                                    b_block_desc_n0_n1_k0_k1,
-                                    b_block_origin_idx,
-                                    b_thread_bufs_up(I1));
-            block_sync_lds();
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
-
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                            b_scale_thread_vec_up;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
-                        });
-
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA,
-                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB,
-                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
-                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                    });
-                });
-            });
-
-            block_sync_lds();
-
-            // a thread copy
-            static_for<0, KRepeat, 1>{}([&](auto k) {
-                constexpr auto k_step =
-                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                        constexpr auto a_k_step_chunk =
-                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                           a_block_buf,
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                           a_thread_buf);
-                    });
-                });
-            });
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
-
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                            b_scale_thread_vec_up;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I1][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I1][Number<b_scale_offset + s>{}];
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs_up[I1][Number<b_scale_offset + s>{}];
-                        });
-
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA,
-                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB,
-                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
-                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                    });
-                });
-            });
-        }
-        else if constexpr(TailNum == TailNumber::Odd)
-        {
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec_up;
-
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                            b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                            b_scale_thread_vec_up;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
-                        });
-
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA,
-                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB,
-                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
-                            c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                    });
-                });
-            });
-        }
-    }
-
-    // TODO: make this field protected when a_scale_thread_copy_ is moved
-    // here
-    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from a_scale_grid to a_scale_thread
-    static constexpr auto a_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
-
-    // TODO: make this field protected when b_scale_thread_copy_ is moved
-    // here
-    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from b_scale_grid to b_scale_thread_buf
-    static constexpr auto b_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
-
-    protected:
-    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
-    using Base::a_thread_copy_;
-    using Base::a_thread_desc_;
-    using Base::b_thread_copy_;
-    // using Base::b_thread_desc_;
-    using Base::c_thread_desc_;
-
-    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
-};
-
-} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
index f899c223b9..b3b3d312c7 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
@@ -116,9 +116,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
                                                       MRepeat,
                                                       NRepeat,
                                                       KPack>;
+    using Base::A_K1;
     using Base::I0;
     using Base::I1;
-    using Base::I2;
     using Base::KRepeat;
     using Base::MWaves;
     using Base::NWaves;
@@ -138,66 +138,67 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
     using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
     using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
 
-    using Base::a_block_desc_m0_m1_m2_k;
-    using Base::b_block_desc_n0_n1_n2_k;
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
 
     using Base::AMmaKStride;
+    using Base::APackedSize;
     using Base::BMmaKStride;
+    using Base::BPackedSize;
     using Base::KThreadChunk;
 
-    using Base::APackedSize;
-    using Base::BPackedSize;
-    using Base::ComputePackedSize;
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
 
     using AccType      = typename Base::AccType;
-    using Tuple4       = typename Base::Tuple4;
+    using Tuple5       = typename Base::Tuple5;
     using ComputeTypeA = typename Base::ComputeTypeA;
     using ComputeTypeB = typename Base::ComputeTypeB;
 
     static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t LocalPrefetchStages   = 2;
     static constexpr index_t PrefillStages         = 1;
-    static constexpr index_t GlobalBufferNum       = 2;
+    static constexpr index_t GlobalBufferNum       = 1;
     static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
 
-    template <typename TileDesc_M0_M1_M2_K>
-    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
-    {
-        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
-        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
-        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
-
-        return transform_tensor_descriptor(
-            TileDesc_M0_M1_M2_K{},
-            make_tuple(
-                make_pass_through_transform(Number<M0>{}),
-                make_pass_through_transform(Number<M1>{}),
-                make_pass_through_transform(Number<M2>{}),
-                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
-    }
-
-    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
-        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+    static constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+    static constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack * 2;
+    static constexpr auto async_vmcnt = num_buffer_load_a_scale + num_buffer_load_b_scale +
+                                        HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+    static constexpr auto async_vmcnt_encoding = 3952 + async_vmcnt % 16 + async_vmcnt / 16 * 16384;
 
     static constexpr auto ScalesPerKBlockSize =
         KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
 
     //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
 
     //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
     static constexpr auto ScalesPerXdlopsRunPerThread =
         ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
 
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
     __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
     {
         return num_loop > PrefetchStages;
     }
 
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
     __device__ static constexpr auto HotLoopScheduler()
     {
         // A/B split schedule
@@ -206,106 +207,104 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
                 ? HotLoopInstList::A_LDS_Read_Inst_Num
                 : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
-        constexpr auto num_ds_read_inst_b =
-            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
-                ? HotLoopInstList::B_LDS_Read_Inst_Num
-                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
-
-        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
-        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
 
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+        constexpr auto num_buffer_load_stage1 =
+            num_buffer_load_inst_b + num_buffer_load_a_scale + num_buffer_load_b_scale;
 
-        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto num_buffer_load_stage2 = num_buffer_load_inst_a;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize * 2;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
 
-        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
-        constexpr auto ds_read_b_issue_cycle =
-            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
         constexpr auto ds_read_a_mfma_rate =
-            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
-        constexpr auto ds_read_b_mfma_rate =
-            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+            math::integer_divide_ceil(mfma_cycle - 8, 2 * ds_read_a_issue_cycle);
 
-        constexpr auto num_dsread_a_mfma =
-            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
-        constexpr auto num_dsread_b_mfma =
-            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
 
-        // stage 1
-        // Separate this part?
-        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
-        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
-        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
-        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
-        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
-        constexpr auto num_mfma_per_issue =
-            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
-        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
-        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+        constexpr auto num_total_stages = MRepeat;
 
-        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
-            ignore = i;
-            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
-                ignore = idswrite;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more =
+            math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less =
+            math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_stage2 =
+            math::integer_divide_floor((num_buffer_load_stage2), 2);
+
+        constexpr auto buffer_load_stages_more =
+            num_buffer_load_stage1 -
+            math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less;
+        constexpr auto buffer_load_issue_point_interval_stage2 =
+            num_mfma_perstage / buffer_load_perstage_stage2;
+
+        // Stage 1
+        // global read more
+        static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
             });
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(
-                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
         });
-        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
-            ignore = i;
-            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
-                ignore = idswrite;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+
+        // global read less
+        static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
             });
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(
-                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
         });
 
-        // stage 2
-        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
-            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
-                         ds_read_a_mfma_rate)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
-            }
-            else
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100,
-                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
-                                                                              ds_read_a_mfma_rate,
-                                                     0); // DS read
-            }
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        // Stage 2, Sync
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
         });
-
-        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
-            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
-                         ds_read_b_mfma_rate)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
-            }
-            else
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100,
-                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
-                                                                              ds_read_b_mfma_rate,
-                                                     0); // DS read
-            }
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-    }
-
-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
-    {
-        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
     }
 
     template <bool HasMainLoop,
@@ -335,16 +334,16 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
         const ABlockDesc& a_block_desc,
         ABlockTransfer& a_blockwise_copy,
         const AGridBuffer& a_grid_buf,
-        ABlockBuffer& a_block_buf,
+        ABlockBuffer& a_block_bufs,
         const ABlockTransferStep& a_block_copy_step,
-        // BBlockCopy
+        // B Gate and Up
         const BGridDesc& b_grid_desc,
         const BBlockDesc& b_block_desc,
         BBlockTransfer& b_blockwise_copy,
         BBlockTransfer& b_blockwise_copy_up,
         const BGridBuffer& b_grid_buf,
         const BGridBuffer& b_grid_buf_up,
-        BBlockBuffer& b_block_buf,
+        BBlockBuffer& b_block_bufs,
         const BBlockTransferStep& b_block_copy_step,
         // CThread
         CThreadBuffer& c_thread_buf,
@@ -353,6 +352,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
         const AScaleGridDesc& a_scale_grid_desc,
         AScaleThreadTransfer& a_scale_thread_copy,
         const AScaleGridBuffer& a_scale_grid_buf,
+        // Gate and Up scale
         const BScaleGridDesc& b_scale_grid_desc,
         BScaleThreadTransfer& b_scale_thread_copy,
         BScaleThreadTransfer& b_scale_thread_copy_up,
@@ -360,134 +360,138 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
         const BScaleGridBuffer& b_scale_grid_buf_up,
         index_t num_loop) const
     {
-        ignore = b_block_desc;
-        ignore = b_block_buf;
-
+        ignore            = b_block_bufs;
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
             b_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
 
         StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
-        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs_up;
-        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+        StaticallyIndexedArray<decltype(b_thread_buf_up), Number<2>{}> b_thread_bufs_up;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0);
 
         auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
             a_scale_thread_desc.GetElementSpaceSize());
+
         auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
             b_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
 
         StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
         StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
-        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs_up;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf_up), Number<2>{}> b_scale_thread_bufs_up;
 
-        // Global prefetch B1
-        b_blockwise_copy.Run(b_grid_desc,
-                             b_grid_buf,
-                             b_block_desc_n0_n1_k0_k1,
-                             b_block_origin_idx,
-                             b_thread_bufs(I0));
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I0));
+        b_blockwise_copy.Run(
+            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I0));
+        b_blockwise_copy_up.Run(
+            b_grid_desc, b_grid_buf_up, b_block_desc, b_block_origin_idx, b_thread_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-        b_blockwise_copy_up.Run(b_grid_desc,
-                                b_grid_buf_up,
-                                b_block_desc_n0_n1_k0_k1,
-                                b_block_origin_idx,
-                                b_thread_bufs_up(I0));
         b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-        // Global prefetch A1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
 
-        // Prefetch a_scales to buf 0
-        a_scale_thread_copy.Run(a_scale_grid_desc,
-                                a_scale_grid_buf,
-                                a_scale_thread_desc,
-                                make_tuple(I0, I0, I0),
-                                a_scale_thread_bufs(I0));
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
 
         // restore row id and advance to the next set of scales
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                               make_multi_index(0, ScalesPerKBlockSize, 0));
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
 
-        // Prefetch b_scales 1
-        static_for<0, NRepeat, 1>{}([&](auto n0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto b_scale_offset =
-                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                    auto b_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            b_scale_thread_buf_copy);
+        // Prefetch b_scales_gate
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
 
-                    b_scale_thread_bufs(I0)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy[Number<0>{}];
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-
-                    auto b_scale_thread_buf_copy_up =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
-                                               b_scale_grid_buf_up,
-                                               b_scale_thread_desc_copy,
-                                               make_tuple(I0, I0),
-                                               b_scale_thread_buf_copy_up);
-
-                    b_scale_thread_bufs_up(I0)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy_up[Number<0>{}];
-                    b_scale_thread_copy_up.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
             });
             b_scale_thread_copy.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-            b_scale_thread_copy_up.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
         });
+
         // restore col id and advance to the next set of scales
-        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales_up
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                           b_scale_grid_buf_up,
+                                           b_scale_thread_desc,
+                                           make_tuple(n0, k0, I0),
+                                           b_scale_thread_bufs_up(I0));
+
+                b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                          make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy_up.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
         b_scale_thread_copy_up.MoveSrcSliceWindow(
-            b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
 
-        // Local prefill A1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0)); // vmem->vgpr-> lds0
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+        block_sync_lds();
+        static_for<0, LocalPrefetchStages, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_m3_k,
+                            make_tuple(
+                                I0, I0, Number<m0 % MXdlPack>{}, I0, Number<a_k_step_chunk>{}),
+                            a_block_bufs(I0),
+                            a_thread_desc_,
+                            make_tuple(
+                                I0, I0, Number<m0 % MXdlPack>{}, k, Number<chunk * KThreadChunk>{}),
+                            a_thread_buf);
+                    });
+            });
+        });
 
-        // Global prefetch A2
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        // Global prefetch 2
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I1));
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
 
         // Initialize C
         c_thread_buf.Clear();
-        c_thread_buf_up.Clear();
-
-        // Local prefetch A1
-        block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k) {
-            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                    constexpr auto a_k_step_chunk =
-                        k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                       make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                       a_block_buf.At(I0),
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                       a_thread_buf);
-                });
-            });
-        });
-
+        __builtin_amdgcn_sched_barrier(0);
+        constexpr index_t SwitchM = MRepeat - LocalPrefetchStages;
         // main body
         if constexpr(HasMainLoop)
         {
@@ -495,136 +499,149 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
             index_t i = 0;
             do
             {
-                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
-                    // Prefetch a_scales to buf 1
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc,
-                                            make_tuple(I0, I0, I0),
-                                            a_scale_thread_bufs(local_read_buf));
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(scale_mem_buf));
+                    b_blockwise_copy_up.Run(b_grid_desc,
+                                            b_grid_buf_up,
+                                            b_block_desc,
+                                            b_block_origin_idx,
+                                            b_thread_bufs_up(scale_mem_buf));
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
 
                     // restore row id and advance to the next set of scales
                     a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc, make_multi_index(0, ScalesPerKBlockSize, 0));
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
 
-                    // Prefetch b_scales 2
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                                constexpr auto b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                                auto b_scale_thread_buf_copy =
-                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                        b_scale_thread_desc_copy.GetElementSpaceSize());
-                                b_scale_thread_copy.Run(b_scale_grid_desc,
-                                                        b_scale_grid_buf,
-                                                        b_scale_thread_desc_copy,
-                                                        make_tuple(I0, I0),
-                                                        b_scale_thread_buf_copy);
+                    // Prefetch b_scales_gate
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
 
-                                b_scale_thread_bufs(local_read_buf)(Number<b_scale_offset>{}) =
-                                    b_scale_thread_buf_copy[Number<0>{}];
-                                b_scale_thread_copy.MoveSrcSliceWindow(
-                                    b_scale_grid_desc,
-                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-
-                                auto b_scale_thread_buf_copy_up =
-                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                        b_scale_thread_desc_copy.GetElementSpaceSize());
-                                b_scale_thread_copy_up.Run(b_scale_grid_desc,
-                                                           b_scale_grid_buf_up,
-                                                           b_scale_thread_desc_copy,
-                                                           make_tuple(I0, I0),
-                                                           b_scale_thread_buf_copy_up);
-
-                                b_scale_thread_bufs_up(local_read_buf)(Number<b_scale_offset>{}) =
-                                    b_scale_thread_buf_copy_up[Number<0>{}];
-                                b_scale_thread_copy_up.MoveSrcSliceWindow(
-                                    b_scale_grid_desc,
-                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                            });
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
                         });
                         b_scale_thread_copy.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-                        b_scale_thread_copy_up.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
                     });
+
                     // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
                     b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales_up
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                                       b_scale_grid_buf_up,
+                                                       b_scale_thread_desc,
+                                                       make_tuple(n0, k0, I0),
+                                                       b_scale_thread_bufs_up(scale_mem_buf));
+
+                            b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                      make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy_up.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
                     b_scale_thread_copy_up.MoveSrcSliceWindow(
-                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
 
-                    // Local prefill A2
-                    block_sync_lds();
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
-
-                    // Global prefetch A1
-                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-                    // Global prefetch B2
-                    b_blockwise_copy.Run(b_grid_desc,
-                                         b_grid_buf,
-                                         b_block_desc_n0_n1_k0_k1,
-                                         b_block_origin_idx,
-                                         b_thread_bufs(local_read_buf));
+                    // a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                     b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                    b_blockwise_copy_up.Run(b_grid_desc,
-                                            b_grid_buf_up,
-                                            b_block_desc_n0_n1_k0_k1,
-                                            b_block_origin_idx,
-                                            b_thread_bufs_up(local_read_buf));
                     b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-                    // A1 * B1
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        constexpr auto im_major = m0 / MXdlPack;
+                        constexpr auto im_minor = m0 % MXdlPack;
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            constexpr auto ik_major = k0 / KXdlPack;
+                            constexpr auto ik_minor = k0 % KXdlPack;
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                constexpr auto in_major = n0 / NXdlPack;
+                                constexpr auto in_minor = n0 % NXdlPack;
+
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(
+                                        make_tuple(im_major, ik_major, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(
+                                        make_tuple(in_major, ik_major, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec_up;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+                                // B Gate scale
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+                                // B Up scale
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs_up(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
                                 vector_type<ComputeTypeA, KPack> a_thread_vec;
                                 vector_type<ComputeTypeB, KPack> b_thread_vec;
                                 vector_type<ComputeTypeB, KPack> b_thread_vec_up;
 
-                                static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                                static_for<0, KPack, 1>{}([&](auto ik) {
                                     a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                         a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                            make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                        b_thread_bufs[mfma_reg_buf]
-                                                     [Number<b_thread_desc_.CalculateOffset(
-                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                            make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) = b_thread_bufs
+                                        [scale_comp_buf][Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                                     b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
-                                        b_thread_bufs_up[mfma_reg_buf]
-                                                        [Number<b_thread_desc_.CalculateOffset(
-                                                            make_tuple(n0, I0, k0, ik))>{}];
-                                });
-
-                                constexpr index_t a_scale_offset =
-                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-                                constexpr index_t b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                                vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    a_scale_thread_vec;
-                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    b_scale_thread_vec;
-                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    b_scale_thread_vec_up;
-
-                                // Pack scale_thread_buf into scale_thread_vec
-                                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                        a_scale_thread_bufs[mfma_reg_buf]
-                                                           [Number<a_scale_offset + s>{}];
-                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                        b_scale_thread_bufs[mfma_reg_buf]
-                                                           [Number<b_scale_offset + s>{}];
-                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                        b_scale_thread_bufs_up[mfma_reg_buf]
-                                                              [Number<b_scale_offset + s>{}];
+                                        b_thread_bufs_up
+                                            [scale_comp_buf][Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                                 });
 
                                 using mfma_input_type_a =
@@ -636,52 +653,83 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
                                                          xdlops_gemm.K1PerXdlops /
                                                              BPackedSize>::type;
 
-                                constexpr index_t c_offset =
-                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                                using mfma_scale_input_type_a =
+                                    typename vector_type<AScaleDataType,
+                                                         a_scale_thread_vec_size>::type;
+                                using mfma_scale_input_type_b =
+                                    typename vector_type<BScaleDataType,
+                                                         b_scale_thread_vec_size>::type;
 
-                                // MFMA accumulation
-                                xdlops_gemm.template Run<>(
+                                constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                                // MFMA accumulation A * Gate
+                                xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                         ik_minor * NXdlPack + in_minor>(
                                     a_thread_vec.template AsType<mfma_input_type_a>(),
-                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                                     b_thread_vec.template AsType<mfma_input_type_b>(),
-                                    b_scale_thread_vec.template AsType<BScaleDataType>(),
+                                    b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
                                     c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                                xdlops_gemm.template Run<>(
+
+                                // MFMA accumulation A * Up
+                                xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                         ik_minor * NXdlPack + in_minor>(
                                     a_thread_vec.template AsType<mfma_input_type_a>(),
-                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
+                                    a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                                     b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                                    b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                                    b_scale_thread_vec_up
+                                        .template AsType<mfma_scale_input_type_b>(),
                                     c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                            }); // KRepeat
-                        });     // NRepeat
-                    });         // MRepeat
+                            });
+                        });
 
-                    // Local prefetch A2
-                    block_sync_lds();
-                    static_for<0, KRepeat, 1>{}([&](auto k) {
-                        constexpr auto k_step =
-                            k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
+                        if constexpr(m0.value == SwitchM)
+                        {
+                            __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+                            block_sync_lds();
+                            a_blockwise_copy.Run(a_grid_desc,
+                                                 a_grid_buf,
+                                                 a_block_desc,
+                                                 a_block_bufs(scale_comp_buf));
+                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        }
 
-                        static_for<0, MRepeat, 1>{}([&](auto m0) {
-                            static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}(
-                                [&](auto chunk) {
-                                    constexpr auto a_k_step_chunk =
-                                        k_step + chunk * KThreadChunk *
-                                                     xdlops_gemm.mfma_instr.num_input_blks;
-                                    a_thread_copy_.Run(
-                                        a_block_desc_m0_m1_m2_k,
-                                        make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                        a_block_buf.At(local_read_buf),
-                                        a_thread_desc_,
-                                        make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                        a_thread_buf);
-                                });
+                        constexpr auto lds_buf =
+                            m0.value >= SwitchM ? scale_mem_buf : scale_comp_buf;
+
+                        static_for<0, KRepeat, 1>{}([&](auto k) {
+                            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(Number<lds_buf>{}),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
                         });
                     });
 
                     HotLoopScheduler();
                     __builtin_amdgcn_sched_barrier(0);
-                }; // LoopFunc
+                };
 
                 LoopFunc(I0, I1);
                 LoopFunc(I1, I0);
@@ -693,112 +741,112 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
         // tail
         if constexpr(TailNum == TailNumber::Even)
         {
-            // Prefetch a_scales 2
-            a_scale_thread_copy.Run(a_scale_grid_desc,
-                                    a_scale_grid_buf,
-                                    a_scale_thread_desc,
-                                    make_tuple(I0, I0, I0),
-                                    a_scale_thread_bufs(I1));
+            b_blockwise_copy.Run(
+                b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I1));
+            b_blockwise_copy_up.Run(
+                b_grid_desc, b_grid_buf_up, b_block_desc, b_block_origin_idx, b_thread_bufs_up(I1));
 
-            // Prefetch b_scales 2
-            static_for<0, NRepeat, 1>{}([&](auto n0) {
-                static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                        constexpr auto b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                        auto b_scale_thread_buf_copy =
-                            make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                b_scale_thread_desc_copy.GetElementSpaceSize());
-                        b_scale_thread_copy.Run(b_scale_grid_desc,
-                                                b_scale_grid_buf,
-                                                b_scale_thread_desc_copy,
-                                                make_tuple(I0, I0),
-                                                b_scale_thread_buf_copy);
+            // Prefetch a_scales_up
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
 
-                        b_scale_thread_bufs(I1)(Number<b_scale_offset>{}) =
-                            b_scale_thread_buf_copy[Number<0>{}];
-                        b_scale_thread_copy.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
 
-                        auto b_scale_thread_buf_copy_up =
-                            make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                b_scale_thread_desc_copy.GetElementSpaceSize());
-                        b_scale_thread_copy_up.Run(b_scale_grid_desc,
-                                                   b_scale_grid_buf_up,
-                                                   b_scale_thread_desc_copy,
-                                                   make_tuple(I0, I0),
-                                                   b_scale_thread_buf_copy_up);
+            // Prefetch b_scales_gate
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
 
-                        b_scale_thread_bufs_up(I1)(Number<b_scale_offset>{}) =
-                            b_scale_thread_buf_copy_up[Number<0>{}];
-                        b_scale_thread_copy_up.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                    });
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
                 });
                 b_scale_thread_copy.MoveSrcSliceWindow(
-                    b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-                b_scale_thread_copy_up.MoveSrcSliceWindow(
-                    b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
             });
 
-            // Local prefill A2
-            block_sync_lds();
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+            // Prefetch b_scales_up
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc,
+                                               make_tuple(n0, k0, I0),
+                                               b_scale_thread_bufs_up(I1));
 
-            // Global prefetch B2
-            b_blockwise_copy.Run(b_grid_desc,
-                                 b_grid_buf,
-                                 b_block_desc_n0_n1_k0_k1,
-                                 b_block_origin_idx,
-                                 b_thread_bufs(I1));
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                              make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy_up.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
 
-            b_blockwise_copy_up.Run(b_grid_desc,
-                                    b_grid_buf_up,
-                                    b_block_desc_n0_n1_k0_k1,
-                                    b_block_origin_idx,
-                                    b_thread_bufs_up(I1));
-
-            // A1 * B1
             static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+                        // B Gate scale
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+                        // B Up scale
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I0)[Number<b_scale_offset + s>{}];
+                        });
+
                         vector_type<ComputeTypeA, KPack> a_thread_vec;
                         vector_type<ComputeTypeB, KPack> b_thread_vec;
                         vector_type<ComputeTypeB, KPack> b_thread_vec_up;
 
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                        static_for<0, KPack, 1>{}([&](auto ik) {
                             a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
                             b_thread_vec.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                             b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                            b_scale_thread_vec_up;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                         });
 
                         using mfma_input_type_a =
@@ -808,85 +856,117 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
                             typename vector_type<ComputeTypeB,
                                                  xdlops_gemm.K1PerXdlops / BPackedSize>::type;
 
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
 
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation A * Gate
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
                             a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                             b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        xdlops_gemm.template Run<>(
+
+                        // MFMA accumulation A * Gate
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
                             a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                             b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            b_scale_thread_vec_up.template AsType<mfma_scale_input_type_b>(),
                             c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                    }); // KRepeat
-                });     // NRepeat
-            });         // MRepeat
-
-            // Local prefetch A2
-            block_sync_lds();
-            static_for<0, KRepeat, 1>{}([&](auto k) {
-                constexpr auto k_step =
-                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                        constexpr auto a_k_step_chunk =
-                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                           a_block_buf.At(I1),
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                           a_thread_buf);
                     });
                 });
+                if constexpr(m0.value == SwitchM)
+                {
+                    __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+                    block_sync_lds();
+                }
+
+                constexpr auto lds_buf = m0.value >= SwitchM ? I1 : I0;
+
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                            (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_m3_k,
+                                make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                  (MRepeat / MXdlPack)>{},
+                                           I0,
+                                           Number<im_minor>{},
+                                           I0,
+                                           Number<a_k_step_chunk>{}),
+                                a_block_bufs(Number<lds_buf>{}),
+                                a_thread_desc_,
+                                make_tuple(
+                                    I0, I0, Number<im_minor>{}, k, Number<chunk * KThreadChunk>{}),
+                                a_thread_buf);
+                        });
+                });
             });
 
-            // A2 * B2
             static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I1)[Number<b_scale_offset + s>{}];
+                        });
+
                         vector_type<ComputeTypeA, KPack> a_thread_vec;
                         vector_type<ComputeTypeB, KPack> b_thread_vec;
                         vector_type<ComputeTypeB, KPack> b_thread_vec_up;
 
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                        static_for<0, KPack, 1>{}([&](auto ik) {
                             a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
                             b_thread_vec.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                             b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_bufs_up[I1][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                            b_scale_thread_vec_up;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I1][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I1][Number<b_scale_offset + s>{}];
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs_up[I1][Number<b_scale_offset + s>{}];
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                         });
 
                         using mfma_input_type_a =
@@ -896,66 +976,119 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
                             typename vector_type<ComputeTypeB,
                                                  xdlops_gemm.K1PerXdlops / BPackedSize>::type;
 
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
 
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation A * Gate
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
                             a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                             b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        xdlops_gemm.template Run<>(
+
+                        // MFMA accumulation A * Up
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
                             a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                             b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            b_scale_thread_vec_up.template AsType<mfma_scale_input_type_b>(),
                             c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                    }); // KRepeat
-                });     // NRepeat
-            });         // MRepeat
+                    });
+                });
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(I1),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                    });
+                }
+            });
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
             static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+                        // B Gate scale
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+                        // B Up scale
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I0)[Number<b_scale_offset + s>{}];
+                        });
+
                         vector_type<ComputeTypeA, KPack> a_thread_vec;
                         vector_type<ComputeTypeB, KPack> b_thread_vec;
                         vector_type<ComputeTypeB, KPack> b_thread_vec_up;
 
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
+                        static_for<0, KPack, 1>{}([&](auto ik) {
                             a_thread_vec.template AsType<ComputeTypeA>()(ik) =
                                 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
                             b_thread_vec.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                             b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
                                 b_thread_bufs_up[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                            b_scale_thread_vec_up;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs_up[I0][Number<b_scale_offset + s>{}];
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                         });
 
                         using mfma_input_type_a =
@@ -965,56 +1098,103 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
                             typename vector_type<ComputeTypeB,
                                                  xdlops_gemm.K1PerXdlops / BPackedSize>::type;
 
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
 
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation A * Gate
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
                             a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                             b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
                             c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                        xdlops_gemm.template Run<>(
+
+                        // MFMA accumulation A * up
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
                             a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
                             b_thread_vec_up.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec_up.template AsType<BScaleDataType>(),
+                            b_scale_thread_vec_up.template AsType<mfma_scale_input_type_b>(),
                             c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
-                    }); // KRepeat
-                });     // NRepeat
-            });         // MRepeat
+                    });
+                });
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(I0),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                    });
+                }
+            });
         }
     }
 
+    //  Length:  A[ARegBuf, MWave, MXdlPack, KRepeat, KPack]
+    //  Order:     1        0      3         2        4
+    static constexpr auto ARegBuf        = 2;
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<ARegBuf>{}, I1, Number<MXdlPack>{}, Number<KRepeat>{}, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeTypeA,
+                                                         decltype(a_block_desc_m0_m1_m2_m3_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()};
+
     // TODO: make this field protected when a_scale_thread_copy_ is moved
     // here
     static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from a_scale_grid to a_scale_thread
-    static constexpr auto a_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
 
     // TODO: make this field protected when b_scale_thread_copy_ is moved
     // here
     static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from b_scale_grid to b_scale_thread_buf
-    static constexpr auto b_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
 
     protected:
-    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
-    using Base::a_thread_copy_;
-    using Base::a_thread_desc_;
+    // using Base::a_thread_copy_;
+    // using Base::a_thread_desc_;
     using Base::b_thread_copy_;
-    // using Base::b_thread_desc_;
+    using Base::b_thread_desc_;
     using Base::c_thread_desc_;
-
-    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
 };
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
index 59b2619416..6789d26a45 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp"
-#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp"
 
@@ -43,54 +41,11 @@ constexpr auto BlockGemmMXBPreshufflePipeline_Selector()
     {
         if constexpr(GUFusion)
         {
-            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v1<
-                BlkGemmPipeSche,
-                ThreadBlockSize,
-                ScaleBlockSize,
-                ADataType,
-                AScaleDataType,
-                BDataType,
-                BScaleDataType,
-                ATileDesc,
-                BTileDesc,
-                AMmaTileDesc,
-                BMmaTileDesc,
-                ABlockTransferSrcScalarPerVector,
-                BBlockTransferSrcScalarPerVector,
-                MPerBlock,
-                NPerBlock,
-                KPerBlock,
-                MPerXDL,
-                NPerXDL,
-                MRepeat,
-                NRepeat,
-                KPack>{};
-            ;
+            return nullptr;
         }
         else
         {
-            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1<
-                BlkGemmPipeSche,
-                ThreadBlockSize,
-                ScaleBlockSize,
-                ADataType,
-                AScaleDataType,
-                BDataType,
-                BScaleDataType,
-                ATileDesc,
-                BTileDesc,
-                AMmaTileDesc,
-                BMmaTileDesc,
-                ABlockTransferSrcScalarPerVector,
-                BBlockTransferSrcScalarPerVector,
-                MPerBlock,
-                NPerBlock,
-                KPerBlock,
-                MPerXDL,
-                NPerXDL,
-                MRepeat,
-                NRepeat,
-                KPack>{};
+            return nullptr;
         }
     }
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
deleted file mode 100644
index c3b54df7c8..0000000000
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
+++ /dev/null
@@ -1,813 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
-
-namespace ck {
-
-// Naive pipeline with lowest resource request per WGP
-// GlobalPrefetchStages: 2
-// LocalPreFillStages: 1
-// LocalPreFetchStages: 1
-// LocalSharedMemoryBuffer: 1
-
-template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
-          index_t ThreadBlockSize,
-          index_t ScaleBlockSize,
-          typename ADataType,
-          typename AScaleDataType,
-          typename BDataType,
-          typename BScaleDataType,
-          typename ATileDesc,
-          typename BTileDesc,
-          typename AMmaTileDesc,
-          typename BMmaTileDesc,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MRepeat, // MXdlPerWave
-          index_t NRepeat, // NXdlPerWave
-          index_t KPack>
-struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1
-{
-};
-
-template <index_t ThreadBlockSize,
-          index_t ScaleBlockSize,
-          typename ADataType,
-          typename AScaleDataType,
-          typename BDataType,
-          typename BScaleDataType,
-          typename ATileDesc,
-          typename BTileDesc,
-          typename AMmaTileDesc,
-          typename BMmaTileDesc,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t BBlockTransferSrcScalarPerVector,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerXDL,
-          index_t NPerXDL,
-          index_t MRepeat, // MXdlPerWave
-          index_t NRepeat, // NXdlPerWave
-          index_t KPack>
-struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1<BlockGemmPipelineScheduler::Intrawave,
-                                                          ThreadBlockSize,
-                                                          ScaleBlockSize,
-                                                          ADataType,
-                                                          AScaleDataType,
-                                                          BDataType,
-                                                          BScaleDataType,
-                                                          ATileDesc,
-                                                          BTileDesc,
-                                                          AMmaTileDesc,
-                                                          BMmaTileDesc,
-                                                          ABlockTransferSrcScalarPerVector,
-                                                          BBlockTransferSrcScalarPerVector,
-                                                          MPerBlock,
-                                                          NPerBlock,
-                                                          KPerBlock,
-                                                          MPerXDL,
-                                                          NPerXDL,
-                                                          MRepeat,
-                                                          NRepeat,
-                                                          KPack>
-    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
-                                           ADataType,
-                                           BDataType,
-                                           ATileDesc,
-                                           BTileDesc,
-                                           AMmaTileDesc,
-                                           BMmaTileDesc,
-                                           ABlockTransferSrcScalarPerVector,
-                                           BBlockTransferSrcScalarPerVector,
-                                           MPerBlock,
-                                           NPerBlock,
-                                           KPerBlock,
-                                           MPerXDL,
-                                           NPerXDL,
-                                           MRepeat,
-                                           NRepeat,
-                                           KPack>
-
-{
-
-    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
-                                                      ADataType,
-                                                      BDataType,
-                                                      ATileDesc,
-                                                      BTileDesc,
-                                                      AMmaTileDesc,
-                                                      BMmaTileDesc,
-                                                      ABlockTransferSrcScalarPerVector,
-                                                      BBlockTransferSrcScalarPerVector,
-                                                      MPerBlock,
-                                                      NPerBlock,
-                                                      KPerBlock,
-                                                      MPerXDL,
-                                                      NPerXDL,
-                                                      MRepeat,
-                                                      NRepeat,
-                                                      KPack>;
-    using Base::I0;
-    using Base::I1;
-    using Base::KRepeat;
-    using Base::MWaves;
-    using Base::NWaves;
-    using Base::WaveSize;
-    using Base::xdlops_gemm;
-
-    using Base::CalculateCThreadOriginDataIndex;
-    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
-    using Base::GetCThreadBuffer;
-    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
-    using Base::GetWaveIdx;
-    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
-    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
-
-    using Base::a_block_desc_m0_m1_m2_k;
-    using Base::b_block_desc_n0_n1_n2_k;
-
-    using Base::AMmaKStride;
-    using Base::BMmaKStride;
-    using Base::KThreadChunk;
-
-    using Base::APackedSize;
-    using Base::BPackedSize;
-    using Base::ComputePackedSize;
-
-    using AccType      = typename Base::AccType;
-    using Tuple4       = typename Base::Tuple4;
-    using ComputeTypeA = typename Base::ComputeTypeA;
-    using ComputeTypeB = typename Base::ComputeTypeB;
-
-    static constexpr index_t PrefetchStages  = 2;
-    static constexpr index_t PrefillStages   = 1;
-    static constexpr index_t GlobalBufferNum = 2;
-
-    template <typename TileDesc_M0_M1_M2_K>
-    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
-    {
-        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
-        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
-        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
-
-        return transform_tensor_descriptor(
-            TileDesc_M0_M1_M2_K{},
-            make_tuple(
-                make_pass_through_transform(Number<M0>{}),
-                make_pass_through_transform(Number<M1>{}),
-                make_pass_through_transform(Number<M2>{}),
-                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
-    }
-
-    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
-        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
-
-    static constexpr auto ScalesPerKBlockSize =
-        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
-
-    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRun = (KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
-
-    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
-    static constexpr auto ScalesPerXdlopsRunPerThread =
-        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
-
-    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
-    {
-        return num_loop > PrefetchStages;
-    }
-
-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
-    {
-        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
-    }
-
-    template <bool HasMainLoop,
-              TailNumber TailNum,
-              typename AGridDesc,
-              typename ABlockDesc,
-              typename ABlockTransfer,
-              typename AGridBuffer,
-              typename ABlockBuffer,
-              typename ABlockTransferStep,
-              typename BGridDesc,
-              typename BBlockDesc,
-              typename BBlockTransfer,
-              typename BGridBuffer,
-              typename BBlockBuffer,
-              typename BBlockTransferStep,
-              typename CThreadBuffer,
-              typename AScaleGridBuffer,
-              typename AScaleGridDesc,
-              typename AScaleThreadTransfer,
-              typename BScaleGridBuffer,
-              typename BScaleGridDesc,
-              typename BScaleThreadTransfer>
-    __device__ void Run(
-        // ABlockCopy
-        const AGridDesc& a_grid_desc,
-        const ABlockDesc& a_block_desc,
-        ABlockTransfer& a_blockwise_copy,
-        const AGridBuffer& a_grid_buf,
-        ABlockBuffer& a_block_buf,
-        const ABlockTransferStep& a_block_copy_step,
-        // BBlockCopy
-        const BGridDesc& b_grid_desc,
-        const BBlockDesc& b_block_desc,
-        BBlockTransfer& b_blockwise_copy,
-        const BGridBuffer& b_grid_buf,
-        BBlockBuffer& b_block_buf,
-        const BBlockTransferStep& b_block_copy_step,
-        // CThread
-        CThreadBuffer& c_thread_buf,
-        // A and B scales
-        const AScaleGridDesc& a_scale_grid_desc,
-        AScaleThreadTransfer& a_scale_thread_copy,
-        const AScaleGridBuffer& a_scale_grid_buf,
-        const BScaleGridDesc& b_scale_grid_desc,
-        BScaleThreadTransfer& b_scale_thread_copy,
-        const BScaleGridBuffer& b_scale_grid_buf,
-        index_t num_loop) const
-    {
-        ignore = b_block_desc;
-        ignore = b_block_buf;
-
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
-            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
-            b_thread_desc_.GetElementSpaceSize());
-
-        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
-        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
-
-        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-            a_scale_thread_desc.GetElementSpaceSize());
-        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-            b_scale_thread_desc.GetElementSpaceSize());
-
-        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
-        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
-
-        // Global prefetch A1 B1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-        b_blockwise_copy.Run(b_grid_desc,
-                             b_grid_buf,
-                             b_block_desc_n0_n1_k0_k1,
-                             b_block_origin_idx,
-                             b_thread_bufs(I0));
-
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-        // Prefetch a_scales
-        static_for<0, MRepeat, 1>{}([&](auto m0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto a_scale_offset =
-                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
-                    auto a_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-                            a_scale_thread_desc_copy.GetElementSpaceSize());
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            a_scale_thread_buf_copy);
-
-                    a_scale_thread_buf(I0)(Number<a_scale_offset>{}) =
-                        a_scale_thread_buf_copy[Number<0>{}];
-                    a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
-            });
-            a_scale_thread_copy.MoveSrcSliceWindow(
-                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
-        });
-
-        // restore row id and advance to the next set of scales
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
-
-        // Prefetch b_scales to buf 0
-        static_for<0, NRepeat, 1>{}([&](auto n0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto b_scale_offset =
-                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                    auto b_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            b_scale_thread_buf_copy);
-
-                    b_scale_thread_bufs(I0)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy[Number<0>{}];
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
-            });
-            b_scale_thread_copy.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-        });
-
-        // restore col id and advance to the next set of scales
-        // NWaves * NPerXDL * NRepeat == NPerBlock
-        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-
-        __builtin_amdgcn_sched_barrier(0);
-
-        // Local prefill A1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
-
-        // Global prefetch A2
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-        // Prefetch a_scales to buf 1
-        static_for<0, MRepeat, 1>{}([&](auto m0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto a_scale_offset =
-                        a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, s));
-                    auto a_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
-                            a_scale_thread_desc_copy.GetElementSpaceSize());
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            a_scale_thread_buf_copy);
-
-                    a_scale_thread_buf(I1)(Number<a_scale_offset>{}) =
-                        a_scale_thread_buf_copy[Number<0>{}];
-                    a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
-            });
-            a_scale_thread_copy.MoveSrcSliceWindow(
-                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
-        });
-
-        // restore row id and advance to the next set of scales
-        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
-                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
-
-        // Prefetch b_scales to buf 1
-        static_for<0, NRepeat, 1>{}([&](auto n0) {
-            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                    constexpr auto b_scale_offset =
-                        b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                    auto b_scale_thread_buf_copy =
-                        make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                            b_scale_thread_desc_copy.GetElementSpaceSize());
-                    b_scale_thread_copy.Run(b_scale_grid_desc,
-                                            b_scale_grid_buf,
-                                            b_scale_thread_desc_copy,
-                                            make_tuple(I0, I0),
-                                            b_scale_thread_buf_copy);
-
-                    b_scale_thread_bufs(I1)(Number<b_scale_offset>{}) =
-                        b_scale_thread_buf_copy[Number<0>{}];
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc,
-                        make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                });
-            });
-            b_scale_thread_copy.MoveSrcSliceWindow(
-                b_scale_grid_desc, make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-        });
-
-        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
-                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-
-        // Local prefetch A1
-        block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k) {
-            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                    constexpr auto a_k_step_chunk =
-                        k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                       make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                       a_thread_buf);
-                });
-            });
-        });
-
-        // Initialize C
-        c_thread_buf.Clear();
-
-        // main body
-        if constexpr(HasMainLoop)
-        {
-            // loop over k with the step KPerBlock
-            index_t i = 0;
-            do
-            {
-                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
-                    b_blockwise_copy.Run(b_grid_desc,
-                                         b_grid_buf,
-                                         b_block_desc_n0_n1_k0_k1,
-                                         b_block_origin_idx,
-                                         b_thread_bufs(local_read_buf));
-                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                    block_sync_lds();
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
-
-                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
-                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                                vector_type<ComputeTypeA, KPack> a_thread_vec;
-                                vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                                static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                            make_tuple(m0, I0, k0, ik))>{}];
-                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                        b_thread_bufs[mfma_reg_buf]
-                                                     [Number<b_thread_desc_.CalculateOffset(
-                                                         make_tuple(n0, I0, k0, ik))>{}];
-                                });
-
-                                constexpr index_t a_scale_offset =
-                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-                                constexpr index_t b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                                static_assert(
-                                    0 < ScalesPerXdlopsRunPerThread,
-                                    "Must have at least one scale per Xdlops per Thread.");
-
-                                vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    a_scale_thread_vec;
-                                vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread>
-                                    b_scale_thread_vec;
-
-                                // Pack scale_thread_buf into scale_thread_vec
-                                static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                        a_scale_thread_bufs[mfma_reg_buf]
-                                                           [Number<a_scale_offset + s>{}];
-                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                        b_scale_thread_bufs[mfma_reg_buf]
-                                                           [Number<b_scale_offset + s>{}];
-                                });
-
-                                using mfma_input_type_a =
-                                    typename vector_type<ComputeTypeA,
-                                                         xdlops_gemm.K1PerXdlops /
-                                                             APackedSize>::type;
-                                using mfma_input_type_b =
-                                    typename vector_type<ComputeTypeB,
-                                                         xdlops_gemm.K1PerXdlops /
-                                                             BPackedSize>::type;
-
-                                constexpr index_t c_offset =
-                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                                // MFMA accumulation
-                                xdlops_gemm.template Run<>(
-                                    a_thread_vec.template AsType<mfma_input_type_a>(),
-                                    a_scale_thread_vec.template AsType<AScaleDataType>(),
-                                    b_thread_vec.template AsType<mfma_input_type_b>(),
-                                    b_scale_thread_vec.template AsType<BScaleDataType>(),
-                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            });
-                        });
-                    });
-
-                    block_sync_lds();
-
-                    // a thread copy
-                    static_for<0, KRepeat, 1>{}([&](auto k) {
-                        constexpr auto k_step =
-                            k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-                        static_for<0, MRepeat, 1>{}([&](auto m0) {
-                            static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}(
-                                [&](auto chunk) {
-                                    constexpr auto a_k_step_chunk =
-                                        k_step + chunk * KThreadChunk *
-                                                     xdlops_gemm.mfma_instr.num_input_blks;
-                                    a_thread_copy_.Run(
-                                        a_block_desc_m0_m1_m2_k,
-                                        make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                        a_block_buf,
-                                        a_thread_desc_,
-                                        make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                        a_thread_buf);
-                                });
-                        });
-                    });
-
-                    // Prefetch a_scales
-                    a_scale_thread_copy.Run(a_scale_grid_desc,
-                                            a_scale_grid_buf,
-                                            a_scale_thread_desc,
-                                            make_tuple(I0, I0, I0),
-                                            a_scale_thread_bufs(mfma_reg_buf));
-
-                    // restore row id and advance to the next set of scales
-                    a_scale_thread_copy.MoveSrcSliceWindow(
-                        a_scale_grid_desc, make_multi_index(0, ScalesPerKBlockSize, 0));
-
-                    // Prefetch b_scales
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                                constexpr auto b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, s));
-                                auto b_scale_thread_buf_copy =
-                                    make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
-                                        b_scale_thread_desc_copy.GetElementSpaceSize());
-                                b_scale_thread_copy.Run(b_scale_grid_desc,
-                                                        b_scale_grid_buf,
-                                                        b_scale_thread_desc_copy,
-                                                        make_tuple(I0, I0),
-                                                        b_scale_thread_buf_copy);
-
-                                b_scale_thread_bufs(mfma_reg_buf)(Number<b_scale_offset>{}) =
-                                    b_scale_thread_buf_copy[Number<0>{}];
-                                b_scale_thread_copy.MoveSrcSliceWindow(
-                                    b_scale_grid_desc,
-                                    make_multi_index(0, xdlops_gemm.KPerXdlops / ScaleBlockSize));
-                            });
-                        });
-                        b_scale_thread_copy.MoveSrcSliceWindow(
-                            b_scale_grid_desc,
-                            make_multi_index(NWaves * NPerXDL, -ScalesPerKBlockSize));
-                    });
-
-                    b_scale_thread_copy.MoveSrcSliceWindow(
-                        b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
-                };
-
-                LoopFunc(I0, I1);
-                LoopFunc(I1, I0);
-
-                i += 2;
-            } while(i < (num_loop - 2));
-        }
-
-        // tail
-        if constexpr(TailNum == TailNumber::Even)
-        {
-            b_blockwise_copy.Run(b_grid_desc,
-                                 b_grid_buf,
-                                 b_block_desc_n0_n1_k0_k1,
-                                 b_block_origin_idx,
-                                 b_thread_bufs(I1));
-            block_sync_lds();
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
-                        });
-
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA,
-                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB,
-                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                    });
-                });
-            });
-
-            block_sync_lds();
-
-            // a thread copy
-            static_for<0, KRepeat, 1>{}([&](auto k) {
-                constexpr auto k_step =
-                    k * xdlops_gemm.KPerXdlops * (KPack / xdlops_gemm.K1PerXdlops);
-
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) {
-                        constexpr auto a_k_step_chunk =
-                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
-                                           make_tuple(m0, I0, I0, Number<a_k_step_chunk>{}),
-                                           a_block_buf,
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, k, Number<chunk * KThreadChunk>{}),
-                                           a_thread_buf);
-                    });
-                });
-            });
-
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I1][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I1][Number<b_scale_offset + s>{}];
-                        });
-
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA,
-                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB,
-                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                    });
-                });
-            });
-        }
-        else if constexpr(TailNum == TailNumber::Odd)
-        {
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
-                static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat, 1>{}([&](auto k0) {
-                        vector_type<ComputeTypeA, KPack> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                        static_for<0, KPack / ComputePackedSize, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                    make_tuple(m0, I0, k0, ik))>{}];
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
-                                    make_tuple(n0, I0, k0, ik))>{}];
-                        });
-
-                        constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
-
-                        constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
-
-                        vector_type<AScaleDataType, ScalesPerXdlopsRunPerThread> a_scale_thread_vec;
-                        vector_type<BScaleDataType, ScalesPerXdlopsRunPerThread> b_scale_thread_vec;
-
-                        // Pack b_scale_thread_buf into b_scale_thread_vec
-                        static_for<0, ScalesPerXdlopsRunPerThread, 1>{}([&](auto s) {
-                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
-                                a_scale_thread_bufs[I0][Number<a_scale_offset + s>{}];
-                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
-                                b_scale_thread_bufs[I0][Number<b_scale_offset + s>{}];
-                        });
-
-                        using mfma_input_type_a =
-                            typename vector_type<ComputeTypeA,
-                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
-                        using mfma_input_type_b =
-                            typename vector_type<ComputeTypeB,
-                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
-
-                        // MFMA accumulation
-                        xdlops_gemm.template Run<>(
-                            a_thread_vec.template AsType<mfma_input_type_a>(),
-                            a_scale_thread_vec.template AsType<AScaleDataType>(),
-                            b_thread_vec.template AsType<mfma_input_type_b>(),
-                            b_scale_thread_vec.template AsType<BScaleDataType>(),
-                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                    });
-                });
-            });
-        }
-    }
-
-    // TODO: make this field protected when a_scale_thread_copy_ is moved
-    // here
-    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from a_scale_grid to a_scale_thread
-    static constexpr auto a_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
-
-    // TODO: make this field protected when b_scale_thread_copy_ is moved
-    // here
-    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, Number<KRepeat>{}, Number<ScalesPerXdlopsRunPerThread>{}));
-
-    // Is used to copy data from b_scale_grid to b_scale_thread_buf
-    static constexpr auto b_scale_thread_desc_copy =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<1>{}));
-
-    protected:
-    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
-    using Base::a_thread_copy_;
-    using Base::a_thread_desc_;
-    using Base::b_thread_copy_;
-    // using Base::b_thread_desc_;
-    using Base::c_thread_desc_;
-
-    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
-};
-
-} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
index ec0628ca20..2b936c8d25 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
@@ -116,9 +116,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                                                       MRepeat,
                                                       NRepeat,
                                                       KPack>;
+    using Base::A_K1;
     using Base::I0;
     using Base::I1;
-    using Base::I2;
     using Base::KRepeat;
     using Base::MWaves;
     using Base::NWaves;
@@ -142,52 +142,31 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
     using Base::b_block_desc_n0_n1_n2_n3_k;
 
     using Base::AMmaKStride;
+    using Base::APackedSize;
     using Base::BMmaKStride;
+    using Base::BPackedSize;
     using Base::KThreadChunk;
 
     using Base::KXdlPack;
     using Base::MXdlPack;
     using Base::NXdlPack;
 
-    using Base::APackedSize;
-    using Base::BPackedSize;
-
     using AccType      = typename Base::AccType;
     using Tuple5       = typename Base::Tuple5;
     using ComputeTypeA = typename Base::ComputeTypeA;
     using ComputeTypeB = typename Base::ComputeTypeB;
 
-    static constexpr index_t PrefetchStages  = 2;
-    static constexpr index_t PrefillStages   = 1;
-    static constexpr index_t GlobalBufferNum = 1;
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t LocalPrefetchStages   = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
 
-    template <typename TileDesc_M0_M1_M2_M3_K>
-    __host__ __device__ static constexpr auto
-    MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_M3_K&)
-    {
-        constexpr index_t M0 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<0>{});
-        constexpr index_t M1 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<1>{});
-        constexpr index_t M2 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<2>{});
-        constexpr index_t M3 = TileDesc_M0_M1_M2_M3_K{}.GetLength(Number<3>{});
-        constexpr index_t K2 = KPack;
-        constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat;
-
-        return transform_tensor_descriptor(
-            TileDesc_M0_M1_M2_M3_K{},
-            make_tuple(
-                make_pass_through_transform(Number<M0>{}),
-                make_pass_through_transform(Number<M1>{}),
-                make_pass_through_transform(Number<M2>{}),
-                make_pass_through_transform(Number<M3>{}),
-                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-            make_tuple(
-                Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4, 5, 6>{}));
-    }
-
-    static constexpr auto a_block_desc_m0_m1_m2_m3_k0_k1_k2 =
-        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_m3_k);
+    static constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+    static constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack;
+    static constexpr auto async_vmcnt =
+        num_buffer_load_a_scale + num_buffer_load_b_scale + HotLoopInstList::B_Buffer_Load_Inst_Num;
+    static constexpr auto async_vmcnt_encoding = 3952 + async_vmcnt % 16 + async_vmcnt / 16 * 16384;
 
     static constexpr auto ScalesPerKBlockSize =
         KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
@@ -215,6 +194,11 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
         return num_loop > PrefetchStages;
     }
 
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
     __device__ static constexpr auto HotLoopScheduler()
     {
         // A/B split schedule
@@ -223,106 +207,104 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
                 ? HotLoopInstList::A_LDS_Read_Inst_Num
                 : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
-        constexpr auto num_ds_read_inst_b =
-            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
-                ? HotLoopInstList::B_LDS_Read_Inst_Num
-                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
-
-        constexpr auto num_ds_write_inst_a = HotLoopInstList::A_LDS_Write_Inst_Num;
-        constexpr auto num_ds_write_inst_b = HotLoopInstList::B_LDS_Write_Inst_Num;
 
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
         constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_stage1 =
+            num_buffer_load_inst_b + num_buffer_load_a_scale + num_buffer_load_b_scale;
 
-        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num;
+        constexpr auto num_buffer_load_stage2 = num_buffer_load_inst_a;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize;
+        constexpr auto mfma_cycle    = HotLoopInstList::C_MFMA_Inst_Cycle;
 
-        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
         constexpr auto ds_read_a_issue_cycle =
             HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
-        constexpr auto ds_read_b_issue_cycle =
-            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
         constexpr auto ds_read_a_mfma_rate =
-            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
-        constexpr auto ds_read_b_mfma_rate =
-            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+            math::integer_divide_ceil(mfma_cycle - 8, 2 * ds_read_a_issue_cycle);
 
-        constexpr auto num_dsread_a_mfma =
-            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
-        constexpr auto num_dsread_b_mfma =
-            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+        // constexpr auto num_dsread_a_mfma =
+        //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
 
-        // stage 1
-        // Separate this part?
-        // constexpr auto num_mfma_per_ds_read = sizeof(ComputeDataType) / sizeof(ADataType) >
-        //                                               sizeof(ComputeDataType) / sizeof(BDataType)
-        //                                           ? sizeof(ComputeDataType) / sizeof(ADataType)
-        //                                           : sizeof(ComputeDataType) / sizeof(BDataType);
-        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
-        constexpr auto num_mfma_per_issue =
-            num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b);
-        constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a;
-        constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b;
+        constexpr auto num_total_stages = MRepeat;
 
-        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
-            ignore = i;
-            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
-                ignore = idswrite;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+        // Group num_mfma_perstage num_ds_read_a_perstage
+        // since we want to reuse a local register buffer
+        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+
+        constexpr auto num_ds_read_a_mfma_perstage =
+            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+
+        constexpr auto num_ds_read_a_prefetch_stages = 2;
+
+        constexpr auto buffer_load_perstage_more =
+            math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_less =
+            math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
+        constexpr auto buffer_load_perstage_stage2 =
+            math::integer_divide_floor((num_buffer_load_stage2), 2);
+
+        constexpr auto buffer_load_stages_more =
+            num_buffer_load_stage1 -
+            math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
+                ((num_total_stages - 2));
+
+        constexpr auto buffer_load_issue_point_interval_more =
+            num_mfma_perstage / buffer_load_perstage_more;
+        constexpr auto buffer_load_issue_point_interval_less =
+            num_mfma_perstage / buffer_load_perstage_less;
+        constexpr auto buffer_load_issue_point_interval_stage2 =
+            num_mfma_perstage / buffer_load_perstage_stage2;
+
+        // Stage 1
+        // global read more
+        static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+
+                if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
             });
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(
-                0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA
         });
-        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
-            ignore = i;
-            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
-                ignore = idswrite;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+
+        // global read less
+        static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
                 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
             });
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(
-                0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA
         });
 
-        // stage 2
-        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
-            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
-                         ds_read_a_mfma_rate)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
-            }
-            else
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100,
-                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
-                                                                              ds_read_a_mfma_rate,
-                                                     0); // DS read
-            }
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        // Stage 2, Sync
+        // lds synchronization, prefetch next loop local A
+        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
+            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                {
+                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                }
+            });
         });
-
-        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
-            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
-                         ds_read_b_mfma_rate)
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
-            }
-            else
-            {
-                __builtin_amdgcn_sched_group_barrier(0x100,
-                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
-                                                                              ds_read_b_mfma_rate,
-                                                     0); // DS read
-            }
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-    }
-
-    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
-    {
-        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
     }
 
     template <bool HasMainLoop,
@@ -352,14 +334,14 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
         const ABlockDesc& a_block_desc,
         ABlockTransfer& a_blockwise_copy,
         const AGridBuffer& a_grid_buf,
-        ABlockBuffer& a_block_buf,
+        ABlockBuffer& a_block_bufs,
         const ABlockTransferStep& a_block_copy_step,
         // BBlockCopy
         const BGridDesc& b_grid_desc,
         const BBlockDesc& b_block_desc,
         BBlockTransfer& b_blockwise_copy,
         const BGridBuffer& b_grid_buf,
-        BBlockBuffer& b_block_buf,
+        BBlockBuffer& b_block_bufs,
         const BBlockTransferStep& b_block_copy_step,
         // CThread
         CThreadBuffer& c_thread_buf,
@@ -372,14 +354,11 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
         const BScaleGridBuffer& b_scale_grid_buf,
         index_t num_loop) const
     {
-        ignore = b_block_desc;
-        ignore = b_block_buf;
-
+        ignore            = b_block_bufs;
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
             b_thread_desc_.GetElementSpaceSize());
-
         StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
         constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0);
 
@@ -391,19 +370,15 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
         StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
         StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
 
-        // Global prefetch B1
-        b_blockwise_copy.Run(b_grid_desc,
-                             b_grid_buf,
-                             b_block_desc_n0_n1_n2_k0_k1,
-                             b_block_origin_idx,
-                             b_thread_bufs(I0));
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I0));
+        b_blockwise_copy.Run(
+            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-        // Global prefetch A1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-        // Prefetch a_scales to buf 0
+        // Prefetch a_scales
         static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
             static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                 a_scale_thread_copy.Run(a_scale_grid_desc,
@@ -424,7 +399,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
             a_scale_grid_desc,
             make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
 
-        // Prefetch b_scales 1
+        // Prefetch b_scales
         static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
             static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                 b_scale_thread_copy.Run(b_scale_grid_desc,
@@ -446,44 +421,38 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
             b_scale_grid_desc,
             make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
 
-        // Local prefill A1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0)); // vmem->vgpr-> lds0
-
-        // Global prefetch A2
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-        // Local prefetch A1
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
         block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k) {
-            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
-                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
-            static_for<0, MRepeat, 1>{}([&](auto m0) {
+        static_for<0, LocalPrefetchStages, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
                 static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
                     [&](auto chunk) {
                         constexpr auto a_k_step_chunk =
                             k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
-                                           make_tuple(Number<m0 / MXdlPack>{},
-                                                      I0,
-                                                      Number<m0 % MXdlPack>{},
-                                                      I0,
-                                                      Number<a_k_step_chunk>{}),
-                                           a_block_buf.At(I0),
-                                           a_thread_desc_,
-                                           make_tuple(Number<m0 / MXdlPack>{},
-                                                      I0,
-                                                      Number<m0 % MXdlPack>{},
-                                                      k,
-                                                      Number<chunk * KThreadChunk>{}),
-                                           a_thread_buf);
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_m3_k,
+                            make_tuple(
+                                I0, I0, Number<m0 % MXdlPack>{}, I0, Number<a_k_step_chunk>{}),
+                            a_block_bufs(I0),
+                            a_thread_desc_,
+                            make_tuple(
+                                I0, I0, Number<m0 % MXdlPack>{}, k, Number<chunk * KThreadChunk>{}),
+                            a_thread_buf);
                     });
             });
         });
 
+        // Global prefetch 2
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I1));
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
         // Initialize C
         c_thread_buf.Clear();
-
+        __builtin_amdgcn_sched_barrier(0);
+        constexpr index_t SwitchM = MRepeat - LocalPrefetchStages;
         // main body
         if constexpr(HasMainLoop)
         {
@@ -492,7 +461,13 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
             do
             {
                 auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
-                    // Prefetch a_scales to buf 1
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(scale_mem_buf));
+
+                    // Prefetch a_scales
                     static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
                         static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                             a_scale_thread_copy.Run(a_scale_grid_desc,
@@ -513,7 +488,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                         a_scale_grid_desc,
                         make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
 
-                    // Prefetch b_scales 1
+                    // Prefetch b_scales
                     static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
                         static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                             b_scale_thread_copy.Run(b_scale_grid_desc,
@@ -535,30 +510,25 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                         b_scale_grid_desc,
                         make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
 
-                    // Local prefill A2
-                    block_sync_lds();
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(scale_mem_buf));
-
-                    // Global prefetch A1
-                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-
-                    // Global prefetch B2
-                    b_blockwise_copy.Run(b_grid_desc,
-                                         b_grid_buf,
-                                         b_block_desc_n0_n1_n2_k0_k1,
-                                         b_block_origin_idx,
-                                         b_thread_bufs(scale_mem_buf));
+                    // a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                     b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
-                    // A1 * B1
-                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
-                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
-                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        constexpr auto im_major = m0 / MXdlPack;
+                        constexpr auto im_minor = m0 % MXdlPack;
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            constexpr auto ik_major = k0 / KXdlPack;
+                            constexpr auto ik_minor = k0 % KXdlPack;
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                constexpr auto in_major = n0 / NXdlPack;
+                                constexpr auto in_minor = n0 % NXdlPack;
+
                                 constexpr index_t a_scale_offset =
-                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                    a_scale_thread_desc.CalculateOffset(
+                                        make_tuple(im_major, ik_major, I0));
                                 constexpr index_t b_scale_offset =
-                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+                                    b_scale_thread_desc.CalculateOffset(
+                                        make_tuple(in_major, ik_major, I0));
 
                                 static_assert(0 < ScalesPerXdlopsRunPerThread,
                                               "Must have at least one scale per Xdlops "
@@ -582,97 +552,95 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                                             scale_comp_buf)[Number<b_scale_offset + s>{}];
                                 });
 
-                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+                                vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec;
 
-                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
-                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                                            static_for<0, KPack, 1>{}([&](auto ik) {
-                                                a_thread_vec.template AsType<ComputeTypeA>()(
-                                                    ik) = a_thread_buf
-                                                    [Number<a_thread_desc_.CalculateOffset(
-                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
-                                                b_thread_vec.template AsType<ComputeTypeB>()(
-                                                    ik) = b_thread_buf
-                                                    [Number<b_thread_desc_.CalculateOffset(
-                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
-                                            });
-
-                                            using mfma_input_type_a =
-                                                typename vector_type<ComputeTypeA,
-                                                                     xdlops_gemm.K1PerXdlops /
-                                                                         APackedSize>::type;
-
-                                            using mfma_input_type_b =
-                                                typename vector_type<ComputeTypeB,
-                                                                     xdlops_gemm.K1PerXdlops /
-                                                                         BPackedSize>::type;
-
-                                            using mfma_scale_input_type_a =
-                                                typename vector_type<AScaleDataType,
-                                                                     a_scale_thread_vec_size>::type;
-                                            using mfma_scale_input_type_b =
-                                                typename vector_type<BScaleDataType,
-                                                                     b_scale_thread_vec_size>::type;
-
-                                            constexpr index_t c_offset =
-                                                c_thread_desc_.CalculateOffset(
-                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
-
-                                            // MFMA accumulation
-                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
-                                                                     ikxdl * NXdlPack + inxdl>(
-                                                a_thread_vec.template AsType<mfma_input_type_a>(),
-                                                a_scale_thread_vec
-                                                    .template AsType<mfma_scale_input_type_a>(),
-                                                b_thread_vec.template AsType<mfma_input_type_b>(),
-                                                b_scale_thread_vec
-                                                    .template AsType<mfma_scale_input_type_b>(),
-                                                c_thread_buf.GetVectorTypeReference(
-                                                    Number<c_offset>{}));
-                                        });
-                                    });
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) = b_thread_bufs
+                                        [scale_comp_buf][Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                                 });
+
+                                using mfma_input_type_a =
+                                    typename vector_type<ComputeTypeA,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             APackedSize>::type;
+
+                                using mfma_input_type_b =
+                                    typename vector_type<ComputeTypeB,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             BPackedSize>::type;
+
+                                using mfma_scale_input_type_a =
+                                    typename vector_type<AScaleDataType,
+                                                         a_scale_thread_vec_size>::type;
+                                using mfma_scale_input_type_b =
+                                    typename vector_type<BScaleDataType,
+                                                         b_scale_thread_vec_size>::type;
+
+                                constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                                // MFMA accumulation
+                                xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                         ik_minor * NXdlPack + in_minor>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                             });
                         });
-                    });
 
-                    // Local prefetch A2
-                    block_sync_lds();
-                    static_for<0, KRepeat, 1>{}([&](auto k) {
-                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
-                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
-                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        if constexpr(m0.value == SwitchM)
+                        {
+                            __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+                            block_sync_lds();
+                            a_blockwise_copy.Run(a_grid_desc,
+                                                 a_grid_buf,
+                                                 a_block_desc,
+                                                 a_block_bufs(scale_comp_buf));
+                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        }
+
+                        constexpr auto lds_buf =
+                            m0.value >= SwitchM ? scale_mem_buf : scale_comp_buf;
+
+                        static_for<0, KRepeat, 1>{}([&](auto k) {
+                            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
                             static_for<0,
                                        xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
                                        1>{}([&](auto chunk) {
                                 constexpr auto a_k_step_chunk =
                                     k_step +
                                     chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
-                                                   make_tuple(Number<m0 / MXdlPack>{},
-                                                              I0,
-                                                              Number<m0 % MXdlPack>{},
-                                                              I0,
-                                                              Number<a_k_step_chunk>{}),
-                                                   a_block_buf.At(scale_mem_buf),
-                                                   a_thread_desc_,
-                                                   make_tuple(Number<m0 / MXdlPack>{},
-                                                              I0,
-                                                              Number<m0 % MXdlPack>{},
-                                                              k,
-                                                              Number<chunk * KThreadChunk>{}),
-                                                   a_thread_buf);
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(Number<lds_buf>{}),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
                             });
                         });
                     });
 
                     HotLoopScheduler();
                     __builtin_amdgcn_sched_barrier(0);
-                }; // LoopFunc
+                };
 
                 LoopFunc(I0, I1);
                 LoopFunc(I1, I0);
@@ -684,6 +652,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
         // tail
         if constexpr(TailNum == TailNumber::Even)
         {
+            b_blockwise_copy.Run(
+                b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I1));
+
             // Prefetch a_scales
             static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
                 static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
@@ -716,25 +687,20 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                     b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
             });
 
-            // Local prefill A2
-            block_sync_lds();
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
 
-            // Global prefetch B2
-            b_blockwise_copy.Run(b_grid_desc,
-                                 b_grid_buf,
-                                 b_block_desc_n0_n1_n2_k0_k1,
-                                 b_block_origin_idx,
-                                 b_thread_bufs(I1));
-
-            // A1 * B1
-            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
-                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
                         constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
                         constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
 
                         static_assert(0 < ScalesPerXdlopsRunPerThread,
                                       "Must have at least one scale per Xdlops "
@@ -754,98 +720,91 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                                 b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
                         });
 
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
 
-                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
-                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                                    static_for<0, KPack, 1>{}([&](auto ik) {
-                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
-                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
-                                    });
-
-                                    using mfma_input_type_a =
-                                        typename vector_type<ComputeTypeA,
-                                                             xdlops_gemm.K1PerXdlops /
-                                                                 APackedSize>::type;
-
-                                    using mfma_input_type_b =
-                                        typename vector_type<ComputeTypeB,
-                                                             xdlops_gemm.K1PerXdlops /
-                                                                 BPackedSize>::type;
-
-                                    using mfma_scale_input_type_a =
-                                        typename vector_type<AScaleDataType,
-                                                             a_scale_thread_vec_size>::type;
-                                    using mfma_scale_input_type_b =
-                                        typename vector_type<BScaleDataType,
-                                                             b_scale_thread_vec_size>::type;
-
-                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
-                                        make_tuple(m0, n0, imxdl, inxdl, 0));
-
-                                    // MFMA accumulation
-                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
-                                                             ikxdl * NXdlPack + inxdl>(
-                                        a_thread_vec.template AsType<mfma_input_type_a>(),
-                                        a_scale_thread_vec
-                                            .template AsType<mfma_scale_input_type_a>(),
-                                        b_thread_vec.template AsType<mfma_input_type_b>(),
-                                        b_scale_thread_vec
-                                            .template AsType<mfma_scale_input_type_b>(),
-                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                                });
-                            });
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                         });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
-            });
+                if constexpr(m0.value == SwitchM)
+                {
+                    __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+                    block_sync_lds();
+                }
 
-            // Local prefetch A2
-            block_sync_lds();
+                constexpr auto lds_buf = m0.value >= SwitchM ? I1 : I0;
 
-            static_for<0, KRepeat, 1>{}([&](auto k) {
-                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
-                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
-                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                            (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
                     static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
                         [&](auto chunk) {
                             constexpr auto a_k_step_chunk =
                                 k_step +
                                 chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
-                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
-                                               make_tuple(Number<m0 / MXdlPack>{},
-                                                          I0,
-                                                          Number<m0 % MXdlPack>{},
-                                                          I0,
-                                                          Number<a_k_step_chunk>{}),
-                                               a_block_buf.At(I0),
-                                               a_thread_desc_,
-                                               make_tuple(Number<m0 / MXdlPack>{},
-                                                          I0,
-                                                          Number<m0 % MXdlPack>{},
-                                                          k,
-                                                          Number<chunk * KThreadChunk>{}),
-                                               a_thread_buf);
+                            a_thread_copy_.Run(
+                                a_block_desc_m0_m1_m2_m3_k,
+                                make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                  (MRepeat / MXdlPack)>{},
+                                           I0,
+                                           Number<im_minor>{},
+                                           I0,
+                                           Number<a_k_step_chunk>{}),
+                                a_block_bufs(Number<lds_buf>{}),
+                                a_thread_desc_,
+                                make_tuple(
+                                    I0, I0, Number<im_minor>{}, k, Number<chunk * KThreadChunk>{}),
+                                a_thread_buf);
                         });
                 });
             });
 
-            // A2 * B2
-            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
-                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
                         constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
                         constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
 
                         static_assert(0 < ScalesPerXdlopsRunPerThread,
                                       "Must have at least one scale per Xdlops "
@@ -865,69 +824,91 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                                 b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
                         });
 
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
 
-                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
-                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                                    static_for<0, KPack, 1>{}([&](auto ik) {
-                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
-                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
-                                    });
-
-                                    using mfma_input_type_a =
-                                        typename vector_type<ComputeTypeA,
-                                                             xdlops_gemm.K1PerXdlops /
-                                                                 APackedSize>::type;
-
-                                    using mfma_input_type_b =
-                                        typename vector_type<ComputeTypeB,
-                                                             xdlops_gemm.K1PerXdlops /
-                                                                 BPackedSize>::type;
-
-                                    using mfma_scale_input_type_a =
-                                        typename vector_type<AScaleDataType,
-                                                             a_scale_thread_vec_size>::type;
-                                    using mfma_scale_input_type_b =
-                                        typename vector_type<BScaleDataType,
-                                                             b_scale_thread_vec_size>::type;
-
-                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
-                                        make_tuple(m0, n0, imxdl, inxdl, 0));
-
-                                    // MFMA accumulation
-                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
-                                                             ikxdl * NXdlPack + inxdl>(
-                                        a_thread_vec.template AsType<mfma_input_type_a>(),
-                                        a_scale_thread_vec
-                                            .template AsType<mfma_scale_input_type_a>(),
-                                        b_thread_vec.template AsType<mfma_input_type_b>(),
-                                        b_scale_thread_vec
-                                            .template AsType<mfma_scale_input_type_b>(),
-                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                                });
-                            });
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                         });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(I1),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                    });
+                }
             });
         }
         else if constexpr(TailNum == TailNumber::Odd)
         {
-            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
-                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
-                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
                         constexpr index_t a_scale_offset =
-                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
                         constexpr index_t b_scale_offset =
-                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
 
                         static_assert(0 < ScalesPerXdlopsRunPerThread,
                                       "Must have at least one scale per Xdlops "
@@ -947,64 +928,94 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                                 b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
                         });
 
-                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
-                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
-                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
-                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
 
-                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
-                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
-
-                                    static_for<0, KPack, 1>{}([&](auto ik) {
-                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
-                                        // b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                        //     b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        //         make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
-                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                            type_convert<ComputeTypeB>(ck::float2_t(1.0));
-                                    });
-
-                                    using mfma_input_type_a =
-                                        typename vector_type<ComputeTypeA,
-                                                             xdlops_gemm.K1PerXdlops /
-                                                                 APackedSize>::type;
-
-                                    using mfma_input_type_b =
-                                        typename vector_type<ComputeTypeB,
-                                                             xdlops_gemm.K1PerXdlops /
-                                                                 BPackedSize>::type;
-
-                                    using mfma_scale_input_type_a =
-                                        typename vector_type<AScaleDataType,
-                                                             a_scale_thread_vec_size>::type;
-                                    using mfma_scale_input_type_b =
-                                        typename vector_type<BScaleDataType,
-                                                             b_scale_thread_vec_size>::type;
-
-                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
-                                        make_tuple(m0, n0, imxdl, inxdl, 0));
-
-                                    // MFMA accumulation
-                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
-                                                             ikxdl * NXdlPack + inxdl>(
-                                        a_thread_vec.template AsType<mfma_input_type_a>(),
-                                        a_scale_thread_vec
-                                            .template AsType<mfma_scale_input_type_a>(),
-                                        b_thread_vec.template AsType<mfma_input_type_b>(),
-                                        b_scale_thread_vec
-                                            .template AsType<mfma_scale_input_type_b>(),
-                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                                });
-                            });
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(I0, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
                         });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
+                if constexpr(m0.value < (MRepeat - LocalPrefetchStages))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                        static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                            [&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_m3_k,
+                                    make_tuple(Number<((m0 + LocalPrefetchStages) / MXdlPack) %
+                                                      (MRepeat / MXdlPack)>{},
+                                               I0,
+                                               Number<im_minor>{},
+                                               I0,
+                                               Number<a_k_step_chunk>{}),
+                                    a_block_bufs(I0),
+                                    a_thread_desc_,
+                                    make_tuple(I0,
+                                               I0,
+                                               Number<im_minor>{},
+                                               k,
+                                               Number<chunk * KThreadChunk>{}),
+                                    a_thread_buf);
+                            });
+                    });
+                }
             });
         }
     }
 
+    //  Length:  A[ARegBuf, MWave, MXdlPack, KRepeat, KPack]
+    //  Order:     1        0      3         2        4
+    static constexpr auto ARegBuf        = 2;
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<ARegBuf>{}, I1, Number<MXdlPack>{}, Number<KRepeat>{}, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeTypeA,
+                                                         decltype(a_block_desc_m0_m1_m2_m3_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, KThreadChunk>,
+                                                         Sequence<0, 1, 2, 3, 4>,
+                                                         4,
+                                                         A_K1,
+                                                         A_K1>;
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex()};
+
     // TODO: make this field protected when a_scale_thread_copy_ is moved
     // here
     static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
@@ -1020,13 +1031,11 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
                    Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
 
     protected:
-    using Base::a_thread_copy_;
-    using Base::a_thread_desc_;
+    // using Base::a_thread_copy_;
+    // using Base::a_thread_desc_;
     using Base::b_thread_copy_;
     using Base::b_thread_desc_;
     using Base::c_thread_desc_;
-
-    static constexpr BTileDesc b_block_desc_n0_n1_n2_k0_k1;
 };
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_gufusion_v3.hpp
new file mode 100644
index 0000000000..66d221691b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_gufusion_v3.hpp
@@ -0,0 +1,1332 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                           ThreadBlockSize,
+                                                           ScaleBlockSize,
+                                                           ADataType,
+                                                           AScaleDataType,
+                                                           BDataType,
+                                                           BScaleDataType,
+                                                           ATileDesc,
+                                                           BTileDesc,
+                                                           AMmaTileDesc,
+                                                           BMmaTileDesc,
+                                                           ABlockTransferSrcScalarPerVector,
+                                                           BBlockTransferSrcScalarPerVector,
+                                                           MPerBlock,
+                                                           NPerBlock,
+                                                           KPerBlock,
+                                                           MPerXDL,
+                                                           NPerXDL,
+                                                           MRepeat,
+                                                           NRepeat,
+                                                           KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2 * 2;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+
+        constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+        constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack * 2;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize * 2;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 4 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                               num_buffer_load_a_scale + num_buffer_load_b_scale;
+
+        constexpr auto mfma_perstage_more =
+            math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+        constexpr auto mfma_perstage_less =
+            math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+        constexpr auto mfma_stages_more =
+            num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            if constexpr(i < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                          num_buffer_load_a_scale) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // A
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_bufs,
+        const ABlockTransferStep& a_block_copy_step,
+        // Gate and Up
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        BBlockTransfer& b_blockwise_copy_up,
+        const BGridBuffer& b_grid_buf,
+        const BGridBuffer& b_grid_buf_up,
+        BBlockBuffer& b_block_bufs,
+        BBlockBuffer& b_block_bufs_up,
+        const BBlockTransferStep& b_block_copy_step,
+        // C
+        CThreadBuffer& c_thread_buf,
+        CThreadBuffer& c_thread_buf_up,
+        // A scale
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        // Gate and Up scale
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        BScaleThreadTransfer& b_scale_thread_copy_up,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        const BScaleGridBuffer& b_scale_grid_buf_up,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf_up = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf_up), Number<2>{}> b_scale_thread_bufs_up;
+
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I0));
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(I0));
+        b_blockwise_copy_up.Run(b_grid_desc, b_grid_buf_up, b_block_desc, b_block_bufs_up(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales_gate
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales_up
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                           b_scale_grid_buf_up,
+                                           b_scale_thread_desc,
+                                           make_tuple(n0, k0, I0),
+                                           b_scale_thread_bufs_up(I0));
+
+                b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                          make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy_up.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy_up.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(3952);
+
+        // Local prefetch 1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_bufs(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_bufs(I0),
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_bufs_up(I0),
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf_up);
+                    });
+            });
+        });
+
+        // Global prefetch 2
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I1));
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(I1));
+        b_blockwise_copy_up.Run(b_grid_desc, b_grid_buf_up, b_block_desc, b_block_bufs_up(I1));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        c_thread_buf_up.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+
+                    a_blockwise_copy.Run(
+                        a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(scale_comp_buf));
+                    b_blockwise_copy.Run(
+                        b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(scale_comp_buf));
+                    b_blockwise_copy_up.Run(
+                        b_grid_desc, b_grid_buf_up, b_block_desc, b_block_bufs_up(scale_comp_buf));
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales_up
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                                       b_scale_grid_buf_up,
+                                                       b_scale_thread_desc,
+                                                       make_tuple(n0, k0, I0),
+                                                       b_scale_thread_bufs_up(scale_mem_buf));
+
+                            b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                      make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy_up.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec_up;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs_up(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                                a_thread_vec.template AsType<ComputeTypeA>()(
+                                                    ik) = a_thread_buf
+                                                    [Number<a_thread_desc_.CalculateOffset(
+                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                                b_thread_vec.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                                b_thread_vec_up.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf_up
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                            });
+
+                                            using mfma_input_type_a =
+                                                typename vector_type<ComputeTypeA,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         APackedSize>::type;
+
+                                            using mfma_input_type_b =
+                                                typename vector_type<ComputeTypeB,
+                                                                     xdlops_gemm.K1PerXdlops /
+                                                                         BPackedSize>::type;
+
+                                            using mfma_scale_input_type_a =
+                                                typename vector_type<AScaleDataType,
+                                                                     a_scale_thread_vec_size>::type;
+                                            using mfma_scale_input_type_b =
+                                                typename vector_type<BScaleDataType,
+                                                                     b_scale_thread_vec_size>::type;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                            // MFMA accumulation
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec_up
+                                                    .template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec_up
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf_up.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+
+                    // k indexes mapping to threads for 32x32x64:
+                    // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                    // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                    //              k = 0                 k = 1
+
+                    //  k indexes mapping to threads for 16x16x128:
+                    // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                    // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                    // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                    // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                    //              k = 0                    k = 1
+                    // block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step =
+                            k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_bufs(scale_mem_buf),
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_bufs(scale_mem_buf),
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_bufs_up(scale_mem_buf),
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf_up);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales_up
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy_up.Run(b_scale_grid_desc,
+                                               b_scale_grid_buf_up,
+                                               b_scale_thread_desc,
+                                               make_tuple(n0, k0, I0),
+                                               b_scale_thread_bufs_up(I1));
+
+                    b_scale_thread_copy_up.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                              make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy_up.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf_up[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec_up
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            __builtin_amdgcn_s_waitcnt(3952);
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_bufs(I1),
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_bufs(I1),
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_bufs_up(I1),
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf_up);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf_up[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec_up
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec_up;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec_up.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs_up(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec_up;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                        b_thread_vec_up.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf_up[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a =
+                                        typename vector_type<ComputeTypeA,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 APackedSize>::type;
+
+                                    using mfma_input_type_b =
+                                        typename vector_type<ComputeTypeB,
+                                                             xdlops_gemm.K1PerXdlops /
+                                                                 BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a =
+                                        typename vector_type<AScaleDataType,
+                                                             a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b =
+                                        typename vector_type<BScaleDataType,
+                                                             b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec_up.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec_up
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf_up.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_selector.hpp
new file mode 100644
index 0000000000..f2a4eab393
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_selector.hpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_gufusion_v3.hpp"
+
+namespace ck {
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType, // TODO: remove this as in this pipeline ADataType and BDataType
+                                    // must be used for compute
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool GUFusion = false>
+constexpr auto BlockGemmMXPipeline_Selector()
+{
+
+    // Hardware MX GEMM pipeline
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        if constexpr(GUFusion)
+        {
+            return nullptr;
+        }
+        else
+        {
+            return nullptr;
+        }
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        if constexpr(GUFusion)
+        {
+            return BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
+        }
+        else
+        {
+            return BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3<BlkGemmPipeSche,
+                                                              ThreadBlockSize,
+                                                              ScaleBlockSize,
+                                                              ADataType,
+                                                              AScaleDataType,
+                                                              BDataType,
+                                                              BScaleDataType,
+                                                              ATileDesc,
+                                                              BTileDesc,
+                                                              AMmaTileDesc,
+                                                              BMmaTileDesc,
+                                                              ABlockTransferSrcScalarPerVector,
+                                                              BBlockTransferSrcScalarPerVector,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              MPerXDL,
+                                                              NPerXDL,
+                                                              MRepeat,
+                                                              NRepeat,
+                                                              KPack>{};
+        }
+    }
+    else
+    {
+        std::cerr << "MX GEMM Pipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp
new file mode 100644
index 0000000000..bb4286b3f5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp
@@ -0,0 +1,1090 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                  ThreadBlockSize,
+                                                  ScaleBlockSize,
+                                                  ADataType,
+                                                  AScaleDataType,
+                                                  BDataType,
+                                                  BScaleDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // A/B split schedule
+        // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
+        constexpr auto num_ds_read_inst_a =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16
+                ? HotLoopInstList::A_LDS_Read_Inst_Num
+                : HotLoopInstList::A_LDS_Read_Inst_Num / 2;
+        constexpr auto num_ds_read_inst_b =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16
+                ? HotLoopInstList::B_LDS_Read_Inst_Num
+                : HotLoopInstList::B_LDS_Read_Inst_Num / 2;
+
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+        constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack;
+
+        constexpr auto num_mfma_inst = HotLoopInstList::C_MFMA_Inst_Num * APackedSize;
+
+        constexpr auto mfma_cycle = HotLoopInstList::C_MFMA_Inst_Cycle;
+        constexpr auto ds_read_a_issue_cycle =
+            HotLoopInstList::A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+        constexpr auto ds_read_b_issue_cycle =
+            HotLoopInstList::B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+
+        constexpr auto ds_read_a_mfma_rate =
+            (mfma_cycle - 8 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
+        constexpr auto ds_read_b_mfma_rate =
+            (mfma_cycle - 8 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
+
+        constexpr auto num_dsread_a_mfma =
+            (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
+        constexpr auto num_dsread_b_mfma =
+            (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
+
+        // stage 1
+        constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
+        constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                               num_buffer_load_a_scale + num_buffer_load_b_scale;
+
+        constexpr auto mfma_perstage_more =
+            math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+        constexpr auto mfma_perstage_less =
+            math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+        constexpr auto mfma_stages_more =
+            num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            if constexpr(i < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+            if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                          num_buffer_load_a_scale) < mfma_stages_more)
+            {
+                static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+            else
+            {
+                static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+            }
+        });
+
+        // stage 2
+        static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                         ds_read_a_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
+                                                                              ds_read_a_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+
+        static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) {
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
+                         ds_read_b_mfma_rate)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x100,
+                                                     num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
+                                                                              ds_read_b_mfma_rate,
+                                                     0); // DS read
+            }
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_bufs,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_bufs,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I0));
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(3952);
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k) {
+            constexpr auto k_step = k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_bufs(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read block data in chunks to assemble correct thread vectors
+                static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto b_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      I0,
+                                                      Number<b_k_step_chunk>{}),
+                                           b_block_bufs(I0),
+                                           b_thread_desc_,
+                                           make_tuple(Number<n0 / NXdlPack>{},
+                                                      I0,
+                                                      Number<n0 % NXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           b_thread_buf);
+                    });
+            });
+        });
+
+        // Global prefetch 2
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(I1));
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(I1));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+
+                    a_blockwise_copy.Run(
+                        a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(scale_comp_buf));
+                    b_blockwise_copy.Run(
+                        b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(scale_comp_buf));
+
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                                    static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                        static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                            constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                            vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                            vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                                a_thread_vec.template AsType<ComputeTypeA>()(
+                                                    ik) = a_thread_buf
+                                                    [Number<a_thread_desc_.CalculateOffset(
+                                                        make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                                b_thread_vec.template AsType<ComputeTypeB>()(
+                                                    ik) = b_thread_buf
+                                                    [Number<b_thread_desc_.CalculateOffset(
+                                                        make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                            });
+
+                                            using mfma_input_type_a = typename vector_type< //
+                                                ComputeTypeA,
+                                                xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                            using mfma_input_type_b = typename vector_type< //
+                                                ComputeTypeB,
+                                                xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                            using mfma_scale_input_type_a = typename vector_type< //
+                                                AScaleDataType,
+                                                a_scale_thread_vec_size>::type;
+                                            using mfma_scale_input_type_b = typename vector_type< //
+                                                BScaleDataType,
+                                                b_scale_thread_vec_size>::type;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                            // MFMA accumulation
+                                            xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                                     ikxdl * NXdlPack + inxdl>(
+                                                a_thread_vec.template AsType<mfma_input_type_a>(),
+                                                a_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_a>(),
+                                                b_thread_vec.template AsType<mfma_input_type_b>(),
+                                                b_scale_thread_vec
+                                                    .template AsType<mfma_scale_input_type_b>(),
+                                                c_thread_buf.GetVectorTypeReference(
+                                                    Number<c_offset>{}));
+                                        });
+                                    });
+                                });
+                            });
+                        });
+                    });
+
+                    // k indexes mapping to threads for 32x32x64:
+                    // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
+                    // t32: |16 --> 31 48 --> 63 | 80 --> 95 112 --> 127 | etc.
+                    //              k = 0                 k = 1
+
+                    //  k indexes mapping to threads for 16x16x128:
+                    // t0 : |0  --> 15 64  --> 79 | 128 --> 143 192 --> 207| etc.
+                    // t16: |16 --> 31 80  --> 95 | 144 --> 159 208 --> 223| etc.
+                    // t32: |32 --> 47 96  --> 111| 160 --> 175 224 --> 239| etc.
+                    // t48: |48 --> 63 112 --> 127| 176 --> 191 240 --> 255| etc.
+                    //              k = 0                    k = 1
+                    // __builtin_amdgcn_s_waitcnt(3952);
+                    // block_sync_lds();
+                    static_for<0, KRepeat, 1>{}([&](auto k) {
+                        constexpr auto k_step =
+                            k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_bufs(scale_mem_buf),
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            // read block data in chunks to assemble correct thread vectors
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto b_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              I0,
+                                                              Number<b_k_step_chunk>{}),
+                                                   b_block_bufs(scale_mem_buf),
+                                                   b_thread_desc_,
+                                                   make_tuple(Number<n0 / NXdlPack>{},
+                                                              I0,
+                                                              Number<n0 % NXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   b_thread_buf);
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            __builtin_amdgcn_s_waitcnt(3952);
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step =
+                    k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_bufs(I1),
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read block data in chunks to assemble correct thread vectors
+                    static_for<0, xdlops_gemm.K1PerXdlops / (BPackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto b_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            b_thread_copy_.Run(b_block_desc_n0_n1_n2_n3_k,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          I0,
+                                                          Number<b_k_step_chunk>{}),
+                                               b_block_bufs(I1),
+                                               b_thread_desc_,
+                                               make_tuple(Number<n0 / NXdlPack>{},
+                                                          I0,
+                                                          Number<n0 % NXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               b_thread_buf);
+                        });
+                });
+            });
+
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        static_for<0, KXdlPack, 1>{}([&](auto ikxdl) {
+                            static_for<0, MXdlPack, 1>{}([&](auto imxdl) {
+                                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {
+                                    constexpr auto kxdl = ikxdl + k0 * KXdlPack;
+
+                                    vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                    vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                    static_for<0, KPack, 1>{}([&](auto ik) {
+                                        a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                            a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                                make_tuple(m0, I0, imxdl, kxdl, ik))>{}];
+                                        b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                            b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                                make_tuple(n0, I0, inxdl, kxdl, ik))>{}];
+                                    });
+
+                                    using mfma_input_type_a = typename vector_type< //
+                                        ComputeTypeA,
+                                        xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                                    using mfma_input_type_b = typename vector_type< //
+                                        ComputeTypeB,
+                                        xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                                    using mfma_scale_input_type_a = typename vector_type< //
+                                        AScaleDataType,
+                                        a_scale_thread_vec_size>::type;
+                                    using mfma_scale_input_type_b = typename vector_type< //
+                                        BScaleDataType,
+                                        b_scale_thread_vec_size>::type;
+
+                                    constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, n0, imxdl, inxdl, 0));
+
+                                    // MFMA accumulation
+                                    xdlops_gemm.template Run<ikxdl * MXdlPack + imxdl,
+                                                             ikxdl * NXdlPack + inxdl>(
+                                        a_thread_vec.template AsType<mfma_input_type_a>(),
+                                        a_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_a>(),
+                                        b_thread_vec.template AsType<mfma_input_type_b>(),
+                                        b_scale_thread_vec
+                                            .template AsType<mfma_scale_input_type_b>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
new file mode 100644
index 0000000000..3e9e501126
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+/**
+ * Transfer that uses direct load instructions to copy data from global to LDS memory.
+ *
+ * Traditional loads first copy data from global to registers, and then from registers to LDS.
+ * Direct loads do not need an intermediate step, data is copied directly from global to LDS,
+ * without the use of additional registers.
+ *
+ * However, the instruction has limitations:
+ * - each thread must copy exactly a single DWORD - 4 bytes;
+ * - threads within a single wavefront must write consecutive DWORDS into LDS,
+ *   (data in global do not need to be contiguous, each thread might have its own offset).
+ *
+ * To make sure that all the transfers finished, the `waitcnt` instruction must be used with
+ * `vmcnt` instead of `lgkmcnt`.
+ *
+ * Limitations of the transfer class:
+ * - `SrcData` must be the same as `DstData` - no possibility to convert the data type in flight;
+ * - `DstVectorDim` must be the last dimension;
+ * - `SrcVectorDim` must be the last dimension if `ScalarPerVector` is greater than 1;
+ * - `ScalarPerVector` times the number of bytes of `DstData` must be equal to a single DWORD = 4B
+ *   (for examlpe if `DstData` is fp32, then `ScalarPerVector` must be 1; if `DstData` is fp16,
+ *   `ScalarPerVector` must be 2);
+ * - if `ScalarPerVector` is greater than 1, the contiguous dimension in src and dst must be
+ *   the same dimension;
+ * - threads in a wavefront must write contiguous data to LDS (when wavefront size is 64,
+ *   they must write 64 contiguous DWORDs) - `ThreadClusterLengths` must be prepared in such a way
+ *   to guarantee that.
+ */
+template <typename ThreadGroup,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t ScalarPerVector,
+          typename IndexType,
+          index_t GatherDim = 1>
+struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr auto block_slice_lengths    = BlockSliceLengths{};
+    static constexpr auto thread_cluster_lengths = ThreadClusterLengths{};
+
+    static constexpr auto thread_single_load_size = generate_sequence(
+        detail::lambda_scalar_per_access<DstVectorDim, ScalarPerVector>{}, Number<nDim>{});
+    // After a load, each thread moves by `thread_steps` instead of loading the next elements.
+    // It makes the whole wavefront load contiguous memory, what is required for direct loads.
+    static constexpr auto thread_steps         = thread_cluster_lengths * thread_single_load_size;
+    static constexpr auto thread_slice_lengths = block_slice_lengths / thread_steps;
+    static constexpr index_t gather_num        = thread_slice_lengths.At(Number<GatherDim>{});
+
+    static __device__ constexpr bool AreThreadClusterLengthsValid()
+    {
+        // Make sure that ThreadClusterLengths are set in a way that allows for contiguous writes to
+        // LDS by the threads from a single wavefront.
+        // Examples (assuming 64 threads in a wavefront, 128 in a thread block):
+        // 1. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp32 -> ScalarPerVector = 1
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 3, 7] and thread 32 writes [1, 0, 0] instead of
+        //             [0, 4, 0].
+        //    VALID: ThreadClusterLengths = [2, 8, 8] or [1, 16, 8] since in the first iteration,
+        //           threads 0-63 write [0, 0, 0] - [0, 7, 7] -> 64 consecutive elements (DWORDs).
+        // 2. BlockSliceLengths = [K0PerBlock, MPerBlock, K1PerBlock] = [4, 128, 8],
+        //    data type = fp16 -> ScalarPerVector = 2
+        //    NOTE: ThreadClusterLengths must take into account that each thread writes two
+        //          elements (single DWORD) along the contiguous dimension.
+        //    INVALID: ThreadClusterLengths = [4, 4, 8] since each 8 threads would try to write
+        //             8 * 2 elements of K1PerBlock and there are only 8;
+        //             ThreadClusterLengths = [4, 8, 4] since in the first iteration, threads 0-31
+        //             write [0, 0, 0] - [0, 7, 7] (7 since each writes 2 elements) and thread 32
+        //             writes [1, 0, 0] instead of [0, 8, 0].
+        //    VALID: ThreadClusterLengths = [4, 16, 4] or [2, 32, 4] or [1, 64, 4] since in the
+        //           first iteration, threads 0-63 write [0, 0, 0] -  [0, 15, 7] -> 128 consecutive
+        //           elements = 64 consecutive DWORDs.
+#if defined(__gfx950__)
+        int num_contiguous_dwords = 4;
+#else
+        int num_contiguous_dwords = 1;
+#endif
+        bool is_contiguous = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(is_contiguous)
+            {
+                num_contiguous_dwords *= thread_cluster_lengths[nDim - i - 1];
+            }
+            if(thread_slice_lengths[nDim - i - 1] > 1)
+            {
+                is_contiguous = false;
+            }
+        });
+        constexpr index_t wavefront_size = get_warp_size();
+        const bool wave_contiguous       = num_contiguous_dwords % wavefront_size == 0;
+
+        bool thread_slice_lengths_correct = true;
+        static_for<0, nDim, 1>{}([&](auto i) {
+            if(thread_slice_lengths[i] <= 0)
+            {
+                thread_slice_lengths_correct = false;
+            }
+        });
+
+        return wave_contiguous && thread_slice_lengths_correct;
+    }
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_Gather_DirectLoad(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin,
+        const StaticallyIndexedArray<IndexType, gather_num>& gather_offsets)
+        : gather_offsets_(gather_offsets)
+    {
+        static_assert(ck::is_same_v<SrcData, DstData>,
+                      "Direct load transfer does not support datatypes conversion. Source and "
+                      "destination data types must be the same.");
+
+        static_assert(
+            DstVectorDim == nDim - 1,
+            "Direct load transfer requires the destination vector dimension to be the last one.");
+
+        static_assert(ScalarPerVector == 1 || SrcVectorDim == DstVectorDim,
+                      "When loading more than one element per thread at once, the contiguous "
+                      "dimension must be the same between source and destination.");
+
+        // constexpr auto dword_bytes           = 4;
+        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
+        // static_assert(bytes_per_thread_load == dword_bytes,
+        //               "Direct load transfer requires each thread to load exactly a single "
+        //               "DWORD of data.");
+
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size(),
+                      "Inconsistent number of dimensions across lengths and descriptors.");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "The number of threads cannot be less than the number of elements in "
+                      "thread cluster lengths.");
+
+        // static_assert(
+        //     AreThreadClusterLengthsValid(),
+        //     "Thread cluster lengths are incorrect. They must be set in a way that allows a single
+        //     " "wavefront to write contiguous DWORDs into LDS memory. ");
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc_.CalculateBottomIndex(make_multi_index(ThreadGroup::GetThreadId()));
+
+        constexpr auto wave_cluster_lengths = generate_sequence_v2(
+            [&](auto i) {
+                if constexpr(ThreadClusterArrangeOrder{}.At(i) == (nDim - 3))
+                {
+                    return Number<ThreadGroup::GetNumOfThread() / 64>{};
+                }
+                else
+                {
+                    return I1;
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto wave_thread_cluster_lengths = ThreadClusterLengths{} / wave_cluster_lengths;
+        constexpr auto wave_single_load_size =
+            wave_thread_cluster_lengths * thread_single_load_size;
+        constexpr auto wave_cluster_desc_ =
+            make_cluster_descriptor(wave_cluster_lengths, ThreadClusterArrangeOrder{});
+
+        const auto wave_cluster_idx = wave_cluster_desc_.CalculateBottomIndex(
+            make_multi_index(ThreadGroup::GetThreadId() / 64));
+
+        const auto thread_data_idx_begin = thread_cluster_idx * thread_single_load_size;
+        const auto wave_data_idx_begin   = wave_cluster_idx * wave_single_load_size;
+
+        SetSrcSliceOrigin(src_desc, src_block_slice_origin + thread_data_idx_begin);
+        // We don't need threadwise offset for lds since it was calculate by HW
+        // We still need input the wavewise offset.
+        SetDstSliceOrigin(dst_desc, dst_block_slice_origin + wave_data_idx_begin);
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        auto adjusted_src_origin_idx = [&]() {
+            Index idx;
+            static_for<0, nDim, 1>{}([&](auto i) {
+                idx(i) = i.value == GatherDim ? 0 : src_slice_origin_idx[Number<i>{}];
+            });
+            return idx;
+        }();
+
+        // CK_PRINT<decltype(adjusted_src_origin_idx)>();
+        // CK_PRINT<decltype(src_slice_origin_idx)>();
+
+        src_coord_        = make_tensor_coordinate(src_desc, adjusted_src_origin_idx);
+        src_slice_origin_ = adjusted_src_origin_idx;
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_        = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst_slice_origin_ = dst_slice_origin_idx;
+    }
+
+    __device__ void ResetDstSliceWindow(const DstDesc& dst_desc)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_);
+    }
+
+    template <typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global,
+                      "Source data must come from a global memory buffer.");
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "Destination data must be stored in an LDS memory buffer.");
+
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>,
+            "SrcBuffer and SrcData data types must be consistent.");
+        static_assert(
+            ck::is_same_v<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>,
+            "DstBuffer and DstData data types must be consistent.");
+
+        constexpr auto dst_access_lengths = thread_slice_lengths;
+
+        const auto dst_forward_steps  = generate_steps(dst_desc, 1);
+        const auto dst_backward_steps = generate_steps(dst_desc, -1);
+        const auto src_forward_steps  = generate_steps(src_desc, 1);
+        const auto src_backward_steps = generate_steps(src_desc, -1);
+
+        // Loop over the destination block and copy data.
+        static_ford<decltype(dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            IndexType gather_offset = gather_offsets_[ordered_dst_access_idx[Number<GatherDim>{}]];
+            // src_coord_xor_          = src_coord_;
+            // src_coord_xor_.GetIndex().At(I0) =
+            //     src_coord_.GetIndex().At(I0) ^ ((threadIdx.x % 64) / 8);
+            Index new_index = src_coord_.GetIndex();
+            new_index(I0)   = src_coord_.GetIndex().At(I0) ^ ((threadIdx.x % 64) / 8);
+            src_coord_xor_  = make_tensor_coordinate(src_desc, new_index);
+
+            const IndexType src_offset = src_coord_xor_.GetOffset() + gather_offset;
+            const IndexType dst_offset = __builtin_amdgcn_readfirstlane(dst_coord_.GetOffset());
+
+            // Check if src data is not in the logic padding area.
+            // Leave the HW for oob checking
+            // const bool is_src_valid =
+            //     coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc,
+            //     src_coord_);
+
+            src_buf.template DirectCopyToLds<remove_cvref_t<decltype(dst_buf)>, ScalarPerVector>(
+                dst_buf, src_offset, dst_offset, true);
+
+            constexpr auto move_src_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_dst_access_idx[j] == dst_access_lengths[j] - 1;
+                    });
+                    move_on_dim_(i) &= i.value != GatherDim;
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            constexpr auto move_dst_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &= ordered_dst_access_idx[j] == dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // Decide whether to move forward or backward.
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                // Move the source coordinate.
+                if constexpr(move_src_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(src_desc, src_coord_, src_forward_steps[i]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(src_desc, src_coord_, src_backward_steps[i]);
+                    }
+                }
+
+                // Move the destination coordinate.
+                if constexpr(move_dst_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_forward_steps[i]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(dst_desc, dst_coord_, dst_backward_steps[i]);
+                    }
+                }
+            });
+        });
+
+        // Reset the destination slice since the entire buffer has been already filled.
+        ResetDstSliceWindow(dst_desc);
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        src_slice_origin_ = src_slice_origin_ + step;
+        src_coord_        = make_tensor_coordinate(src_desc, src_slice_origin_);
+    }
+
+    template <typename DescType>
+    __device__ auto generate_steps(const DescType& desc, int sign)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                Index step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    step_idx(j) = (i.value == j.value) ? sign * thread_steps[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(desc, step_idx);
+            },
+            Number<nDim>{});
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    SrcCoord src_coord_;
+    SrcCoord src_coord_xor_;
+    DstCoord dst_coord_;
+    Index src_slice_origin_;
+    Index dst_slice_origin_;
+    StaticallyIndexedArray<IndexType, gather_num> gather_offsets_;
+    // static constexpr auto a_grid_xor_desc = make_naive_tensor_descriptor_packed(
+    //     make_tuple(Number<AK0 ^ ((threadIdx / AK0) % AK0)>{}, Number<M>{}, Number<AK1>{}));
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
index 2868ce2567..e7be94242b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm.hpp
@@ -194,10 +194,10 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
                     const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
                         arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
 
-                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
-                                         sizeof(ADataType) / APackedSize;
-                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
-                                         sizeof(BDataType) / BPackedSize;
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
 
                     const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
                         arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
@@ -245,49 +245,31 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
                 }
             };
 
-            constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) /
-                                             APackedSize / BlockSize / 4 *
-                                             (1 + GridwiseGemm::NWave);
-            constexpr auto estimated_reg_b = NPerBlock * KPerBlock * sizeof(BDataType) /
-                                             BPackedSize / BlockSize / 4 * (2) *
-                                             (IsInputGemm ? 2 : 1);
-            constexpr auto estimated_reg_c = MPerBlock * NPerBlock * sizeof(GemmAccDataType) /
-                                             BlockSize / 4 * (IsInputGemm ? 2 : 1);
-            constexpr auto estimated_reg_total =
-                estimated_reg_a + estimated_reg_b + estimated_reg_c;
-
-            constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
+            // TODO: Check if this is the right algorithm for minimum_occupancy
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
 
             constexpr auto MemoryDataOp =
                 IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+
             if(has_main_k_block_loop)
             {
                 // Tail number always full
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
-                    {
-                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                        {
-                            const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
-                                                                  true,
-                                                                  MemoryDataOp,
-                                                                  minimum_occupancy,
-                                                                  TailNumber::Odd>;
-                            RunKernel(kernel);
-                        }
-                        else
-                        {
-                            const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
-                                                                  true,
-                                                                  MemoryDataOp,
-                                                                  minimum_occupancy,
-                                                                  TailNumber::Even>;
-                            RunKernel(kernel);
-                        }
-                    }
+                    const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                               true,
+                                                               MemoryDataOp,
+                                                               minimum_occupancy,
+                                                               TailNumber::Full>;
+                    RunKernel(kernel);
                 }
-                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
-                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
                     if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                     {
@@ -315,26 +297,15 @@ struct DeviceMoeGemmMX : public DeviceMoEGemmMXBPreShuffle<ALayout,
             }
             else
             {
+                // Tail number always full
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
-                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
-                    {
-                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
-                                                              false,
-                                                              MemoryDataOp,
-                                                              minimum_occupancy,
-                                                              TailNumber::Odd>;
-                        RunKernel(kernel);
-                    }
-                    else
-                    {
-                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
-                                                              false,
-                                                              MemoryDataOp,
-                                                              minimum_occupancy,
-                                                              TailNumber::Even>;
-                        RunKernel(kernel);
-                    }
+                    const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                               false,
+                                                               MemoryDataOp,
+                                                               minimum_occupancy,
+                                                               TailNumber::Full>;
+                    RunKernel(kernel);
                 }
                 else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
new file mode 100644
index 0000000000..6dc3a5f881
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOP                        = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType>
+struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
+                                                                      BLayout,
+                                                                      DsLayout,
+                                                                      CLayout,
+                                                                      ADataType,
+                                                                      AScaleDataType,
+                                                                      BDataType,
+                                                                      BScaleDataType,
+                                                                      DsDataType,
+                                                                      CDataType,
+                                                                      ScaleBlockSize,
+                                                                      AElementwiseOperation,
+                                                                      BElementwiseOperation,
+                                                                      CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    using GridwiseGemm                  = GridwiseMoeGemmMX_BPreshuffle<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        AScaleDataType,
+        BDataType,
+        BScaleDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        ScaleBlockSize,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ActivationOP,
+        NSwizzle,
+        IsInputGemm,
+        MulRoutedWeight,
+        IndexType,
+        ComputeTypeA,
+        ComputeTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto RunKernel = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            // TODO: Check if this is the right algorithm for minimum_occupancy
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            constexpr auto MemoryDataOp =
+                IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                                  true,
+                                                                  MemoryDataOp,
+                                                                  minimum_occupancy,
+                                                                  TailNumber::Odd>;
+                            RunKernel(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                                  true,
+                                                                  MemoryDataOp,
+                                                                  minimum_occupancy,
+                                                                  TailNumber::Even>;
+                            RunKernel(kernel);
+                        }
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   true,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   true,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 & v3 support now");
+                }
+            }
+            else
+            {
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              false,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm<GridwiseGemm,
+                                                              false,
+                                                              MemoryDataOp,
+                                                              minimum_occupancy,
+                                                              TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   false,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_mxgemm_2lds<GridwiseGemm,
+                                                                   false,
+                                                                   MemoryDataOp,
+                                                                   minimum_occupancy,
+                                                                   TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // only impl kbatch 1 now
+        if(arg.KBatch > 1)
+        {
+            return false;
+        }
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_sorted_token_ids,
+                             const void* p_sorted_expert_ids,
+                             const void* p_max_token_id,
+                             const void* p_a,
+                             const void* p_a_scale,
+                             const void* p_b,
+                             const void* p_b_scale,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             index_t NumTokens,
+                             index_t TopK,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideScaleA,
+                             index_t StrideB,
+                             index_t StrideScaleB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const index_t*>(p_sorted_token_ids),
+                        static_cast<const index_t*>(p_sorted_expert_ids),
+                        static_cast<const index_t*>(p_max_token_id),
+                        static_cast<const ADataType*>(p_a),
+                        static_cast<const AScaleDataType*>(p_a_scale),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<const BScaleDataType*>(p_b_scale),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        NumTokens,
+                        TopK,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideScaleA,
+                        StrideB,
+                        StrideScaleB,
+                        StrideDs,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_a_scale,
+                                                      const void* p_b,
+                                                      const void* p_b_scale,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideScaleA,
+                                                      index_t StrideB,
+                                                      index_t StrideScaleB,
+                                                      std::array<ck::index_t, NumDTensor> StrideDs,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          static_cast<const ADataType*>(p_a),
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M, // randoms set, no use
+                                          0,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideScaleA,
+                                          StrideB,
+                                          StrideScaleB,
+                                          StrideDs,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceMoeGEmmMx"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index 59693a5861..5f8e524fb2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -4,17 +4,19 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
-#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp"
-#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_moe_selector.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp"
 
 #define DEBUG_LOG 0
 
@@ -33,6 +35,7 @@ enum Activation
     silu_and_mul = 1
 };
 
+#if 0
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -40,7 +43,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
@@ -69,6 +72,7 @@ __global__ void
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
 }
+#endif
 
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
@@ -83,23 +87,23 @@ __global__ void
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    // auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
     GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
         karg.p_sorted_token_ids,
         karg.p_sorted_expert_ids,
         karg.p_max_token_id,
-        karg.p_a_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_grid,
-        karg.p_b_scale_grid,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
         karg.p_ds_grid,
         karg.p_c_grid,
-        p_shared,
-        p_shared1,
+        p_shared_0,
+        p_shared_1,
         karg,
         karg.a_element_op,
         karg.b_element_op,
@@ -125,8 +129,8 @@ template <typename ALayout,
           typename BElementwiseOperation,
           typename CElementwiseOperation,
           tensor_operation::device::GemmSpecialization GemmSpec,
-          index_t ScaleBlockSize,
-          index_t BlockSize,
+          index_t ScaleBlockSize, // Scaling block size
+          index_t BlockSize,      // Thread block size
           index_t MPerBlock,
           index_t NPerBlock,
           index_t KPerBlock,
@@ -178,15 +182,20 @@ struct GridwiseMoeGemmMX
     static constexpr auto I5 = Number<5>{};
     static constexpr auto I6 = Number<6>{};
     static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
 
     static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
         CDEShuffleBlockTransferScalarPerVectors{}[I0];
     // K1 should be Number<...>
-    static constexpr auto AK0Number       = Number<KPerBlock / AK1Value>{};
-    static constexpr auto BK0Number       = Number<KPerBlock / BK1Value>{};
-    static constexpr auto AK1Number       = Number<AK1Value>{};
-    static constexpr auto BK1Number       = Number<BK1Value>{};
-    static constexpr auto BlockSizeNumber = Number<BlockSize>{};
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr auto lcm_AK1_BK1         = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
 
     static constexpr index_t NumDTensor = DsDataType::Size();
 
@@ -194,28 +203,23 @@ struct GridwiseMoeGemmMX
     static constexpr auto NXdlPack = 2;
     static constexpr auto KXdlPack = 2;
 
+    //> KPack is at least the k_per_blk of selected mfma
+    //
+    // Should be a multiple of k_per_blk.
+    // TODO: Move this to blockwise pipeline base
+    // KPack in packed data types for pk A/B
+
     static constexpr index_t APackedSize = packed_size_v<ADataType>;
     static constexpr index_t BPackedSize = packed_size_v<BDataType>;
 
-    static constexpr bool is_single_rate_mfma = false;
-    static constexpr auto is_scale_mfma       = true;
-    using mfma_selector                       = MfmaSelector<ComputeTypeA,
+    using mfma_selector = MfmaSelector<ComputeTypeA,
                                        MPerXdl,
                                        NPerXdl,
                                        ComputeTypeB,
                                        is_single_rate_mfma,
                                        is_scale_mfma>;
-    static constexpr index_t KPack            = math::max(
-        math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk / APackedSize);
-    static constexpr index_t KLane =
-        mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops();
-
-    static constexpr index_t KGroup = 1; // mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1;
-    // static_assert(KGroup == 2, "");
-    static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup);
-    static constexpr index_t NLane   = NPerXdl;
-    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
-    static constexpr index_t MWave   = MPerBlock / MPerXdl / MXdlPerWave;
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1, mfma_selector::selected_mfma.k_per_blk / APackedSize);
 
     // static constexpr index_t NumTokens = 1;
     static constexpr index_t SortedTileSize = MPerBlock;
@@ -245,61 +249,52 @@ struct GridwiseMoeGemmMX
         return std::make_tuple(gridx, gridy, 1);
     }
 
-    __host__ __device__ static auto CalculateMPadded(index_t M)
+    __host__ static auto CalculateMPadded(index_t M)
     {
         return math::integer_least_multiple(M, MPerBlock);
     }
 
-    __host__ __device__ static auto CalculateNPadded(index_t N)
+    __host__ static auto CalculateNPadded(index_t N)
     {
         return math::integer_least_multiple(N, NPerBlock);
     }
 
-    __host__ __device__ static auto CalculateBN0Shuffled(index_t N)
-    {
-        return math::integer_divide_ceil(N, NLane);
-    }
-    __host__ __device__ static auto CalculateBK0Shuffled(index_t K)
-    {
-        return math::integer_divide_ceil(K, KLane * KPack / KGroup);
-    }
-
-    __host__ __device__ static auto CalculateKPadded(index_t K)
+    __host__ static auto CalculateKPadded(index_t K)
     {
         return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
     }
 
-    __host__ __device__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
     {
         auto K_t = K_Batch * KPerBlock;
         return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
     }
 
-    __host__ __device__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
     {
         auto K_t = K_Batch * KPerBlock;
         return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
     }
 
-    __host__ __device__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
     {
         auto K_t = K_Batch * KPerBlock;
         return (K + K_t - 1) / K_t * KPerBlock;
     }
 
-    __host__ __device__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
     {
         constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
         auto K_t                = K_Batch * KReadVec;
         return (K + K_t - 1) / K_t * KReadVec;
     }
 
-    __host__ __device__ static auto CalculateMBlock(index_t M)
+    __host__ static auto CalculateMBlock(index_t M)
     {
         return math::integer_divide_ceil(M, MPerBlock);
     }
 
-    __host__ __device__ static auto CalculateNBlock(index_t N)
+    __host__ static auto CalculateNBlock(index_t N)
     {
         return math::integer_divide_ceil(N, NPerBlock);
     }
@@ -312,10 +307,18 @@ struct GridwiseMoeGemmMX
     __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
     {
         constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t MN = TileDesc_K0_MN_K1{}.GetLength(Number<1>{});
         constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
 
-        return transform_tensor_descriptor(
+        constexpr auto permuted_desc = transform_tensor_descriptor(
             TileDesc_K0_MN_K1{},
+            make_tuple(make_xor_with_modulo_transform(make_tuple(Number<MN>{}, Number<K0>{})),
+                       make_pass_through_transform(Number<K1>{})),
+            make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+            make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+        return transform_tensor_descriptor(
+            permuted_desc,
             make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
                        make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
                                                          Number<MNWaves>{},
@@ -367,12 +370,28 @@ struct GridwiseMoeGemmMX
             // pad M, but not K
             const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
                 a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
                            make_right_pad_transform(M, MPad - M)),
                 make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            return a_grid_desc_ak0_m_ak1;
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(MPad, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(MPad),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            return a_grid_desc;
         }
         else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
                           GemmSpec == GemmSpecialization::NKPadding)
@@ -398,27 +417,32 @@ struct GridwiseMoeGemmMX
             // not pad M or K
             const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
                 a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
                            make_pass_through_transform(M)),
                 make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            return a_grid_desc_ak0_m_ak1;
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(M, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(M),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_grid_desc;
         }
     }
 
-    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
-    {
-        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack / KGroup>{};
-        return make_naive_tensor_descriptor(
-            make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber),
-            make_tuple(NWave * NXdlPack * K0 * NkSwizzleNumber,
-                       NXdlPack * K0 * NkSwizzleNumber,
-                       K0 * NkSwizzleNumber,
-                       NkSwizzleNumber,
-                       I1));
-    }
-
     __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
         index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
     {
@@ -439,8 +463,9 @@ struct GridwiseMoeGemmMX
                         GemmSpec != GemmSpecialization::Default),
                       "pk_i4_t does not support padding");
         static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
-                        GemmSpec != GemmSpecialization::Default),
-                      "f4x2_pk_t does not support padding");
+                        (GemmSpec != GemmSpecialization::Default &&
+                         GemmSpec != GemmSpecialization::MPadding)),
+                      "f4x2_pk_t does not support K padding");
 
         if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
                      GemmSpec == GemmSpecialization::MNKPadding)
@@ -499,12 +524,29 @@ struct GridwiseMoeGemmMX
             // not pad N or K
             const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
                 b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, BK0Number, BK1Value)),
                            make_pass_through_transform(N)),
                 make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            return b_grid_desc_bk0_n_bk1;
+            const auto b_grid_desc_permuted = transform_tensor_descriptor(
+                b_grid_desc_bk0_n_bk1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(N, BK0Number)),
+                           make_pass_through_transform(BK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto b_grid_desc = transform_tensor_descriptor(
+                b_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, BK0Number)),
+                    make_pass_through_transform(N),
+                    make_pass_through_transform(BK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_grid_desc;
         }
     }
 
@@ -512,7 +554,9 @@ struct GridwiseMoeGemmMX
     __host__ __device__ static constexpr auto
     MakeAMmaTileDescriptor_M0_M1_M2_M3_K(const ABlockDesc_AK0_M_AK1&)
     {
-        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWave, MXdlPack, MPerXdl>(
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MXdlPack, MPerXdl>(
             ABlockDesc_AK0_M_AK1{});
     }
 
@@ -520,7 +564,9 @@ struct GridwiseMoeGemmMX
     __host__ __device__ static constexpr auto
     MakeBMmaTileDescriptor_N0_N1_N2_N3_K(const BBlockDesc_BK0_N_BK1&)
     {
-        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWave, NXdlPack, NPerXdl>(
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NXdlPack, NPerXdl>(
             BBlockDesc_BK0_N_BK1{});
     }
 
@@ -595,18 +641,18 @@ struct GridwiseMoeGemmMX
 
     struct Problem
     {
-        __host__ __device__ Problem(index_t NumTokens_,
-                                    index_t TopK_,
-                                    index_t M_,
-                                    index_t N_,
-                                    index_t K_,
-                                    index_t StrideA_,
-                                    index_t StrideScaleA_,
-                                    index_t StrideB_,
-                                    index_t StrideScaleB_,
-                                    std::array<index_t, NumDTensor> StrideDs_,
-                                    index_t StrideC_,
-                                    index_t KBatch_)
+        __host__ Problem(index_t NumTokens_,
+                         index_t TopK_,
+                         index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideScaleA_,
+                         index_t StrideB_,
+                         index_t StrideScaleB_,
+                         std::array<index_t, NumDTensor> StrideDs_,
+                         index_t StrideC_,
+                         index_t KBatch_)
             : NumTokens{NumTokens_},
               TopK{TopK_},
               M{M_},
@@ -626,9 +672,7 @@ struct GridwiseMoeGemmMX
               AK0{CalculateAK0Padded(K_, KBatch_)},
               BK0{CalculateBK0Padded(K_, KBatch_)},
               MBlock{CalculateMBlock(M_)},
-              NBlock{CalculateNBlock(N_)},
-              BN0Shuffled{CalculateBN0Shuffled(N_)},
-              BK0Shuffled{CalculateBK0Shuffled(K_)}
+              NBlock{CalculateNBlock(N_)}
         {
         }
 
@@ -641,7 +685,7 @@ struct GridwiseMoeGemmMX
                       << "N:" << N << ", "
                       << "K:" << K << ", "
                       << "SA:" << StrideA << ", "
-                      << "SSCaleA:" << StrideScaleA << ", "
+                      << "SScaleA:" << StrideScaleA << ", "
                       << "SB:" << StrideB << ", "
                       << "SScaleB:" << StrideScaleB << ", "
                       << "SC:" << StrideC << ", "
@@ -675,9 +719,6 @@ struct GridwiseMoeGemmMX
         index_t BK0;
         index_t MBlock;
         index_t NBlock;
-        // FOR PRESHUFFLE ONLY
-        index_t BN0Shuffled;
-        index_t BK0Shuffled;
     };
 
     // Argument
@@ -714,7 +755,7 @@ struct GridwiseMoeGemmMX
                       K_ / APackedSize,
                       StrideA_ / APackedSize,
                       StrideScaleA_,
-                      StrideB_ / APackedSize,
+                      StrideB_ / BPackedSize,
                       StrideScaleB_,
                       StrideDs_,
                       StrideC_,
@@ -821,11 +862,12 @@ struct GridwiseMoeGemmMX
     __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
     {
         // A matrix in LDS memory, dst of blockwise copy
-        if constexpr(ABlockLdsExtraM)
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
         {
+            // contiguous in LDS
             return make_naive_tensor_descriptor(
-                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+                make_tuple(Number<AK0Number>{}, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock>{}, I1));
         }
         // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
         // in some cases.
@@ -850,28 +892,29 @@ struct GridwiseMoeGemmMX
             // kfold and mpair dimension is not always required.
             // more dimension in merge_transform increase the difficulty of generating immarg offset
             // for compiler.
-            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1 = MPerBlock / M0;
+            constexpr auto WaveSize = 64;
+            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1       = MPerBlock / M0;
 
             constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
             constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerXdl;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
             constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
 
-            constexpr auto kfold = (AK1Number * M0 * sizeof(LDSTypeA) > 128)
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
                                        ? 1
-                                       : 128 / (AK1Number * M0 * sizeof(LDSTypeA));
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
             constexpr auto KThreadReadPerm =
                 (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
                     ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
                     : KThreadRead;
 
             // 1<=mpair<=n0
-            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(LDSTypeA) > 128)
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
                                        ? 1
-                                       : ((128 / (AK1Number * MPerXdl * sizeof(LDSTypeA))) > M0
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
                                               ? M0
-                                              : 128 / (AK1Number * MPerXdl * sizeof(LDSTypeA)));
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
 
             constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
                 make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
@@ -936,16 +979,123 @@ struct GridwiseMoeGemmMX
 
     __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
     {
-        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
-        return make_naive_tensor_descriptor_packed(make_tuple(Number<NXdlPerWave / NXdlPack>{},
-                                                              I1,
-                                                              Number<NXdlPack>{},
-                                                              Number<KRepeat>{},
-                                                              Number<BK1Value>{}));
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // contiguous in lds
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock>{}, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr auto b_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                                             make_tuple(BK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<NPerBlock>{}, Number<BK0Number>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_permuted;
+        }
+        else // RowMajor B
+        {
+            constexpr auto WaveSize = 64;
+            constexpr auto N0       = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1       = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = WaveSize / NPerXdl;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerXdl * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerXdl * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerXdl * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
     }
 
     __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
     {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
         constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
             make_naive_tensor_descriptor_packed(
                 make_tuple(I1,
@@ -957,7 +1107,7 @@ struct GridwiseMoeGemmMX
     }
 
     using BlockwiseGemmPipe =
-        remove_cvref_t<decltype(BlockGemmMXBPreshufflePipeline_Selector<
+        remove_cvref_t<decltype(BlockGemmMXPipeline_Selector<
                                 BlkGemmPipelineVer,
                                 BlkGemmPipeSched,
                                 BlockSize,
@@ -990,12 +1140,17 @@ struct GridwiseMoeGemmMX
     {
         // LDS allocation for A and B: be careful of alignment
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
         // lds max alignment
         constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
 
         constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
             a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
 
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
         // LDS allocation for C shuffle in LDS
         constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
             GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
@@ -1003,8 +1158,18 @@ struct GridwiseMoeGemmMX
         constexpr auto c_block_size =
             c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
 
-        return math::max(a_block_space_size_aligned * sizeof(LDSTypeA),
-                         c_block_size * sizeof(CShuffleDataType));
+        if constexpr(IsInputGemm)
+        {
+            return math::max(a_block_space_size_aligned * sizeof(ADataType) +
+                                 b_block_space_size_aligned * sizeof(BDataType) * 2,
+                             c_block_size * sizeof(CShuffleDataType));
+        }
+        else
+        {
+            return math::max((a_block_space_size_aligned * sizeof(ADataType) +
+                              b_block_space_size_aligned * sizeof(BDataType)),
+                             c_block_size * sizeof(CShuffleDataType));
+        }
     }
 
     // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
@@ -1025,12 +1190,12 @@ struct GridwiseMoeGemmMX
         {
             if(!(karg.M % MPerBlock == 0))
             {
-#if DEBUG_LOG
-                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
-                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
                 return false;
             }
         }
@@ -1043,12 +1208,12 @@ struct GridwiseMoeGemmMX
         {
             if(!(karg.N % NPerBlock == 0))
             {
-#if DEBUG_LOG
-                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
-                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
                 return false;
             }
         }
@@ -1058,16 +1223,15 @@ struct GridwiseMoeGemmMX
                        GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
         {
-
             auto K_t = karg.KBatch * KPerBlock;
             if(!(karg.K % K_t == 0))
             {
-#if DEBUG_LOG
-                std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
-                          << karg.K << " " << __FILE__ << ":" << __LINE__
-                          << ", in function: " << __func__ << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
@@ -1086,13 +1250,13 @@ struct GridwiseMoeGemmMX
         {
             if(karg.K % ABlockTransferSrcScalarPerVector != 0)
             {
-#if DEBUG_LOG
-                std::cout << "Arg K (" << karg.K
-                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
@@ -1100,13 +1264,13 @@ struct GridwiseMoeGemmMX
         {
             if(karg.M % ABlockTransferSrcScalarPerVector != 0)
             {
-#if DEBUG_LOG
-                std::cout << "Arg M (" << karg.M
-                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
@@ -1115,13 +1279,13 @@ struct GridwiseMoeGemmMX
         {
             if(karg.N % BBlockTransferSrcScalarPerVector != 0)
             {
-#if DEBUG_LOG
-                std::cout << "Arg N (" << karg.N
-                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
@@ -1129,13 +1293,13 @@ struct GridwiseMoeGemmMX
         {
             if(karg.K % BBlockTransferSrcScalarPerVector != 0)
             {
-#if DEBUG_LOG
-                std::cout << "Arg K (" << karg.K
-                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
@@ -1144,14 +1308,15 @@ struct GridwiseMoeGemmMX
         {
             if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
             {
-#if DEBUG_LOG
-                std::cout << "Arg N (" << karg.N
-                          << ") value is not a multiple of "
-                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
-                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
-
-#endif // DEBUG_LOG
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
                 return false;
             }
         }
@@ -1159,15 +1324,17 @@ struct GridwiseMoeGemmMX
         {
             if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
             {
-#if DEBUG_LOG
-                std::cout << "Arg M (" << karg.M
-                          << ") value is not a multiple of "
-                             "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                          << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " << __FILE__
-                          << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
 
-#endif // DEBUG_LOG
-                return false;
+                    return false;
+                }
             }
         }
 
@@ -1184,14 +1351,14 @@ struct GridwiseMoeGemmMX
         return true;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
 
         return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
     }
 
-    __host__ __device__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
 
@@ -1199,7 +1366,7 @@ struct GridwiseMoeGemmMX
     }
 
     template <typename CGridDesc>
-    __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
         const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
     {
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
@@ -1225,6 +1392,11 @@ struct GridwiseMoeGemmMX
     static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
                   "B scale pack data type too large!");
 
+    static_assert(is_same_v<AElementwiseOperation, tensor_operation::element_wise::PassThrough> &&
+                      is_same_v<BElementwiseOperation, tensor_operation::element_wise::PassThrough>,
+                  "A/B ElementwiseOperation should be PassThrough as load_to_lds is used!");
+
+#if 0
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
@@ -1243,6 +1415,7 @@ struct GridwiseMoeGemmMX
                                BElementwiseOperation b_element_op,
                                CElementwiseOperation c_element_op)
     {
+        ignore                           = a_element_op;
         ignore                           = b_element_op;
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
@@ -1251,8 +1424,8 @@ struct GridwiseMoeGemmMX
             problem.KPadded,
             problem.StrideA,
             problem.AK0);
-        const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -1261,7 +1434,7 @@ struct GridwiseMoeGemmMX
             problem.StrideC);
 
         const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
-            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerBlock),
+            make_tuple(problem.M / (MXdlPack * MPerXdl),
                        math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
                            (KXdlPack * 64 / MPerXdl),
                        64 * KXdlPack * MXdlPack / scale_pack_size_a));
@@ -1275,8 +1448,8 @@ struct GridwiseMoeGemmMX
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
-        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
         const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
         if(expert_block_id * MPerBlock >= max_token_id)
             return;
@@ -1327,104 +1500,96 @@ struct GridwiseMoeGemmMX
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K / APackedSize;
+            gather_offsets(m0) = static_cast<IndexType>(token_offset);
         });
+
         const index_t expert_stride =
             __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
-        const index_t expert_scale_stride =
-            __builtin_amdgcn_readfirstlane(problem.N * (IsInputGemm ? 2 : 1) *
-                                           math::integer_divide_ceil(problem.K, ScaleBlockSize));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            problem.N * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
 
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
 
+        // Gride buffer creation
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + expert_id * expert_stride / BPackedSize,
-            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
 
         // A, B scale buffer
         const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
         const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_scale_grid + expert_id * expert_scale_stride,
+            p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
             b_scale_grid_desc_bn_ak.GetElementSpaceSize());
 
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
         // B matrix in LDS memory, dst of blockwise copy
-        // dummy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-        // A matrix blockwise copy
-        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+
+        // A matrix blockwise direct to LDS copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad<
             ThisThreadBlock,
-            AElementwiseOperation,
-            ck::tensor_operation::element_wise::PassThrough,
-            InMemoryDataOperationEnum::Set,
             Sequence<AK0Number, MPerBlock, AK1Number>,
             ABlockTransferThreadClusterLengths_AK0_M_AK1,
             ABlockTransferThreadClusterArrangeOrder,
             ADataType,
-            LDSTypeA,
+            ADataType,
             decltype(a_grid_desc_ak0_m_ak1),
             decltype(a_block_desc_ak0_m_ak1),
             ABlockTransferSrcAccessOrder,
-            Sequence<0, 1, 2>,
             ABlockTransferSrcVectorDim,
             2,
             ABlockTransferSrcScalarPerVector,
-            ABlockTransferDstScalarPerVector_AK1,
-            1,
-            1,
-            AThreadTransferSrcResetCoordinateAfterRun,
-            true,
             IndexType,
-            1,
-            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
-                                                make_multi_index(0, 0, 0),
-                                                a_element_op,
-                                                a_block_desc_ak0_m_ak1,
-                                                make_multi_index(0, 0, 0),
-                                                ck::tensor_operation::element_wise::PassThrough{},
-                                                gather_offsets);
-
-        // Thread-wise copy
-        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
-        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+            1>(a_grid_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               a_block_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               gather_offsets);
 
+        // B matrix blockwise copy
         auto b_blockwise_copy =
-            ThreadwiseTensorSliceTransfer_v2<BDataType,
-                                             BDataType,
-                                             decltype(b_grid_desc_bpreshuffled),
-                                             decltype(b_block_desc_bk0_n_bk1),
-                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
-                                                      I1,
-                                                      Number<NXdlPack>{},
-                                                      Number<KRepeat>{},
-                                                      Number<BK1Value>{}>,
-                                             Sequence<1, 2, 0, 3>,
-                                             4,
-                                             BBlockTransferSrcScalarPerVector,
-                                             BThreadTransferSrcResetCoordinateAfterRun,
-                                             true>(
-                b_grid_desc_bpreshuffled,
-                make_multi_index(n_block_data_idx_on_grid,
-                                 get_warp_local_1d_id() % NWave,
-                                 0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                      BBlockTransferThreadClusterArrangeOrder,
+                                                      BDataType,
+                                                      BDataType,
+                                                      decltype(b_grid_desc_bk0_n_bk1),
+                                                      decltype(b_block_desc_bk0_n_bk1),
+                                                      BBlockTransferSrcAccessOrder,
+                                                      BBlockTransferSrcVectorDim,
+                                                      2,
+                                                      BBlockTransferSrcScalarPerVector>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0));
 
         // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
         // Cast after lds
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<LDSTypeA*>(p_shared),
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
+                                         a_block_space_size_aligned * sizeof(ADataType)),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
 
         // Blockwise GEMM pipeline
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
@@ -1448,8 +1613,6 @@ struct GridwiseMoeGemmMX
         const auto waveId_m = wave_idx[I0];
         const auto waveId_n = wave_idx[I1];
 
-        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
-
         auto thread_offset_shuffled =
             get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
 
@@ -1481,7 +1644,7 @@ struct GridwiseMoeGemmMX
             Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
             Sequence<0, 1, 2>,                                       // DimAccessOrder
             2,                                                       // SrcVectorDim
-            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
             1,                                                       // SrcScalarStrideInVector
             true>(b_scale_grid_desc_bn_ak,
                   make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
@@ -1490,29 +1653,42 @@ struct GridwiseMoeGemmMX
 
         if constexpr(IsInputGemm)
         {
-            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+            auto b_block_buf_up = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
+                                             a_block_space_size_aligned * sizeof(ADataType) +
+                                             b_block_space_size_aligned * sizeof(BDataType)),
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
             const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + expert_id * expert_stride / BPackedSize,
-                b_grid_desc_bpreshuffled.GetElementSpaceSize());
-            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                p_b_grid_up + expert_id * expert_stride,
+                b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+            auto b_blockwise_copy_up = ThreadGroupTensorSliceTransfer_DirectLoad<
+                ThisThreadBlock,
+                Sequence<BK0Number, NPerBlock, BK1Number>,
+                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                BBlockTransferThreadClusterArrangeOrder,
                 BDataType,
                 BDataType,
-                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_grid_desc_bk0_n_bk1),
                 decltype(b_block_desc_bk0_n_bk1),
-                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
-                Sequence<1, 2, 0, 3>,
-                3,
-                BBlockTransferSrcScalarPerVector,
-                BThreadTransferSrcResetCoordinateAfterRun,
-                true>(b_grid_desc_bpreshuffled,
-                      make_multi_index(n_block_data_idx_on_grid,
-                                       get_warp_local_1d_id() % NWave,
-                                       0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
-            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
-            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                BBlockTransferSrcAccessOrder,
+                BBlockTransferSrcVectorDim,
+                2,
+                BBlockTransferSrcScalarPerVector>(b_grid_desc_bk0_n_bk1,
+                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
+                                                  b_block_desc_bk0_n_bk1,
+                                                  make_multi_index(0, 0, 0));
+
+            const BScaleDataType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride / sizeof(BScaleDataType),
                 b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
             auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
                 BScaleDataType,
                 BScaleDataType,
@@ -1530,25 +1706,31 @@ struct GridwiseMoeGemmMX
                                  thread_offset_shuffled / scale_pack_size_b));
 
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                // A
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,
                 a_grid_buf,
                 a_block_buf,
                 a_block_slice_copy_step,
-                b_grid_desc_bpreshuffled,
+                // Gate and Up
+                b_grid_desc_bk0_n_bk1,
                 b_block_desc_bk0_n_bk1,
                 b_blockwise_copy,
                 b_blockwise_copy_up,
                 b_grid_buf,
                 b_grid_buf_up,
                 b_block_buf,
+                b_block_buf_up,
                 b_block_slice_copy_step,
+                // C
                 c_thread_buf,
                 c_thread_buf_up,
+                // A scale
                 a_scale_grid_desc_am_ak,
                 a_scale_thread_copy,
                 a_scale_grid_buf,
+                // Gate and Up scale
                 b_scale_grid_desc_bn_ak,
                 b_scale_thread_copy,
                 b_scale_thread_copy_up,
@@ -1559,23 +1741,23 @@ struct GridwiseMoeGemmMX
         else
         {
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                a_grid_desc_ak0_m_ak1,
+                a_grid_desc_ak0_m_ak1, // A
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,
                 a_grid_buf,
                 a_block_buf,
                 a_block_slice_copy_step,
-                b_grid_desc_bpreshuffled,
+                b_grid_desc_bk0_n_bk1, // B
                 b_block_desc_bk0_n_bk1,
                 b_blockwise_copy,
                 b_grid_buf,
                 b_block_buf,
                 b_block_slice_copy_step,
-                c_thread_buf,
-                a_scale_grid_desc_am_ak,
+                c_thread_buf,            // C
+                a_scale_grid_desc_am_ak, // A scale
                 a_scale_thread_copy,
                 a_scale_grid_buf,
-                b_scale_grid_desc_bn_ak,
+                b_scale_grid_desc_bn_ak, // B scale
                 b_scale_thread_copy,
                 b_scale_grid_buf,
                 num_k_block_main_loop);
@@ -1586,84 +1768,111 @@ struct GridwiseMoeGemmMX
             static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
                               NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
                           "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
 
             // TODO: hacky, fix it!
             constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             // TODO: hacky, fix it!
             // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
             constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
             constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
             constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
             constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
 
             // mul scales
-            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
-            static_assert(M4 == 4);
-            const index_t m1 = get_warp_local_1d_id() / NWave;
-            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
+            static_assert(M5 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave; // Mwave id
+            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
 
             vector_type<float, 4> topk_weights; // for gemm2 only
-            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
-                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
-                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
-                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
-                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
-                        if constexpr(MulRoutedWeight)
-                        {
-                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
-                                p_ds_grid[I2] + m_pos);
-                        }
-                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
-                            constexpr index_t c_offset =
-                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-                                    make_tuple(m0, n0, m2 * M4 + m4));
-                            constexpr auto cidx = Number<c_offset>{};
-
-                            if constexpr(IsInputGemm) // gu fusion
-                            {
-                                if constexpr(ActivationOperation == Activation::silu_and_mul)
-                                {
-                                    float gate = c_thread_buf[cidx];
-                                    float up   = c_thread_buf_up[cidx];
-                                    if constexpr(MulRoutedWeight)
-                                    {
-                                        gate = gate * topk_weights.AsType<float>()[m4];
-                                        up   = up * topk_weights.AsType<float>()[m4];
-                                    }
-                                    tensor_operation::element_wise::Silu{}(gate, gate);
-                                    c_thread_buf_fp32(cidx) = gate * up;
-                                }
-                                else if(ActivationOperation == Activation::gelu_and_mul)
-                                {
-                                    float gate = c_thread_buf[cidx];
-                                    float up   = c_thread_buf_up[cidx];
-                                    if constexpr(MulRoutedWeight)
-                                    {
-                                        gate = gate * topk_weights.AsType<float>()[m4];
-                                        up   = up * topk_weights.AsType<float>()[m4];
-                                    }
-                                    tensor_operation::element_wise::Gelu{}(gate, gate);
-                                    c_thread_buf_fp32(cidx) = gate * up;
-                                }
-                            }
-                            else
-                            {
-                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
+                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
+                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
+                                const index_t m_pos = block_m_id * MPerBlock +
+                                                      m0 * M2 * M1 * M3 * M4 * M5 +
+                                                      m1 * M2 * M3 * M4 * M5 +
+                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
                                 if constexpr(MulRoutedWeight)
                                 {
-                                    c_thread_buf_fp32(cidx) =
-                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                    topk_weights =
+                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
+                                            p_ds_grid[I2] + m_pos);
                                 }
-                            }
+                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
+                                    constexpr index_t c_offset =
+                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
+                                    constexpr auto cidx = Number<c_offset>{};
+
+                                    if constexpr(IsInputGemm) // gu fusion
+                                    {
+                                        if constexpr(ActivationOperation ==
+                                                     Activation::silu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Silu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                        else if(ActivationOperation == Activation::gelu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+
+                                            /*float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                //up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = up;*/
+                                        }
+                                    }
+                                    else
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            c_thread_buf_fp32(cidx) =
+                                                topk_weights.AsType<float>()[m5] *
+                                                c_thread_buf_fp32[cidx];
+                                        }
+                                    }
+                                });
+                            });
                         });
                     });
                 });
@@ -1681,19 +1890,25 @@ struct GridwiseMoeGemmMX
                 make_tuple(
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
-                        M1,                                      // M1 = MWave
-                        M2,                                      // M2 * M3 * M4 = MPerXdl
-                        M3,
-                        M4)),
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave)
+                                                                            // per shuffle
+                        M1,                                                 // M1 = MWave
+                        M2,                                                 // M2 = MXdlPack
+                        M3, // M3 * M4 * M5 = MPerXdl
+                        M4,
+                        M5)),
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
-                        N1,                                      // N1 = NWave
-                        N2))),                                   // N2 = NPerXdl
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
+                                                                            // per shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(
-                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -1705,8 +1920,8 @@ struct GridwiseMoeGemmMX
 
             const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto m_thread_data_on_block_idx =
@@ -1715,8 +1930,8 @@ struct GridwiseMoeGemmMX
 
             const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
-                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto n_thread_data_on_block_idx =
@@ -1724,36 +1939,39 @@ struct GridwiseMoeGemmMX
                     make_multi_index(n_thread_data_on_block));
 
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
 
             using EDataType = CDataType;
 
@@ -1774,18 +1992,16 @@ struct GridwiseMoeGemmMX
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -1804,52 +2020,65 @@ struct GridwiseMoeGemmMX
             using CDEBlockTransferCluster =
                 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
-            constexpr index_t scatter_weight_idx  = 1; // hack fix felix
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                            // support arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                               // Sequence support
+                                                                               // arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
             constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle,
-                                           CShuffleNXdlPerWavePerShuffle,
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
                                            1,
                                            1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
                                            M2,
                                            1,
                                            M4,
@@ -1929,6 +2158,7 @@ struct GridwiseMoeGemmMX
             });
         }
     }
+#endif
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -1942,13 +2172,14 @@ struct GridwiseMoeGemmMX
                                     const BScaleDataType* p_b_scale_grid,
                                     DsGridPointer& p_ds_grid,
                                     CDataType* p_c_grid,
-                                    void* p_shared,
-                                    void* p_shared1,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
                                     const Problem& problem,
                                     AElementwiseOperation a_element_op,
                                     BElementwiseOperation b_element_op,
                                     CElementwiseOperation c_element_op)
     {
+        ignore                           = a_element_op;
         ignore                           = b_element_op;
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
@@ -1957,8 +2188,8 @@ struct GridwiseMoeGemmMX
             problem.KPadded,
             problem.StrideA,
             problem.AK0);
-        const auto b_grid_desc_bpreshuffled =
-            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
             problem.MPadded,
@@ -1967,7 +2198,7 @@ struct GridwiseMoeGemmMX
             problem.StrideC);
 
         const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
-            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerXdl),
+            make_tuple(problem.M / (MXdlPack * MPerXdl),
                        math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
                            (KXdlPack * 64 / MPerXdl),
                        64 * KXdlPack * MXdlPack / scale_pack_size_a));
@@ -1981,8 +2212,8 @@ struct GridwiseMoeGemmMX
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
-        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
         const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
         if(expert_block_id * MPerBlock >= max_token_id)
             return;
@@ -2020,13 +2251,13 @@ struct GridwiseMoeGemmMX
         constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
         constexpr auto AKThreads  = AK0Threads * AK1Threads;
         constexpr auto AMRepeats  = MPerBlock / AMThreads;
-        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads;
 
         if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
         StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
-            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0 * AMThreads];
             index_t token_offset      = fused_token & 0xffffff;
             if constexpr(!IsInputGemm)
             {
@@ -2038,103 +2269,100 @@ struct GridwiseMoeGemmMX
         const index_t expert_stride =
             __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
         const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
-            problem.N * math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
+            problem.N * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
 
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
 
+        // Gride buffer creation
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
 
+        // A, B scale buffer
         const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
         const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
             b_scale_grid_desc_bn_ak.GetElementSpaceSize());
 
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
         // B matrix in LDS memory, dst of blockwise copy
-        // dummy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-        // A matrix blockwise copy
-        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+
+        // A matrix blockwise direct to LDS copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad<
             ThisThreadBlock,
-            AElementwiseOperation,
-            ck::tensor_operation::element_wise::PassThrough,
-            InMemoryDataOperationEnum::Set,
             Sequence<AK0Number, MPerBlock, AK1Number>,
             ABlockTransferThreadClusterLengths_AK0_M_AK1,
             ABlockTransferThreadClusterArrangeOrder,
             ADataType,
-            LDSTypeA,
+            ADataType,
             decltype(a_grid_desc_ak0_m_ak1),
             decltype(a_block_desc_ak0_m_ak1),
             ABlockTransferSrcAccessOrder,
-            Sequence<0, 1, 2>,
             ABlockTransferSrcVectorDim,
             2,
             ABlockTransferSrcScalarPerVector,
-            ABlockTransferDstScalarPerVector_AK1,
-            1,
-            1,
-            AThreadTransferSrcResetCoordinateAfterRun,
-            true,
             IndexType,
-            1,
-            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
-                                                make_multi_index(0, 0, 0),
-                                                a_element_op,
-                                                a_block_desc_ak0_m_ak1,
-                                                make_multi_index(0, 0, 0),
-                                                ck::tensor_operation::element_wise::PassThrough{},
-                                                gather_offsets);
-
-        // Thread-wise copy
-        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
-        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+            1>(a_grid_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               a_block_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               gather_offsets);
 
+        // B matrix blockwise copy
         auto b_blockwise_copy =
-            ThreadwiseTensorSliceTransfer_v2<BDataType,
-                                             BDataType,
-                                             decltype(b_grid_desc_bpreshuffled),
-                                             decltype(b_block_desc_bk0_n_bk1),
-                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
-                                                      I1,
-                                                      Number<NXdlPack>{},
-                                                      Number<KRepeat>{},
-                                                      Number<BK1Value>{}>,
-                                             Sequence<1, 2, 0, 3, 4>,
-                                             4,
-                                             BBlockTransferSrcScalarPerVector,
-                                             BThreadTransferSrcResetCoordinateAfterRun,
-                                             true>(
-                b_grid_desc_bpreshuffled,
-                make_multi_index(n_block_data_idx_on_grid,
-                                 get_warp_local_1d_id() % NWave,
-                                 0,
-                                 0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
+            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
+                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                      BBlockTransferThreadClusterArrangeOrder,
+                                                      BDataType,
+                                                      BDataType,
+                                                      decltype(b_grid_desc_bk0_n_bk1),
+                                                      decltype(b_block_desc_bk0_n_bk1),
+                                                      BBlockTransferSrcAccessOrder,
+                                                      BBlockTransferSrcVectorDim,
+                                                      2,
+                                                      BBlockTransferSrcScalarPerVector>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0));
 
         // LDS allocation for A and B: be careful of alignment
-        // Cast after lds
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
         auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            bit_cast<BDataType*>(static_cast<char*>(p_shared_0) +
+                                 a_block_space_size_aligned * sizeof(ADataType)),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
         auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
+                                 a_block_space_size_aligned * sizeof(ADataType)),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
         auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, 0, KRepeat, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
 
         // Blockwise GEMM pipeline
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
@@ -2203,29 +2431,50 @@ struct GridwiseMoeGemmMX
 
         if constexpr(IsInputGemm)
         {
-            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
             const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + expert_id * expert_stride / BPackedSize,
-                b_grid_desc_bpreshuffled.GetElementSpaceSize());
-            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                p_b_grid_up + expert_id * expert_stride,
+                b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+            // lds ping pong buffers for up
+            constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+            auto b_block_buf_up_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                bit_cast<BDataType*>(static_cast<char*>(p_shared_0) +
+                                     a_block_space_size_aligned * sizeof(ADataType) +
+                                     b_block_space_size_aligned * sizeof(BDataType)),
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+            auto b_block_buf_up_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                bit_cast<BDataType*>(bit_cast<char*>(p_shared_1) +
+                                     a_block_space_size_aligned * sizeof(ADataType) +
+                                     b_block_space_size_aligned * sizeof(BDataType)),
+                b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+            auto b_block_bufs_up = make_tuple(b_block_buf_up_ping, b_block_buf_up_pong);
+
+            auto b_blockwise_copy_up = ThreadGroupTensorSliceTransfer_DirectLoad<
+                ThisThreadBlock,
+                Sequence<BK0Number, NPerBlock, BK1Number>,
+                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                BBlockTransferThreadClusterArrangeOrder,
                 BDataType,
                 BDataType,
-                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_grid_desc_bk0_n_bk1),
                 decltype(b_block_desc_bk0_n_bk1),
-                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
-                Sequence<1, 2, 0, 3>,
-                3,
-                BBlockTransferSrcScalarPerVector,
-                BThreadTransferSrcResetCoordinateAfterRun,
-                true>(b_grid_desc_bpreshuffled,
-                      make_multi_index(n_block_data_idx_on_grid,
-                                       get_warp_local_1d_id() % NWave,
-                                       0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
-            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
-            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                BBlockTransferSrcAccessOrder,
+                BBlockTransferSrcVectorDim,
+                2,
+                BBlockTransferSrcScalarPerVector>(b_grid_desc_bk0_n_bk1,
+                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
+                                                  b_block_desc_bk0_n_bk1,
+                                                  make_multi_index(0, 0, 0));
+
+            const BScaleDataType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride / sizeof(BScaleDataType),
                 b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
             auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
                 BScaleDataType,
                 BScaleDataType,
@@ -2243,25 +2492,31 @@ struct GridwiseMoeGemmMX
                                  thread_offset_shuffled / scale_pack_size_b));
 
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                // A
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,
                 a_grid_buf,
                 a_block_bufs,
                 a_block_slice_copy_step,
-                b_grid_desc_bpreshuffled,
+                // Gate and Up
+                b_grid_desc_bk0_n_bk1,
                 b_block_desc_bk0_n_bk1,
                 b_blockwise_copy,
                 b_blockwise_copy_up,
                 b_grid_buf,
                 b_grid_buf_up,
                 b_block_bufs,
+                b_block_bufs_up,
                 b_block_slice_copy_step,
+                // C
                 c_thread_buf,
                 c_thread_buf_up,
+                // A scale
                 a_scale_grid_desc_am_ak,
                 a_scale_thread_copy,
                 a_scale_grid_buf,
+                // B scale
                 b_scale_grid_desc_bn_ak,
                 b_scale_thread_copy,
                 b_scale_thread_copy_up,
@@ -2272,23 +2527,23 @@ struct GridwiseMoeGemmMX
         else
         {
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                a_grid_desc_ak0_m_ak1,
+                a_grid_desc_ak0_m_ak1, // A
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,
                 a_grid_buf,
                 a_block_bufs,
                 a_block_slice_copy_step,
-                b_grid_desc_bpreshuffled,
+                b_grid_desc_bk0_n_bk1, // B
                 b_block_desc_bk0_n_bk1,
                 b_blockwise_copy,
                 b_grid_buf,
                 b_block_bufs,
                 b_block_slice_copy_step,
-                c_thread_buf,
-                a_scale_grid_desc_am_ak,
+                c_thread_buf,            // C
+                a_scale_grid_desc_am_ak, // A scale
                 a_scale_thread_copy,
                 a_scale_grid_buf,
-                b_scale_grid_desc_bn_ak,
+                b_scale_grid_desc_bn_ak, // B scale
                 b_scale_thread_copy,
                 b_scale_grid_buf,
                 num_k_block_main_loop);
@@ -2299,89 +2554,102 @@ struct GridwiseMoeGemmMX
             static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
                               NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
                           "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
 
             // TODO: hacky, fix it!
             constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             // TODO: hacky, fix it!
             // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
             constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
             constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
             constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
             constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
 
             // mul scales
 
-            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
-            static_assert(M4 == 4);
+            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
+            static_assert(M5 == 4);
             const index_t m1 = get_warp_local_1d_id() / NWave;
-            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
 
             vector_type<float, 4> topk_weights; // for gemm2 only
-            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
-                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
-                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
-                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
-                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
-                        if constexpr(MulRoutedWeight)
-                        {
-                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
-                                p_ds_grid[I2] + m_pos);
-                        }
-                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
-                            constexpr index_t c_offset =
-                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-                                    make_tuple(m0 / MXdlPack,
-                                               n0 / NXdlPack,
-                                               m0 % MXdlPack,
-                                               n0 % NXdlPack,
-                                               m2 * M4 + m4));
-                            constexpr auto cidx = Number<c_offset>{};
-
-                            if constexpr(IsInputGemm) // gu fusion
-                            {
-                                if constexpr(ActivationOperation == Activation::silu_and_mul)
-                                {
-                                    float gate = c_thread_buf[cidx];
-                                    float up   = c_thread_buf_up[cidx];
-                                    if constexpr(MulRoutedWeight)
-                                    {
-                                        gate = gate * topk_weights.AsType<float>()[m4];
-                                        up   = up * topk_weights.AsType<float>()[m4];
-                                    }
-                                    tensor_operation::element_wise::Silu{}(gate, gate);
-                                    c_thread_buf_fp32(cidx) = gate * up;
-                                }
-                                else if(ActivationOperation == Activation::gelu_and_mul)
-                                {
-                                    float gate = c_thread_buf[cidx];
-                                    float up   = c_thread_buf_up[cidx];
-                                    if constexpr(MulRoutedWeight)
-                                    {
-                                        gate = gate * topk_weights.AsType<float>()[m4];
-                                        up   = up * topk_weights.AsType<float>()[m4];
-                                    }
-                                    tensor_operation::element_wise::Gelu{}(gate, gate);
-                                    c_thread_buf_fp32(cidx) = gate * up;
-                                }
-                            }
-                            else
-                            {
-                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
+                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
+                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
+                                const index_t m_pos = block_m_id * MPerBlock +
+                                                      m0 * M2 * M1 * M3 * M4 * M5 +
+                                                      m1 * M2 * M3 * M4 * M5 +
+                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
                                 if constexpr(MulRoutedWeight)
                                 {
-                                    c_thread_buf_fp32(cidx) =
-                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                    topk_weights =
+                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
+                                            p_ds_grid[I2] + m_pos);
                                 }
-                            }
+                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
+                                    constexpr index_t c_offset =
+                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
+                                    constexpr auto cidx = Number<c_offset>{};
+
+                                    if constexpr(IsInputGemm) // gu fusion
+                                    {
+                                        if constexpr(ActivationOperation ==
+                                                     Activation::silu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Silu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                        else if(ActivationOperation == Activation::gelu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                    }
+                                    else
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            c_thread_buf_fp32(cidx) =
+                                                topk_weights.AsType<float>()[m5] *
+                                                c_thread_buf_fp32[cidx];
+                                        }
+                                    }
+                                });
+                            });
                         });
                     });
                 });
@@ -2391,7 +2659,7 @@ struct GridwiseMoeGemmMX
                 GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
 
             auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                static_cast<CShuffleDataType*>(p_shared),
+                static_cast<CShuffleDataType*>(p_shared_0),
                 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
@@ -2399,19 +2667,25 @@ struct GridwiseMoeGemmMX
                 make_tuple(
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
-                        M1,                                      // M1 = MWave
-                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2, // M2 * M3 * M4 = MPerXdl
                         M3,
-                        M4)),
+                        M4,
+                        M5)),
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
-                        N1,                                      // N1 = NWave
-                        N2))),                                   // N2 = NPerXdl
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
+                                                                            // per shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(
-                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -2423,8 +2697,8 @@ struct GridwiseMoeGemmMX
 
             const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto m_thread_data_on_block_idx =
@@ -2433,8 +2707,8 @@ struct GridwiseMoeGemmMX
 
             const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
-                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto n_thread_data_on_block_idx =
@@ -2442,36 +2716,39 @@ struct GridwiseMoeGemmMX
                     make_multi_index(n_thread_data_on_block));
 
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
 
             using EDataType = CDataType;
 
@@ -2530,8 +2807,9 @@ struct GridwiseMoeGemmMX
                 decltype(c_ds_desc_refs),
                 decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
                 CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                            // support arbitray type
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                            // Sequence support
+                                                                            // arbitray type
                 Sequence<1,
                          CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
                          1,
@@ -2561,13 +2839,25 @@ struct GridwiseMoeGemmMX
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
             constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle,
-                                           CShuffleNXdlPerWavePerShuffle,
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
                                            1,
                                            1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
                                            M2,
                                            1,
                                            M4,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
new file mode 100644
index 0000000000..156db6e636
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
@@ -0,0 +1,2761 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp"
+
+#define DEBUG_LOG 0
+
+namespace ck {
+
+// Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same
+// kernel function Blockers:
+// 1. Two separted declaration of __shared__ pointer is the key to make sure data access operate on
+// two lds chunks.
+// 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
+// buffer when we declare __shared__ inside blkgemmpipe
+
+enum Activation
+{
+    gelu_and_mul = 0,
+    silu_and_mul = 1
+};
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Even>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+    // __attribute__((amdgpu_waves_per_eu(1, 1)))
+    kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared_0,
+        p_shared_1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__))
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize, // Scaling block size
+          index_t BlockSize,      // Thread block size
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t ActivationOperation                 = 0,
+          bool NSwizzle                               = false,
+          bool IsInputGemm                            = true,
+          bool MulRoutedWeight                        = true,
+          typename IndexType                          = index_t,
+          typename ComputeTypeA                       = ADataType,
+          typename ComputeTypeB                       = BDataType>
+struct GridwiseMoeGemmMX_BPreshuffle
+{
+    using LDSTypeA = ADataType;
+    using LDSTypeB = BDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
+
+    static constexpr auto CShuffleBlockTransferScalarPerVector_NPerBlock =
+        CDEShuffleBlockTransferScalarPerVectors{}[I0];
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr auto lcm_AK1_BK1         = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma = false;
+    static constexpr auto is_scale_mfma       = true;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto MXdlPack = 2;
+    static constexpr auto NXdlPack = 2;
+    static constexpr auto KXdlPack = 2;
+
+    //> KPack is at least the k_per_blk of selected mfma
+    //
+    // Should be a multiple of k_per_blk.
+    // TODO: Move this to blockwise pipeline base
+    // KPack in packed data types for pk A/B
+
+    static constexpr index_t APackedSize = packed_size_v<ADataType>;
+    static constexpr index_t BPackedSize = packed_size_v<BDataType>;
+
+    using mfma_selector = MfmaSelector<ComputeTypeA,
+                                       MPerXdl,
+                                       NPerXdl,
+                                       ComputeTypeB,
+                                       is_single_rate_mfma,
+                                       is_scale_mfma>;
+    static constexpr index_t KPack =
+        math::max(lcm_AK1_BK1, mfma_selector::selected_mfma.k_per_blk / APackedSize);
+
+    static constexpr index_t NLane   = NPerXdl;
+    static constexpr index_t KLane   = 64 / NLane;
+    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
+    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
+
+    // static constexpr index_t NumTokens = 1;
+    static constexpr index_t SortedTileSize = MPerBlock;
+
+    using mx_scale_t                           = e8m0_bexp_t;
+    static constexpr index_t scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr index_t scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t nblock = math::integer_divide_ceil(N, NPerBlock);
+        const index_t mblock = math::integer_divide_ceil(M, MPerBlock);
+        const index_t gridx  = NSwizzle ? nblock * mblock : nblock;
+        const index_t gridy  = NSwizzle ? 1 : mblock;
+
+        return std::make_tuple(gridx, gridy, 1);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateBN0Shuffled(index_t N)
+    {
+        return math::integer_divide_ceil(N, NLane);
+    }
+    __host__ static auto CalculateBK0Shuffled(index_t K)
+    {
+        return math::integer_divide_ceil(K, KLane * KPack);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNXdlPerWave,
+              index_t MNWaves,
+              index_t MNXdlPack,
+              index_t MNPerXdl,
+              bool IsXor,
+              typename TileDesc_K0_MN_K1>
+    __host__ __device__ static constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1&)
+    {
+        constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+        constexpr index_t MN = TileDesc_K0_MN_K1{}.GetLength(Number<1>{});
+        constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+        if constexpr(IsXor)
+        {
+            constexpr auto permuted_desc = transform_tensor_descriptor(
+                TileDesc_K0_MN_K1{},
+                make_tuple(make_xor_with_modulo_transform(make_tuple(Number<MN>{}, Number<K0>{})),
+                           make_pass_through_transform(Number<K1>{})),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return transform_tensor_descriptor(
+                permuted_desc,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                    make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                      Number<MNWaves>{},
+                                                      Number<MNXdlPack>{},
+                                                      Number<MNPerXdl>{}))),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                TileDesc_K0_MN_K1{},
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                    make_unmerge_transform(make_tuple(Number<MNXdlPerWave / MNXdlPack>{},
+                                                      Number<MNWaves>{},
+                                                      Number<MNXdlPack>{},
+                                                      Number<MNPerXdl>{}))),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}));
+        }
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        IndexType M, IndexType MPad, IndexType K, IndexType KPad, IndexType StrideA, IndexType AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(M, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(M),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_grid_desc;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
+    {
+        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "f4x2_pk_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, BK0Number, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            const auto b_grid_desc_permuted = transform_tensor_descriptor(
+                b_grid_desc_bk0_n_bk1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(N, BK0Number)),
+                           make_pass_through_transform(BK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto b_grid_desc = transform_tensor_descriptor(
+                b_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, BK0Number)),
+                    make_pass_through_transform(N),
+                    make_pass_through_transform(BK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_grid_desc;
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeAMmaTileDescriptor_M0_M1_M2_M3_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor<MXdlPerWave, MWaves, MXdlPack, MPerXdl, true>(
+            ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeBMmaTileDescriptor_N0_N1_N2_N3_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor<NXdlPerWave, NWaves, NXdlPack, NPerXdl, false>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ELayout>
+    __host__ __device__ static auto MakeCGridDescriptor_M_N(
+        IndexType M, IndexType MPad, IndexType N, IndexType NPad, IndexType StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename DLayout>
+    __host__ __device__ static auto
+    MakeDGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I0));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I0, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    __host__ __device__ static auto MakeDsGridDescriptor_M_N(
+        index_t M, index_t MPad, index_t N, index_t NPad, std::array<index_t, NumDTensor> StrideDs)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return MakeDGridDescriptor_M_N<DLayout>(M, MPad, N, NPad, StrideDs[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    template <typename DsGridDesc>
+    __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDesc& ds_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n[i], MBlock, NBlock);
+            },
+            Number<NumDTensor>{});
+    }
+
+    struct Problem
+    {
+        __host__ Problem(index_t NumTokens_,
+                         index_t TopK_,
+                         index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideScaleA_,
+                         index_t StrideB_,
+                         index_t StrideScaleB_,
+                         std::array<index_t, NumDTensor> StrideDs_,
+                         index_t StrideC_,
+                         index_t KBatch_)
+            : NumTokens{NumTokens_},
+              TopK{TopK_},
+              M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideScaleA{StrideScaleA_},
+              StrideB{StrideB_},
+              StrideScaleB{StrideScaleB_},
+              StrideDs{StrideDs_},
+              StrideC{StrideC_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)},
+              BN0Shuffled{CalculateBN0Shuffled(N_)},
+              BK0Shuffled{CalculateBK0Shuffled(K_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "NumTokens:" << NumTokens << ", "
+                      << "TopK:" << TopK << ", "
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t NumTokens;
+        index_t TopK;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideScaleA;
+        index_t StrideB;
+        index_t StrideScaleB;
+        std::array<index_t, NumDTensor> StrideDs;
+        index_t StrideC;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+        // FOR PRESHUFFLE ONLY
+        index_t BN0Shuffled;
+        index_t BK0Shuffled;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const index_t* p_sorted_token_ids_,
+                          const index_t* p_sorted_expert_ids_,
+                          const index_t* p_max_token_id_,
+                          const ADataType* p_a_grid_,
+                          const AScaleDataType* p_a_scale_grid_,
+                          const BDataType* p_b_grid_,
+                          const BScaleDataType* p_b_scale_grid_,
+                          std::array<const void*, NumDTensor> p_ds_grid_,
+                          CDataType* p_c_grid_,
+                          index_t NumTokens_,
+                          index_t TopK_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideScaleA_,
+                          index_t StrideB_,
+                          index_t StrideScaleB_,
+                          std::array<index_t, NumDTensor> StrideDs_,
+                          index_t StrideC_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_)
+            : Problem{NumTokens_,
+                      TopK_,
+                      M_,
+                      N_,
+                      K_ / APackedSize,
+                      StrideA_ / APackedSize,
+                      StrideScaleA_,
+                      StrideB_ / BPackedSize,
+                      StrideScaleB_,
+                      StrideDs_,
+                      StrideC_,
+                      k_batch_},
+              p_sorted_token_ids{p_sorted_token_ids_},
+              p_sorted_expert_ids{p_sorted_expert_ids_},
+              p_max_token_id{p_max_token_id_},
+              p_a_grid{p_a_grid_},
+              p_a_scale_grid{p_a_scale_grid_},
+              p_b_grid{p_b_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              p_ds_grid{},
+              p_c_grid{p_c_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_}
+        {
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType_ = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid(i) = static_cast<const DDataType_*>(p_ds_grid_[i]);
+            });
+        }
+
+        const index_t* p_sorted_token_ids;
+        const index_t* p_sorted_expert_ids;
+        const index_t* p_max_token_id;
+        const ADataType* p_a_grid;
+        const AScaleDataType* p_a_scale_grid;
+        const BDataType* p_b_grid;
+        const BScaleDataType* p_b_scale_grid;
+        DsGridPointer p_ds_grid;
+        CDataType* p_c_grid;
+
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+    };
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(Argument& karg, index_t k_id)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                // KPack * NLane * KLane * K0 * N0
+                b_k_split_offset = k_id * karg.KRead * NPerXdl;
+            }
+
+            // Calculate A scale offset
+            a_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / APackedSize) * MXdlPack *
+                                     MPerXdl / scale_pack_size_a;
+
+            // Calculate B scale offset
+            b_scale_k_split_offset = k_id * karg.KRead / (ScaleBlockSize / BPackedSize) * NXdlPack *
+                                     NPerXdl / scale_pack_size_b;
+
+            if(k_id < karg.KBatch - 1)
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t a_scale_k_split_offset;
+        index_t b_scale_k_split_offset;
+    };
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // contiguous in LDS
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr auto a_lds_block_desc =
+                make_naive_tensor_descriptor(make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                                             make_tuple(AK1Number, Number<KPerBlock>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(
+                               make_tuple(Number<MPerBlock>{}, Number<AK0Number>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_permuted;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto WaveSize = 64;
+            constexpr auto M0       = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1       = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = WaveSize / MPerXdl;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerXdl * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerXdl * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerXdl * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        return make_naive_tensor_descriptor_packed(make_tuple(Number<NXdlPerWave / NXdlPack>{},
+                                                              I1,
+                                                              Number<NXdlPack>{},
+                                                              Number<KRepeat>{},
+                                                              Number<BK1Value>{}));
+    }
+
+    __device__ static constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    using BlockwiseGemmPipe =
+        remove_cvref_t<decltype(BlockGemmMXBPreshufflePipeline_Selector<
+                                BlkGemmPipelineVer,
+                                BlkGemmPipeSched,
+                                BlockSize,
+                                ScaleBlockSize,
+                                ADataType,
+                                AScaleDataType,
+                                BDataType,
+                                BScaleDataType,
+                                ComputeTypeA,
+                                AccDataType,
+                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
+                                decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()),
+                                decltype(MakeAMmaTileDescriptor_M0_M1_M2_M3_K(
+                                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                                decltype(MakeBMmaTileDescriptor_N0_N1_N2_N3_K(
+                                    GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                                ABlockTransferSrcScalarPerVector,
+                                BBlockTransferSrcScalarPerVector,
+                                MPerBlock,
+                                NPerBlock,
+                                KPerBlock,
+                                MPerXdl,
+                                NPerXdl,
+                                MXdlPerWave,
+                                NXdlPerWave,
+                                KPack,
+                                IsInputGemm>())>;
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max(a_block_space_size_aligned * sizeof(ADataType),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        static_assert(KPerBlock % (ScaleBlockSize / BPackedSize) == 0,
+                      "KPerBlock should be multiple of ScaleBlockSize");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+#if 0
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+        {
+            return false;
+        }
+#endif
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    // using Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock,
+    // NPerBlock>;
+
+#if 0
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const index_t* p_sorted_token_ids,
+                               const index_t* p_sorted_expert_ids,
+                               const index_t* p_max_token_id,
+                               const ADataType* p_a_grid,
+                               const AScaleDataType* p_a_scale_grid,
+                               const BDataType* p_b_grid,
+                               const BScaleDataType* p_b_scale_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op)
+    {
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
+            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerBlock),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K / APackedSize;
+        });
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * (IsInputGemm ? 2 : 1) *
+                                           math::integer_divide_ceil(problem.K, ScaleBlockSize));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride / BPackedSize,
+            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        // A, B scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + expert_id * expert_scale_stride,
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        // dummy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            LDSTypeA,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            IndexType,
+            1,
+            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                a_element_op,
+                                                a_block_desc_ak0_m_ak1,
+                                                make_multi_index(0, 0, 0),
+                                                ck::tensor_operation::element_wise::PassThrough{},
+                                                gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        auto b_blockwise_copy =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             BDataType,
+                                             decltype(b_grid_desc_bpreshuffled),
+                                             decltype(b_block_desc_bk0_n_bk1),
+                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                      I1,
+                                                      Number<NXdlPack>{},
+                                                      Number<KRepeat>{},
+                                                      Number<BK1Value>{}>,
+                                             Sequence<1, 2, 0, 3>,
+                                             4,
+                                             BBlockTransferSrcScalarPerVector,
+                                             BThreadTransferSrcResetCoordinateAfterRun,
+                                             true>(
+                b_grid_desc_bpreshuffled,
+                make_multi_index(n_block_data_idx_on_grid,
+                                 get_warp_local_1d_id() % NWave,
+                                 0,
+                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<LDSTypeA*>(p_shared),
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // a and b scale processing
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
+
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        // B scale load
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BDataType,
+                BDataType,
+                decltype(b_grid_desc_bpreshuffled),
+                decltype(b_block_desc_bk0_n_bk1),
+                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
+                Sequence<1, 2, 0, 3>,
+                3,
+                BBlockTransferSrcScalarPerVector,
+                BThreadTransferSrcResetCoordinateAfterRun,
+                true>(b_grid_desc_bpreshuffled,
+                      make_multi_index(n_block_data_idx_on_grid,
+                                       get_warp_local_1d_id() % NWave,
+                                       0,
+                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
+            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride,
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BScaleDataType,
+                BScaleDataType,
+                decltype(b_scale_grid_desc_bn_ak),
+                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+                Sequence<0, 1, 2>,                                       // DimAccessOrder
+                2,                                                       // SrcVectorDim
+                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+                1,                                                       // SrcScalarStrideInVector
+                true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                 0,
+                                 thread_offset_shuffled / scale_pack_size_b));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                c_thread_buf_up,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_buf,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_buf,
+                b_block_slice_copy_step,
+                c_thread_buf,
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            // mul scales
+            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
+            static_assert(M4 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            vector_type<float, 4> topk_weights; // for gemm2 only
+            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
+                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
+                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
+                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
+                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
+                        if constexpr(MulRoutedWeight)
+                        {
+                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
+                                p_ds_grid[I2] + m_pos);
+                        }
+                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
+                            constexpr index_t c_offset =
+                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                    make_tuple(m0, n0, m2 * M4 + m4));
+                            constexpr auto cidx = Number<c_offset>{};
+
+                            if constexpr(IsInputGemm) // gu fusion
+                            {
+                                if constexpr(ActivationOperation == Activation::silu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Silu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                                else if(ActivationOperation == Activation::gelu_and_mul)
+                                {
+                                    float gate = c_thread_buf[cidx];
+                                    float up   = c_thread_buf_up[cidx];
+                                    if constexpr(MulRoutedWeight)
+                                    {
+                                        gate = gate * topk_weights.AsType<float>()[m4];
+                                        up   = up * topk_weights.AsType<float>()[m4];
+                                    }
+                                    tensor_operation::element_wise::Gelu{}(gate, gate);
+                                    c_thread_buf_fp32(cidx) = gate * up;
+                                }
+                            }
+                            else
+                            {
+                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    c_thread_buf_fp32(cidx) =
+                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                }
+                            }
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 1; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                               // support arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+#endif
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run_2Lds(const index_t* p_sorted_token_ids,
+                                    const index_t* p_sorted_expert_ids,
+                                    const index_t* p_max_token_id,
+                                    const ADataType* p_a_grid,
+                                    const AScaleDataType* p_a_scale_grid,
+                                    const BDataType* p_b_grid,
+                                    const BScaleDataType* p_b_scale_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* p_c_grid,
+                                    void* p_shared_0,
+                                    void* p_shared_1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op)
+    {
+        ignore                           = a_element_op;
+        ignore                           = b_element_op;
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
+            problem.MPadded,
+            problem.K,
+            problem.KPadded,
+            problem.StrideA,
+            problem.AK0);
+        const auto b_grid_desc_bpreshuffled =
+            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
+            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
+            problem.MPadded,
+            problem.N,
+            problem.NPadded,
+            problem.StrideC);
+
+        // We pad the M unconditionaly for Scale
+        const auto Padded_Scale_M =
+            math::integer_divide_ceil(problem.M, ScaleBlockSize) * ScaleBlockSize;
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(Padded_Scale_M / (MXdlPack * MPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
+                           (KXdlPack * 64 / MPerXdl),
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / APackedSize)) *
+                           MPerXdl * MXdlPack / scale_pack_size_a,
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a,
+                       1));
+
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(problem.N / (NXdlPack * NPerXdl),
+                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
+                           (KXdlPack * 64 / NPerXdl),
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / BPackedSize)) *
+                           NPerXdl * NXdlPack / scale_pack_size_b,
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b,
+                       1));
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
+        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
+        if(expert_block_id * MPerBlock >= max_token_id)
+            return;
+        const index_t expert_id =
+            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
+        const auto block_mn = [&]() -> std::pair<int, int> {
+            if constexpr(NSwizzle)
+            {
+                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
+                const index_t prefix_block = ecnt_prefix * problem.NBlock;
+                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
+                const index_t expert_swizzle =
+                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
+                const index_t bid_new = blockIdx.x - prefix_block;
+                const index_t nid     = __builtin_amdgcn_readfirstlane(
+                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
+                const index_t mid =
+                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
+                return {nid, mid};
+            }
+            else
+            {
+                return {blockIdx.x, blockIdx.y};
+            }
+        }();
+
+        const index_t block_n_id = block_mn.first;
+        const index_t block_m_id = block_mn.second;
+        const index_t token0 =
+            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
+
+        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
+        constexpr auto AKThreads  = AK0Threads * AK1Threads;
+        constexpr auto AMRepeats  = MPerBlock / AMThreads;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads;
+
+        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
+        static_for<0, AMRepeats, 1>{}([&](auto m0) {
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0 * AMThreads];
+            index_t token_offset      = fused_token & 0xffffff;
+            if constexpr(!IsInputGemm)
+            {
+                token_offset = token_offset * problem.TopK + (fused_token >> 24);
+            }
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
+        });
+
+        const index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            problem.N * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
+
+        // N0, K0, Blocksize*KPack
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave / NXdlPack);
+
+        // Gride buffer creation
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+
+        // A, B scale buffer
+        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
+            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise direct to LDS copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad<
+            ThisThreadBlock,
+            Sequence<AK0Number, MPerBlock, AK1Number>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ADataType,
+            ADataType,
+            decltype(a_grid_desc_ak0_m_ak1),
+            decltype(a_block_desc_ak0_m_ak1),
+            ABlockTransferSrcAccessOrder,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            IndexType,
+            1>(a_grid_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               a_block_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               gather_offsets);
+
+        // Thread-wise copy
+        // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
+
+        auto b_blockwise_copy =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             BDataType,
+                                             decltype(b_grid_desc_bpreshuffled),
+                                             decltype(b_block_desc_bk0_n_bk1),
+                                             Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                      I1,
+                                                      Number<NXdlPack>{},
+                                                      Number<KRepeat>{},
+                                                      Number<BK1Value>{}>,
+                                             Sequence<0, 1, 2, 3, 4>,
+                                             4,
+                                             BBlockTransferSrcScalarPerVector,
+                                             BThreadTransferSrcResetCoordinateAfterRun,
+                                             true>(
+                b_grid_desc_bpreshuffled,
+                make_multi_index(n_block_data_idx_on_grid,
+                                 get_warp_local_1d_id() % NWave,
+                                 0,
+                                 0,
+                                 KPack * (get_thread_local_1d_id() % warpSize)));
+
+        // LDS allocation for A and B: be careful of alignment
+        // Cast after lds
+        auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, 0, KRepeat, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+        decltype(c_thread_buf) c_thread_buf_up;
+
+        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                                  float,
+                                  c_thread_buf.num_of_v_,
+                                  c_thread_buf.s_per_v,
+                                  true>
+            c_thread_buf_fp32;
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        // a and b scale processing
+        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        auto thread_offset_shuffled =
+            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
+
+        auto a_thread_offset_m = waveId_m;
+
+        // get each thread's offset int the scale tensor
+        const index_t token_scale_pos = block_m_id * MPerBlock;
+        if(token_scale_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+
+        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            AScaleDataType,
+            AScaleDataType,
+            decltype(a_scale_grid_desc_am_ak),
+            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(a_scale_grid_desc_am_ak,
+                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_a));
+
+        // B scale load
+        auto b_thread_offset_n = waveId_n;
+
+        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
+            BScaleDataType,
+            BScaleDataType,
+            decltype(b_scale_grid_desc_bn_ak),
+            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+            Sequence<0, 1, 2>,                                       // DimAccessOrder
+            2,                                                       // SrcVectorDim
+            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            1,                                                       // SrcScalarStrideInVector
+            true>(b_scale_grid_desc_bn_ak,
+                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                   0,
+                                   thread_offset_shuffled / scale_pack_size_b));
+
+        if constexpr(IsInputGemm)
+        {
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
+            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_grid_up + expert_id * expert_stride,
+                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            auto b_blockwise_copy_up =
+                ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                                 BDataType,
+                                                 decltype(b_grid_desc_bpreshuffled),
+                                                 decltype(b_block_desc_bk0_n_bk1),
+                                                 Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                          I1,
+                                                          Number<NXdlPack>{},
+                                                          Number<KRepeat>{},
+                                                          Number<BK1Value>{}>,
+                                                 Sequence<0, 1, 2, 3, 4>,
+                                                 4,
+                                                 BBlockTransferSrcScalarPerVector,
+                                                 BThreadTransferSrcResetCoordinateAfterRun,
+                                                 true>(
+                    b_grid_desc_bpreshuffled,
+                    make_multi_index(n_block_data_idx_on_grid,
+                                     get_warp_local_1d_id() % NWave,
+                                     0,
+                                     0,
+                                     KPack * (get_thread_local_1d_id() % warpSize)));
+            const BScaleDataType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride / sizeof(BScaleDataType),
+                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
+                BScaleDataType,
+                BScaleDataType,
+                decltype(b_scale_grid_desc_bn_ak),
+                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
+                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
+                Sequence<0, 1, 2>,                                       // DimAccessOrder
+                2,                                                       // SrcVectorDim
+                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+                1,                                                       // SrcScalarStrideInVector
+                true>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
+                                 0,
+                                 thread_offset_shuffled / scale_pack_size_b));
+
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                // A
+                a_grid_desc_ak0_m_ak1,
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                // Gate and Up
+                b_grid_desc_bpreshuffled,
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_blockwise_copy_up,
+                b_grid_buf,
+                b_grid_buf_up,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                // C
+                c_thread_buf,
+                c_thread_buf_up,
+                // A scale
+                a_scale_grid_desc_am_ak,
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                // B scale
+                b_scale_grid_desc_bn_ak,
+                b_scale_thread_copy,
+                b_scale_thread_copy_up,
+                b_scale_grid_buf,
+                b_scale_grid_buf_up,
+                num_k_block_main_loop);
+        }
+        else
+        {
+            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                a_grid_desc_ak0_m_ak1, // A
+                a_block_desc_ak0_m_ak1,
+                a_blockwise_copy,
+                a_grid_buf,
+                a_block_bufs,
+                a_block_slice_copy_step,
+                b_grid_desc_bpreshuffled, // B
+                b_block_desc_bk0_n_bk1,
+                b_blockwise_copy,
+                b_grid_buf,
+                b_block_bufs,
+                b_block_slice_copy_step,
+                c_thread_buf,            // C
+                a_scale_grid_desc_am_ak, // A scale
+                a_scale_thread_copy,
+                a_scale_grid_buf,
+                b_scale_grid_desc_bn_ak, // B scale
+                b_scale_thread_copy,
+                b_scale_grid_buf,
+                num_k_block_main_loop);
+        }
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
+
+            // mul scales
+
+            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
+            static_assert(M5 == 4);
+            const index_t m1 = get_warp_local_1d_id() / NWave;
+            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
+
+            vector_type<float, 4> topk_weights; // for gemm2 only
+            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
+                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
+                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
+                                const index_t m_pos = block_m_id * MPerBlock +
+                                                      m0 * M2 * M1 * M3 * M4 * M5 +
+                                                      m1 * M2 * M3 * M4 * M5 +
+                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
+                                if constexpr(MulRoutedWeight)
+                                {
+                                    topk_weights =
+                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
+                                            p_ds_grid[I2] + m_pos);
+                                }
+                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
+                                    constexpr index_t c_offset =
+                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
+                                    constexpr auto cidx = Number<c_offset>{};
+
+                                    if constexpr(IsInputGemm) // gu fusion
+                                    {
+                                        if constexpr(ActivationOperation ==
+                                                     Activation::silu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Silu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                        else if(ActivationOperation == Activation::gelu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                    }
+                                    else
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            c_thread_buf_fp32(cidx) =
+                                                topk_weights.AsType<float>()[m5] *
+                                                c_thread_buf_fp32[cidx];
+                                        }
+                                    }
+                                });
+                            });
+                        });
+                    });
+                });
+            });
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared_0),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2, // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4,
+                        M5)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
+                                                                            // per shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
+
+            using EDataType = CDataType;
+
+            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin =
+                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
+                                 generate_tuple(
+                                     [&](auto) {
+                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
+                                         // return make_multi_index(block_work_idx[I0], 0,
+                                         // block_work_idx[I1], 0);
+                                     },
+                                     Number<NumDTensor>{}));
+
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                c_grid_desc_mblock_mperblock_nblock_nperblock;
+
+            using CDEBlockTransferCluster =
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
+            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
+            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                            // Sequence support
+                                                                            // arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferCluster,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVectors,
+                CShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                IndexType,
+                1,                 // ScatterDim
+                true,              // OutputScatter: false, only use scatter weights
+                scatter_weight_idx // ScatterWeightIdx: ascale
+                >{c_ds_desc_refs,
+                  idx_c_ds_block_begin,
+                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                  c_element_op};
+
+            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            constexpr auto EMThreads =
+                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
+            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
+            constexpr auto ENThreads =
+                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
+
+                auto dstidx = sfc_cde_block.GetIndex(access_id);
+                const index_t c_token_pos =
+                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
+                static_for<0, EMRepeats, 1>{}([&](auto m0) {
+                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
+                    IndexType token_offset    = fused_token & 0xffffff;
+                    if constexpr(IsInputGemm)
+                    {
+                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
+                    }
+                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
+                });
+
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf_fp32,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(c_grid_buf),
+                    scatter_offsets);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck

From a8742f7e31d481b5fb2152ab5428b721c6bcb27b Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Mon, 7 Jul 2025 11:13:12 +0800
Subject: [PATCH 280/443] [CK_TILE][CORE] enhance slice_tile api (#2430)

* support slice cross p

* fix some bug in y_len

* more case

* fix a bug when R exist

* support -1 to hint end of current length

* format

* change commit
---
 include/ck_tile/core/container/sequence.hpp   |  16 +-
 .../ck_tile/core/tensor/tile_distribution.hpp |  76 +++++++---
 .../tensor/tile_distribution_encoding.hpp     | 138 ++++++++++++++++--
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/slice_tile/CMakeLists.txt        |   1 +
 test/ck_tile/slice_tile/test_slice_tile.cpp   | 135 +++++++++++++++++
 6 files changed, 337 insertions(+), 30 deletions(-)
 create mode 100644 test/ck_tile/slice_tile/CMakeLists.txt
 create mode 100644 test/ck_tile/slice_tile/test_slice_tile.cpp

diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 4fcea9642d..b187b71830 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -1178,6 +1178,15 @@ struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, Slice
 // clang-format off
 // input a sequence(with optional mask), and the SliceSize : size per slice
 // output the sequence each slice, and number of slices
+// the length count for slice size is from right to left(reverse slice)
+// or we can say, find the greatest common divider(gcd) from right to left, for the slice length
+//
+// e.g. <2, 8, 4>, slice length = 16
+//  step-1: we take the right most <*, *, 4>, remaining 16/4=4
+//  step-2: we only need 4 out of 8, of the midden dim, hence <*, 4, 4>
+//  step-3: since nonthing remain, so the first dim we only need 1, hence<1, 4, 4>
+//  => we got <1, 4, 4> as length for each slice
+//  => total number of slice = <2, 8, 4> / <1, 4, 4> = <2, 2, 1>
 //
 // e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
 //      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
@@ -1197,7 +1206,7 @@ struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, Slice
 //
 // return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
 // have split slices (right -> left)
-//  or the first index that sliced length is different from the original length
+//  or the first index (right -> left) that sliced length is different from the original length
 // clang-format on
 template <typename Seq,
           index_t SliceSize,
@@ -1207,6 +1216,11 @@ constexpr auto reverse_slice_sequence(Seq,
                                       Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
 {
     static_assert(Seq::size() == Mask::size());
+    static_assert(SliceSize != 0, "slice size zero is invalid");
+    static_assert(container_reduce(pick_sequence_elements_by_mask(Seq{}, Mask{}), multiplies{}, 1) %
+                          SliceSize ==
+                      0,
+                  "slice size can't evenly divide input sizes");
     using sliced_type =
         impl::reverse_slice_sequence_impl<Seq,
                                           Mask,
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index 7761be492d..d7be5957c6 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -542,26 +542,26 @@ namespace detail {
 //
 // e.g
 //       X0           X1
-//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice origin:<0, 0>, len:<0, 32>, (0 means all length)
+//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice start:<0, 0>, end:<-1, 32>, (-1 means the last one)
 //        Y  P  P      Y  P  Y  P  Y
 //   =>  <1, 4, 32> - <1, 1, 4, 2, 4> -> OK
 //                     |--> slice along this Y dim, is the first dim of X1, totally 4 slices
 //
 //       X0           X1
-//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice origin:<0, 0>, len:<0, 8>, (0 means all length)
+//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice start:<0, 0>, end:<-1, 8>, (-1 means the last one)
 //        Y  P  P      Y  P  Y  P  Y
 //   =>  <1, 4, 32> - <1, 1, 1, 2, 4> -> OK
 //                           |--> slice along this Y dim, the P dim is 1 in the left, so is OK
 //                                 totally 16 slices
 //
 //       X0           X1
-//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice origin:<0, 0>, len:<0, 4>, (0 means all length)
+//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice start:<0, 0>, end:<-1, 4>, (-1 means the last one)
 //        Y  P  P      Y  P  Y  P  Y
 //   =>  <1, 4, 32> - <1, 1, 1, 1, 4> -> Fail
 //                              |--> slice along this P dim, will split threads, not supported
 //
 //       X0           X1
-//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice origin:<0, 0>, len:<0, 16>, (0 means all length)
+//       <1, 4, 32> - <4, 1, 4, 2, 4>  | slice start:<0, 0>, end:<-1, 16>, (-1 means the last one)
 //        Y  P  P      Y  P  Y  P  Y
 //   =>  <1, 4, 32> - <1, 1, 2, 2, 4> -> OK
 //                           |--> slice along this Y dim, but this Y sim need to split into 2
@@ -577,11 +577,39 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
     using Encoding = decltype(Distribution::get_static_tile_distribution_encoding());
 
     static_assert(sizeof...(XSliceBegins) == sizeof...(XSliceEnds));
+    static_assert(sizeof...(XSliceBegins) == Encoding::NDimX, "only support slice over h, not r");
 
-    constexpr auto x_slice_lengths = x_slice_ends - x_slice_begins;
+    constexpr auto p_len_over_h = Encoding::detail::get_uniformed_p_dim_lengths_over_h();
+
+    constexpr auto x_slice_ends_ = generate_sequence_v2(
+        [&](auto i) {
+            if constexpr(x_slice_ends[i] == -1)
+            {
+                // -1 means till the end
+                constexpr auto x_length_ =
+                    container_reduce(typename Encoding::HsLengthss{}[i], multiplies{}, number<1>{});
+                return x_length_;
+            }
+            else
+            {
+                return x_slice_ends[i];
+            }
+        },
+        number<x_slice_ends.size()>{});
+
+    constexpr auto x_slice_lengths = x_slice_ends_ - x_slice_begins;
+
+    constexpr auto x_slice_lengths_without_p = generate_sequence_v2(
+        [&](auto i) constexpr {
+            constexpr auto len_ = x_slice_lengths[i];
+            static_assert(len_ % p_len_over_h[i] == 0,
+                          "slice length must be dividable by p_len_over_h");
+            return number<len_ / p_len_over_h[i]>{};
+        },
+        number<x_slice_lengths.size()>{});
 
     constexpr auto src_h_prefix_sum = Encoding::detail::get_h_dim_lengths_prefix_sum();
-    constexpr auto src_y_info       = Encoding::detail::get_sorted_y_info();
+    constexpr auto src_y_info       = Encoding::detail::get_sorted_y_to_h_info();
     constexpr auto src_y_dims       = src_y_info[number<0>{}];
     constexpr auto src_y_maps       = src_y_info[number<1>{}];
     constexpr auto src_y_prefix_sum = src_y_info[number<2>{}];
@@ -590,14 +618,15 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
     {
         auto y_slice_sorted_origins = make_zero_multi_index<Encoding::NDimY>();
         auto y_slice_lengths        = Encoding::detail::ys_lengths_;
+        constexpr auto y_to_h_masks = Encoding::detail::get_y_to_h_masks();
 
         // This lambda will modify some value outside, so c++ will not treat return value as
         // constexpr
         // TODO: ugly
         auto new_h_lengths = transform_tuples(
             [&](auto h_len, auto id) {
-                constexpr auto sliced_h =
-                    reverse_slice_sequence(h_len, number<x_slice_lengths[id]>{});
+                constexpr auto sliced_h = reverse_slice_sequence(
+                    h_len, number<x_slice_lengths_without_p[id]>{}, y_to_h_masks[id]);
 
                 constexpr auto sliced_h_lens  = sliced_h[number<0>{}];
                 constexpr auto sliced_h_index = sliced_h[number<2>{}];
@@ -605,26 +634,39 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
                 // update y_slice_lengths
                 constexpr auto uniformed_h_index = sliced_h_index + number<src_h_prefix_sum[id]>{};
                 constexpr auto found_y_index     = container_find(src_y_dims, uniformed_h_index);
+                constexpr auto y_to_h_dim_end    = src_y_prefix_sum[id + 1];
 
                 static_assert(found_y_index >= 0 && found_y_index < src_y_dims.size(),
                               "not sliced at y dim, please check");
 
-                static_for<0, sliced_h_index + 1, 1>{}([&](auto i) {
-                    y_slice_lengths(src_y_maps[found_y_index - i]) =
-                        sliced_h_lens[sliced_h_index - i];
-                });
+                {
+                    constexpr auto sliced_y_to_h_lens =
+                        pick_sequence_elements_by_mask(sliced_h_lens, y_to_h_masks[id]);
+                    constexpr auto sliced_y_to_h_dims = sliced_y_to_h_lens.size();
+                    static_for<0, sliced_y_to_h_dims, 1>{}([&](auto i) {
+                        y_slice_lengths(src_y_maps[y_to_h_dim_end - 1 - i]) =
+                            sliced_y_to_h_lens[sliced_y_to_h_dims - 1 - i];
+                    });
+                }
                 // TODO: add validations not across p dim
 
                 // NOTE: this y_origin is for all dims, not only current dim
                 //       will later use pick to select target dim
                 constexpr auto y_origin = [&]() {
-                    constexpr auto h_trans = make_merge_transform_v3_division_mod(h_len);
-                    auto h_origin_         = make_zero_multi_index<h_trans.NDimLow>();
-                    h_trans.calculate_lower_index(h_origin_, sequence<x_slice_begins[id].value>{});
+                    // can't use Encoding::Ys2RHsMajor/Ys2RHsMinor, these are unordered
+                    constexpr auto y_to_h_len =
+                        pick_sequence_elements_by_mask(h_len, y_to_h_masks[id]);
+                    constexpr auto y_to_h_dims = y_to_h_len.size();
+
+                    constexpr auto h_trans  = make_merge_transform_v3_division_mod(y_to_h_len);
+                    auto h_origin_          = make_zero_multi_index<h_trans.NDimLow>();
+                    constexpr auto y_begin_ = x_slice_begins[id] / p_len_over_h[id];
+                    h_trans.calculate_lower_index(h_origin_, sequence<y_begin_.value>{});
 
                     auto y_origin_ = make_zero_multi_index<Encoding::NDimY>();
-                    static_for<0, sliced_h_index + 1, 1>{}([&](auto i) {
-                        y_origin_(found_y_index - i) = h_origin_[sliced_h_index - i];
+
+                    static_for<0, y_to_h_dims, 1>{}([&](auto i) {
+                        y_origin_(y_to_h_dim_end - 1 - i) = h_origin_[y_to_h_dims - 1 - i];
                     });
                     return y_origin_;
                 }();
diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
index 7b1e952025..30cd698595 100644
--- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
@@ -255,33 +255,107 @@ struct tile_distribution_encoding
             }
         }();
 
-        // e.g. tuple<seq<1, 4, 32>, seq<4, 1, 4, 2, 4>> --> seq<3, 5> --> seq<0, 3, 8>
-        CK_TILE_HOST_DEVICE static constexpr auto get_h_dim_lengths_prefix_sum()
+        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_h_dim_lengths()
         {
-            // <len_d0, len_d1, ...>
             // e.g. tuple<seq<1, 4, 32>, seq<4, 1, 4, 2, 4>> --> seq<3, 5>
             constexpr auto uniformed_h_dim_lengths = generate_sequence_v2(
                 [&](auto i) {
-                    constexpr index_t size = HsLengthss{}[i].size();
-                    return number<size>{};
+                    constexpr index_t size_ = HsLengthss{}[i].size();
+                    return number<size_>{};
                 },
                 number<NDimX>{});
+            return uniformed_h_dim_lengths;
+        }
 
+        // note: this function only count the p dim length along h, not r
+        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_p_dim_lengths_over_h()
+        {
+            // e.g. tuple<seq<1, 4, 32>, seq<1, 2, 8, 4, 4>>
+            //                Y  P  Y        Y  P  Y  P  Y
+            //                   |              |     |
+            //                   v              v     v
+            // return :      seq<4,             2  *  4> => seq<4, 8>
+            constexpr auto uniformed_ps_to_rhss_major_ =
+                unpack([](auto... xs_) { return merge_sequences(xs_...); }, ps_to_rhss_major_);
+            constexpr auto uniformed_ps_to_rhss_minor_ =
+                unpack([](auto... xs_) { return merge_sequences(xs_...); }, ps_to_rhss_minor_);
+
+            constexpr auto p_len_ = [&]() {
+                array<index_t, NDimX> len_{1};
+                static_for<0, NDimX, 1>{}([&](auto idim_x_) {
+                    constexpr auto major_ = number<idim_x_ + 1>{}; // RDim
+                    static_for<0, uniformed_ps_to_rhss_major_.size(), 1>{}([&](auto idim_u_) {
+                        if constexpr(major_.value == uniformed_ps_to_rhss_major_[idim_u_])
+                        {
+                            constexpr auto minor_    = uniformed_ps_to_rhss_minor_[idim_u_];
+                            constexpr auto h_length_ = hs_lengthss_[idim_x_][minor_];
+                            len_[idim_x_] *= h_length_;
+                        }
+                    });
+                });
+                return len_;
+            }();
+            constexpr auto p_len_over_h_seq_ = TO_SEQUENCE(p_len_, NDimX);
+            return p_len_over_h_seq_;
+        }
+
+        //
+        // R: seq<3>, H: tuple<seq<1, 4, 32>, seq<4, 1, 4, 2, 4>>
+        //  => return seq<1, 3, 5>
+        // R: seq<>, H: tuple<seq<2, 4>, seq<16, 8, 8>>
+        //  => return seq<0, 2, 3>
+        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_rh_dim_lengths()
+        {
+            constexpr auto uniformed_rh_dim_lengths =
+                merge_sequences(sequence<NDimR>{} /*for R dims*/, get_uniformed_h_dim_lengths());
+
+            return uniformed_rh_dim_lengths;
+        }
+
+        // e.g. tuple<seq<1, 4, 32>, seq<4, 1, 4, 2, 4>> --> seq<3, 5> --> seq<0, 3, 8>
+        CK_TILE_HOST_DEVICE static constexpr auto get_h_dim_lengths_prefix_sum()
+        {
             // <0, len_d0, len_d0+len_d1, ...>
             // e.g. seq<3, 5> --> seq<0, 3, 8>
-            constexpr auto h_dim_prefix_sum = prefix_sum_sequence(uniformed_h_dim_lengths);
+            constexpr auto h_dim_prefix_sum = prefix_sum_sequence(get_uniformed_h_dim_lengths());
 
             return h_dim_prefix_sum;
         }
 
-        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_idx_y_to_h()
+        CK_TILE_HOST_DEVICE static constexpr auto get_rh_dim_lengths_prefix_sum()
+        {
+            // <0, len_d0, len_d0+len_d1, ...>
+            // e.g. seq<3, 5> --> seq<0, 3, 8>
+            constexpr auto rh_dim_prefix_sum = prefix_sum_sequence(get_uniformed_rh_dim_lengths());
+
+            return rh_dim_prefix_sum;
+        }
+
+        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_idx_p_to_h()
+        {
+            // tuple<seq<xx..>, seq<yy..>> -> seq<xx..yy..>
+            constexpr auto uniformed_ps_to_rhss_major_ =
+                unpack([](auto... xs_) { return merge_sequences(xs_...); }, ps_to_rhss_major_);
+            constexpr auto uniformed_ps_to_rhss_minor_ =
+                unpack([](auto... xs_) { return merge_sequences(xs_...); }, ps_to_rhss_minor_);
+
+            constexpr auto all_ps_2_rhss = transform_sequences(
+                [](auto major, auto minor) constexpr {
+                    constexpr auto rh_dim_prefix_sum = get_rh_dim_lengths_prefix_sum();
+                    return rh_dim_prefix_sum.at(major) + minor;
+                },
+                uniformed_ps_to_rhss_major_,
+                uniformed_ps_to_rhss_minor_);
+
+            return all_ps_2_rhss;
+        }
+
+        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_idx_y_to_rh()
         {
             constexpr auto all_ys_2_rhss = transform_sequences(
                 [](auto major, auto minor) constexpr {
-                    // <0, 0, len_d0, len_d0+len_d1, ...>
-                    constexpr auto x_dim_prefix_sum = merge_sequences(
-                        sequence<0>{} /*for R dims*/, get_h_dim_lengths_prefix_sum());
-                    return x_dim_prefix_sum.at(major) + minor;
+                    constexpr auto rh_dim_prefix_sum = get_rh_dim_lengths_prefix_sum();
+                    return rh_dim_prefix_sum.at(major) + minor;
                 },
                 Ys2RHsMajor{},
                 Ys2RHsMinor{});
@@ -289,6 +363,45 @@ struct tile_distribution_encoding
             return all_ys_2_rhss;
         }
 
+        CK_TILE_HOST_DEVICE static constexpr auto get_uniformed_idx_y_to_h()
+        {
+            // TODO: Y can't point to R
+            constexpr auto all_ys_2_rhss = transform_sequences(
+                [](auto major, auto minor) constexpr {
+                    constexpr auto rh_dim_prefix_sum = get_rh_dim_lengths_prefix_sum();
+                    return rh_dim_prefix_sum.at(major) + minor - NDimR;
+                },
+                Ys2RHsMajor{},
+                Ys2RHsMinor{});
+
+            return all_ys_2_rhss;
+        }
+
+        // return tuple of seq
+        CK_TILE_HOST_DEVICE static constexpr auto get_y_to_h_masks()
+        {
+            constexpr auto masks_ = generate_tuple(
+                [&](auto i) {
+                    constexpr auto size_                = HsLengthss{}[i].size();
+                    constexpr auto current_y_to_h_mask_ = [&]() {
+                        array<index_t, size_> m_{0};
+                        // TODO: we loop over all y for each h dim
+                        for(auto j = 0; j < NDimY; j++)
+                        {
+                            if(Ys2RHsMajor{}[j] == (i + 1) /*RDim need plus 1*/)
+                            {
+                                m_[Ys2RHsMinor{}[j]] = 1;
+                            }
+                        }
+                        return m_;
+                    }();
+
+                    return TO_SEQUENCE(current_y_to_h_mask_, size_);
+                },
+                number<NDimX>{});
+            return masks_;
+        }
+
         // return tuple<sorted_dims, sorted_maps, sorted_prefix_sum>
         template <typename IdxSeq, typename PrefixSumSeq>
         CK_TILE_HOST_DEVICE static constexpr auto get_sorted_info(IdxSeq, PrefixSumSeq)
@@ -305,7 +418,8 @@ struct tile_distribution_encoding
             return make_tuple(sorted_dims, sorted_maps, sorted_prefix_sum);
         }
 
-        CK_TILE_HOST_DEVICE static constexpr auto get_sorted_y_info()
+        // Note here y_to_h does not count R dim!
+        CK_TILE_HOST_DEVICE static constexpr auto get_sorted_y_to_h_info()
         {
             return get_sorted_info(get_uniformed_idx_y_to_h(), get_h_dim_lengths_prefix_sum());
         }
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 57afb5cbb5..5d05243238 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -4,3 +4,4 @@ add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
 add_subdirectory(data_type)
+add_subdirectory(slice_tile)
diff --git a/test/ck_tile/slice_tile/CMakeLists.txt b/test/ck_tile/slice_tile/CMakeLists.txt
new file mode 100644
index 0000000000..d0d1a4ee00
--- /dev/null
+++ b/test/ck_tile/slice_tile/CMakeLists.txt
@@ -0,0 +1 @@
+add_test_executable(test_slice_tile test_slice_tile.cpp)
\ No newline at end of file
diff --git a/test/ck_tile/slice_tile/test_slice_tile.cpp b/test/ck_tile/slice_tile/test_slice_tile.cpp
new file mode 100644
index 0000000000..57770d3bf6
--- /dev/null
+++ b/test/ck_tile/slice_tile/test_slice_tile.cpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core.hpp"
+#include <type_traits>
+
+// clang-format off
+template<typename SliceStart_ = ck_tile::sequence<0, 0>,
+        typename SliceEnd_ = ck_tile::sequence<64, 16>,
+        typename Y_Origin_ = ck_tile::sequence<0, 0, 0, 0>>
+void test_slice_distribution_from_x_case_0(SliceStart_ = {}, SliceEnd_={}, Y_Origin_ = {})
+{
+    // slice length [-1, 16]
+    using namespace ck_tile;
+    constexpr auto r = detail::slice_distribution_from_x(
+        make_static_tile_distribution(
+                tile_distribution_encoding<sequence<>,
+                                        tuple<sequence<1, 4, 16>, sequence<2, 2, 1, 4, 4>>,
+                                        //             Y  P  P             Y  P  Y  P  Y
+                                        tuple<sequence<1, 2>, sequence<2, 1>>,
+                                        tuple<sequence<1, 1>, sequence<3, 2>>,
+                                        sequence<1, 2, 2, 2>,
+                                        sequence<0, 0, 2, 4>>{}),
+        SliceStart_{},
+        SliceEnd_{});
+
+    using sliced_dist_enc = remove_cvref_t<decltype(r[number<0>{}].get_static_tile_distribution_encoding())>;
+    using target_dist_enc = tile_distribution_encoding<sequence<>,
+                                        tuple<sequence<1, 4, 16>, sequence<1, 2, 1, 4, 2>>,
+                                        //             Y  P  P             Y  P  Y  P  Y
+                                        tuple<sequence<1, 2>, sequence<2, 1>>,
+                                        tuple<sequence<1, 1>, sequence<3, 2>>,
+                                        sequence<1, 2, 2, 2>,
+                                        sequence<0, 0, 2, 4>>;
+
+    static_assert(std::is_same_v<sliced_dist_enc, target_dist_enc>);
+
+    using sliced_y_origins = remove_cvref_t<decltype(r[number<1>{}])>;
+    using sliced_y_lengths = remove_cvref_t<decltype(r[number<2>{}])>;
+    static_assert(std::is_same_v<sliced_y_origins, Y_Origin_>);
+    static_assert(std::is_same_v<sliced_y_lengths, sequence<1, 1, 1, 2>>);
+}
+
+template<typename SliceStart_ = ck_tile::sequence<0, 0>,
+        typename SliceEnd_ = ck_tile::sequence<16, 16>,
+        typename Y_Origin_ = ck_tile::sequence<0, 0, 0, 0, 0>>
+void test_slice_distribution_from_x_case_1(SliceStart_ = {}, SliceEnd_={}, Y_Origin_ = {})
+{
+    // slice length [16, 16]
+    using namespace ck_tile;
+    constexpr auto r = detail::slice_distribution_from_x(
+        make_static_tile_distribution(
+                tile_distribution_encoding<sequence<>,
+                                        tuple<sequence<4, 8, 2>, sequence<2, 4, 2, 8, 2>>,
+                                        //             Y  P  Y            Y  P  Y  Y  P
+                                        tuple<sequence<1>, sequence<2, 2>>,
+                                        tuple<sequence<1>, sequence<4, 1>>,
+                                        sequence<1, 1, 2, 2, 2>,
+                                        sequence<0, 2, 0, 2, 3>>{}),
+        SliceStart_{},
+        SliceEnd_{});
+
+    using sliced_dist_enc = remove_cvref_t<decltype(r[number<0>{}].get_static_tile_distribution_encoding())>;
+    using target_dist_enc = tile_distribution_encoding<sequence<>,
+                                        tuple<sequence<1, 8, 2>, sequence<1, 4, 1, 2, 2>>,
+                                        //             Y  P  Y            Y  P  Y  Y  P
+                                        tuple<sequence<1>, sequence<2, 2>>,
+                                        tuple<sequence<1>, sequence<4, 1>>,
+                                        sequence<1, 1, 2, 2, 2>,
+                                        sequence<0, 2, 0, 2, 3>>;
+
+    static_assert(std::is_same_v<sliced_dist_enc, target_dist_enc>);
+
+    using sliced_y_origins = remove_cvref_t<decltype(r[number<1>{}])>;
+    using sliced_y_lengths = remove_cvref_t<decltype(r[number<2>{}])>;
+    static_assert(std::is_same_v<sliced_y_origins, Y_Origin_>);
+    static_assert(std::is_same_v<sliced_y_lengths, sequence<1, 2, 1, 1, 2>>);
+}
+
+template<typename SliceStart_ = ck_tile::sequence<0, 0>,
+        typename SliceEnd_ = ck_tile::sequence<12, 48>,
+        typename Y_Origin_ = ck_tile::sequence<0, 0, 0, 0, 0>>
+void test_slice_distribution_from_x_case_2(SliceStart_ = {}, SliceEnd_={}, Y_Origin_ = {})
+{
+    // slice length [12, 48]
+    using namespace ck_tile;
+    constexpr auto r = detail::slice_distribution_from_x(
+        make_static_tile_distribution(
+                tile_distribution_encoding<sequence<4, 5>,
+                                        tuple<sequence<4, 3, 2>, sequence<2, 2, 1, 4, 3, 4>>,
+                                        //             Y  P  Y            Y  P, Y, P  P, Y
+                                        tuple<sequence<0, 1, 0>, sequence<2, 2, 2>>,
+                                        tuple<sequence<0, 1, 1>, sequence<4, 1, 3>>,
+                                        sequence<1, 2, 1, 2, 2>,
+                                        sequence<2, 0, 0, 5, 2>>{}),
+        SliceStart_{},
+        SliceEnd_{});
+
+    using sliced_dist_enc = remove_cvref_t<decltype(r[number<0>{}].get_static_tile_distribution_encoding())>;
+    using target_dist_enc = tile_distribution_encoding<sequence<4, 5>,
+                                        tuple<sequence<2, 3, 2>, sequence<1, 2, 1, 4, 3, 2>>,
+                                        //             Y  P  Y            Y  P, Y, P  P, Y
+                                        tuple<sequence<0, 1, 0>, sequence<2, 2, 2>>,
+                                        tuple<sequence<0, 1, 1>, sequence<4, 1, 3>>,
+                                        sequence<1, 2, 1, 2, 2>,
+                                        sequence<2, 0, 0, 5, 2>>;
+
+    static_assert(std::is_same_v<sliced_dist_enc, target_dist_enc>);
+
+    using sliced_y_origins = remove_cvref_t<decltype(r[number<1>{}])>;
+    using sliced_y_lengths = remove_cvref_t<decltype(r[number<2>{}])>;
+    static_assert(std::is_same_v<sliced_y_origins, Y_Origin_>);
+    static_assert(std::is_same_v<sliced_y_lengths, sequence<2, 1, 2, 2, 1>>);
+}
+
+void test_slice_distribution_from_x()
+{
+    using namespace ck_tile;
+
+    test_slice_distribution_from_x_case_0(sequence< 0,  0>{}, sequence<-1, 16>{}, sequence<0, 0, 0, 0>{});
+    test_slice_distribution_from_x_case_0(sequence< 0, 16>{}, sequence<-1, 32>{}, sequence<0, 0, 0, 2>{});
+    test_slice_distribution_from_x_case_0(sequence< 0, 32>{}, sequence<-1, 48>{}, sequence<0, 1, 0, 0>{});
+    test_slice_distribution_from_x_case_0(sequence< 0, 48>{}, sequence<-1, 64>{}, sequence<0, 1, 0, 2>{});
+
+    test_slice_distribution_from_x_case_1(sequence< 0,  0>{}, sequence<16, 16>{}, sequence<0, 0, 0, 0, 0>{});
+    test_slice_distribution_from_x_case_1(sequence<16, 16>{}, sequence<32, 32>{}, sequence<1, 0, 0, 0, 2>{});
+    test_slice_distribution_from_x_case_1(sequence<32, 64>{}, sequence<48, 80>{}, sequence<2, 0, 0, 1, 0>{});
+    test_slice_distribution_from_x_case_1(sequence<48, 208>{}, sequence<64, 224>{}, sequence<3, 0, 1, 1, 2>{});
+
+    test_slice_distribution_from_x_case_2(sequence< 0,  0>{}, sequence<12, 48>{}, sequence<0, 0, 0, 0, 0>{});
+    test_slice_distribution_from_x_case_2(sequence<12, 144>{}, sequence<24, 192>{}, sequence<0, 1, 2, 2, 0>{});
+}
+
+// clang-format on
+int main() { test_slice_distribution_from_x(); }

From 0aecb5ab68a468f780bd16aea0471f59b11d4972 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Mon, 7 Jul 2025 14:54:34 +0800
Subject: [PATCH 281/443] default skip y point to r (#2457)

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 include/ck_tile/core/config.hpp                            | 6 ++++++
 include/ck_tile/core/tensor/tile_distribution_encoding.hpp | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 1ecc28fbeb..3a1ddd8abd 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -263,3 +263,9 @@
 #ifndef CK_TILE_WA_ISSUE_2028
 #define CK_TILE_WA_ISSUE_2028 0
 #endif
+
+// Y pointed to R, we don't see a valuable use case.
+// Will enforce encoding to check Y not pointed to R if set to zero
+#ifndef CK_TILE_ENC_SUPPORT_Y_TO_R
+#define CK_TILE_ENC_SUPPORT_Y_TO_R 0
+#endif
diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
index 30cd698595..52a16f32bd 100644
--- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
@@ -47,6 +47,11 @@ struct tile_distribution_encoding
     static constexpr auto ys_to_rhs_major_  = Ys2RHsMajor{};
     static constexpr auto ys_to_rhs_minor_  = Ys2RHsMinor{};
 
+#if !CK_TILE_ENC_SUPPORT_Y_TO_R
+    static_assert(container_find(ys_to_rhs_major_, 0) == NDimY,
+                  "do not support Y dim pointed to R dim");
+#endif
+
     // redundant but useful info
     // TODO: really bad code, should be over-hauled
     struct detail

From 9f4c5d7372fcec5d4efc0957cb3bc9da7d6e70bc Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Mon, 7 Jul 2025 16:16:54 +0800
Subject: [PATCH 282/443] ck tile pagedkv prefill (#2405)

* add prefetching physical block id for pagedkv

* start add pagedkv prefill

* rename pipeline

* add kernel for pagedkv

* add an init version pagedkv prefill

* fix redefine issue

* add struct BlockFmhaFwdPagedKVPipelineProblem and fmha_fwd_pagedkv_args

* generate dispatch code

* add body generating code

* comipling pass

* remove dropout from pagedkv

* set lse to false in generating code

* start changing qr kernel to pagedkv

* init version of  kernerl with pagedkv

* change names of file that are generated

* chang host validation for pagedkv prefill

* using iglp to change blockgemm

* add kernel files to op head file

* show parameters

* rewrite print parameter fun

* add fwd

* remove default parameter of GridSize

* format

* fix nhead issue and add seqlen_k_ptr to batch mode

* format code

* remove no-longer used code

* format

* fix some comments

---------

Co-authored-by: ltqin <letaoqin@amd.com>
Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 example/ck_tile/01_fmha/CMakeLists.txt        |    9 +-
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |    1 +
 .../codegen/ops/fmha_pagedkv_prefill.py       |  585 +++++++
 example/ck_tile/01_fmha/fmha_fwd.cpp          |   43 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |  260 ++++
 include/ck_tile/ops/fmha.hpp                  |    3 +
 .../ops/fmha/block/page_block_navigator.hpp   |   71 +
 .../fmha/kernel/fmha_fwd_pagedkv_kernel.hpp   | 1374 +++++++++++++++++
 ...ock_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp |  751 +++++++++
 ...gedkv_pipeline_qr_ks_vs_default_policy.hpp |   91 ++
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp |   16 +-
 .../pipeline/block_fmha_pipeline_problem.hpp  |   52 +
 .../ops/fmha/pipeline/tile_fmha_traits.hpp    |   28 +
 include/ck_tile/ops/gemm.hpp                  |    1 +
 .../block/block_gemm_areg_bsmem_creg_v2r1.hpp |  247 +++
 15 files changed, 3520 insertions(+), 12 deletions(-)
 create mode 100644 example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
 create mode 100644 include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp
 create mode 100644 include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp

diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 4fc8b0b4c9..e73faf6325 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -1,5 +1,5 @@
 # validate user-specified fmha_fwd API list
-set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv")
+set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv;pagedkv_prefill")
 set(FMHA_FWD_ENABLE_APIS "fwd" CACHE STRING
     "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
 if(FMHA_FWD_ENABLE_APIS STREQUAL "all")
@@ -102,6 +102,13 @@ else()
   list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0)
 endif()
 
+# conditionally enable call to the pagedkv_prefill API in fmha_fwd example
+if("pagedkv_prefill" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=1)
+else()
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=0)
+endif()
+
 # conditionally specify the use of OCP_FP8
 if(CK_USE_OCP_FP8)
   list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 5b9d5742b4..9e15a822ef 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -122,6 +122,7 @@ PIPELINE_ENUM_MAP = {
     "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
+    "qr_pagedkv" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
new file mode 100644
index 0000000000..650ebaf80e
--- /dev/null
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import copy
+from dataclasses import dataclass
+import fnmatch
+import itertools
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.cmake_config import *
+from codegen.cpp_symbol_map import *
+
+
+DTYPE_BITS = {
+    "fp32": 32,
+    "fp16": 16,
+    "bf16": 16,
+    "fp8" : 8,
+    "bf8" : 8
+}
+
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
+
+FMHA_FWD_PAGEDKV_PIPELINE_MAP = {
+    "qr_pagedkv" : "ck_tile::BlockFmhaFwdPagedKVPipelineQRKSVS"
+}
+
+FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "fmha_fwd.hpp"
+"""
+
+FMHA_FWD_KERNEL_BODY="""
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaFwdPagedKVTraits<{F_spad},
+                                                             {F_skpad},
+                                                             {F_dpad},
+                                                             {F_dvpad},
+                                                             {F_logits},
+                                                             {F_bias},
+                                                             false,
+                                                             {F_lse},      //lse
+                                                             {F_pagedkv},  //pagedkv
+                                                             {F_squant},
+                                                             {F_occupancy},
+                                                             {F_skip}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdPagedKVPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_variant_{F_idx},
+    fmha_mask_{F_idx},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} =
+    ck_tile::FmhaFwdPagedKVKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+
+#include <iostream>
+
+template<>
+float fmha_fwd_pagedkv_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_pagedkv_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_fwd_pagedkv_create_kargs_and_grids<k_>(a);
+    constexpr dim3 blocks             = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+}}
+"""
+
+FMHA_FWD_API_FILENAME="fmha_fwd_pagedkv_api.cpp"
+FMHA_FWD_API="""
+float fmha_fwd_pagedkv(fmha_fwd_pagedkv_traits& t, fmha_fwd_pagedkv_args& a, const ck_tile::stream_config& s){{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.use_pagedkv == {F_pagedkv}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+                using trait_ = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                return fmha_fwd_pagedkv_<trait_>(s, a);
+            }}
+"""
+
+@dataclass
+class FmhaFwdApiTrait:
+    pipeline_tag : str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim      : str
+    dtype     : str  # data type
+    mode      : str  # value from MODE_MAP
+    bm0       : int  # tile size along q seqlen (block size)
+    bn0       : int  # tile size along qk seqlen
+    bk0       : int  # tile size along qk gemm unroll
+    bn1       : int  # tile size along v head_dim
+    bk1       : int  # tile size along kv gemm unroll
+    bk0max    : int
+    vlayout   : str
+    logits    : str
+    mask      : str
+    bias      : str  #
+    lse       : str  #
+    pagedkv   : str
+    squant    : str  #
+    spad      : str
+    skpad     : str
+    dpad      : str
+    dvpad     : str
+    skip      : str
+
+    @property
+    def name(self) -> str:
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.pagedkv}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}'
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.spad == 't' : return 'true' # always support
+            else :                return 'true'
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_q % {self.bm0} == 0'
+        else: assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode skpad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
+            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_k % {self.bn0} == 0'
+        else: assert False
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
+            else :               assert False
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
+        else:   assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
+            else :                assert False
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
+        else:   assert False
+
+@dataclass
+class FmhaFwdPipeline:
+    tag : str
+
+    F_vlayout   : str  # row/col
+    F_spad      : str  # true/false
+    F_skpad     : str  #
+    F_dpad      : str  #
+    F_dvpad     : str  #
+    F_logits    : str  # t/f
+    F_bias      : str  # true/false
+    F_lse       : str  #
+    F_pagedkv   : str  #
+    F_squant    : str  #
+    F_mask      : str  # value from MASK_MAP
+    F_skip      : str  # true/false
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ''
+            if self.F_spad == 't': n += 's'
+            if self.F_skpad == 't' : n += 'sk'
+            if self.F_dpad == 't' : n += 'd'
+            if self.F_dvpad == 't' : n += 'dv'
+            if n != '' : n = 'p' + n
+            return n
+        pn = pad_name()
+        n = f'{self.tag}_v{self.F_vlayout[0]}'
+        if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
+        if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
+        if self.F_mask[0:2] == 's_':
+            if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
+        else:
+            if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
+        if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
+        if self.F_skip == 't' : n += '_skip'
+        else: n += '_nskip'
+
+        if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
+
+        if self.F_pagedkv == 't' : n += '_pagedkv'
+        else: n += '_npagedkv'
+
+        return n
+
+class FmhaFwdApiPool:
+    def __init__(self, mask_impl):
+        self.pool = dict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait : FmhaFwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.pool.keys():
+            self.pool[trait.dtype] = dict()
+        if trait.hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][trait.hdim] = list()
+
+        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][hdim]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                   F_lse=BOOL_MAP[trait.lse], F_pagedkv=BOOL_MAP[trait.pagedkv], F_skip=BOOL_MAP[trait.skip],
+                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0       : int  # tile size along q seqlen (block size)
+    F_bn0       : int  # tile size along k seqlen
+    F_bk0       : int  # tile size along qk gemm unroll
+    F_bn1       : int  # tile size along v head_dim
+    F_bk1       : int  # tile size along kv gemm unroll
+    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0       : int  # number of warps for gemm0 along q seqlen
+    F_rn0       : int  # number of warps for gemm0 along k seqlen
+    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1       : int  # number of warps for gemm1 along q seqlen
+    F_rn1       : int  # number of warps for gemm1 along head dim v
+    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0       : int  # gemm0 warp size along m
+    F_wn0       : int  # gemm0 warp size along n
+    F_wk0       : int  # gemm0 warp size along k
+    F_wm1       : int  # gemm1 warp size along m
+    F_wn1       : int  # gemm1 warp size along n
+    F_wk1       : int  # gemm1 warp size along k
+    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    @property
+    def name(self) -> str:
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
+        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\
+        ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+
+@dataclass
+class FmhaFwdKernel:
+    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim          : int  # hdim
+    F_dtype         : str  # data type
+    F_mode          : str  # value from MODE_MAP
+    F_tile          : FmhaFwdTileSize
+    F_pipeline      : FmhaFwdPipeline
+    mask_impl       : str
+
+    @property
+    def template(self) -> str:
+        kernel_body = str()
+        return FMHA_FWD_KERNEL_HEADER + \
+            FMHA_FWD_KERNEL_BODY.format(
+                F_idx           = self.F_idx,
+                F_hdim          = self.F_hdim,
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
+                F_bm0           = self.F_tile.F_bm0,
+                F_bn0           = self.F_tile.F_bn0,
+                F_bk0           = self.F_tile.F_bk0,
+                F_bn1           = self.F_tile.F_bn1,
+                F_bk1           = self.F_tile.F_bk1,
+                F_bk0max        = self.F_tile.F_bk0max,
+                F_rm0           = self.F_tile.F_rm0,
+                F_rn0           = self.F_tile.F_rn0,
+                F_rk0           = self.F_tile.F_rk0,
+                F_rm1           = self.F_tile.F_rm1,
+                F_rn1           = self.F_tile.F_rn1,
+                F_rk1           = self.F_tile.F_rk1,
+                F_wm0           = self.F_tile.F_wm0,
+                F_wn0           = self.F_tile.F_wn0,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
+                F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
+                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
+                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
+                F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
+                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
+                F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
+                F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
+                F_pagedkv       = BOOL_MAP[self.F_pipeline.F_pagedkv],
+                F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
+                F_skip          = BOOL_MAP[self.F_pipeline.F_skip],
+                F_occupancy     = self.F_tile.F_occupancy,
+                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
+                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
+                F_mode          = MODE_MAP[self.F_mode],
+                F_pipeline      = FMHA_FWD_PAGEDKV_PIPELINE_MAP[self.F_pipeline.tag])
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return f"fmha_fwd_pagedkv_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
+                self.F_tile.name + '_' + self.F_pipeline.name
+
+    @property
+    def filename(self) -> str:
+        return self.name + ".cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+                pipeline_tag=self.F_pipeline.tag,
+                hdim=str(self.F_hdim),
+                dtype=self.F_dtype,
+                mode=self.F_mode,
+                bm0=self.F_tile.F_bm0,
+                bn0=self.F_tile.F_bn0,
+                bk0=self.F_tile.F_bk0,
+                bn1=self.F_tile.F_bn1,
+                bk1=self.F_tile.F_bk1,
+                bk0max=self.F_tile.F_bk0max,
+                vlayout=self.F_pipeline.F_vlayout,
+                mask=self.F_pipeline.F_mask,
+                logits=self.F_pipeline.F_logits,
+                bias=self.F_pipeline.F_bias,
+                lse=self.F_pipeline.F_lse,
+                pagedkv=self.F_pipeline.F_pagedkv,
+                squant=self.F_pipeline.F_squant,
+                spad=self.F_pipeline.F_spad,
+                skpad=self.F_pipeline.F_skpad,
+                dpad=self.F_pipeline.F_dpad,
+                dvpad=self.F_pipeline.F_dvpad,
+                skip=self.F_pipeline.F_skip)
+
+# TODO: design a more practical way to do it
+# this is current supported tile size per hdim
+def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
+    if dtype == 'fp16' or dtype == 'bf16':
+        return {
+            # '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            # '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            # '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            # '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        }
+    elif dtype == 'fp8' or dtype == 'bf8':
+        return {
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+        }
+    else:
+        return None
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr_pagedkv pipeline, let 't' padding to appear later!!
+        # TODO: how to design this more generic?
+        squant = 't' if dtype == 'fp8' else 'f'
+        pipelines = []
+        if dtype in ['fp16', 'bf16']:
+            for logits, mask, bias,  pagedkv, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(),  ["t", "f"], ["t", "f"]):
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'col', 't', 'f', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'col', 't', 't', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 'f', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 't', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+        elif dtype in ['fp8', 'bf8']:
+            # TODO
+            None
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
+        else:
+            assert False
+        return pipelines
+
+    gen = list()
+    api_pool = FmhaFwdApiPool(mask_impl)
+
+    for dtype in FWD_DTYPE_MAP.keys():
+        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
+        if d == None:
+            continue
+        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
+            tile = d[hdim_str]
+            hdim = int(hdim_str)
+            for pipeline in get_pipelines(dtype, hdim):
+                # if pipeline.F_pagedkv == 'f':
+                #     continue
+                if mode == "group":
+                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
+                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
+                        continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' :
+                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
+                k = FmhaFwdKernel(F_idx=0,
+                                  F_hdim=hdim,
+                                  F_dtype=dtype,
+                                  F_mode=mode,
+                                  F_tile=tile,
+                                  F_pipeline=pipeline,
+                                  mask_impl=mask_impl)
+                if kernel_filter != '':
+                    if not fnmatch.fnmatch(k.name, kernel_filter):
+                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
+                # 2 - Flash attention integration
+                if receipt in (2, 3):
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
+                    if not cond:
+                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_fwd) integration
+                elif receipt == 100:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'batch'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_varlen_fwd) integration
+                elif receipt == 200:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    (autogen_dir / kernel.filename).write_text(kernel.template)
+
+def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
+    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+    for kernel in kernels:
+        write_single_fwd_kernel(kernel, output_dir)
+    write_fwd_api(api_pool, output_dir)
+
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    with file_path.open('a') as f:
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+        for kernel in kernels:
+            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 972653c218..e9403f4698 100755
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -323,7 +323,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     ck_tile::index_t page_block_size = arg_parser.get_int("page_block_size");
-#if !CK_TILE_FMHA_FWD_APPENDKV_API && !CK_TILE_FMHA_FWD_SPLITKV_API
+#if(!(CK_TILE_FMHA_FWD_APPENDKV_API || CK_TILE_FMHA_FWD_SPLITKV_API || \
+      CK_TILE_FMHA_FWD_PAGEDKV_API))
     if(0 < page_block_size)
     {
         std::cerr << "paged-kvcache is not supported. ignoring the 'page_block_size' option"
@@ -339,7 +340,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     }
 
     bool use_cache_batch_idx = arg_parser.get_bool("cache_batch_idx");
-#if !CK_TILE_FMHA_FWD_APPENDKV_API && !CK_TILE_FMHA_FWD_SPLITKV_API
+#if !(CK_TILE_FMHA_FWD_APPENDKV_API || CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API)
     if(use_cache_batch_idx)
     {
         std::cerr << "split-kv is not supported. ignoring the 'cache_batch_idx' option"
@@ -547,7 +548,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         std::cerr << "num_splits greater than 128 is not supported" << std::endl;
         return false;
     }
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
     if(0 < p_drop && (1 < num_splits || use_kvcache))
     {
         std::cerr << "dropout is not supoprted by split-kv kernels. ignoring the 'p_drop' option"
@@ -802,7 +803,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                   << (is_rotary_interleaved ? "inter" : "half") << ")";
     }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
     if(1 < num_splits)
     {
         std::cout << ", num_splits:" << num_splits;
@@ -843,6 +844,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
             {
                 traits.has_dropout = (p_drop > 0.0f);
             }
+            else if constexpr(std::is_same_v<fmha_fwd_pagedkv_traits,
+                                             std::decay_t<decltype(traits)>>)
+            {
+                traits.use_pagedkv = use_kvcache;
+            }
         }
     };
 
@@ -1051,6 +1057,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                 args.split_stride_lse_acc = split_stride_lse_acc;
                 args.split_stride_o_acc   = split_stride_o_acc;
             }
+            else if constexpr(std::is_same_v<fmha_fwd_pagedkv_args, std::decay_t<decltype(args)>>)
+            {
+                args.block_table_ptr =
+                    (0 < page_block_size ? block_table_buf.GetDeviceBuffer() : nullptr);
+                args.batch_stride_block_table = batch_stride_block_table;
+                args.page_block_size          = page_block_size;
+                args.is_gappy = false; // use 'false' for flash-attention integration
+
+                args.cache_batch_idx =
+                    (use_cache_batch_idx ? cache_batch_idx_buf.GetDeviceBuffer() : nullptr);
+            }
         }
     };
 
@@ -1072,7 +1089,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     const float fwd_ave_time = [&] {
 #if CK_TILE_FMHA_FWD_SPLITKV_API
-        if(1 < num_splits || use_kvcache)
+        if(1 < num_splits && use_kvcache)
         {
             fmha_fwd_splitkv_traits fmha_splitkv_traits;
             init_traits(fmha_splitkv_traits);
@@ -1082,6 +1099,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
             return fmha_fwd_splitkv(fmha_splitkv_traits, fmha_splitkv_args, stream_config);
         }
+#endif
+#if CK_TILE_FMHA_FWD_PAGEDKV_API
+        if(use_kvcache)
+        {
+            fmha_fwd_pagedkv_traits fmha_pagedkv_traits;
+            init_traits(fmha_pagedkv_traits);
+
+            fmha_fwd_pagedkv_args fmha_pagedkv_args;
+            init_args(fmha_pagedkv_args);
+
+            return fmha_fwd_pagedkv(fmha_pagedkv_traits, fmha_pagedkv_args, stream_config);
+        }
 #endif
         fmha_fwd_traits fmha_traits;
         init_traits(fmha_traits);
@@ -1237,7 +1266,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host_ref_ro(i); });
         }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
         if(0 < page_block_size) {
             if(i_perm) {
                 k_host_ref.ForEach([&](auto& self, auto i) {
@@ -1288,7 +1317,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             });
         }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
         if(0 < page_block_size) {
             if(is_v_rowmajor) {
                 if(i_perm) {
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 15b028fa9f..81dda692ea 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -178,6 +178,86 @@ struct fmha_fwd_args
         drop_seed_offset;
 };
 
+struct fmha_fwd_pagedkv_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    void* lse_ptr;
+    void* o_ptr;
+
+    void* block_table_ptr;
+    ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
+    ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
+    bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not
+                   // nullptr.
+
+    const void* cache_batch_idx;
+
+    // the real seqlen_q & seqlen_k are decided by following:
+    // batch mode: seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k
+    // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    //
+    // batch mode (kvcache):
+    //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //
+    //     when is_gappy=true:
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    //             seqstart_k_ptr[b] now store local offset of each batch
+    //
+    //     when is_gappy=false:
+    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void* seqlen_k_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    float scale_s;
+    float scale_p;
+    float scale_o;
+
+    float logits_soft_cap;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_o;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_lse;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_lse;
+    ck_tile::index_t batch_stride_o;
+
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+    ck_tile::index_t min_seqlen_q;
+};
+
 struct fmha_fwd_splitkv_args
 {
     const void* q_ptr;
@@ -501,6 +581,114 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
     }
 }
 
+template <typename FmhaKernel>
+auto fmha_fwd_pagedkv_create_kargs_and_grids(fmha_fwd_pagedkv_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaKernel::kIsGroupMode)
+        {
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqstart_q_ptr,
+                                         args.seqstart_k_ptr,
+                                         args.seqlen_k_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.block_table_ptr,
+                                         args.batch_stride_block_table,
+                                         args.page_block_size,
+                                         args.is_gappy,
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.min_seqlen_q);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqlen_q,
+                                         args.seqlen_k,
+                                         args.seqlen_k_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.block_table_ptr,
+                                         args.batch_stride_block_table,
+                                         args.page_block_size,
+                                         args.cache_batch_idx,
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_q,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.batch_stride_bias,
+                                         args.batch_stride_lse,
+                                         args.batch_stride_o,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type);
+        }
+    }();
+
+    // FmhaKernel::PrintParameters(kargs, args.batch);
+    if constexpr(FmhaKernel::kIsGroupMode)
+    {
+        dim3 grids = FmhaKernel::GridSize(
+            args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+    else
+    {
+        dim3 grids =
+            FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+}
+
 template <typename Kernel>
 auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
 {
@@ -870,6 +1058,57 @@ struct fmha_fwd_traits_
 template <typename Traits_>
 float fmha_fwd_(const ck_tile::stream_config&, fmha_fwd_args);
 
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::index_t kM0_,
+          ck_tile::index_t kN0_,
+          ck_tile::index_t kK0_,
+          ck_tile::index_t kN1_,
+          ck_tile::index_t kK1_,
+          ck_tile::index_t kK0BlockLength_,
+          bool kIsVLayoutRowMajor_,
+          ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          bool kHasLogitsSoftCap_,
+          typename FmhaMask_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kStoreLse_,
+          bool kIsPagedKV_,
+          bool kDoFp8StaticQuant_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_,
+          bool kSkipMinSeqlenQ_ = false>
+struct fmha_fwd_pagedkv_traits_
+{
+    static constexpr ck_tile::index_t HDim           = HDim_;
+    using DataType                                   = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode               = kIsGroupMode_;
+    static constexpr ck_tile::index_t kM0            = kM0_;
+    static constexpr ck_tile::index_t kN0            = kN0_;
+    static constexpr ck_tile::index_t kK0            = kK0_;
+    static constexpr ck_tile::index_t kN1            = kN1_;
+    static constexpr ck_tile::index_t kK1            = kK1_;
+    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
+    static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
+    static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    static constexpr bool kHasLogitsSoftCap          = kHasLogitsSoftCap_;
+    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
+    static constexpr auto BiasEnum                   = BiasEnum_;
+    static constexpr bool kStoreLse                  = kStoreLse_;
+    static constexpr bool kIsPagedKV                 = kIsPagedKV_;
+    static constexpr bool kDoFp8StaticQuant          = kDoFp8StaticQuant_;
+    static constexpr bool kPadS                      = kPadS_;
+    static constexpr bool kPadSK                     = kPadSK_;
+    static constexpr bool kPadD                      = kPadD_;
+    static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
+};
+
+template <typename Traits_>
+float fmha_fwd_pagedkv_(const ck_tile::stream_config&, fmha_fwd_pagedkv_args);
+
 template <ck_tile::index_t HDim_,
           typename DataType_,
           bool kIsGroupMode_,
@@ -1004,6 +1243,27 @@ struct fmha_fwd_traits
 };
 float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const ck_tile::stream_config&);
 
+struct fmha_fwd_pagedkv_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    bool is_v_rowmajor;
+    bool has_logits_soft_cap;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_lse             = false;
+    bool use_pagedkv         = true;
+    bool do_fp8_static_quant = false;
+    bool skip_min_seqlen_q   = false;
+    // TODO: padding check is inside this api
+};
+
+float fmha_fwd_pagedkv(fmha_fwd_pagedkv_traits&,
+                       fmha_fwd_pagedkv_args&,
+                       const ck_tile::stream_config&);
+
 struct fmha_fwd_splitkv_traits
 {
     int hdim_q;
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index ac6ef9cae3..f21136d2a8 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -17,6 +17,7 @@
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp"
+#include "ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp"
@@ -34,6 +35,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp"
diff --git a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
index 5d158f9fb3..f1e6101d1d 100644
--- a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
+++ b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
@@ -51,6 +51,27 @@ struct TrivialPageBlockNavigator
         return /*block_index=*/0;
     }
 
+    template <typename TileWindow>
+    CK_TILE_HOST_DEVICE index_t
+    move_tile_window(index_t /*block_index*/,
+                     TileWindow& tile_window,
+                     const typename remove_cvref_t<TileWindow>::BottomTensorIndex& step,
+                     index_t /*id*/) const
+    {
+
+        ck_tile::move_tile_window(tile_window, step);
+        return 0;
+    }
+
+    template <typename TileWindow>
+    CK_TILE_HOST_DEVICE index_t
+    prefetch_table_id(index_t /*block_index*/,
+                      TileWindow /*tile_window*/,
+                      const typename remove_cvref_t<TileWindow>::BottomTensorIndex& /*step*/) const
+    {
+        return -1;
+    }
+
     CK_TILE_HOST_DEVICE static constexpr WindowOrigin
     to_local_window_origin(const WindowOrigin& global_window_origin)
     {
@@ -153,6 +174,56 @@ struct PageBlockNavigator
         return new_block_index;
     }
 
+    template <typename TileWindow>
+    CK_TILE_HOST_DEVICE index_t
+    move_tile_window(index_t block_index,
+                     TileWindow& tile_window,
+                     const typename remove_cvref_t<TileWindow>::BottomTensorIndex& step,
+                     index_t id) const
+    {
+        ck_tile::move_tile_window(tile_window, step);
+
+        const WindowOrigin global_window_origin =
+            to_global_window_origin(block_index, tile_window.get_window_origin());
+        const WindowOrigin local_window_origin = to_local_window_origin(global_window_origin);
+
+        const index_t new_block_index = get_block_index(global_window_origin);
+        /// TODO: only update necessary attributes
+        tile_window.bottom_tensor_view_.desc_ =
+            (is_last_block(new_block_index) ? last_view : complete_view).get_tensor_descriptor();
+        tile_window.set_window_origin(local_window_origin);
+        if(id >= 0)
+            tile_window.set_bottom_tensor_view_data_ptr(physical_blocks + id * block_stride +
+                                                        fixed_offset);
+        else
+            tile_window.set_bottom_tensor_view_data_ptr(nullptr);
+
+        return new_block_index;
+    }
+
+    template <typename TileWindow>
+    CK_TILE_HOST_DEVICE index_t
+    prefetch_table_id(index_t block_index,
+                      TileWindow& tile_window,
+                      const typename remove_cvref_t<TileWindow>::BottomTensorIndex& step) const
+    {
+        auto local_tile_window = tile_window; // not affect origin window
+        ck_tile::move_tile_window(local_tile_window, step);
+
+        const WindowOrigin global_window_origin =
+            to_global_window_origin(block_index, local_tile_window.get_window_origin());
+        const index_t new_block_index = get_block_index(global_window_origin);
+
+        if(new_block_index < num_blocks)
+        {
+            return physical_block_indices[new_block_index];
+        }
+        else
+        {
+            return -1;
+        }
+    }
+
     CK_TILE_HOST_DEVICE bool is_last_block(index_t block_index) const
     {
         return block_index == num_blocks - 1;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
new file mode 100644
index 0000000000..e56d518634
--- /dev/null
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -0,0 +1,1374 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/variants.hpp"
+
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+// S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] @ K[seqlen_k, hdim_q]
+// S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
+// S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
+// P[seqlen_q, seqlen_k] = Softmax(S''[seqlen_q, seqlen_k])
+// O[seqlen_q, hdim_v] = P[seqlen_q, seqlen_k] @ V^T[hdim_v, seqlen_k]
+
+namespace ck_tile {
+
+// TODO: This class is a variant of the existing FmhaFwdSplitKVKernel pipeline.
+//       Refactoring to extract shared logic is recommended as future work.
+template <typename FmhaPipeline_, typename EpiloguePipeline_>
+struct FmhaFwdPagedKVKernel
+{
+    using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
+    using EpiloguePipeline                        = ck_tile::remove_cvref_t<EpiloguePipeline_>;
+    static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
+    static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+    static_assert(kBlockPerCu > 0);
+    static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
+
+    using QDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::QDataType>;
+    using KDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::KDataType>;
+    using VDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::VDataType>;
+    using BiasDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
+    using LSEDataType  = ck_tile::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
+    using ODataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::ODataType>;
+    using SaccDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::SaccDataType>;
+
+    using VLayout = ck_tile::remove_cvref_t<typename FmhaPipeline::VLayout>;
+
+    static constexpr bool kIsGroupMode      = FmhaPipeline::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = FmhaPipeline::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = FmhaPipeline::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = FmhaPipeline::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = FmhaPipeline::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = FmhaPipeline::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = FmhaPipeline::BiasEnum;
+    static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
+    static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
+    static constexpr bool kSkipMinSeqlenQ   = FmhaPipeline::Problem::kSkipMinSeqlenQ;
+    static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
+
+    using AttentionVariant = ck_tile::remove_cvref_t<typename FmhaPipeline::AttentionVariant>;
+    using FmhaMask         = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
+    static constexpr bool kHasMask = FmhaMask::IsMasking;
+
+    static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
+
+    // clang-format off
+    template <typename T> struct t2s;
+    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
+    template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
+    template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
+    // clang-format on
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // sync with generate.py
+        // clang-format off
+        using bfs = typename FmhaPipeline::BlockFmhaShape;
+        using g0br = typename bfs::Gemm0BlockWarps;
+        using g1br = typename bfs::Gemm1BlockWarps;
+        using g0wt = typename bfs::Gemm0WarpTile;
+        using g1wt = typename bfs::Gemm1WarpTile;
+        #define _SS_  std::string
+        #define _TS_  std::to_string
+        auto pn = [&] () {
+            std::string n;
+            if (kPadSeqLenQ) n += "s";
+            if (kPadSeqLenK) n += "sk";
+            if (kPadHeadDimQ) n += "d";
+            if (kPadHeadDimV) n += "dv";
+            return n.empty() ? n : std::string("p") + n; }();
+        return
+            _SS_("fmha_fwd_pagedkv_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
+            "_" + (kIsGroupMode ? "group" : "batch") + "_"
+            "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
+                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
+            "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
+            "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
+            "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
+            (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
+            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
+            (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" )  + (kSkipMinSeqlenQ ? "_skip" : "_nskip" )  + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
+        #undef _SS_
+        #undef _TS_
+        // clang-format on
+    }
+
+    template <ck_tile::index_t I> // to avoid duplicated base class prblem, introduce an template
+                                  // arg
+    struct FmhaFwdEmptyKargs
+    {
+    };
+
+    // kargs use aggregate initializer, so no constructor will provided
+    // use inheritance to minimize karg size
+    // user need to use MakeKargs() function to create kargs.
+    struct FmhaFwdCommonKargs
+    {
+        const void* q_ptr;
+        const void* k_ptr;
+        const void* v_ptr;
+        void* o_ptr;
+
+        ck_tile::index_t seqlen_q;
+        ck_tile::index_t seqlen_k;
+        ck_tile::index_t hdim_q;
+        ck_tile::index_t hdim_v;
+
+        ck_tile::index_t num_head_q;
+        // for MQA/GQA, nhead could be different. This parameter is nhead_q / nhead_k
+        // if this param is larger than 1, indicate MQA/GQA case
+        ck_tile::index_t nhead_ratio_qk;
+        float scale_s;
+
+        ck_tile::index_t stride_q;
+        ck_tile::index_t stride_k;
+        ck_tile::index_t stride_v;
+        ck_tile::index_t stride_o;
+
+        ck_tile::index_t nhead_stride_q;
+        ck_tile::index_t nhead_stride_k;
+        ck_tile::index_t nhead_stride_v;
+        ck_tile::index_t nhead_stride_o;
+    };
+
+    struct FmhaFwdLogitsSoftCapKargs
+    {
+        FmhaFwdLogitsSoftCapKargs() = default;
+
+        void init_logits_soft_cap(float logits_soft_cap_)
+        {
+            if(0 < logits_soft_cap_)
+            {
+                logits_soft_cap     = logits_soft_cap_;
+                logits_soft_cap_rcp = 1.f / logits_soft_cap;
+            }
+            else
+            {
+                logits_soft_cap     = 0.f;
+                logits_soft_cap_rcp = 0.f;
+            }
+        }
+
+        float logits_soft_cap;
+        float logits_soft_cap_rcp;
+    };
+
+    struct FmhaFwdCommonBiasKargs
+    {
+        const void* bias_ptr               = nullptr;
+        ck_tile::index_t stride_bias       = 0;
+        ck_tile::index_t nhead_stride_bias = 0;
+    };
+
+    struct FmhaFwdBatchModeBiasKargs : FmhaFwdCommonBiasKargs
+    {
+        ck_tile::index_t batch_stride_bias = 0;
+    };
+
+    struct FmhaFwdAlibiKargs
+    {
+        // alibi is batch*nhead*1, no matter in batch/group mode, they are the same
+        const void* alibi_slope_ptr;
+        ck_tile::index_t alibi_slope_stride; // stride in batch, or 0 for all batch share same slope
+    };
+
+    struct FmhaFwdMaskKargs
+    {
+        // ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::index_t window_size_left, window_size_right;
+        ck_tile::GenericAttentionMaskEnum mask_type;
+    };
+
+    struct FmhaFwdFp8StaticQuantKargs
+    {
+        float scale_p;
+        float scale_o;
+    };
+
+    struct FmhaFwdCommonLSEKargs
+    {
+        void* lse_ptr                     = nullptr;
+        ck_tile::index_t nhead_stride_lse = 0;
+        ck_tile::index_t batch_stride_lse = 0;
+    };
+
+    struct FmhaFwdSkipMinSeqlenQKargs
+    {
+        ck_tile::index_t min_seqlen_q = 0;
+    };
+
+    struct CommonPageBlockTableKargs
+    {
+        const int32_t* block_table_ptr;
+        ck_tile::index_t batch_stride_block_table;
+        ck_tile::index_t page_block_size;
+    };
+
+    struct GroupModePageBlockTableKargs : CommonPageBlockTableKargs
+    {
+        bool is_gappy = false;
+    };
+
+    struct CacheBatchIdxKargs
+    {
+        const int32_t* cache_batch_idx;
+    };
+
+    struct FmhaFwdBatchModeKargs
+        : FmhaFwdCommonKargs,
+          std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS,
+                             FmhaFwdBatchModeBiasKargs,
+                             std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ALIBI,
+                                                FmhaFwdAlibiKargs,
+                                                FmhaFwdEmptyKargs<0>>>,
+          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+          std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
+          std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
+          std::conditional_t<kIsPagedKV, CommonPageBlockTableKargs, CacheBatchIdxKargs>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<4>>
+    {
+        const int32_t* seqlen_k_ptr;
+
+        ck_tile::index_t batch_stride_q;
+        ck_tile::index_t batch_stride_k; // when using paged-kvcache, this will be stride/size for
+                                         // single kcache page-block
+        ck_tile::index_t batch_stride_v; // when using paged-kvcache, this will be stride/size for
+                                         // single vcache page-block
+        ck_tile::index_t batch_stride_o;
+    };
+
+    struct FmhaFwdGroupModeKargs
+        : FmhaFwdCommonKargs,
+          std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS,
+                             FmhaFwdCommonBiasKargs,
+                             std::conditional_t<BiasEnum == BlockAttentionBiasEnum::ALIBI,
+                                                FmhaFwdAlibiKargs,
+                                                FmhaFwdEmptyKargs<0>>>,
+          std::conditional_t<kHasMask, FmhaFwdMaskKargs, FmhaFwdEmptyKargs<1>>,
+          std::conditional_t<kStoreLSE, FmhaFwdCommonLSEKargs, FmhaFwdEmptyKargs<2>>,
+          std::conditional_t<kDoFp8StaticQuant, FmhaFwdFp8StaticQuantKargs, FmhaFwdEmptyKargs<3>>,
+          std::conditional_t<kHasLogitsSoftCap, FmhaFwdLogitsSoftCapKargs, FmhaFwdEmptyKargs<4>>,
+          std::conditional_t<kIsPagedKV, GroupModePageBlockTableKargs, FmhaFwdEmptyKargs<5>>,
+          std::conditional_t<kSkipMinSeqlenQ, FmhaFwdSkipMinSeqlenQKargs, FmhaFwdEmptyKargs<6>>
+    {
+        const int32_t* seqstart_q_ptr;
+        const int32_t* seqstart_k_ptr;
+        const int32_t* seqlen_k_ptr;
+
+        ck_tile::index_t batch_stride_k; // only used for paged-kvcache, this will be stride/size
+                                         // for single kcache page-block
+        ck_tile::index_t batch_stride_v; // only used for paged-kvcache, this will be stride/size
+                                         // for single vcache page-block
+    };
+
+    using Kargs = std::conditional_t<kIsGroupMode, FmhaFwdGroupModeKargs, FmhaFwdBatchModeKargs>;
+
+    struct BlockIndices
+    {
+        ck_tile::index_t batch_idx;
+        ck_tile::index_t qo_head_idx;
+        ck_tile::index_t kv_head_idx;
+    };
+
+    template <bool Cond = !kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  void* lse_ptr,
+                  void* o_ptr,
+                  ck_tile::index_t seqlen_q,
+                  ck_tile::index_t seqlen_k,
+                  const void* seqlen_k_ptr, // only used for (paged-) kvcache
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  const void* block_table_ptr,
+                  ck_tile::index_t batch_stride_block_table,
+                  ck_tile::index_t page_block_size,
+                  const void* cache_batch_idx,
+                  float scale_s,
+                  float scale_p,
+                  float scale_o,
+                  float logits_soft_cap,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_o,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_lse,
+                  ck_tile::index_t nhead_stride_o,
+                  ck_tile::index_t batch_stride_q,
+                  ck_tile::index_t batch_stride_k,
+                  ck_tile::index_t batch_stride_v,
+                  ck_tile::index_t batch_stride_bias,
+                  ck_tile::index_t batch_stride_lse,
+                  ck_tile::index_t batch_stride_o,
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type)
+    {
+        Kargs kargs{{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
+                     seqlen_q,
+                     seqlen_k,
+                     hdim_q,
+                     hdim_v,
+                     num_head_q,
+                     nhead_ratio_qk,
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                     static_cast<float>(scale_s * ck_tile::log2e_v<>),
+#else
+                     scale_s,
+#endif
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o}, // args for common karg
+                    {},               // placeholder for bias
+                    {},               // placeholder for mask
+                    {},               // placeholder for lse
+                    {},               // placeholder for fp8_static_quant args
+                    {},               // placeholder for pagedkv
+                    {},               // placeholder for logits_soft_cap
+                    reinterpret_cast<const int32_t*>(seqlen_k_ptr),
+                    batch_stride_q,
+                    batch_stride_k,
+                    batch_stride_v,
+                    batch_stride_o};
+
+        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+        {
+            kargs.bias_ptr          = bias_ptr;
+            kargs.stride_bias       = stride_bias;
+            kargs.nhead_stride_bias = nhead_stride_bias;
+            kargs.batch_stride_bias = batch_stride_bias;
+        }
+        else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+        {
+            kargs.alibi_slope_ptr    = bias_ptr;
+            kargs.alibi_slope_stride = stride_bias;
+        }
+        if constexpr(kHasMask)
+        {
+            kargs.window_size_left  = window_size_left;
+            kargs.window_size_right = window_size_right;
+            kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
+        }
+        if constexpr(kStoreLSE)
+        {
+            kargs.lse_ptr          = lse_ptr;
+            kargs.nhead_stride_lse = nhead_stride_lse;
+            kargs.batch_stride_lse = batch_stride_lse;
+        }
+        if constexpr(kDoFp8StaticQuant)
+        {
+            kargs.scale_p = scale_p;
+            kargs.scale_o = scale_o;
+        }
+        if constexpr(kIsPagedKV)
+        {
+            kargs.block_table_ptr          = reinterpret_cast<const int32_t*>(block_table_ptr);
+            kargs.batch_stride_block_table = batch_stride_block_table;
+            kargs.page_block_size          = page_block_size;
+        }
+        else
+        {
+            kargs.cache_batch_idx = reinterpret_cast<const int32_t*>(cache_batch_idx);
+        }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
+
+        return kargs;
+    }
+
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = !kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t seqlen_k,
+              const void* seqlen_k_ptr, // only used for (paged-) kvcache
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              const void* block_table_ptr,
+              ck_tile::index_t batch_stride_block_table,
+              ck_tile::index_t page_block_size,
+              const void* cache_batch_idx,
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              float logits_soft_cap,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_lse,
+              ck_tile::index_t batch_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type)
+    {
+        return MakeKargsImpl(q_ptr,
+                             k_ptr,
+                             v_ptr,
+                             bias_ptr,
+                             lse_ptr,
+                             o_ptr,
+                             seqlen_q,
+                             seqlen_k,
+                             seqlen_k_ptr,
+                             hdim_q,
+                             hdim_v,
+                             num_head_q,
+                             nhead_ratio_qk,
+                             block_table_ptr,
+                             batch_stride_block_table,
+                             page_block_size,
+                             cache_batch_idx,
+                             scale_s,
+                             scale_p,
+                             scale_o,
+                             logits_soft_cap,
+                             stride_q,
+                             stride_k,
+                             stride_v,
+                             stride_bias,
+                             stride_o,
+                             nhead_stride_q,
+                             nhead_stride_k,
+                             nhead_stride_v,
+                             nhead_stride_bias,
+                             nhead_stride_lse,
+                             nhead_stride_o,
+                             batch_stride_q,
+                             batch_stride_k,
+                             batch_stride_v,
+                             batch_stride_bias,
+                             batch_stride_lse,
+                             batch_stride_o,
+                             window_size_left,
+                             window_size_right,
+                             mask_type);
+    }
+
+    template <bool Cond = kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargsImpl(const void* q_ptr,
+                  const void* k_ptr,
+                  const void* v_ptr,
+                  const void* bias_ptr,
+                  void* lse_ptr,
+                  void* o_ptr,
+                  const void* seqstart_q_ptr,
+                  const void* seqstart_k_ptr,
+                  const void* seqlen_k_ptr,
+                  ck_tile::index_t hdim_q,
+                  ck_tile::index_t hdim_v,
+                  ck_tile::index_t num_head_q,
+                  ck_tile::index_t nhead_ratio_qk,
+                  const void* block_table_ptr,
+                  ck_tile::index_t batch_stride_block_table,
+                  ck_tile::index_t page_block_size,
+                  bool is_gappy,
+                  float scale_s,
+                  float scale_p,
+                  float scale_o,
+                  float logits_soft_cap,
+                  ck_tile::index_t stride_q,
+                  ck_tile::index_t stride_k,
+                  ck_tile::index_t stride_v,
+                  ck_tile::index_t stride_bias,
+                  ck_tile::index_t stride_o,
+                  ck_tile::index_t nhead_stride_q,
+                  ck_tile::index_t nhead_stride_k,
+                  ck_tile::index_t nhead_stride_v,
+                  ck_tile::index_t nhead_stride_bias,
+                  ck_tile::index_t nhead_stride_lse,
+                  ck_tile::index_t nhead_stride_o,
+                  ck_tile::index_t batch_stride_k, // only used for paged-kvcache
+                  ck_tile::index_t batch_stride_v, // only used for paged-kvcache
+                  ck_tile::index_t window_size_left,
+                  ck_tile::index_t window_size_right,
+                  ck_tile::index_t mask_type,
+                  ck_tile::index_t min_seqlen_q)
+    {
+        Kargs kargs{{q_ptr,
+                     k_ptr,
+                     v_ptr,
+                     o_ptr,
+                     -1, // seqlen will be updated by another pointer
+                     -1, //
+                     hdim_q,
+                     hdim_v,
+                     num_head_q,
+                     nhead_ratio_qk,
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                     static_cast<float>(scale_s * ck_tile::log2e_v<>),
+#else
+                     scale_s,
+#endif
+                     stride_q,
+                     stride_k,
+                     stride_v,
+                     stride_o,
+                     nhead_stride_q,
+                     nhead_stride_k,
+                     nhead_stride_v,
+                     nhead_stride_o}, // args for common karg
+                    {},               // placeholder for bias
+                    {},               // placeholder for mask
+                    {},               // placeholder for lse
+                    {},               // placeholder for fp8_static_quant args
+                    {},               // placeholder for logits_soft_cap
+                    {},               // placeholder for pagdkv
+                    {},               // placeholder for min_seqlen_q
+                    reinterpret_cast<const int32_t*>(seqstart_q_ptr),
+                    reinterpret_cast<const int32_t*>(seqstart_k_ptr),
+                    reinterpret_cast<const int32_t*>(seqlen_k_ptr),
+                    batch_stride_k,
+                    batch_stride_v};
+
+        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+        {
+            kargs.bias_ptr          = bias_ptr;
+            kargs.stride_bias       = stride_bias;
+            kargs.nhead_stride_bias = nhead_stride_bias;
+        }
+        else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+        {
+            kargs.alibi_slope_ptr    = bias_ptr;
+            kargs.alibi_slope_stride = stride_bias;
+        }
+        if constexpr(kHasMask)
+        {
+            kargs.window_size_left  = window_size_left;
+            kargs.window_size_right = window_size_right;
+            kargs.mask_type         = static_cast<ck_tile::GenericAttentionMaskEnum>(mask_type);
+        }
+        if constexpr(kStoreLSE)
+        {
+            kargs.lse_ptr          = lse_ptr;
+            kargs.nhead_stride_lse = nhead_stride_lse;
+        }
+        if constexpr(kDoFp8StaticQuant)
+        {
+            kargs.scale_p = scale_p;
+            kargs.scale_o = scale_o;
+        }
+        if constexpr(kHasLogitsSoftCap)
+        {
+            kargs.init_logits_soft_cap(logits_soft_cap);
+        }
+        if constexpr(kIsPagedKV)
+        {
+            kargs.block_table_ptr          = reinterpret_cast<const int32_t*>(block_table_ptr);
+            kargs.batch_stride_block_table = batch_stride_block_table;
+            kargs.page_block_size          = page_block_size;
+            kargs.is_gappy                 = is_gappy;
+        }
+        if constexpr(kSkipMinSeqlenQ)
+        {
+            kargs.min_seqlen_q = min_seqlen_q;
+        }
+
+        return kargs;
+    }
+
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
+    template <bool Cond = kIsGroupMode>
+    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              const void* seqstart_q_ptr,
+              const void* seqstart_k_ptr,
+              const void* seqlen_k_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              const void* block_table_ptr,
+              ck_tile::index_t batch_stride_block_table,
+              ck_tile::index_t page_block_size,
+              bool is_gappy,
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              float logits_soft_cap,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_k, // only used for paged-kvcache
+              ck_tile::index_t batch_stride_v, // only used for paged-kvcache
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              ck_tile::index_t min_seqlen_q)
+    {
+        return MakeKargsImpl(q_ptr,
+                             k_ptr,
+                             v_ptr,
+                             bias_ptr,
+                             lse_ptr,
+                             o_ptr,
+                             seqstart_q_ptr,
+                             seqstart_k_ptr,
+                             seqlen_k_ptr,
+                             hdim_q,
+                             hdim_v,
+                             num_head_q,
+                             nhead_ratio_qk,
+                             block_table_ptr,
+                             batch_stride_block_table,
+                             page_block_size,
+                             is_gappy,
+                             scale_s,
+                             scale_p,
+                             scale_o,
+                             logits_soft_cap,
+                             stride_q,
+                             stride_k,
+                             stride_v,
+                             stride_bias,
+                             stride_o,
+                             nhead_stride_q,
+                             nhead_stride_k,
+                             nhead_stride_v,
+                             nhead_stride_bias,
+                             nhead_stride_lse,
+                             nhead_stride_o,
+                             batch_stride_k,
+                             batch_stride_v,
+                             window_size_left,
+                             window_size_right,
+                             mask_type,
+                             min_seqlen_q);
+    }
+
+    CK_TILE_HOST static void PrintParameters(const Kargs& kargs, int num_batches)
+    {
+        static bool dummy = [&]() {
+            std::cout << std::endl;
+
+            std::cout << " q_ptr: " << kargs.q_ptr << " k_ptr:" << kargs.k_ptr
+                      << " v_ptr: " << kargs.v_ptr << " o_ptr:" << kargs.o_ptr
+                      << " hdim_q: " << kargs.hdim_q << " hdim_v: " << kargs.hdim_v
+                      << " num_head_q:" << kargs.num_head_q
+                      << " nhead_ratio_qk: " << kargs.nhead_ratio_qk << " scale_s:" << kargs.scale_s
+                      << " stride_q:" << kargs.stride_q << " stride_k:" << kargs.stride_k
+                      << " stride_v:" << kargs.stride_v << " stride_o:" << kargs.stride_o
+                      << " nhead_stride_q: " << kargs.nhead_stride_q
+                      << " nhead_stride_k: " << kargs.nhead_stride_k
+                      << " nhead_stride_v:" << kargs.nhead_stride_v
+                      << " nhead_stride_o: " << kargs.nhead_stride_o;
+            if constexpr(!kIsGroupMode)
+            {
+                std::cout << " batch_stride_q:" << kargs.batch_stride_q;
+            }
+            std::cout << " batch_stride_k:" << kargs.batch_stride_k
+                      << " batch_stride_v:" << kargs.batch_stride_v;
+
+            if constexpr(kIsGroupMode)
+            {
+                if constexpr(kSkipMinSeqlenQ)
+                {
+                    std::cout << " min_seqlen_q: " << kargs.min_seqlen_q;
+                }
+
+                std::cout << " seqstart_q_ptr:" << kargs.seqstart_q_ptr
+                          << " seqstart_k_ptr: " << kargs.seqstart_k_ptr
+                          << " seqlen_k_ptr:" << kargs.seqlen_k_ptr;
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    std::cout << "{";
+                    for(int i_batch = 0; i_batch < num_batches; i_batch++)
+                        std::cout << kargs.seqlen_k_ptr[i_batch] << ",";
+                    std::cout << "}";
+                }
+            }
+            if constexpr(kHasMask)
+            {
+                std::cout << " window_size_left: " << kargs.window_size_left
+                          << " window_size_right:" << kargs.window_size_right
+                          << " mask_type: " << static_cast<int>(kargs.mask_type);
+            }
+
+            if constexpr(kIsPagedKV)
+            {
+                std::cout << " block_table_ptr: " << kargs.block_table_ptr
+                          << " batch_stride_block_table:" << kargs.batch_stride_block_table
+                          << " page_block_size: " << kargs.page_block_size;
+
+                std::cout << "table value: [";
+                for(int b = 0; b < num_batches; b++)
+                {
+                    std::cout << "[ ";
+                    for(int i = 0; i < kargs.batch_stride_block_table; i++)
+                    {
+                        std::cout << kargs.block_table_ptr[b * kargs.batch_stride_block_table + i]
+                                  << ",";
+                    }
+                    std::cout << " ]";
+                }
+                std::cout << " ]";
+            }
+            std::cout << std::endl;
+            return true;
+        }();
+        (void)dummy;
+    }
+    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size_,
+                                                ck_tile::index_t nhead_,
+                                                ck_tile::index_t seqlen_q_,
+                                                ck_tile::index_t hdim_v_,
+                                                bool has_padded_seqlen_k)
+    {
+        // has_padded_seqlen_k is determined by checking (seqlen_k_ptr != nullptr)
+        if(has_padded_seqlen_k)
+        {
+            // TODO: this may need tuning
+            return dim3(nhead_,
+                        batch_size_,
+                        ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1));
+        }
+        else
+        {
+            // TODO: this may need tuning
+            return dim3(ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+                            ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
+                        nhead_,
+                        batch_size_);
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto GetTileIndex(const Kargs& kargs)
+    {
+        bool has_padded_seqlen_k = false;
+
+        if constexpr(kIsGroupMode)
+            has_padded_seqlen_k = (kargs.seqlen_k_ptr != nullptr);
+
+        if(has_padded_seqlen_k)
+        {
+            // const index_t num_tile_m0 = seqlen_q / kM0;
+            const index_t num_tile_n1 =
+                ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+            const index_t i_block = blockIdx.z;
+            const index_t i_nhead = blockIdx.x;
+            const index_t i_batch = blockIdx.y;
+
+            const auto f = [](index_t dividend, index_t divisor) {
+                index_t quotient = dividend / divisor;
+                index_t modulus  = dividend - quotient * divisor;
+                return ck_tile::make_tuple(quotient, modulus);
+            };
+
+            const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+            if constexpr(kHasMask)
+            {
+                // assume that num_tile_n1 is always 1
+                return ck_tile::make_tuple(gridDim.z - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+            else
+            {
+                return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+        }
+        else
+        {
+            // const index_t num_tile_m0 = seqlen_q / kM0;
+            const index_t num_tile_n1 =
+                ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
+
+            const index_t i_block = blockIdx.x;
+            const index_t i_nhead = blockIdx.y;
+            const index_t i_batch = blockIdx.z;
+
+            const auto f = [](index_t dividend, index_t divisor) {
+                index_t quotient = dividend / divisor;
+                index_t modulus  = dividend - quotient * divisor;
+                return ck_tile::make_tuple(quotient, modulus);
+            };
+
+            const auto [i_tile_m, i_tile_n] = f(i_block, num_tile_n1);
+
+            if constexpr(kHasMask)
+            {
+                // assume that num_tile_n1 is always 1
+                return ck_tile::make_tuple(gridDim.x - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+            else
+            {
+                return ck_tile::make_tuple(i_tile_m, i_tile_n, i_nhead, i_batch);
+            }
+        }
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return ck_tile::max(FmhaPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        // divide problem
+        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
+
+        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+        long_index_t batch_offset_q    = 0;
+        long_index_t batch_offset_k    = 0;
+        long_index_t batch_offset_v    = 0;
+        long_index_t batch_offset_bias = 0;
+        long_index_t batch_offset_lse  = 0;
+        long_index_t batch_offset_o    = 0;
+        index_t kv_l2p_offset          = 0;
+
+        if constexpr(kIsGroupMode)
+        {
+            // get starting offset for each batch
+            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+            batch_offset_q = query_start * kargs.stride_q;
+            batch_offset_k = key_start * kargs.stride_k;
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                batch_offset_v = key_start * kargs.stride_v;
+            }
+            else
+            {
+                batch_offset_v = key_start;
+            }
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                batch_offset_bias = query_start * kargs.stride_bias;
+            }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = query_start;
+            }
+
+            batch_offset_o = query_start * kargs.stride_o;
+
+            // get real # queries & # keys under group mode
+            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+            if constexpr(kSkipMinSeqlenQ)
+            {
+                if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                {
+                    return;
+                }
+            }
+
+            // # of required blocks is different in each groups, terminate unnecessary blocks
+            // earlier
+            if(kargs.seqlen_q <= i_m0)
+            {
+                return;
+            }
+
+            if(kargs.seqlen_k_ptr != nullptr)
+            {
+                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+            }
+            else
+            {
+                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+            }
+
+            if constexpr(kIsPagedKV)
+            {
+                if(kargs.is_gappy)
+                {
+                    // seqstart_k_ptr has different meaning in this case
+                    kv_l2p_offset = kargs.seqstart_k_ptr[i_batch];
+                }
+            }
+        }
+        else
+        {
+            const index_t i_cache_batch = [&, i_batch_ = i_batch] {
+                if constexpr(kIsPagedKV)
+                {
+                    return i_batch_;
+                }
+                else
+                {
+                    return (kargs.cache_batch_idx != nullptr ? kargs.cache_batch_idx[i_batch_]
+                                                             : i_batch_);
+                }
+            }();
+
+            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+            batch_offset_k = static_cast<long_index_t>(i_cache_batch) * kargs.batch_stride_k;
+            batch_offset_v = static_cast<long_index_t>(i_cache_batch) * kargs.batch_stride_v;
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+            }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+            }
+
+            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+            if(kargs.seqlen_k_ptr != nullptr)
+            {
+                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+            }
+        }
+
+        // for simplicity, batch stride we just modify the pointer
+        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                 batch_offset_q;
+        const KDataType* k_ptr =
+            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+            batch_offset_k;
+        const VDataType* v_ptr =
+            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
+            batch_offset_v;
+        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                           batch_offset_o;
+
+        // Q/K/V DRAM and DRAM window
+        const auto q_dram = [&]() {
+            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                q_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                make_tuple(kargs.stride_q, 1),
+                number<FmhaPipeline::kAlignmentQ>{},
+                number<1>{});
+            if constexpr(FmhaPipeline::kQLoadOnce)
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            }
+        }();
+
+        const auto make_k_dram = [&](const KDataType* data, index_t height) {
+            const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                data, // will update this pointer if using paged-kvcache
+                make_tuple(height, kargs.hdim_q),
+                make_tuple(kargs.stride_k, 1),
+                number<FmhaPipeline::kAlignmentK>{},
+                number<1>{});
+
+            return pad_tensor_view(
+                k_dram_naive,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                sequence<false, kPadHeadDimQ>{});
+        };
+        const auto k_dram = [&]() {
+            if constexpr(kIsPagedKV)
+            {
+                return make_k_dram(nullptr, kargs.page_block_size);
+            }
+            else
+            {
+                return make_k_dram(k_ptr, kargs.seqlen_k);
+            }
+        }();
+
+        const auto make_v_dram = [&](const VDataType* data, index_t length) {
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(length, kargs.hdim_v),
+                    make_tuple(kargs.stride_v, 1),
+                    number<FmhaPipeline::kAlignmentV>{},
+                    number<1>{});
+
+                const auto v_dram_transposed =
+                    transform_tensor_view(v_dram_naive,
+                                          make_tuple(make_pass_through_transform(kargs.hdim_v),
+                                                     make_pass_through_transform(length)),
+                                          make_tuple(sequence<1>{}, sequence<0>{}),
+                                          make_tuple(sequence<0>{}, sequence<1>{}));
+
+                return pad_tensor_view(
+                    v_dram_transposed,
+                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+            }
+            else
+            {
+                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(kargs.hdim_v, length),
+                    make_tuple(kargs.stride_v, 1),
+                    number<FmhaPipeline::kAlignmentV>{},
+                    number<1>{});
+
+                return pad_tensor_view(
+                    v_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                    sequence<false, kPadSeqLenK>{});
+            }
+        };
+        const auto v_dram = [&]() {
+            if constexpr(kIsPagedKV)
+            {
+                return make_v_dram(nullptr, kargs.page_block_size);
+            }
+            else
+            {
+                return make_v_dram(v_ptr, kargs.seqlen_k);
+            }
+        }();
+
+        auto q_dram_window = make_tile_window(
+            q_dram,
+            [&]() {
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                    return make_tuple(number<FmhaPipeline::kM0>{},
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
+                else
+                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+            }(),
+            {i_m0, 0});
+
+        auto k_page_block_navigator =
+            [&, i_batch_ = i_batch, i_nhead_ = i_nhead / kargs.nhead_ratio_qk]() {
+                if constexpr(kIsPagedKV)
+                {
+                    const auto* block_indices =
+                        reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
+                        i_batch_ * kargs.batch_stride_block_table;
+                    const index_t num_blocks =
+                        integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
+
+                    const long_index_t fixed_offset = i_nhead_ * kargs.nhead_stride_k;
+
+                    return make_page_block_navigator<const KDataType, 0>(
+                        kargs.k_ptr,
+                        kargs.batch_stride_k, // kcache page-block stride/size
+                        fixed_offset,
+                        block_indices,
+                        num_blocks,
+                        kargs.page_block_size,
+                        k_dram,
+                        make_k_dram(nullptr,
+                                    (kv_l2p_offset + kargs.seqlen_k) -
+                                        (num_blocks - 1) * kargs.page_block_size));
+                }
+                else
+                {
+                    return make_page_block_navigator(k_dram);
+                }
+            }();
+
+        auto v_page_block_navigator =
+            [&, i_batch_ = i_batch, i_nhead_ = i_nhead / kargs.nhead_ratio_qk]() {
+                if constexpr(kIsPagedKV)
+                {
+                    const auto* block_indices =
+                        reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
+                        i_batch_ * kargs.batch_stride_block_table;
+                    const index_t num_blocks =
+                        integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
+
+                    const long_index_t fixed_offset = i_nhead_ * kargs.nhead_stride_v;
+
+                    return make_page_block_navigator<const VDataType, 1>(
+                        kargs.v_ptr,
+                        kargs.batch_stride_v, // vcache page-block stride/size
+                        fixed_offset,
+                        block_indices,
+                        num_blocks,
+                        kargs.page_block_size,
+                        v_dram,
+                        make_v_dram(nullptr,
+                                    (kv_l2p_offset + kargs.seqlen_k) -
+                                        (num_blocks - 1) * kargs.page_block_size));
+                }
+                else
+                {
+                    return make_page_block_navigator(v_dram);
+                }
+            }();
+
+        auto k_dram_window_lengths =
+            make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{});
+        auto v_dram_window_lengths =
+            make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{});
+
+        /// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
+        /// following copy capture of the 'i_nhead' if in C++20
+        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto bias_dram_window_lengths =
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                const BiasDataType* bias_ptr =
+                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                    batch_offset_bias;
+
+                const auto bias_dram = [&]() {
+                    const auto bias_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        bias_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                        make_tuple(kargs.stride_bias, 1),
+                        number<FmhaPipeline::kAlignmentBias>{},
+                        number<1>{});
+
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                }();
+
+                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+            }
+            else
+            {
+                return make_null_tile_window(bias_dram_window_lengths);
+            }
+        }();
+
+        // lse
+        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+            if constexpr(kStoreLSE)
+            {
+                LSEDataType* lse_ptr =
+                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
+
+                const auto lse_dram = [&]() {
+                    const auto lse_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        lse_ptr,
+                        make_tuple(kargs.seqlen_q),
+                        make_tuple(1),
+                        number<1>{},
+                        number<1>{});
+
+                    return pad_tensor_view(
+                        lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                }();
+
+                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+            }
+            else
+            {
+                return make_null_tile_window(lse_dram_window_lengths);
+            }
+        }();
+
+        FmhaMask mask = [&]() {
+            if constexpr(kHasMask)
+                return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                    kargs.window_size_left,
+                    kargs.window_size_right,
+                    kargs.seqlen_q,
+                    kargs.seqlen_k,
+                    kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+            else
+                return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+        }();
+
+        // WA i_batch capture structure binding before c++20
+        auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                // data loading, shared by entire wg
+                // TODO: how to use s_read?
+                SaccDataType slope =
+                    *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                      i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                slope *= ck_tile::log2e_v<>;
+#endif
+                if constexpr(kHasMask)
+                {
+                    return make_alibi_from_lr_mask<SaccDataType, true>(slope,
+                                                                       kargs.window_size_left,
+                                                                       kargs.window_size_right,
+                                                                       kargs.seqlen_q,
+                                                                       kargs.seqlen_k,
+                                                                       kargs.mask_type);
+                }
+                else
+                {
+                    return Alibi<SaccDataType, true>{
+                        slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                }
+            }
+            else
+            {
+                return EmptyPositionEncoding<SaccDataType>{};
+            }
+        }();
+
+        AttentionVariant variant;
+        const auto variant_params = [&] {
+            if constexpr(kHasLogitsSoftCap)
+            {
+                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+            }
+            else
+            {
+                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+            }
+        }();
+
+        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+
+        auto o_acc_tile = [&]() {
+            if constexpr(kDoFp8StaticQuant)
+            {
+                return FmhaPipeline{}(
+                    q_dram_window,
+                    identity{}, // q_element_func
+                    k_dram_window_lengths,
+                    k_page_block_navigator,
+                    identity{}, // k_element_func
+                    v_dram_window_lengths,
+                    v_page_block_navigator,
+                    identity{}, // v_element_func
+                    bias_dram_window,
+                    identity{}, // bias_element_func
+                    lse_dram_window,
+                    identity{},                                          // lse_element_func
+                    identity{},                                          // s_acc_element_func
+                    scales{kargs.scale_p},                               // p_compute_element_func
+                    composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
+                    mask,
+                    position_encoding,
+                    kargs.scale_s,
+                    variant,
+                    variant_params,
+                    block_indices,
+                    kv_l2p_offset,
+                    smem_ptr);
+            }
+            else
+            {
+                return FmhaPipeline{}(q_dram_window,
+                                      k_dram_window_lengths,
+                                      k_page_block_navigator,
+                                      v_dram_window_lengths,
+                                      v_page_block_navigator,
+                                      bias_dram_window,
+                                      lse_dram_window,
+                                      mask,
+                                      position_encoding,
+                                      kargs.scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
+                                      kv_l2p_offset,
+                                      smem_ptr);
+            }
+        }();
+
+        // O DRAM and O DRAM window
+        auto o_dram = [&]() {
+            const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                o_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                make_tuple(kargs.stride_o, 1),
+                number<FmhaPipeline::kAlignmentO>{},
+                number<1>{});
+
+            return pad_tensor_view(
+                o_dram_naive,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                sequence<kPadSeqLenQ, kPadHeadDimV>{});
+        }();
+
+        auto o_dram_window =
+            make_tile_window(o_dram,
+                             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                             {i_m0, i_n1});
+
+        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
new file mode 100644
index 0000000000..6ad5844b69
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
@@ -0,0 +1,751 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+// TODO: This class is a variant of the existing BlockFmhaFwdSplitKVPipelineQRKSVS pipeline.
+//       Refactoring to extract shared logic is recommended as future work.
+
+template <typename Problem_, typename Policy_ = BlockFmhaFwdPagedKVPipelineQRKSVSDefaultPolicy>
+struct BlockFmhaFwdPagedKVPipelineQRKSVS
+{
+    using Problem             = remove_cvref_t<Problem_>;
+    using Policy              = remove_cvref_t<Policy_>;
+    using QDataType           = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType           = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType           = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType        = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType        = remove_cvref_t<typename Problem::BiasDataType>;
+    using LSEDataType         = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType           = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType        = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType           = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant    = remove_cvref_t<typename Problem::AttentionVariant>;
+    using FmhaMask            = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
+    static constexpr bool kIsGroupMode      = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ       = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = Problem::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kIsPagedKV        = Problem::kIsPagedKV;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+
+    static constexpr index_t kAlignmentO =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentO<Problem>();
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            if constexpr(kQKHeaddim <= 32)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim <= 64)
+            {
+                return 3;
+            }
+            else if constexpr(kQKHeaddim <= 128)
+            {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 256)
+            {
+                return 1;
+            }
+            else
+            {
+                return 1;
+            }
+        }
+    }();
+
+    static constexpr const char* name = "qr_pagedkv";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowLengths,
+              typename KPageBlockNavigator,
+              typename VDramBlockWindowLengths,
+              typename VPageBlockNavigator,
+              typename BiasDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename QElementFunction,
+              typename KElementFunction,
+              typename VElementFunction,
+              typename BiasElementFunction,
+              typename LSEElementFunction,
+              typename SAccElementFunction,
+              typename PComputeElementFunction,
+              typename OAccElementFunction,
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp, // M0*K0 tile
+               const QElementFunction& q_element_func,
+               const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile
+               const KPageBlockNavigator& k_page_block_navigator,
+               const KElementFunction& k_element_func,
+               const VDramBlockWindowLengths& v_dram_block_window_lengths, // N1*K1 tile
+               const VPageBlockNavigator& v_page_block_navigator,
+               const VElementFunction& v_element_func,
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               const BiasElementFunction& bias_element_func,
+               LSEDramBlockWindowTmp& lse_dram_window_tmp, // M0*1 tile
+               const LSEElementFunction& lse_element_func,
+               const SAccElementFunction& s_acc_element_func,
+               const PComputeElementFunction& p_compute_element_func,
+               const OAccElementFunction& o_acc_element_func,
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
+               index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
+               void* smem_ptr) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KPageBlockNavigator::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VPageBlockNavigator::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == KDramBlockWindowLengths{}[number<0>{}] &&
+                          kK0 == KDramBlockWindowLengths{}[number<1>{}] &&
+                          kN1 == VDramBlockWindowLengths{}[number<0>{}] &&
+                          kK1 == VDramBlockWindowLengths{}[number<1>{}] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // K tile in LDS
+        KDataType* k_lds_ptr = static_cast<KDataType*>(static_cast<void*>(
+            static_cast<char*>(smem_ptr) + Policy::template GetSmemSizeQ<Problem>()));
+        auto k_lds           = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsBlockDescriptor<Problem>());
+        auto k_lds_window =
+            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0});
+
+        // V tile in LDS
+        auto v_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(smem_ptr),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_window = make_tile_window(
+            v_lds, Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetKVBlockGemm<Problem>();
+
+        auto q_dram_window = make_tile_window(q_dram_block_window_tmp.get_bottom_tensor_view(),
+                                              q_dram_block_window_tmp.get_window_lengths(),
+                                              q_dram_block_window_tmp.get_window_origin(),
+                                              Policy::template MakeQRegTileDistribution<Problem>());
+
+        auto q = load_tile(q_dram_window);
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(s_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        // init Oacc, M, L
+        auto o_acc = OaccBlockTileType{};
+        auto m     = MLBlockTileType{};
+        auto l     = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_window.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK)
+        {
+            const auto num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse, -numeric<SMPLComputeDataType>::infinity());
+
+                    store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse));
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // k_dram_block_window
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start + kv_l2p_offset;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end + kv_l2p_offset;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start =
+            [&, physical_seqlen_k_start_ = physical_seqlen_k_start] {
+                if constexpr(kIsPagedKV)
+                {
+                    return kN0 * integer_divide_floor(physical_seqlen_k_start_, kN0);
+                }
+                else
+                {
+                    return physical_seqlen_k_start_;
+                }
+            }();
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        auto [i_page_block_k, k_dram_block_window] = k_page_block_navigator.make_tile_window(
+            k_dram_block_window_lengths, {aligned_physical_seqlen_k_start, 0});
+
+        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+        auto bias_dram_window =
+            make_tile_window(bias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}),
+                              logical_seqlen_k_start - (physical_seqlen_k_start -
+                                                        aligned_physical_seqlen_k_start)}, // M/N
+                             Policy::template MakeBiasDramTileDistribution<decltype(gemm_0)>());
+
+        // v_dram_window
+        auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
+            v_dram_block_window_lengths,
+            {0, aligned_physical_seqlen_k_start}, // TODO: hdim split?
+            Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto q_tile = tile_elementwise_in(q_element_func, q);
+
+        // prefetch K tile
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(2 <= k0_loops);
+        static_assert(1 <= k1_loops);
+        do
+        {
+            // STAGE 1, QK gemm
+            auto k_dram_window = make_tile_window(
+                k_dram_block_window,
+                Policy::template MakeKDramTileDistribution<Problem>()); // K DRAM tile window for
+                                                                        // load
+
+            auto k_block_tile = load_tile(k_dram_window);
+            {
+                // moving k_dram_window is an in-page-block operation, so there is
+                // no need to invoke k_page_block_navigator.move_tile_window() here.
+                move_tile_window(k_dram_window, {0, kK0});
+                clear_tile(s_acc); // initialize C
+                store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
+                k_block_tile = load_tile(k_dram_window);
+            }
+            auto physical_next_block_id_k =
+                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                    i_page_block_k, k_dram_block_window, {kN0, 0}));
+            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+                v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
+
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                __builtin_amdgcn_sched_barrier(
+                    0); // prevent from messing up the order of global loads
+            }
+            const auto bias_tile = load_tile(bias_dram_window); // load bias tile
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                __builtin_amdgcn_sched_barrier(
+                    0); // prevent from messing up the order of global loads
+            }
+
+            if constexpr(k0_loops > 2)
+            {
+                static_for<0, k0_loops - 2, 1>{}([&](auto i_k0) {
+                    block_sync_lds();
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_lds_window);
+                    block_sync_lds();
+                    move_tile_window(k_dram_window, {0, kK0});
+
+                    store_tile(
+                        k_lds_window,
+                        tile_elementwise_in(k_element_func, k_block_tile)); // LDS write i + 1
+                    k_block_tile = load_tile(k_dram_window);                // global read i + 2
+                });
+            }
+
+            const auto v_prefetch = load_tile(v_dram_window); // prefetch load v tile
+            {                                                 // tail
+                block_sync_lds();
+                gemm_0(s_acc,
+                       get_slice_tile(q_tile,
+                                      sequence<0, (k0_loops - 2) * kK0>{},
+                                      sequence<kM0, (k0_loops - 1) * kK0>{}),
+                       k_lds_window);
+                block_sync_lds();
+
+                store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
+                block_sync_lds();
+
+                gemm_0(s_acc,
+                       get_slice_tile(q_tile,
+                                      sequence<0, (k0_loops - 1) * kK0>{},
+                                      sequence<kM0, k0_loops * kK0>{}),
+                       k_lds_window);
+            }
+
+            // STAGE 2, scale_s, add bias, mask, softmax
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+                tile_elementwise_inout(
+                    [&](auto& x, const auto& y) {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                        x += type_convert<SaccDataType>(bias_element_func(y));
+#else
+                        x += log2e_v<SaccDataType> *
+                             type_convert<SaccDataType>(bias_element_func(y));
+#endif
+                    },
+                    s_acc,
+                    bias_tile);
+            }
+            else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                const auto k_origin = k_page_block_navigator.to_global_window_origin(
+                    i_page_block_k, k_dram_block_window.get_window_origin());
+                constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
+                s_acc                  = tile_elementwise_in(s_acc_element_func, s_acc);
+                sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                    sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                        const auto tile_idx = get_x_indices_from_distributed_indices(
+                            s_acc.get_tile_distribution(), make_tuple(idx0, idx1));
+
+                        const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                        const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                        s_acc(i_j_idx) *= scale_s;
+                        // position_encoding accept only logical coordinates, do conversion here
+                        position_encoding.update(s_acc(i_j_idx), row, col - kv_l2p_offset);
+                    });
+                });
+            }
+            else
+            {
+                s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    auto apply_logits_transform =
+                        [&variant, &variant_params, &block_indices](auto& x) {
+                            x = variant.LogitsTransform(variant_params,
+                                                        variant.QueryTransform(variant_params, x),
+                                                        block_indices.batch_idx,
+                                                        block_indices.qo_head_idx,
+                                                        block_indices.kv_head_idx);
+                        };
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
+#else
+                    tile_elementwise_inout(apply_logits_transform, s_acc);
+#endif
+                }
+                else
+                {
+#if !CK_TILE_FMHA_FWD_FAST_EXP2
+                    tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
+#endif
+                }
+            }
+            move_tile_window(bias_dram_window, {0, kN0});
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = k_page_block_navigator.to_global_window_origin(
+                    i_page_block_k, k_dram_block_window.get_window_origin());
+                // mask accept only logical coordinates, do conversion here
+                bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}),
+                                                           k_origin.at(number<0>{}) - kv_l2p_offset,
+                                                           number<kM0>{},
+                                                           number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                            return !variant.LogitsMask(variant_params,
+                                                       block_indices.batch_idx,
+                                                       row,
+                                                       col - kv_l2p_offset,
+                                                       block_indices.qo_head_idx,
+                                                       block_indices.kv_head_idx);
+                        });
+                }
+            }
+
+            const auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(m_local, f_max, bool_constant<false>{});
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                auto row_max = scale_s * get_validated_m(m[i_idx]);
+#endif
+                sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s[i_j_idx] - row_max);
+                        }
+                    }
+#else
+                    p_compute(i_j_idx)     = exp(s[i_j_idx] - get_validated_m(m[i_idx]));
+#endif
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(rowsum_p, f_sum, bool_constant<false>{});
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                const auto tmp = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+#else
+                const auto tmp       = exp(m_old[i_idx] - get_validated_m(m[i_idx]));
+#endif
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    // FIXME: this use different equation from FA v2 paper,
+                    // but produce correc result.
+                    // Is the equation wrong?
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds();
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                    Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                shuffle_tile(v_shuffle_tmp, v_prefetch);
+                store_tile(
+                    v_lds_window,
+                    tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
+            }
+            else
+            {
+                store_tile(v_lds_window,
+                           tile_elementwise_in(v_element_func, v_prefetch)); // store the prefetch
+            }
+            i_page_block_v = v_page_block_navigator.move_tile_window(
+                i_page_block_v, v_dram_window, {0, kK1}, physical_next_block_id_v);
+
+            const auto p =
+                cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
+
+            // STAGE 3, KV gemm
+            if constexpr(k1_loops > 1)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&,
+                                                  &i_page_block_v_ = i_page_block_v,
+                                                  &v_dram_window_  = v_dram_window](auto i_k1) {
+                    auto physical_next_block_id_v_ =
+                        __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
+                            i_page_block_v_, v_dram_window_, {0, kK1}));
+                    const auto v = load_tile(v_dram_window_); // load next v
+                    block_sync_lds();
+                    gemm_1(o_acc,
+                           get_slice_tile(
+                               p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_lds_window);
+                    block_sync_lds();
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
+                            Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
+                        shuffle_tile(v_shuffle_tmp, v);
+                        store_tile(v_lds_window,
+                                   tile_elementwise_in(v_element_func,
+                                                       v_shuffle_tmp)); // store the prefetch
+                    }
+                    else
+                    {
+                        store_tile(v_lds_window,
+                                   tile_elementwise_in(v_element_func, v)); // store next v
+                    }
+                    i_page_block_v_ = v_page_block_navigator.move_tile_window(
+                        i_page_block_v_, v_dram_window_, {0, kK1}, physical_next_block_id_v_);
+                });
+            }
+            // move K tile windows
+            i_page_block_k = k_page_block_navigator.move_tile_window(
+                i_page_block_k, k_dram_block_window, {kN0, 0}, physical_next_block_id_k);
+            // tail
+            {
+                block_sync_lds();
+                gemm_1(o_acc,
+                       get_slice_tile(p, sequence<0, (k1_loops - 1) * kK1>{}, sequence<kM0, kN0>{}),
+                       v_lds_window);
+                block_sync_lds();
+            }
+        } while(++i_total_loops < num_total_loop);
+
+        // store lse
+        if constexpr(kStoreLSE)
+        {
+            auto lse = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_spans = decltype(lse)::get_distributed_spans();
+            sweep_tile_span(lse_spans[number<0>{}], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+#else
+                lse(i_idx) = m_[i_idx] + log(l_[i_idx]);
+#endif
+            });
+
+            store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse));
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        o_acc = tile_elementwise_in(o_acc_element_func, o_acc);
+
+        return o_acc;
+    }
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowLengths,
+              typename KPageBlockNavigator,
+              typename VDramBlockWindowLengths,
+              typename VPageBlockNavigator,
+              typename BiasDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename PositionEncoding,
+              typename AttentionVariantParams,
+              typename BlockIndices>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,         // M0*K0 tile
+               const KDramBlockWindowLengths& k_dram_block_window_lengths, // N0*K0 tile
+               const KPageBlockNavigator& k_page_block_navigator,
+               const VDramBlockWindowLengths& v_dram_block_window_lengths, // N1*K1 tile
+               const VPageBlockNavigator& v_page_block_navigator,
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEDramBlockWindowTmp& lse_dram_block_window_tmp,         // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               const AttentionVariant& variant,
+               const AttentionVariantParams& variant_params,
+               const BlockIndices& block_indices,
+               index_t kv_l2p_offset, // logical-to-physical offset of seqlen_k coordinate
+               void* smem_ptr) const
+    {
+        return operator()(q_dram_block_window_tmp,
+                          identity{},
+                          k_dram_block_window_lengths,
+                          k_page_block_navigator,
+                          identity{},
+                          v_dram_block_window_lengths,
+                          v_page_block_navigator,
+                          identity{},
+                          bias_dram_block_window_tmp,
+                          identity{},
+                          lse_dram_block_window_tmp,
+                          identity{},
+                          identity{},
+                          identity{},
+                          identity{},
+                          mask,
+                          position_encoding,
+                          scale_s,
+                          variant,
+                          variant_params,
+                          block_indices,
+                          kv_l2p_offset,
+                          smem_ptr);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp
new file mode 100644
index 0000000000..9c348495ff
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+struct BlockFmhaFwdPagedKVPipelineQRKSVSDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopy = */ false,
+                                          /* NumPrefetchK = */ 1,
+                                          /* NumPrefetchV = */ 1>
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::QDataType,
+                             typename Problem::KDataType,
+                             typename Problem::SaccDataType,
+                             Problem::kNumGemm0Warps * get_warp_size(),
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK0>,
+                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
+
+        constexpr auto warp_gemm = []() {
+            constexpr index_t WarpGemmM = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
+            static_assert(WarpGemmM == 4 || WarpGemmM == 16 || WarpGemmM == 32);
+
+            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                         std::is_same_v<typename Problem::KDataType, half_t> &&
+                         std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                if constexpr(WarpGemmM == 32)
+                    return WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution{};
+                else if constexpr(WarpGemmM == 16)
+                    return WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution{};
+                else // WarpGemmM == 4
+                    return WarpGemmMfmaF16F16F32M4N64K16{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::KDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                if constexpr(WarpGemmM == 32)
+                    return WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution{};
+                else if constexpr(WarpGemmM == 16)
+                    return WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution{};
+                else // WarpGemmM == 4
+                    return WarpGemmMfmaBf16Bf16F32M4N64K16{};
+            }
+            else if constexpr(std::is_same_v<typename Problem::QDataType, fp8_t> &&
+                              std::is_same_v<typename Problem::KDataType, fp8_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
+            {
+                static_assert(WarpGemmM == 32);
+
+                // TODO: hard coded here. Otherwise, it may incorrect result
+                constexpr index_t swizzle_factor = 4;
+                return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution<
+                    swizzle_factor>{};
+            } // TODO - bf8_t
+        }();
+
+        using BlockGemmPolicy =
+            BlockGemmARegBSmemCRegV2CustomPolicy<typename Problem::QDataType,
+                                                 typename Problem::KDataType,
+                                                 typename Problem::SaccDataType,
+                                                 typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                                 decltype(warp_gemm)>;
+
+        if constexpr(1 < Problem::kNumGemm0Warps)
+        {
+            if constexpr(128 >= Problem::BlockFmhaShape::kK0)
+                return BlockGemmARegBSmemCRegV2R1<GemmProblem, BlockGemmPolicy>{};
+            else
+                return BlockGemmARegBSmemCRegV2<GemmProblem, BlockGemmPolicy>{};
+        }
+        else
+            return BlockGemmARegBSmemCRegOneWarpV1<GemmProblem, BlockGemmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 7f5f79d7a7..9de640b7cf 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -320,6 +320,11 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
                 k_block_tile = load_tile(k_dram_window);
             }
+            auto physical_next_block_id_k =
+                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                    i_page_block_k, k_dram_block_window, {kN0, 0}));
+            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+                v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
@@ -600,8 +605,8 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 store_tile(v_lds_window,
                            tile_elementwise_in(v_element_func, v_prefetch)); // store the prefetch
             }
-            i_page_block_v =
-                v_page_block_navigator.move_tile_window(i_page_block_v, v_dram_window, {0, kK1});
+            i_page_block_v = v_page_block_navigator.move_tile_window(
+                i_page_block_v, v_dram_window, {0, kK1}, physical_next_block_id_v);
 
             const auto p =
                 cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
@@ -612,6 +617,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 static_for<0, k1_loops - 1, 1>{}([&,
                                                   &i_page_block_v_ = i_page_block_v,
                                                   &v_dram_window_  = v_dram_window](auto i_k1) {
+                    auto physical_next_block_id_v_ =
+                        __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
+                            i_page_block_v_, v_dram_window_, {0, kK1}));
                     const auto v = load_tile(v_dram_window_); // load next v
                     block_sync_lds();
                     gemm_1(o_acc,
@@ -634,12 +642,12 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                                    tile_elementwise_in(v_element_func, v)); // store next v
                     }
                     i_page_block_v_ = v_page_block_navigator.move_tile_window(
-                        i_page_block_v_, v_dram_window_, {0, kK1});
+                        i_page_block_v_, v_dram_window_, {0, kK1}, physical_next_block_id_v_);
                 });
             }
             // move K tile windows
             i_page_block_k = k_page_block_navigator.move_tile_window(
-                i_page_block_k, k_dram_block_window, {kN0, 0});
+                i_page_block_k, k_dram_block_window, {kN0, 0}, physical_next_block_id_k);
             // tail
             {
                 block_sync_lds();
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 21cc4950eb..20b30b7417 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -61,6 +61,58 @@ struct BlockFmhaPipelineProblem
     static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
 };
 
+template <typename QDataType_,
+          typename KDataType_,
+          typename VDataType_,
+          typename SaccDataType_,
+          typename SMPLComputeDataType_,
+          typename BiasDataType_,
+          typename LSEDataType_,
+          typename PDataType_,
+          typename OaccDataType_,
+          typename ODataType_,
+          typename BlockFmhaShape_,
+          bool kIsGroupMode_,
+          typename AttentionVariant_,
+          typename FmhaMask_,
+          typename Traits_>
+struct BlockFmhaFwdPagedKVPipelineProblem
+{
+    using QDataType           = remove_cvref_t<QDataType_>;
+    using KDataType           = remove_cvref_t<KDataType_>;
+    using VDataType           = remove_cvref_t<VDataType_>;
+    using SaccDataType        = remove_cvref_t<SaccDataType_>;
+    using SMPLComputeDataType = remove_cvref_t<SMPLComputeDataType_>;
+    using BiasDataType        = remove_cvref_t<BiasDataType_>;
+    using LSEDataType         = remove_cvref_t<LSEDataType_>;
+    using PDataType           = remove_cvref_t<PDataType_>;
+    using OaccDataType        = remove_cvref_t<OaccDataType_>;
+    using ODataType           = remove_cvref_t<ODataType_>;
+    using BlockFmhaShape      = remove_cvref_t<BlockFmhaShape_>;
+    using AttentionVariant    = remove_cvref_t<AttentionVariant_>;
+    using FmhaMask            = remove_cvref_t<FmhaMask_>;
+    using Traits              = remove_cvref_t<Traits_>;
+
+    static constexpr index_t kNumGemm0Warps = BlockFmhaShape::NumGemm0Warps;
+    static constexpr index_t kNumGemm1Warps = BlockFmhaShape::NumGemm1Warps;
+    static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
+
+    static constexpr bool kIsGroupMode = kIsGroupMode_;
+
+    // attributes from traits
+    static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK       = Traits::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ      = Traits::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV      = Traits::kPadHeadDimV;
+    static constexpr bool kHasLogitsSoftCap = Traits::kHasLogitsSoftCap;
+    static constexpr bool kSkipMinSeqlenQ   = Traits::kSkipMinSeqlenQ;
+    static constexpr auto BiasEnum          = Traits::BiasEnum;
+    static constexpr bool kStoreLSE         = Traits::kStoreLSE;
+    static constexpr bool kDoFp8StaticQuant = Traits::kDoFp8StaticQuant;
+    static constexpr bool kIsPagedKV        = Traits::kIsPagedKV;
+    static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
+};
+
 template <typename QDataType_,
           typename KDataType_,
           typename VDataType_,
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
index 442619a3dc..fb4713ccc0 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -37,6 +37,34 @@ struct TileFmhaTraits
     static constexpr bool kSkipMinSeqlenQ   = kSkipMinSeqlenQ_;
 };
 
+template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
+          bool kPadSeqLenK_ /* padding for seqlen_k */,
+          bool kPadHeadDimQ_ /* paddding for hdim_q */,
+          bool kPadHeadDimV_ /* paddding for hdim_v */,
+          bool kHasLogitsSoftCap_,
+          BlockAttentionBiasEnum BiasEnum_,
+          bool kHasBiasGrad_,
+          bool kStoreLSE_, /* set to true if either num_splits > 1 or fwd training is running */
+          bool kIsPagedKV_,
+          bool kDoFp8StaticQuant_,
+          index_t kBlockPerCu_  = -1, /* overwrite occupancy if not -1 */
+          bool kSkipMinSeqlenQ_ = false /* skip min seqlen q while chunked prefill */>
+struct TileFmhaFwdPagedKVTraits
+{
+    static constexpr bool kPadSeqLenQ       = kPadSeqLenQ_;
+    static constexpr bool kPadSeqLenK       = kPadSeqLenK_;
+    static constexpr bool kPadHeadDimQ      = kPadHeadDimQ_;
+    static constexpr bool kPadHeadDimV      = kPadHeadDimV_;
+    static constexpr bool kHasLogitsSoftCap = kHasLogitsSoftCap_;
+    static constexpr auto BiasEnum          = BiasEnum_;
+    static constexpr bool kHasBiasGrad      = kHasBiasGrad_;
+    static constexpr bool kStoreLSE         = kStoreLSE_;
+    static constexpr bool kIsPagedKV        = kIsPagedKV_;
+    static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
+    static constexpr index_t kBlockPerCu    = kBlockPerCu_;
+    static constexpr bool kSkipMinSeqlenQ   = kSkipMinSeqlenQ_;
+};
+
 template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
           bool kPadSeqLenK_ /* padding for seqlen_k */,
           bool kPadHeadDimQ_ /* paddding for hdim_q */,
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index a1d37f0824..237c00d6c9 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -13,6 +13,7 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp
new file mode 100644
index 0000000000..b8708a91fb
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
+
+namespace ck_tile {
+
+// A is block distributed tensor
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename Policy_ = BlockGemmARegBSmemCRegV2DefaultPolicy>
+struct BlockGemmARegBSmemCRegV2R1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using Policy         = remove_cvref_t<Policy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockTensorTmp, typename BBlockWindowTmp>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BBlockWindowTmp& b_block_window_tmp) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>> &&
+                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+            "wrong!");
+
+        constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
+        constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}];
+
+        static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN &&
+                          KPerBlock == BlockGemmShape::kK,
+                      "wrong!");
+
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        const index_t iNWarp = get_warp_id() % NWarp;
+
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        // constrcut from A-block-tensor from A-Block-tensor-tmp
+        // FIXME: need method to check a_block_tensor and a_block_tensor_tmp have equivalent
+        // distribution
+        auto a_block_tensor = make_static_distributed_tensor<typename ABlockTensorTmp::DataType>(
+            MakeABlockTileDistribution());
+
+        a_block_tensor.get_thread_buffer() = a_block_tensor_tmp.get_thread_buffer();
+
+        // construct B-warp-window
+        auto b_warp_window_tmp = make_tile_window(
+            b_block_window_tmp.get_bottom_tensor_view(),
+            make_tuple(number<WG::kN>{}, number<WG::kK>{}),
+            b_block_window_tmp.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0},
+            make_static_tile_distribution(typename WG::BWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_warp_window_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_windows;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // check C-block-distribution
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "wrong!");
+
+        using AWarpDstr = typename WG::AWarpDstr;
+        using CWarpDstr = typename WG::CWarpDstr;
+
+        using AWarpTensor = typename WG::AWarpTensor;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto a_warp_y_lengths =
+            to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+        constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(decltype(b_warp_window_tmp){})),
+                                     KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensors;
+
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                b_warp_tensors(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
+            });
+        });
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                // read B warp tensor from B Block window
+                const auto b_warp_tensor = b_warp_tensors(nIter)(kIter);
+
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A block tensor
+                    AWarpTensor a_warp_tensor;
+
+                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    // WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor_array[nIter]);
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+
+        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+        static_for<0, KIterPerWarp, 1>{}([&](auto) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto) {
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+            });
+        });
+    }
+
+    template <index_t MPerBlock = BlockGemmShape::kM, index_t KPerBlock = BlockGemmShape::kK>
+    CK_TILE_DEVICE static constexpr auto MakeABlockTileDistribution()
+    {
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr auto a_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
+
+        return make_static_tile_distribution(a_block_dstr_encode);
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        // constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // C = A * B
+    template <typename ABlockTensorTmp, typename BBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BBlockWindowTmp& b_block_window_tmp) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor_tmp, b_block_window_tmp);
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile

From ad593c286f7deda9cc8da7db660f8eddd6690151 Mon Sep 17 00:00:00 2001
From: rahjain-amd <Rahul.Jain@amd.com>
Date: Mon, 7 Jul 2025 14:46:22 +0530
Subject: [PATCH 283/443] Fixing Debug build (#2404)

Failed to build `tile_example_fmha_bwd` due to below error

```
/home/rahjain/src/composable_kernel/example/ck_tile/01_fmha/fmha_bwd.cpp:358:30: error: comparison of integers of different signs: 'size_type' (aka 'unsigned long') and 'ck_tile::index_t' (aka 'int') [-Werror,-Wsign-compare]
  358 |         assert(slopes.size() == nhead);
      |                ~~~~~~~~~~~~~ ^  ~~~~~
/usr/include/assert.h:103:27: note: expanded from macro 'assert'
  103 |      (static_cast <bool> (expr)                                         \
      |                           ^~~~
/home/rahjain/src/composable_kernel/example/ck_tile/01_fmha/fmha_bwd.cpp:989:16: note: in instantiation of function template specialization 'run<FmhaBwdFp16>' requested here
  989 |         return run<FmhaBwdFp16>(arg_parser) ? 0 : -2;
      |                ^
/home/rahjain/src/composable_kernel/example/ck_tile/01_fmha/fmha_bwd.cpp:358:30: error: comparison of integers of different signs: 'size_type' (aka 'unsigned long') and 'ck_tile::index_t' (aka 'int') [-Werror,-Wsign-compare]
  358 |         assert(slopes.size() == nhead);
      |                ~~~~~~~~~~~~~ ^  ~~~~~
/usr/include/assert.h:103:27: note: expanded from macro 'assert'
  103 |      (static_cast <bool> (expr)                                         \
      |                           ^~~~
/home/rahjain/src/composable_kernel/example/ck_tile/01_fmha/fmha_bwd.cpp:993:16: note: in instantiation of function template specialization 'run<FmhaBwdBf16>' requested here
  993 |         return run<FmhaBwdBf16>(arg_parser) ? 0 : -2;
      |                ^
2 errors generated when compiling for gfx942.
```

Fixed with proper cast
---
 example/ck_tile/01_fmha/fmha_bwd.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp
index 3b9cf09eb2..b6de5ea621 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -355,7 +355,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     if(bias.type == bias_enum::alibi)
     {
         auto slopes = ck_tile::get_alibi_slopes<AccDataType>(nhead);
-        assert(slopes.size() == nhead);
+        assert(slopes.size() == static_cast<decltype(slopes.size())>(nhead));
         if(bias.rank_info == 0)
         {
             // alibi in 1*h

From 096bf2de414d53c64a1a704869a24195c760814c Mon Sep 17 00:00:00 2001
From: spolifroni-amd <Sandra.Polifroni@amd.com>
Date: Mon, 7 Jul 2025 10:29:36 -0400
Subject: [PATCH 284/443] updating the doxyfile and the index.rst so that it
 gets the full API (#2416)

* updating the doxyfile and the index.rst so that it gets the full API

* added recommended doxygen values
---
 docs/doxygen/Doxyfile | 14 ++++++--------
 docs/index.rst        |  4 +++-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 4367aabc95..4c8019f8d3 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -945,11 +945,9 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
-                         ../../include/ck/tensor_operation/gpu/block \
-                         ../../include/ck/tensor_operation/gpu/thread \
+INPUT                  = ../../include \
+                         ../../include/ck/ \
                          ../../library/include/ck/library/utility \
-                         ../../include/ck/wrapper \
                          ../../include/ck_tile
 
 # This tag can be used to specify the character encoding of the source files
@@ -1849,7 +1847,7 @@ MATHJAX_CODEFILE       =
 # The default value is: YES.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-SEARCHENGINE           = YES
+SEARCHENGINE           = NO
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
 # implemented using a web server instead of a web client using JavaScript. There
@@ -2406,7 +2404,7 @@ TAGFILES               =
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       =
+GENERATE_TAGFILE       = html/tagfile.xml
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2653,7 +2651,7 @@ DIR_GRAPH_MAX_DEPTH    = 1
 # The default value is: png.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_IMAGE_FORMAT       = png
+DOT_IMAGE_FORMAT       = svg
 
 # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
 # enable generation of interactive SVG images that allow zooming and panning.
@@ -2665,7 +2663,7 @@ DOT_IMAGE_FORMAT       = png
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-INTERACTIVE_SVG        = NO
+INTERACTIVE_SVG        = YES
 
 # The DOT_PATH tag can be used to specify the path where the dot tool can be
 # found. If left blank, it is assumed the dot tool can be found in the path.
diff --git a/docs/index.rst b/docs/index.rst
index 4cc26a1d3e..89a5e3e836 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -36,7 +36,9 @@ The Composable Kernel repository is located at `https://github.com/ROCm/composab
     * :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
     * :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
     * :ref:`wrapper`    
-    * :doc:`Composable Kernel complete class list <./doxygen/html/annotated>`
+    * :doc:`Composable Kernel API reference <./doxygen/html/namespace_c_k>`
+    * :doc:`CK Tile API reference <./doxygen/html/namespaceck__tile>`
+    * :doc:`Composable Kernel complete API class list <./doxygen/html/annotated>`
     
 To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
 

From bfe573d3babe220a9fcb4f5957eb93c424f2abb8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 7 Jul 2025 07:30:49 -0700
Subject: [PATCH 285/443] Bump sphinxcontrib-bibtex from 2.6.4 to 2.6.5 in
 /docs/sphinx (#2424)

---
updated-dependencies:
- dependency-name: sphinxcontrib-bibtex
  dependency-version: 2.6.5
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 3b57fc5148..beedb4e867 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
 rocm-docs-core[api_reference]==1.20.1
-sphinxcontrib-bibtex==2.6.4
+sphinxcontrib-bibtex==2.6.5
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 59263a6e4e..e8aa02aa01 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -182,7 +182,7 @@ ptyprocess==0.7.0
     # via pexpect
 pure-eval==0.2.3
     # via stack-data
-pybtex==0.24.0
+pybtex==0.25.1
     # via
     #   pybtex-docutils
     #   sphinxcontrib-bibtex
@@ -244,9 +244,7 @@ rpds-py==0.24.0
     #   jsonschema
     #   referencing
 six==1.17.0
-    # via
-    #   pybtex
-    #   python-dateutil
+    # via python-dateutil
 smmap==5.0.2
     # via gitdb
 snowballstemmer==2.2.0
@@ -278,7 +276,7 @@ sphinx-notfound-page==1.1.0
     # via rocm-docs-core
 sphinxcontrib-applehelp==2.0.0
     # via sphinx
-sphinxcontrib-bibtex==2.6.4
+sphinxcontrib-bibtex==2.6.5
     # via -r requirements.in
 sphinxcontrib-devhelp==2.0.0
     # via sphinx

From 054f85ab7c0fa07a90968e834899ec415af8b713 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Mon, 7 Jul 2025 10:33:26 -0600
Subject: [PATCH 286/443] MX GEMM - FP6 Example (#2419)

Adds support for MX FP6 data type in MX GEMM block pipeline version v1.
Provides an example of MX FP6 GEMM algorithm.

---------

Co-authored-by: OscarXu <huaiguxu@amd.com>
Co-authored-by: aska-0096 <haocwang@amd.com>
Co-authored-by: mtgu0705 <mtgu@amd.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com>
Co-authored-by: valarLip <340077269@qq.com>
Co-authored-by: Ding, Yi <yi.ding@amd.com>
Co-authored-by: feifei14119 <feiw@amd.com>
Co-authored-by: Lin, Qun <qlin@amd.com>
Co-authored-by: joye <joye@amd.com>
---
 CHANGELOG.md                                  |   2 +-
 example/67_gemm_microscaling/CMakeLists.txt   |   7 +
 .../67_gemm_microscaling/gemm_mx_common.hpp   |  38 ++++--
 example/67_gemm_microscaling/gemm_mx_fp6.cpp  |  99 ++++++++++++++
 include/ck/library/utility/host_tensor.hpp    |  58 ++++++++
 ...blockwise_gemm_mx_pipeline_xdlops_base.hpp |   9 +-
 .../impl/device_gemm_xdl_cshuffle_v3_mx.hpp   |   4 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp |   6 +
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  25 +++-
 include/ck/utility/amd_xdlops.hpp             |  48 +++++++
 include/ck/utility/data_type.hpp              |  91 ++++++++++---
 include/ck/utility/dtype_vector.hpp           |  71 +++++++---
 include/ck/utility/dynamic_buffer.hpp         |   4 +
 include/ck/utility/scaled_type_convert.hpp    |  14 +-
 include/ck/utility/type_convert.hpp           | 125 ++++++++++++++----
 test/data_type/CMakeLists.txt                 |   1 +
 test/data_type/test_bf6.cpp                   |   8 +-
 test/data_type/test_fp6.cpp                   |  63 ++++++++-
 18 files changed, 578 insertions(+), 95 deletions(-)
 create mode 100644 example/67_gemm_microscaling/gemm_mx_fp6.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f04935b8d..86a426e321 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
 * Added support for Stream-K version of mixed fp8/bf16 GEMM
 * Added support for Multiple D GEMM
-* Added GEMM pipeline for microscaling (MX) FP8/FP4 data types
+* Added GEMM pipeline for microscaling (MX) FP8/FP6/FP4 data types
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 07315d4aa5..35c5d18d50 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -10,6 +10,9 @@ add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
 # add_example_executable(example_gemm_mx_fp8_bf8 gemm_mx_fp8_bf8.cpp)
 # add_example_dependencies(example_gemm_mx example_gemm_mx_fp8_bf8) 
 
+add_example_executable(example_gemm_mx_fp6 gemm_mx_fp6.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_fp6)
+
 add_example_executable(example_gemm_mx_fp4 gemm_mx_fp4.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp4)
 
@@ -55,3 +58,7 @@ set(FP8_MXGEMM_OPTIONS)
 list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
 example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})
+
+set(FP6_MXGEMM_OPTIONS)
+list(APPEND FP6_MXGEMM_OPTIONS -mavx512f)
+example_compile_options(example_gemm_mx_fp6 PRIVATE ${FP6_MXGEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 1f01e1c7be..6ce10817ff 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -245,6 +245,11 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
         throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
     };
 
+    if(K % ck::packed_size_v<ADataType> != 0 || K % ck::packed_size_v<BDataType> != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of packed size.");
+    };
+
     // Hardcode scale layouts as per pipeline assumptions
     // TODO: Allow user to specify scale layouts
     using AScaleLayout = Row;
@@ -292,12 +297,20 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     auto a_data_element = [](float x) {
         if constexpr(ck::is_same_v<ADataType, ck::f4x2_pk_t>)
             return ck::type_convert<ADataType>(ck::float2_t(x));
+        else if constexpr(ck::packed_size_v<ADataType> == 32)
+            return ck::type_convert<ADataType>(ck::float32_t(x));
+        else if constexpr(ck::packed_size_v<ADataType> == 16)
+            return ck::type_convert<ADataType>(ck::float16_t(x));
         else
             return ck::type_convert<ADataType>(x);
     };
     auto b_data_element = [](float x) {
         if constexpr(ck::is_same_v<BDataType, ck::f4x2_pk_t>)
             return ck::type_convert<BDataType>(ck::float2_t(x));
+        else if constexpr(ck::packed_size_v<BDataType> == 32)
+            return ck::type_convert<BDataType>(ck::float32_t(x));
+        else if constexpr(ck::packed_size_v<BDataType> == 16)
+            return ck::type_convert<BDataType>(ck::float16_t(x));
         else
             return ck::type_convert<BDataType>(x);
     };
@@ -307,30 +320,35 @@ bool run_mx_gemm(const ProblemSizeSplitK& problem_size, const ExecutionConfig& c
     switch(config.init_method)
     {
     case 0: // Initializations for development and debugging
-        ck::utils::FillConstant<ADataType>{a_data_element(1.0f)}(a_m_k);
-        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(1.0f)}(a_m_k_scale);
+
+        ck::utils::FillConstant<ADataType>{a_data_element(0.5f)}(a_m_k);
+        ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(2.0f)}(a_m_k_scale);
+
         ck::utils::FillConstant<BDataType>{b_data_element(2.0f)}(*b_k_n);
         ck::utils::FillConstant<XDataType>{ck::type_convert<XDataType>(0.5f)}(b_k_n_scale);
+
         if(config.verbosity > 0)
         {
-            std::cout << "Init A = {1}" << std::endl;
+            std::cout << "Init A = {0.5}" << std::endl;
             std::cout << "Init A scale = {2.0}" << std::endl;
-            std::cout << "Init B = {0.5}" << std::endl;
-            std::cout << "Init B scale = {1.0}" << std::endl;
+            std::cout << "Init B = {2.0}" << std::endl;
+            std::cout << "Init B scale = {0.5}" << std::endl;
             std::cout << "Expect C = {K}" << std::endl;
         }
         break;
 
     case 1:
-        a_m_k.GenerateTensorDistr(int_distr{-5, 6});  // Z[-5,5]
-        b_k_n->GenerateTensorDistr(int_distr{-5, 6}); // Z[-5,5]
+        a_m_k.GenerateTensorDistr(
+            int_distr{-5, 5}, ck::identity{}, std::minstd_rand(time(nullptr))); // Z[-5,5]
+        b_k_n->GenerateTensorDistr(int_distr{-5, 5});                           // Z[-5,5]
         static_assert(ck::is_same_v<XDataType, ck::e8m0_bexp_t>);
-        a_m_k_scale.GenerateTensorDistr(int_distr{120, 129}); // scales: {0.25, 0.5, 1, 2}
-        b_k_n_scale.GenerateTensorDistr(int_distr{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        a_m_k_scale.GenerateTensorDistr(int_distr{125, 128}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorDistr(int_distr{125, 128}); // scales: {0.25, 0.5, 1, 2}
         break;
 
     case 2:
-        a_m_k.GenerateTensorDistr(float_distr{-2.0, 2.0});
+        a_m_k.GenerateTensorDistr(
+            float_distr{-2.0, 2.0}, ck::identity{}, std::minstd_rand(time(nullptr))); // R[-2,2]
         a_m_k_scale.GenerateTensorDistr(float_distr{powf(2.0f, -125.0f), 1.0f});
 
         b_k_n->GenerateTensorDistr(float_distr{-2.0, 2.0});
diff --git a/example/67_gemm_microscaling/gemm_mx_fp6.cpp b/example/67_gemm_microscaling/gemm_mx_fp6.cpp
new file mode 100644
index 0000000000..615980082d
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_fp6.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::f6x16_pk_t;
+using BDataType = ck::f6x16_pk_t;
+
+using XDataType = ck::e8m0_bexp_t;
+
+using CDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t ScaleBlockSize = 32;                            // scaling block size
+constexpr ck::index_t KPerBlock = 256 / ck::packed_size_v<ADataType>; // K dimension size per block
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Number of threads per block
+    128,              // MPerBlock
+    128,              // NPerBlock
+    KPerBlock,        // KPerBlock
+    1,  // AK1 number of elements to read at a time when transferring from global memory to LDS
+    1,  // BK1
+    16, // MPerXDL
+    16, // NPerXDL
+    4,  // MXdlPerWave
+    4,  // NXdlPerWave
+    S<16, 16, 1>,   // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+    2,              // ABlockTransferSrcVectorDim
+    1,              // ABlockTransferSrcScalarPerVector
+    16,             // ABlockTransferDstScalarPerVector_AK1
+    true,           // ABlockLdsExtraM
+    S<16, 16, 1>,   // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+    2,              // BBlockTransferSrcVectorDim
+    1,              // BBlockTransferSrcScalarPerVector
+    16,             // BBlockTransferDstScalarPerVector_BK1
+    true,           // BBlockLdsExtraN
+    2,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,  // BlkGemmPipeSched
+    BlkGemmPVer,    // BlkGemmPipelineVer
+    ADataType,      // ComputeTypeA
+    BDataType       // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 286dffc36c..46028b79f9 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -556,6 +556,64 @@ struct Tensor
                         return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
                             ck::float2_t{ck::type_convert<float>(fn(dis_(g_))),
                                          ck::type_convert<float>(fn(dis_(g_)))})};
+                    else if constexpr(ck::is_same_v<T, ck::f6x32_pk_t> ||
+                                      ck::is_same_v<T, ck::bf6x32_pk_t>)
+                    {
+                        return ck::type_convert<T>(
+                            ck::float32_t{ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_)))});
+                    }
+                    else if constexpr(ck::is_same_v<T, ck::f6x16_pk_t> ||
+                                      ck::is_same_v<T, ck::bf6x16_pk_t>)
+                    {
+                        return ck::type_convert<T>(
+                            ck::float16_t{ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_))),
+                                          ck::type_convert<float>(fn(dis_(g_)))});
+                    }
                     else
                         static_assert(false, "Unsupported packed size for T");
                 };
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
index 5370cfa975..c929956124 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -66,9 +66,12 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
     static constexpr index_t AMmaKStride = KPack;
     static constexpr index_t BMmaKStride = KPack;
 
-    //> store rows/cols into thread registers in chunks of 16
-    //> e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
-    static constexpr index_t KThreadChunk = 16 / sizeof(ComputeTypeA);
+    // store rows/cols into thread registers in chunks of 16 for FP8
+    // e.g. [k0,...,k15,k64,...,k79] or [k0,...,k15,k32,...,k47]
+    // or in chunks of 32 / APackedSize for FP6/FP4
+    static constexpr index_t KThreadChunk = (APackedSize == 1) ? 16 : 32 / APackedSize;
+
+    static_assert(APackedSize == BPackedSize, "APackedSize must be equal to BPackedSize for now");
 
     static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
     static constexpr index_t KRepeat       = KPerThread / KPack;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
index ed168195ec..ae9b75cb0d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -54,6 +54,8 @@ namespace device {
  *
  * Conditions for achieving computational load balancing on different hardware platforms can vary.
  *
+ * \tparam KPerBlock is the number of elements in K dimension that each block processes (multiply with packed_size_v to get the actual KPerBlock)
+ *
  * Serialized version of the algorithm:
  * \code
  * // E = A * B + C
@@ -117,7 +119,7 @@ template <typename ALayout,
           index_t BlockSize,      // Thread block size
           index_t MPerBlock,
           index_t NPerBlock,
-          index_t KPerBlock,
+          index_t KPerBlock, // multiply with packed_size_v to get the actual KPerBlock
           index_t AK1,
           index_t BK1,
           index_t MPerXDL,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index e32301fcd2..ca3902188e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -419,6 +419,12 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                         (GemmSpec != GemmSpecialization::Default &&
                          GemmSpec != GemmSpecialization::MPadding)),
                       "f4x2_pk_t does not support K padding");
+        static_assert(!((is_same_v<remove_cvref_t<ADataType>, f6x16_pk_t> ||
+                         is_same_v<remove_cvref_t<ADataType>, bf6x16_pk_t> ||
+                         is_same_v<remove_cvref_t<ADataType>, f6x32_pk_t> ||
+                         is_same_v<remove_cvref_t<ADataType>, bf6x32_pk_t>)&&GemmSpec !=
+                        GemmSpecialization::Default),
+                      "Packed F6 types do not support padding");
 
         if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
                      GemmSpec == GemmSpecialization::MNKPadding)
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 1dd766eca0..64d7f92750 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -889,7 +889,6 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
                         const ScaleB& scale_b,
                         FloatC& reg_c) const
     {
-
         intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops, OpselA, OpselB>::Run(
             a, bit_cast<uint32_t>(scale_a), b, bit_cast<uint32_t>(scale_b), reg_c);
     }
@@ -1224,6 +1223,27 @@ struct MfmaSelector
         return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
     }
 
+    template <>
+    constexpr auto GetMfma<f6_t, 32, 32, f6_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
+    }
+    template <>
+    constexpr auto GetMfma<f6_t, 16, 16, f6_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+    template <>
+    constexpr auto GetMfma<bf6_t, 32, 32, bf6_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_32x32x64f8f6f4;
+    }
+    template <>
+    constexpr auto GetMfma<bf6_t, 16, 16, bf6_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
     template <>
     constexpr auto GetMfma<bf8_t, 32, 32, bf8_t, true, false>()
     {
@@ -1405,8 +1425,7 @@ struct XdlopsGemm
                           MPerXdlops == 64,
                       "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
 
-        static_assert(KPack * 2 % mfma_instr.k_per_blk == 0,
-                      "KPack should be a multiple of k_per_blk");
+        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack should be a multiple of k_per_blk");
     }
 
     // XDL output supporting C = A * B
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 56da5c1dc8..efb877b3f2 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1037,6 +1037,54 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const f6x16x2_t& reg_a,
+                               const int32_t scale_a,
+                               const f6x16x2_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        using arg_type = int32x8_t;
+        arg_type arg_a{
+            static_cast<int32_t>(reg_a.template AsType<f6x16x2_t::data_t>()[Number<0>{}][0]),
+            static_cast<int32_t>(reg_a.template AsType<f6x16x2_t::data_t>()[Number<0>{}][1]),
+            static_cast<int32_t>(reg_a.template AsType<f6x16x2_t::data_t>()[Number<0>{}][2]),
+            static_cast<int32_t>(reg_a.template AsType<f6x16x2_t::data_t>()[Number<1>{}][0]),
+            static_cast<int32_t>(reg_a.template AsType<f6x16x2_t::data_t>()[Number<1>{}][1]),
+            static_cast<int32_t>(reg_a.template AsType<f6x16x2_t::data_t>()[Number<1>{}][2]),
+            0,
+            0};
+        arg_type arg_b{
+            static_cast<int32_t>(reg_b.template AsType<f6x16x2_t::data_t>()[Number<0>{}][0]),
+            static_cast<int32_t>(reg_b.template AsType<f6x16x2_t::data_t>()[Number<0>{}][1]),
+            static_cast<int32_t>(reg_b.template AsType<f6x16x2_t::data_t>()[Number<0>{}][2]),
+            static_cast<int32_t>(reg_b.template AsType<f6x16x2_t::data_t>()[Number<1>{}][0]),
+            static_cast<int32_t>(reg_b.template AsType<f6x16x2_t::data_t>()[Number<1>{}][1]),
+            static_cast<int32_t>(reg_b.template AsType<f6x16x2_t::data_t>()[Number<1>{}][2]),
+            0,
+            0};
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_a,
+                arg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                2,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                2,      // blgp
+                OpselA, // OPSEL
+                scale_a,
+                OpselB, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const bf6x32_t& reg_a,
                                const int32_t scale_a,
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 51da18cd2b..15b8841c39 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -67,27 +67,42 @@ struct f6_pk_t
 {
     using element_type = uint32_t; // element storage fundamental type
 
-    static constexpr index_t packed_size       = pk_size;
-    static constexpr index_t num_bits_elem     = 6;
-    static constexpr index_t num_bits_vec_elem = sizeof(element_type) * CHAR_BIT;
+    static constexpr index_t packed_size   = pk_size; // 16 or 32 for now
+    static constexpr index_t num_bits_elem = 6;       // specialized for 6-bit data
+    // XXX: CHAR_BIT is not defined in HIPRTC, so we must use 8
+    static constexpr index_t num_bits_vec_elem =
+        sizeof(element_type) * 8; // 32-bit uint for storage
     static_assert((packed_size * num_bits_elem) % num_bits_vec_elem == 0,
                   "Packed elements must fit exactly into the element storage.");
-    static constexpr index_t vector_size = (packed_size * num_bits_elem) / num_bits_vec_elem;
+    static constexpr index_t vector_size =
+        (packed_size * num_bits_elem) / num_bits_vec_elem; // 3 or 6 element_type units
 
-    using storage_type = StaticallyIndexedArray_v2<element_type, vector_size>;
-    storage_type data; // packed data
+    using storage_type = element_type __attribute__((ext_vector_type(vector_size)));
+    storage_type data_{storage_type(0)}; // packed data
 
     using type = f6_pk_t<BitType, packed_size>;
 
-    __host__ __device__ constexpr f6_pk_t() : data{} {}
-    __host__ __device__ constexpr f6_pk_t(storage_type init) : data{init} {}
+    __host__ __device__ constexpr f6_pk_t() {}
+    __host__ __device__ constexpr f6_pk_t(const storage_type& init) : data_{init}
+    {
+        // TODO: consider removing initialization similar to vector_type<T, 256>
+    }
+
+    // Initialize from a vector type with the same size as packed_size
     template <typename T, typename = enable_if_t<scalar_type<T>::vector_size == packed_size>>
-    __host__ __device__ f6_pk_t(const T& v) : data{}
+    __host__ __device__ f6_pk_t(const T& v)
     {
         static_for<0, packed_size, 1>{}(
             [&](auto i) { pack(v[static_cast<index_t>(i)], static_cast<index_t>(i)); });
     }
 
+    // Broadcast single initialization value to all packed elements
+    __host__ __device__ f6_pk_t(const int8_t v)
+        : f6_pk_t(static_cast<int8_t __attribute__((ext_vector_type(packed_size)))>(v))
+    {
+        // TODO: consider removing initialization similar to vector_type<T, 256>
+    }
+
     template <typename T>
     __host__ __device__ void pack(const T x, const index_t i)
     {
@@ -99,18 +114,18 @@ struct f6_pk_t
         const int arr_index  = bit_pos / num_bits_vec_elem;
         const int bit_offset = bit_pos % num_bits_vec_elem;
         const int overhang   = bit_offset + num_bits_elem - num_bits_vec_elem;
-        uint32_t old_value   = data.data_[arr_index];
+        uint32_t old_value   = data_[arr_index];
 
         // insert bits into the current 32-bit block
         old_value |= (bits << bit_offset);
-        data.data_[arr_index] = old_value;
+        data_[arr_index] = old_value;
 
         // if it crosses into the next block, shift the remainder
         if(overhang > 0 && (arr_index + 1) < vector_size)
         {
-            uint32_t next_value = data.data_[arr_index + 1];
+            uint32_t next_value = data_[arr_index + 1];
             next_value |= (bits >> (num_bits_elem - overhang));
-            data.data_[arr_index + 1] = next_value;
+            data_[arr_index + 1] = next_value;
         }
     }
 
@@ -121,17 +136,33 @@ struct f6_pk_t
         const int bit_offset = bit_pos % num_bits_vec_elem;
         const int overhang   = bit_offset + num_bits_elem - num_bits_vec_elem;
 
-        uint32_t bits = pk.data.data_[arr_idx] >> bit_offset;
+        uint32_t bits = pk.data_[arr_idx] >> bit_offset;
         if(overhang > 0 && (arr_idx + 1) < vector_size)
         {
-            bits |= (pk.data.data_[arr_idx + 1] & ((1u << overhang) - 1))
-                    << (num_bits_elem - overhang);
+            bits |= (pk.data_[arr_idx + 1] & ((1u << overhang) - 1)) << (num_bits_elem - overhang);
         }
 
         return static_cast<BitType>(bits & 0x3F);
     }
 
     __host__ __device__ inline BitType unpack(const index_t i) const { return unpack(*this, i); }
+
+    // Compare operator
+    __host__ __device__ friend bool operator==(const f6_pk_t& lhs, const f6_pk_t& rhs)
+    {
+#pragma unroll
+        for(index_t i = 0; i < vector_size; ++i)
+        {
+            if(lhs.data_[i] != rhs.data_[i])
+                return false;
+        }
+        return true;
+    }
+
+    __host__ __device__ friend bool operator!=(const f6_pk_t& lhs, const f6_pk_t& rhs)
+    {
+        return !(lhs == rhs);
+    }
 };
 
 using f6x16_pk_t  = f6_pk_t<f6_t, 16>;
@@ -296,6 +327,34 @@ struct scalar_type<f4x2_pk_t>
     static constexpr index_t vector_size = 1;
 };
 
+template <>
+struct scalar_type<f6x32_pk_t>
+{
+    using type                           = f6x32_pk_t::storage_type;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<bf6x32_pk_t>
+{
+    using type                           = bf6x32_pk_t::storage_type;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<f6x16_pk_t>
+{
+    using type                           = f6x16_pk_t::storage_type;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<bf6x16_pk_t>
+{
+    using type                           = bf6x16_pk_t::storage_type;
+    static constexpr index_t vector_size = 1;
+};
+
 template <>
 struct scalar_type<bool>
 {
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index 0891a7ccf4..effe445883 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -1438,14 +1438,16 @@ struct non_native_vector_base<
 
 // implementation for f6x16 and f6x32
 template <typename T, index_t N>
-struct non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T) == 24>>
+struct non_native_vector_base<
+    T,
+    N,
+    ck::enable_if_t<sizeof(T) == 12 || sizeof(T) == 16 || sizeof(T) == 24 || sizeof(T) == 32>>
 {
     using data_t =
         typename nnvb_data_t_selector<T>::type; // select data_t based on declared base type
     using element_t = typename T::element_type; // select element_t based on declared element type
     static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
-    static constexpr size_t size_factor =
-        sizeof(data_t) / sizeof(element_t); // f6x16: 12/4 = 3, f6x32: 24/4 = 6
+    static constexpr size_t size_factor = sizeof(data_t) / sizeof(element_t);
     using data_v = element_t __attribute__((ext_vector_type(N * size_factor)));
     using type   = non_native_vector_base<T, N>;
 
@@ -1457,29 +1459,29 @@ struct non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T)
         StaticallyIndexedArray<data_v, 1> dNx1;
     } data_;
 
-    __host__ __device__ constexpr non_native_vector_base(data_t a)
-        : data_{data_v(a.At(Number<0>{}))}
+    // Broadcast single value to vector
+    __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{}
     {
+        // TODO: consider removing initialization similar to vector_type<T, 256>
+
+        ck::static_for<0, N, 1>{}([&](auto i) {
+            data_.dxN(i) = a; // broadcast value to all elements
+        });
     }
+
     __host__ __device__ constexpr non_native_vector_base(T f)
         : non_native_vector_base(bit_cast<data_t>(f))
     {
     }
+
     __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
+
     __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
 
+    __host__ __device__ constexpr non_native_vector_base(element_t v) : data_{data_v(v)} {}
+
     __host__ __device__ constexpr operator data_v() const { return data_.dN; }
-    __host__ __device__ constexpr operator data_t() const
-    {
-        if constexpr(N == 1)
-        {
-            return data_.dxN[Number<0>{}];
-        }
-        else
-        {
-            return data_.dxN; // XXX this should cause an error
-        }
-    }
+
     __host__ __device__ constexpr operator T() const
     {
         if constexpr(N == 1)
@@ -1488,7 +1490,31 @@ struct non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T)
         }
         else
         {
-            return data_.dTxN; // XXX this should cause an error
+            return err; // XXX this should cause an error
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, data_v> || is_same_v<X, T>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else
+        {
+            return err;
         }
     }
 };
@@ -1504,8 +1530,10 @@ struct scalar_type<non_native_vector_base<
 };
 
 template <typename T, index_t N>
-struct scalar_type<
-    non_native_vector_base<T, N, ck::enable_if_t<sizeof(T) == 12 || sizeof(T) == 24>>>
+struct scalar_type<non_native_vector_base<
+    T,
+    N,
+    ck::enable_if_t<sizeof(T) == 12 || sizeof(T) == 16 || sizeof(T) == 24 || sizeof(T) == 32>>>
 {
     using type                           = typename non_native_vector_base<T, N>::element_t;
     static constexpr index_t vector_size = N * non_native_vector_base<T, N>::size_factor;
@@ -2221,8 +2249,9 @@ using f4x32_t = typename vector_type<f4x2_pk_t, 16>::type;
 using f4x64_t = typename vector_type<f4x2_pk_t, 32>::type;
 
 // f6
-using f6x16_t = typename vector_type<f6x16_pk_t, 1>::type;
-using f6x32_t = typename vector_type<f6x32_pk_t, 1>::type;
+using f6x16_t   = typename vector_type<f6x16_pk_t, 1>::type;
+using f6x16x2_t = typename vector_type<f6x16_pk_t, 2>::type;
+using f6x32_t   = typename vector_type<f6x32_pk_t, 1>::type;
 
 // bf6
 using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type;
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 2debd09c2d..ed42b22daf 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -34,6 +34,10 @@ struct DynamicBuffer
     ElementSpaceSize element_space_size_;
     T invalid_element_value_ = T{0};
 
+    // XXX: PackedSize semantics for pk_i4_t is different from the other packed types.
+    // Objects of f4x2_pk_t and f6_pk_t are counted as 1 element, while
+    // objects of pk_i4_t are counted as 2 elements. Therefore, element_space_size_ for pk_i4_t must
+    // be divided by 2 to correctly represent the number of addressable elements.
     static constexpr index_t PackedSize = []() {
         if constexpr(is_same_v<remove_cvref_t<T>, pk_i4_t>)
             return 2;
diff --git a/include/ck/utility/scaled_type_convert.hpp b/include/ck/utility/scaled_type_convert.hpp
index f3e2bd3dd9..90a018fe3a 100644
--- a/include/ck/utility/scaled_type_convert.hpp
+++ b/include/ck/utility/scaled_type_convert.hpp
@@ -501,8 +501,8 @@ inline __host__ __device__ float scaled_type_convert<float, f6_t>(e8m0_bexp_t sc
         float float_array[32];
     } out{};
 
-    out.float_vector =
-        __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in.f6_vector, type_convert<float>(scale));
+    out.float_vector = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(
+        in.f6_vector.template AsType<f6x32_t::data_t>()[Number<0>{}], type_convert<float>(scale));
     return out.float_array[0];
 #else
     return utils::to_float<f6_t>(scale, x);
@@ -522,7 +522,8 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f6x32_t>(e8m
                                                                              f6x32_t x)
 {
 #if defined(__gfx950__)
-    return __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(x, type_convert<float>(scale));
+    return __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(
+        x.template AsType<f6x32_t::data_t>()[Number<0>{}], type_convert<float>(scale));
 #else
     union
     {
@@ -567,8 +568,8 @@ inline __host__ __device__ float scaled_type_convert<float, bf6_t>(e8m0_bexp_t s
         float float_array[32];
     } out{};
 
-    out.float_vector =
-        __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in.bf6_vector, type_convert<float>(scale));
+    out.float_vector = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(
+        in.bf6_vector.template AsType<bf6x32_t::data_t>()[Number<0>{}], type_convert<float>(scale));
     return out.float_array[0];
 #else
     return utils::to_float<bf6_t>(scale, x);
@@ -588,7 +589,8 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, bf6x32_t>(e8
                                                                               bf6x32_t x)
 {
 #if defined(__gfx950__)
-    return __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(x, type_convert<float>(scale));
+    return __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(
+        x.template AsType<bf6x32_t::data_t>()[Number<0>{}], type_convert<float>(scale));
 #else
     union
     {
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 69a953b575..23ab1bebb5 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -1734,7 +1734,7 @@ inline __host__ __device__ f6_t f6_convert_rne(float x, float scale = 1.0f)
         f6_t f6_array[32];
     } out{};
 
-    out.f6_vector = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, scale);
+    out.f6_vector = f6x32_t{__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, scale)};
 
     return out.f6_array[0];
 #else
@@ -1757,7 +1757,7 @@ inline __host__ __device__ f6x32_t f6_convert_rne(float32_t x, float scale = 1.0
 #if defined(__gfx950__)
     float16_t* in1 = reinterpret_cast<float16_t*>(&x);
     float16_t* in2 = reinterpret_cast<float16_t*>(&x + 16);
-    return __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(*in1, *in2, scale);
+    return f6x32_t{__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(*in1, *in2, scale)};
 #else
     union
     {
@@ -1765,17 +1765,15 @@ inline __host__ __device__ f6x32_t f6_convert_rne(float32_t x, float scale = 1.0
         float float_array[32];
     } in{x};
 
-    union
-    {
-        f6x32_t f6_vector;
-        f6_t f6_array[32];
-    } out{};
+    using array_type = uint8_t __attribute__((ext_vector_type(32)));
+    array_type uint8_array;
 
+    // collect the 6-bit values into an array
     ck::static_for<0, 32, 1>{}([&](auto i) {
-        out.f6_array[i] = utils::sat_convert_to_type<f6_t>(in.float_array[i] / scale);
+        uint8_array[static_cast<index_t>(i)] =
+            utils::sat_convert_to_type<f6_t>(in.float_array[i] / scale);
     });
-
-    return out.f6_vector;
+    return f6x32_t{f6x32_pk_t{uint8_array}};
 #endif
 }
 
@@ -1807,7 +1805,8 @@ inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
         f6_t f6_array[32];
     } out{};
 
-    out.f6_vector = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(in.float_vector, rng, scale);
+    out.f6_vector =
+        f6x32_t{__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(in.float_vector, rng, scale)};
 
     return out.f6_array[0];
 #else
@@ -1837,7 +1836,7 @@ inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f
     // use HW clock for stochastic input multiply by incremented thread id
     uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
                                              (get_thread_global_1d_id() + 1));
-    return __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale);
+    return f6x32_t{__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale)};
 #else
     constexpr int seed = 1254739;
     union
@@ -1852,6 +1851,7 @@ inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f
     uint32_t rng =
         prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
 #endif
+
     union
     {
         float32_t float_vector;
@@ -1914,6 +1914,43 @@ inline __host__ __device__ f6x32_t type_convert<f6x32_t, float32_t>(float32_t x)
 #endif
 }
 
+template <>
+inline __host__ __device__ f6x32_pk_t type_convert<f6x32_pk_t, float32_t>(float32_t x)
+{
+    return static_cast<f6x32_pk_t>(type_convert<f6x32_t>(x));
+}
+
+template <>
+inline __host__ __device__ f6x16_t type_convert<f6x16_t, float16_t>(float16_t x)
+{
+
+    union
+    {
+        float16_t v16x2[2];
+        float32_t v32;
+    } in{{x, x}};
+
+    union
+    {
+        f6x32_t v32;
+        f6x16_t v16x2[2];
+    } out{};
+
+#if CK_USE_SR_F6_CONVERSION
+    out.v32 = f6_convert_sr(in.v32);
+#else
+    out.v32 = f6_convert_rne(in.v32);
+#endif
+
+    return out.v16x2[0];
+}
+
+template <>
+inline __host__ __device__ f6x16_pk_t type_convert<f6x16_pk_t, float16_t>(float16_t x)
+{
+    return static_cast<f6x16_pk_t>(type_convert<f6x16_t>(x));
+}
+
 /**
  * @brief Specializes the type conversion template for converting the 6-bit float type (f6_t) to
  * float.
@@ -1929,9 +1966,9 @@ inline __host__ __device__ float type_convert<float, f6_t>(f6_t x)
 #if defined(__gfx950__)
     union
     {
-        f6x32_t f6_vector;
         f6_t f6_array[32];
-    } in{x};
+        f6x32_t f6_vector;
+    } in{{x}};
 
     union
     {
@@ -1940,7 +1977,8 @@ inline __host__ __device__ float type_convert<float, f6_t>(f6_t x)
     } out{};
 
     out.float_vector = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(
-        in.f6_vector, type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
+        in.f6_vector.template AsType<f6x32_t::data_t>()[Number<0>{}],
+        type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
     return out.float_array[0];
 #else
     return utils::to_float<f6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), x);
@@ -1948,8 +1986,8 @@ inline __host__ __device__ float type_convert<float, f6_t>(f6_t x)
 }
 
 /**
- * @brief Specializes the type conversion template for converting the vector of 32 6-bit float types
- * (f6x32_t) to vector of 32 floats.
+ * @brief Specializes the type conversion template for converting the vector of 32 6-bit float
+ * types (f6x32_t) to vector of 32 floats.
  *
  * Interprets an f6_t values as floats using the default scale factor of 1.
  *
@@ -1961,7 +1999,8 @@ inline __host__ __device__ float32_t type_convert<float32_t, f6x32_t>(f6x32_t x)
 {
 #if defined(__gfx950__)
     return __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(
-        x, type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
+        x.template AsType<f6x32_t::data_t>()[Number<0>{}],
+        type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
 #else
     union
     {
@@ -1984,6 +2023,31 @@ inline __host__ __device__ float32_t type_convert<float32_t, f6x32_t>(f6x32_t x)
 #endif
 }
 
+template <>
+inline __host__ __device__ float16_t type_convert<float16_t, f6x16_t>(f6x16_t x)
+{
+    union
+    {
+        f6x16_t v16x2[2];
+        f6x32_t v32;
+    } in{{x, x}};
+
+    union
+    {
+        float16_t v16x2[2];
+        float32_t v32;
+    } out{};
+
+    out.v32 = type_convert<float32_t>(in.v32);
+    return out.v16x2[0];
+}
+
+template <>
+inline __host__ __device__ float16_t type_convert<float16_t, f6x16_pk_t>(f6x16_pk_t x)
+{
+    return type_convert<float16_t>(static_cast<f6x16_t>(x));
+}
+
 /**
  * @brief Converts a float to the 6-bit BF6 type using round-to-nearest-even.
  *
@@ -2006,7 +2070,7 @@ inline __host__ __device__ bf6_t bf6_convert_rne(float x, float scale = 1.0f)
         bf6_t bf6_array[32];
     } out{};
 
-    out.bf6_vector = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, scale);
+    out.bf6_vector = bf6x32_t{__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, scale)};
 
     return out.bf6_array[0];
 #else
@@ -2030,7 +2094,7 @@ inline __host__ __device__ bf6x32_t bf6_convert_rne(float32_t x, float scale = 1
 #if defined(__gfx950__)
     float16_t* in1 = reinterpret_cast<float16_t*>(&x);
     float16_t* in2 = reinterpret_cast<float16_t*>(&x + 16);
-    return __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(*in1, *in2, scale);
+    return bf6x32_t{__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(*in1, *in2, scale)};
 #else
     union
     {
@@ -2081,7 +2145,8 @@ inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
         bf6_t bf6_array[32];
     } out{};
 
-    out.bf6_vector = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(in.float_vector, rng, scale);
+    out.bf6_vector =
+        bf6x32_t{__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(in.float_vector, rng, scale)};
 
     return out.bf6_array[0];
 #else
@@ -2113,7 +2178,7 @@ inline __host__ __device__ bf6x32_t bf6_convert_sr(float32_t x, float scale = 1.
     // use HW clock for stochastic input multiply by incremented thread id
     uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
                                              (get_thread_global_1d_id() + 1));
-    return __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale);
+    return bf6x32_t{__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale)};
 #else
     constexpr int seed = 1254739;
     union
@@ -2186,6 +2251,12 @@ inline __host__ __device__ bf6x32_t type_convert<bf6x32_t, float32_t>(float32_t
 #endif
 }
 
+template <>
+inline __host__ __device__ bf6x32_pk_t type_convert<bf6x32_pk_t, float32_t>(float32_t x)
+{
+    return static_cast<bf6x32_pk_t>(type_convert<bf6x32_t>(x));
+}
+
 /**
  * @brief Specializes the type conversion template for converting a bf6_t value to float.
  *
@@ -2201,9 +2272,9 @@ inline __host__ __device__ float type_convert<float, bf6_t>(bf6_t x)
 #if defined(__gfx950__)
     union
     {
-        bf6x32_t bf6_vector;
         bf6_t bf6_array[32];
-    } in{x};
+        bf6x32_t bf6_vector;
+    } in{{x}};
 
     union
     {
@@ -2212,7 +2283,8 @@ inline __host__ __device__ float type_convert<float, bf6_t>(bf6_t x)
     } out{};
 
     out.float_vector = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(
-        in.bf6_vector, type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
+        in.bf6_vector.template AsType<bf6x32_t::data_t>()[Number<0>{}],
+        type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
     return out.float_array[0];
 #else
     return utils::to_float<bf6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), x);
@@ -2234,7 +2306,8 @@ inline __host__ __device__ float32_t type_convert<float32_t, bf6x32_t>(bf6x32_t
 {
 #if defined(__gfx950__)
     return __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(
-        x, type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
+        x.template AsType<bf6x32_t::data_t>()[Number<0>{}],
+        type_convert<float>(NumericLimits<e8m0_bexp_t>::Binary_1()));
 #else
     union
     {
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
index 8f6e9a0d15..7e23998f8c 100644
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -53,6 +53,7 @@ if(GPU_TARGETS MATCHES "gfx950")
 
   add_gtest_executable(test_fp6 test_fp6.cpp)
   if(result EQUAL 0)
+    target_compile_options(test_fp6 PRIVATE -mavx512f)
     target_link_libraries(test_fp6 PRIVATE utility)
   endif()
   add_dependencies(test_mx_data_types test_fp6)
diff --git a/test/data_type/test_bf6.cpp b/test/data_type/test_bf6.cpp
index 9dbb77454c..25c01076e9 100644
--- a/test/data_type/test_bf6.cpp
+++ b/test/data_type/test_bf6.cpp
@@ -228,8 +228,8 @@ TEST(BF6, ScaledConvertFP32Stochastic)
 TEST(BF6, TestSize)
 {
     ASSERT_EQ(1, sizeof(bf6_t));
-    ASSERT_EQ(12, sizeof(bf6x16_pk_t));
-    ASSERT_EQ(24, sizeof(bf6x32_pk_t));
+    ASSERT_EQ(16, sizeof(bf6x16_pk_t));
+    ASSERT_EQ(32, sizeof(bf6x32_pk_t));
     ASSERT_EQ(16, sizeof(vector_type<bf6x16_pk_t, 1>));
     ASSERT_EQ(32, sizeof(vector_type<bf6x16_pk_t, 2>));
     ASSERT_EQ(32, sizeof(vector_type<bf6x32_pk_t, 1>));
@@ -238,8 +238,8 @@ TEST(BF6, TestSize)
 TEST(BF6, TestAlignment)
 {
     ASSERT_EQ(1, alignof(bf6_t));
-    ASSERT_EQ(4, alignof(bf6x16_pk_t));
-    ASSERT_EQ(4, alignof(bf6x32_pk_t));
+    ASSERT_EQ(16, alignof(bf6x16_pk_t));
+    ASSERT_EQ(32, alignof(bf6x32_pk_t));
     ASSERT_EQ(16, alignof(vector_type<bf6x16_pk_t, 1>));
     ASSERT_EQ(32, alignof(vector_type<bf6x16_pk_t, 2>));
     ASSERT_EQ(32, alignof(vector_type<bf6x32_pk_t, 1>));
diff --git a/test/data_type/test_fp6.cpp b/test/data_type/test_fp6.cpp
index 6d4aec1d9a..14afe3e2e4 100644
--- a/test/data_type/test_fp6.cpp
+++ b/test/data_type/test_fp6.cpp
@@ -6,6 +6,7 @@
 #include "ck/utility/type_convert.hpp"
 #include "ck/utility/env.hpp"
 #include "ck/utility/scaled_type_convert.hpp"
+#include "ck/library/utility/device_memory.hpp"
 
 using ck::e8m0_bexp_t;
 using ck::f6_convert_rne;
@@ -227,8 +228,8 @@ TEST(FP6, ScaledConvertFP32Stochastic)
 TEST(FP6, TestSize)
 {
     ASSERT_EQ(1, sizeof(f6_t));
-    ASSERT_EQ(12, sizeof(f6x16_pk_t));
-    ASSERT_EQ(24, sizeof(f6x32_pk_t));
+    ASSERT_EQ(16, sizeof(f6x16_pk_t));
+    ASSERT_EQ(32, sizeof(f6x32_pk_t));
     ASSERT_EQ(16, sizeof(vector_type<f6x16_pk_t, 1>));
     ASSERT_EQ(32, sizeof(vector_type<f6x16_pk_t, 2>));
     ASSERT_EQ(32, sizeof(vector_type<f6x32_pk_t, 1>));
@@ -237,8 +238,8 @@ TEST(FP6, TestSize)
 TEST(FP6, TestAlignment)
 {
     ASSERT_EQ(1, alignof(f6_t));
-    ASSERT_EQ(4, alignof(f6x16_pk_t));
-    ASSERT_EQ(4, alignof(f6x32_pk_t));
+    ASSERT_EQ(16, alignof(f6x16_pk_t));
+    ASSERT_EQ(32, alignof(f6x32_pk_t));
     ASSERT_EQ(16, alignof(vector_type<f6x16_pk_t, 1>));
     ASSERT_EQ(32, alignof(vector_type<f6x16_pk_t, 2>));
     ASSERT_EQ(32, alignof(vector_type<f6x32_pk_t, 1>));
@@ -292,6 +293,60 @@ TEST(FP6, TestAsType16x1)
     });
 }
 
+__global__ void test_f6_convert_rne(float* p_test, uint64_t* p_completed)
+{
+    constexpr int N = 32;
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    ck::float32_t float32_in(1.0f);
+    ck::float32_t float32_out{};
+
+    auto f6x32_vec = f6_convert_rne(float32_in);
+    float32_out    = type_convert<ck::float32_t>(f6x32_vec);
+
+    ck::static_for<0, N, 1>{}([&](auto ii) { p_test[i++] = float32_out[static_cast<int>(ii)]; });
+    i = N;
+}
+
+TEST(MXFP6, DeviceF6ConvertRNE)
+{
+    constexpr int N = 32;
+    std::vector<float> out(N, -1.0f);
+
+    DeviceMem device_out(N * sizeof(float));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    test_f6_convert_rne<<<1, 1>>>(static_cast<float*>(device_out.GetDeviceBuffer()),
+                                  static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    EXPECT_EQ(N, completed);
+    ck::static_for<0, N, 1>{}(
+        [&](auto ii) { EXPECT_EQ(out[static_cast<int>(ii)], 1.0f) << "ii: " << ii << std::endl; });
+
+    auto f6x32_vec_tc    = ck::type_convert<f6x32_pk_t>(ck::float32_t(1.0f));
+    auto f6x32_vec_cnstr = f6x32_pk_t(0x08);
+
+    EXPECT_EQ(f6x32_vec_tc, f6x32_vec_cnstr);
+}
+
 // test vector of 2 f6x16_pk_t, contains 32 f6_t
 TEST(FP6, TestAsType16x2)
 {

From f240ae32487219b4dd9d3152b816f87166e20feb Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 7 Jul 2025 10:08:49 -0700
Subject: [PATCH 287/443] Enable Async Copy for MI355 (#2425)

* add for async load builtin

* add async load api

* fix some compiling errors

* fix a compiling error

* fix some compiling errors

* add a pipeline which copies from v4

* add a new pipeline for async load

* fix some compiling errors

* add async load tests

* fix some issues in async load

* fix

* fix async inline assembly

* fix async inline assembly

* add ignore header file

* comment some not gfx950 codes

* comment some not gfx950 codes

* fix a error

* update async load apis

* fix lds descriptor

* fix a compiling error

* fix some compiling errors

* fix a descriptor issue

* update lds descriptor

* change async pipeline's tile distribution pattern from thread to warp

* fix clang format

* update async policy

* fix a CRTP issue

* fix a typo error

* change lds layout

* fix some sync issues

* improve codes

* delete the async test

* fix a commented format issue

* avoid compiling device functions when compile host

* make gemm run

* add the copy kernel support

* finish the feature

* Address comment

* add the support for buffer_builtin

* solved the merging problem

* Comment Addressed

---------

Co-authored-by: joye <joye@amd.com>
Co-authored-by: joyeamd <John.Ye@amd.com>
---
 CHANGELOG.md                                  |  1 +
 example/ck_tile/03_gemm/gemm_utils.hpp        |  1 -
 example/ck_tile/36_copy/test_copy.cpp         | 11 +--
 example/ck_tile/36_copy/test_copy.hpp         | 37 ++++++---
 .../core/arch/amd_buffer_addressing.hpp       | 76 ++++++++++++++-----
 .../arch/amd_buffer_addressing_builtins.hpp   | 75 +++++++++++++-----
 include/ck_tile/core/tensor/buffer_view.hpp   |  4 +-
 include/ck_tile/core/tensor/load_tile.hpp     | 13 ++++
 include/ck_tile/core/tensor/tensor_view.hpp   |  6 +-
 include/ck_tile/core/tensor/tile_window.hpp   | 62 ++++++---------
 .../core/tensor/tile_window_linear.hpp        | 73 +++++++-----------
 .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp  |  9 +++
 12 files changed, 225 insertions(+), 143 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 86a426e321..17f9455feb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 * Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout. (#2166)
 * Added Vectorize Transpose optimization for CK Tile (#2131)
+* Added the asynchronous copy for gfx950 (#2425)
 
 
 ### Fixes
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 5f767d56aa..2157397f1d 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -15,7 +15,6 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4
 
-// temporary workaround to get k_warp_tile based on PrecType and gfx950 or not
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
 {
diff --git a/example/ck_tile/36_copy/test_copy.cpp b/example/ck_tile/36_copy/test_copy.cpp
index 81ea5255fc..4123408453 100644
--- a/example/ck_tile/36_copy/test_copy.cpp
+++ b/example/ck_tile/36_copy/test_copy.cpp
@@ -53,16 +53,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     x_buf.ToDevice(x_host.data());
 
-    using BlockWaves = ck_tile::sequence<2, 1>;
-    using BlockTile  = ck_tile::sequence<64, 8>;
-    using WaveTile   = ck_tile::sequence<64, 8>;
-    using Vector     = ck_tile::sequence<1, 4>;
+    using BlockWaves         = ck_tile::sequence<2, 1>;
+    using BlockTile          = ck_tile::sequence<64, 8>;
+    using WaveTile           = ck_tile::sequence<64, 8>;
+    using Vector             = ck_tile::sequence<1, 2>;
+    constexpr bool AsyncCopy = true;
 
     ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
     std::cout << "grid size " << kGridSize << std::endl;
 
     using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
-    using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
+    using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
     using Kernel  = ck_tile::TileCopy<Problem>;
 
     constexpr ck_tile::index_t kBlockSize  = 128;
diff --git a/example/ck_tile/36_copy/test_copy.hpp b/example/ck_tile/36_copy/test_copy.hpp
index 8fed22a3d0..0b3c87d472 100644
--- a/example/ck_tile/36_copy/test_copy.hpp
+++ b/example/ck_tile/36_copy/test_copy.hpp
@@ -50,11 +50,12 @@ struct TileCopyShape
     static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N, "Inconsisten wave group size!");
 };
 
-template <typename XDataType_, typename BlockShape_>
+template <typename XDataType_, typename BlockShape_, bool AsyncCopy_>
 struct TileCopyProblem
 {
-    using XDataType  = remove_cvref_t<XDataType_>;
-    using BlockShape = remove_cvref_t<BlockShape_>;
+    using XDataType                 = remove_cvref_t<XDataType_>;
+    using BlockShape                = remove_cvref_t<BlockShape_>;
+    static constexpr bool AsyncCopy = AsyncCopy_;
 };
 
 template <typename Problem_>
@@ -63,6 +64,8 @@ struct TileCopy
     using Problem   = ck_tile::remove_cvref_t<Problem_>;
     using XDataType = typename Problem::XDataType;
 
+    static constexpr bool AsyncCopy = Problem::AsyncCopy;
+
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
     {
@@ -156,17 +159,29 @@ struct TileCopy
 
             if(my_id == warp_id)
             {
-                // load from DRAM to registers
-                load_tile(dram_tile, x_block_window);
+                if constexpr(AsyncCopy)
+                {
+                    async_load_tile(x_block_lds_window_no_dist, x_block_window);
 
-                // store in lds
-                store_tile(x_block_lds_window_no_dist, dram_tile);
+                    load_tile(dram_tile, x_block_lds_window);
 
-                // read from lds to registers
-                load_tile(dram_tile, x_block_lds_window);
+                    // store from registers to DRAM
+                    store_tile(y_block_window, dram_tile);
+                }
+                else
+                {
+                    // load from DRAM to registers
+                    load_tile(dram_tile, x_block_window);
 
-                // store from registers to DRAM
-                store_tile(y_block_window, dram_tile);
+                    // store in lds
+                    store_tile(x_block_lds_window_no_dist, dram_tile);
+
+                    // read from lds to registers
+                    load_tile(dram_tile, x_block_lds_window);
+
+                    // store from registers to DRAM
+                    store_tile(y_block_window, dram_tile);
+                }
             }
             __syncthreads();
             move_tile_window(x_block_window, {0, S::Block_N});
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 12f49aa4e3..aafc6c0a85 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -13,6 +13,7 @@
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
 
 // This attribute gives a hint to the compiler that a branch is likely to be taken.
 // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
@@ -23,6 +24,8 @@
 #define LIKELY(x) (__builtin_expect(!!(x), 1))
 #endif
 
+using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
+
 namespace ck_tile {
 
 // 128 bit SGPRs to supply buffer resource in buffer instructions
@@ -1270,7 +1273,7 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
 // Direct loads from global to LDS.
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
-                                __attribute__((address_space(3))) uint32_t* lds_ptr,
+                                as3_uint32_ptr lds_ptr,
                                 index_t size,
                                 index_t voffset,
                                 index_t soffset,
@@ -1749,7 +1752,7 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool pre_nop                        = false>
-CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
+CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T* smem,
                                                int32x4_t src_wave_buffer_resource,
                                                index_t src_thread_addr_offset,
                                                index_t src_wave_addr_offset,
@@ -1779,29 +1782,61 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
                                           index_t flag                         = 0,
                                           bool_constant<oob_conditional_check> = {})
 {
-    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
-
+    constexpr index_t bytes = sizeof(T) * N;
+#if defined(__gfx950__)
+    static_assert(bytes == 4 || bytes == 12 || bytes == 16,
+                  "wrong! only support in dword, dwordx3, dwordx4");
+    ignore = src_wave_addr_offset;
+    ignore = src_immediate_addr_offset;
     if constexpr(oob_conditional_check)
     {
         index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                        smem,
-                                        sizeof(uint32_t),
-                                        v_offset,
-                                        src_wave_addr_offset,
-                                        src_immediate_addr_offset,
-                                        static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            v_offset,
+            0,
+            0,
+            static_cast<index_t>(coherence));
     }
     else
     {
-        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                        smem,
-                                        sizeof(uint32_t),
-                                        src_thread_addr_offset,
-                                        src_wave_addr_offset,
-                                        src_immediate_addr_offset,
-                                        static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            src_thread_addr_offset,
+            0,
+            0,
+            static_cast<index_t>(coherence));
     }
+#else
+    static_assert(bytes == 4, "wrong! not implemented vector size");
+    if constexpr(oob_conditional_check)
+    {
+        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            v_offset,
+            src_wave_addr_offset,
+            src_immediate_addr_offset,
+            static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            src_thread_addr_offset,
+            src_wave_addr_offset,
+            src_immediate_addr_offset,
+            static_cast<index_t>(coherence));
+    }
+#endif
 }
 
 template <index_t N,
@@ -2775,9 +2810,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                  : "memory");
 #else
     // LDS pointer must be attributed with the LDS address space.
-    __attribute__((address_space(3))) uint32_t* lds_ptr =
-        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
-            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+    as3_uint32_ptr lds_ptr =
+        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
 
     llvm_amdgcn_raw_buffer_load_lds(
         src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 306d2cdac3..6ada83aa0e 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -14,6 +14,8 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 
+using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
+
 namespace ck_tile {
 
 // 128 bit SGPRs to supply buffer resource in buffer instructions
@@ -1138,7 +1140,7 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
 // Direct loads from global to LDS.
 CK_TILE_DEVICE_EXTERN void
 llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
-                                __attribute__((address_space(3))) uint32_t* lds_ptr,
+                                as3_uint32_ptr lds_ptr,
                                 index_t size,
                                 index_t voffset,
                                 index_t soffset,
@@ -1549,29 +1551,61 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
                                           index_t flag                         = 0,
                                           bool_constant<oob_conditional_check> = {})
 {
-    static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
-
+    constexpr index_t bytes = sizeof(T) * N;
+#if defined(__gfx950__)
+    static_assert(bytes == 4 || bytes == 12 || bytes == 16,
+                  "wrong! only support in dword, dwordx3, dwordx4");
+    ignore = src_wave_addr_offset;
+    ignore = src_immediate_addr_offset;
     if constexpr(oob_conditional_check)
     {
-        index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                        smem,
-                                        sizeof(uint32_t),
-                                        v_offset,
-                                        src_wave_addr_offset,
-                                        src_immediate_addr_offset,
-                                        static_cast<index_t>(coherence));
+        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            v_offset,
+            0,
+            0,
+            static_cast<index_t>(coherence));
     }
     else
     {
-        llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
-                                        smem,
-                                        sizeof(uint32_t),
-                                        src_thread_addr_offset,
-                                        src_wave_addr_offset,
-                                        src_immediate_addr_offset,
-                                        static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            src_thread_addr_offset,
+            0,
+            0,
+            static_cast<index_t>(coherence));
     }
+#else
+    static_assert(bytes == 4, "wrong! not implemented vector size");
+    if constexpr(oob_conditional_check)
+    {
+        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            v_offset,
+            src_wave_addr_offset,
+            src_immediate_addr_offset,
+            static_cast<index_t>(coherence));
+    }
+    else
+    {
+        llvm_amdgcn_raw_buffer_load_lds(
+            src_wave_buffer_resource,
+            reinterpret_cast<as3_uint32_ptr t*>(reinterpret_cast<uintptr_t>(smem)),
+            bytes,
+            src_thread_addr_offset,
+            src_wave_addr_offset,
+            src_immediate_addr_offset,
+            static_cast<index_t>(coherence));
+    }
+#endif
 }
 
 template <index_t N,
@@ -2545,9 +2579,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                  : "memory");
 #else
     // LDS pointer must be attributed with the LDS address space.
-    __attribute__((address_space(3))) uint32_t* lds_ptr =
-        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
-            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
+    as3_uint32_ptr lds_ptr =
+        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
 
     llvm_amdgcn_raw_buffer_load_lds(
         src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 231a2c832b..5cae332007 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -452,10 +452,12 @@ struct buffer_view<address_space_enum::global,
                       "wrong! X should contain multiple T");
 
         constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+        const int32x4_t src_wave_buffer_resource =
+            make_wave_buffer_resource(p_data_, (buffer_size_) * sizeof(type));
 
         amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
             smem,
-            cached_buf_res_,
+            src_wave_buffer_resource,
             i,
             linear_offset,
             is_valid_element,
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index 4601261197..8b7541bf23 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -89,6 +89,19 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
         tile, number<i_access>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 
+template <typename LdsTileWindow_,
+          typename TileWindow_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto async_load_tile(LdsTileWindow_&& lds_tile,
+                                    const TileWindow_& tile_window,
+                                    number<i_access>                     = {},
+                                    bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.async_load(
+        lds_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
 template <typename LdsTileWindow_,
           typename TileWindow_,
           index_t i_access           = -1,
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 9429a960d8..269465fae6 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -161,7 +161,8 @@ struct tensor_view
     CK_TILE_HOST_DEVICE constexpr void
     async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
                                   const TensorCoord& coord,
-                                  index_t linear_offset) const
+                                  index_t linear_offset,
+                                  bool_constant<oob_conditional_check> = {}) const
     {
         return buf_.template async_get<X>(
             smem,
@@ -181,7 +182,8 @@ struct tensor_view
     async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
                                   const TensorCoord& coord,
                                   index_t linear_offset,
-                                  bool is_valid_element) const
+                                  bool is_valid_element,
+                                  bool_constant<oob_conditional_check> = {}) const
     {
         return buf_.template async_get<X>(smem,
                                           coord.get_offset() / PackedSize,
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 6027668c8e..ad5902f16e 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -344,64 +344,52 @@ struct tile_window_with_static_distribution
     {
         using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
         using LdsDataType   = typename LdsTileWindow::DataType;
-
-        // issues * warps * lanes
-        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
-
-        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
-        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
-        // check?)
-        constexpr index_t size_per_buf =
-            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
-                make_tuple(number<0>{}, number<0>{}, number<0>{}));
-
-        constexpr index_t size_per_wave =
-            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
-                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
-            size_per_buf;
-
-        constexpr index_t size_per_issue =
-            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
-                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
-            size_per_buf;
-
-        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
-
-        using Traits = typename Base::Traits;
+        using Traits        = typename Base::Traits;
 
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        // TODO: we force CK_TILE_LDS_ADDR
-        CK_TILE_LDS_ADDR LdsDataType* smem =
-            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+        // Precompute invariant values outside loops
+        const auto window_origin       = lds_tile.get_window_origin();
+        const auto& bottom_tensor_view = lds_tile.get_bottom_tensor_view();
+        const auto& tensor_descriptor  = bottom_tensor_view.get_tensor_descriptor();
+        auto smem_base_ptr             = bottom_tensor_view.get_buffer_view().p_data_;
 
-        // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            /// TODO: use structure binding (to be captured later) if compiled in C++20
             auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
             auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
 
             static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
                 constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
 
-                // read from bottom tensor
-                this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
-                    smem, bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
+                // Use precomputed window origin
+                auto lds_bottom_tensor_thread_idx =
+                    window_origin + window_adaptor_thread_coord.get_bottom_index();
 
-                // move thread coordinate
+                // Use precomputed tensor descriptor
+                const auto lds_coord =
+                    make_tensor_coordinate(tensor_descriptor, lds_bottom_tensor_thread_idx);
+
+                // Calculate SMEM address using base pointer
+                CK_TILE_LDS_ADDR LdsDataType* smem = smem_base_ptr + lds_coord.get_offset();
+
+                // Write into bottom tensor
+                this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                    smem,
+                    bottom_tensor_thread_coord,
+                    number<0>{},
+                    bool_constant<oob_conditional_check>{});
+
+                // Move thread coordinate if not last access
                 if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                 {
-                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-
+                    constexpr auto idx_diff_ys    = SFC_Ys::get_forward_step(iAccess);
                     constexpr auto idx_diff_ps_ys = container_concat(
                         generate_tuple([&](auto) { return number<0>{}; }, number<Base::NDimP>{}),
                         idx_diff_ys);
 
                     Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
-
-                    smem += size_per_issue; // Note we manually increase the per-issue offset
                 }
             });
         });
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index 596584f3cc..c4b24fba93 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -186,7 +186,7 @@ struct tile_window_linear
         const typename Base::WindowLengths& window_lengths,
         const typename Base::BottomTensorIndex& window_origin,
         const typename Base::TileDstr& tile_distribution)
-        : cached_coords_{}, cached_flags_{}
+        : cached_coords_{}, cached_window_adaptor_coords_{}, cached_flags_{}
     {
         this->bottom_tensor_view_            = bottom_tensor_view;
         this->window_lengths_                = window_lengths;
@@ -214,7 +214,8 @@ struct tile_window_linear
 
             if constexpr(need_save_non_linear_coord)
             {
-                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+                cached_coords_(non_linear_id)                = bottom_tensor_thread_coord_tmp;
+                cached_window_adaptor_coords_(non_linear_id) = window_adaptor_thread_coord_tmp;
             }
 
             // TODO: need pad_tensor_view to check which dim need use flag to check
@@ -554,61 +555,42 @@ struct tile_window_linear
     {
         using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
         using LdsDataType   = typename LdsTileWindow::DataType;
+        using vector_t      = typename traits::vector_t;
 
-        // currently we only support everything is non linear dim
-        // actually it's not performant if we have linear dim(e.g. fast changing)
-        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(NumAccess_NonLinear == NumAccess, "Unsupported configuration");
         static_assert(Base::BottomTensorView::buffer_view::get_address_space() ==
-                      address_space_enum::global);
+                          address_space_enum::global,
+                      "Requires global memory");
 
-        // issues * warps * lanes
-        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+        // Precompute invariant values outside the lambda
+        const auto window_origin       = lds_tile.get_window_origin();
+        const auto& bottom_tensor_view = lds_tile.get_bottom_tensor_view();
+        const auto& tensor_descriptor  = bottom_tensor_view.get_tensor_descriptor();
+        auto smem_base_ptr             = bottom_tensor_view.get_buffer_view().p_data_;
 
-        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
-        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
-        // check?)
-        constexpr index_t size_per_buf =
-            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
-                make_tuple(number<0>{}, number<0>{}, number<0>{}));
-
-        constexpr index_t size_per_wave =
-            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
-                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
-            size_per_buf;
-
-        constexpr index_t size_per_issue =
-            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
-                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
-            size_per_buf;
-
-        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
-
-        using vector_t = typename Base::Traits::vector_t;
-
-        // TODO: we force CK_TILE_LDS_ADDR
-        CK_TILE_LDS_ADDR LdsDataType* smem =
-            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
-
-        // loop over thread tensor space [y0, y1, ...]
         auto issue = [&](auto i_access_) {
-            constexpr auto IAccess          = number<i_access_>{};
-            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            constexpr auto IAccess       = number<i_access_>{};
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[IAccess]>{};
+
+            // Use precomputed values
             auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto window_adaptor_coord       = cached_window_adaptor_coords_[non_linear_id];
             auto bottom_tensor_flag         = cached_flags_[IAccess];
 
-            // read from bottom tensor
+            auto lds_bottom_tensor_thread_idx =
+                window_origin + window_adaptor_coord.get_bottom_index();
+            const auto lds_coord =
+                make_tensor_coordinate(tensor_descriptor, lds_bottom_tensor_thread_idx);
+
+            CK_TILE_LDS_ADDR LdsDataType* smem = smem_base_ptr + lds_coord.get_offset();
+
+            // Read from bottom tensor
             this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
                 smem,
                 bottom_tensor_thread_coord,
                 0,
                 bottom_tensor_flag,
                 bool_constant<oob_conditional_check>{});
-
-            // move thread coordinate
-            if constexpr(i_access_ != (NumAccess - 1))
-            {
-                smem += size_per_issue; // Note we manually increase the per-issue offset
-            }
         };
 
         WINDOW_DISPATCH_ISSUE();
@@ -928,7 +910,8 @@ struct tile_window_linear
 
             if constexpr(need_save_non_linear_coord)
             {
-                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+                cached_coords_(non_linear_id)                = bottom_tensor_thread_coord_tmp;
+                cached_window_adaptor_coords_(non_linear_id) = window_adaptor_thread_coord_tmp;
             }
 
             if constexpr(i_access != (NumAccess - 1))
@@ -948,6 +931,8 @@ struct tile_window_linear
 
     // this contains:
     array<typename Base::BottomTensorCoord, traits::NumAccess_NonLinear> cached_coords_;
+    array<typename Base::WindowAdaptorCoord, traits::NumAccess_NonLinear>
+        cached_window_adaptor_coords_;
     array<bool, Base::Traits::NumAccess> cached_flags_;
 };
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index 07bfb33252..6861adb153 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -32,6 +32,15 @@ struct GemmPipelineAgBgCrImplBase
         move_tile_window(dram_tile_window, dram_tile_window_step);
     }
 
+    template <typename DstBlockWindow, typename SrcTileWindow, typename DramTileWindowStep>
+    CK_TILE_DEVICE void GlobalPrefetchAsync(DstBlockWindow& dst_block_window,
+                                            SrcTileWindow& dram_tile_window,
+                                            const DramTileWindowStep& dram_tile_window_step) const
+    {
+        async_load_tile(dst_block_window, dram_tile_window);
+        move_tile_window(dram_tile_window, dram_tile_window_step);
+    }
+
     template <typename DstTileWindow, typename SrcBlockTile, typename ElementFunction>
     CK_TILE_DEVICE void LocalPrefill(DstTileWindow& lds_tile_window,
                                      const SrcBlockTile& src_block_tile,

From b2dea90116d1060c67db5edddb6d4498188ebac4 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Tue, 8 Jul 2025 09:17:25 +0800
Subject: [PATCH 288/443] Eliminate warning caused by failed to meet occupancy
 requirement (#2389)

Co-authored-by: felix <felix.li@amd.com>
---
 .../block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp       | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
index 6398bf316e..10daea99d1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
@@ -122,6 +122,9 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
             {
                 if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
+                // use larger K/V LDS buffer size will lower the occupancy
+                else if constexpr(64 <= kK0 || 64 <= kK1)
+                    return 1;
                 else
                     return 2;
             }

From e033a1b4bf1ecf9698613995c23ef66e7055bbbf Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 7 Jul 2025 19:40:30 -0700
Subject: [PATCH 289/443] fix compilation errors with clang20 (#2464)

---
 include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 6ada83aa0e..ca4ff8ca7e 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -13,6 +13,7 @@
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
 
 using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
 
@@ -1598,7 +1599,7 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
     {
         llvm_amdgcn_raw_buffer_load_lds(
             src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr t*>(reinterpret_cast<uintptr_t>(smem)),
+            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
             bytes,
             src_thread_addr_offset,
             src_wave_addr_offset,

From 5557eadce68fa6a26808e24f1ee2d3e3ed6d63a1 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Tue, 8 Jul 2025 10:42:07 +0800
Subject: [PATCH 290/443] [CK TILE] Fix FA build filter (#2369)

* Fix for fwd/bwd kernel build filter

* fix bwd code

* cmake depends & bwd filter order fix

* revert unexpected reformat

* Avoid change fmha bwd filter order for downstream compatibility

* Revert unexpected changes

---------

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
Co-authored-by: Ding, Yi <yi.ding@amd.com>
---
 example/ck_tile/01_fmha/CMakeLists.txt        | 45 ++++++++++++++-----
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   |  2 +-
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       | 11 +++--
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |  4 +-
 4 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index e73faf6325..1b004ec100 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -1,7 +1,7 @@
 # validate user-specified fmha_fwd API list
 set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv;pagedkv_prefill")
 set(FMHA_FWD_ENABLE_APIS "fwd" CACHE STRING
-    "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
+  "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
 if(FMHA_FWD_ENABLE_APIS STREQUAL "all")
   set(FMHA_FWD_ENABLE_APIS ${FMHA_FWD_KNOWN_APIS})
 endif()
@@ -17,11 +17,30 @@ if(NOT "fwd" IN_LIST FMHA_FWD_ENABLE_APIS)
   list(APPEND FMHA_FWD_ENABLE_APIS "fwd")
 endif()
 
+file(GLOB_RECURSE CODE_GEN_SCRIPTS CONFIGURE_DEPENDS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  ${CMAKE_CURRENT_LIST_DIR}/codegen/*.py
+)
+# re-run execute_process `generate.py --list_blobs` if any of the codegen scripts change
+set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS "${CODE_GEN_SCRIPTS}")
+
 string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
+set(FMHA_FWD_CODE_GEN_COMMON_ARGS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${FMHA_FWD_APIS}
+  # --filter fmha_fwd...
+)
+set(FMHA_BWD_CODE_GEN_COMMON_ARGS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api bwd
+  --receipt 3
+  # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
+)
+
 # generate a list of kernels, but not actually emit files at config sta
 execute_process(
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api ${FMHA_FWD_APIS} --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
   RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
@@ -29,8 +48,8 @@ if(ret AND NOT ret EQUAL 0)
 endif()
 
 execute_process(
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_BWD_CODE_GEN_COMMON_ARGS}
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt
   RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
@@ -44,14 +63,16 @@ file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
 
 add_custom_command(
   OUTPUT ${FMHA_FWD_GEN_BLOBS}
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api ${FMHA_FWD_APIS} --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CODE_GEN_SCRIPTS}
 )
 
 add_custom_command(
   OUTPUT ${FMHA_BWD_GEN_BLOBS}
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR} --receipt 3
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_BWD_CODE_GEN_COMMON_ARGS}
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CODE_GEN_SCRIPTS}
 )
 
 set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
@@ -73,7 +94,7 @@ target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
 # NOTE: this is dangerous since will change the whole kernel to flush denormals
 #       WIP with compiler team for an exp2 intrinsic..., then remove this
 if(NOT DEFINED FMHA_FWD_FAST_EXP2)
-    set(FMHA_FWD_FAST_EXP2 true)
+  set(FMHA_FWD_FAST_EXP2 true)
 endif()
 
 set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
@@ -82,9 +103,9 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 #       ... because they are auto-generated
 if(FMHA_FWD_FAST_EXP2)
-	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
 else()
-	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
 endif()
 list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 80b64f918a..c251460a9a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -169,7 +169,7 @@ template <typename dot_do_o_trait_, typename dq_dk_dv_trait_, typename convert_d
 float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
 {{
     if(s.log_level_ > 0)
-        std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << ", " << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << ", " << fmha_bwd_convert_dq_get_name_<convert_dq_trait_>() << std::flush;
+        std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << "@" << fmha_bwd_convert_dq_get_name_<convert_dq_trait_>() << "@" << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << std::flush;
     return ck_tile::launch_kernel(s,
         [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
         [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }},
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index 35b2f02e8a..ce3bf8fe8d 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -1874,9 +1874,12 @@ struct FmhaBwdConvertQGradKernel
             if (kPadHeadDimQ) n += "d";
             return n.empty() ? n : std::string("p") + n; }();
         return
-            _SS_("fmha_bwd_convert_dq_d") + _TS_(kQKHeaddim) + "_" + _SS_(t2s<QGradDataType>::name) +
-            "_" + (kIsGroupMode ? "group" : "batch") + "_" + ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) + 
-            (kIsDeterministic ? "_deterministic" : "_ndeterministic") ;
+            _SS_("fmha_bwd_convert_dq_d") + _TS_(kQKHeaddim) + "_"
+            + _SS_(t2s<QGradDataType>::name) + "_"
+            + "b" + _TS_(kM0) + "x" + _TS_(kN0) + "_"
+            + (kIsGroupMode ? "group" : "batch") + "_"
+            + ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn)
+            + (kIsDeterministic ? "_deterministic" : "_ndeterministic") ;
         #undef _SS_
         #undef _TS_
         // clang-format on
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 6dc014c9de..561e5fb00a 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -100,7 +100,7 @@ struct FmhaFwdKernel
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
         #undef _SS_
         #undef _TS_
         // clang-format on

From 33d704a6f907ce887f4286ee89efd20a62c882f2 Mon Sep 17 00:00:00 2001
From: Vidyasagar Ananthan <vanantha@amd.com>
Date: Tue, 8 Jul 2025 10:52:00 -0700
Subject: [PATCH 291/443] Separating ninja build tracing and setting flag to
 false (#2470)

* Separating ninja build tracing and setting flag to false

* Add ftime-tracing flag

* Fix conditional issue

* Try adding a script block

* Embed Clang analysis in ftime trace block
---
 Jenkinsfile | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2b3dd559e6..395e9bd836 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -345,7 +345,7 @@ def cmake_build(Map conf=[:]){
     def build_cmd
     def execute_cmd = conf.get("execute_cmd", "")
     if(!setup_args.contains("NO_CK_BUILD")){
-        def cmake_flags = params.NINJA_BUILD_TRACE ? "-O3 -ftime-trace" : "-O3"
+        def cmake_flags = params.NINJA_FTIME_TRACE ? "-O3 -ftime-trace" : "-O3"
         if (params.NINJA_BUILD_TRACE) {
             echo "running ninja build trace"
         }
@@ -378,11 +378,16 @@ def cmake_build(Map conf=[:]){
         //run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
         if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
             if ((setup_args.contains("gfx9") && params.NINJA_BUILD_TRACE) || params.BUILD_INSTANCES_ONLY){
+                if (params.NINJA_FTIME_TRACE) {
+                    echo "running ninja ftime trace"
+                    sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --all . clang_build.log"
+                    sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --analyze clang_build.log > clang_build_analysis.log"
+                    archiveArtifacts "clang_build_analysis.log"
+                }
+                
                 sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
-                sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --all . clang_build.log"
-                sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --analyze clang_build.log > clang_build_analysis.log"
                 archiveArtifacts "ck_build_trace.json"
-                archiveArtifacts "clang_build_analysis.log"
+
                 // do not run unit tests when building instances only
                 if(!params.BUILD_INSTANCES_ONLY){
                     if (!runAllUnitTests){
@@ -937,6 +942,10 @@ pipeline {
             name: "NINJA_BUILD_TRACE",
             defaultValue: false,
             description: "Generate a ninja build trace (default: OFF)")
+        booleanParam(
+            name: "NINJA_FTIME_TRACE",
+            defaultValue: false,
+            description: "Generate a detailed time trace (default: OFF)")
         booleanParam(
             name: "BUILD_LEGACY_OS",
             defaultValue: false,
@@ -1402,14 +1411,20 @@ pipeline {
                         expression { params.BUILD_INSTANCES_ONLY.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                     }
                     agent{ label rocmnode("gfx942") }
-                    environment{
-                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                           -D CMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -D CMAKE_BUILD_TYPE=Release \
-                                           -D CMAKE_CXX_FLAGS=" -O3 -ftime-trace" .. && ninja -j64 """
-                    }
                     steps{
-                        buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        script {
+                            def execute_args = params.NINJA_FTIME_TRACE ? 
+                                """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                    -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                    -D CMAKE_BUILD_TYPE=Release \
+                                    -D CMAKE_CXX_FLAGS=" -O3 -ftime-trace" .. && ninja -j64 """ :
+                                """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                    -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                    -D CMAKE_BUILD_TYPE=Release \
+                                    -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
+                            
+                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        }
                         cleanWs()
                     }
                 }

From 112b47e8851a5f86ffa1ef17981d543fcaa0fe4b Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 8 Jul 2025 15:09:30 -0700
Subject: [PATCH 292/443] Add templates for fp16 and unsigned short atomic add
 to fix FBGEMM builds. (#2471)

* add template for fp16 atomic add

* add template for unsigned short atomic add

* use atomicCAS in atomic add for fp16 and unsigned short
---
 .../utility/generic_memory_space_atomic.hpp   | 27 ++++++++++++++++++
 .../gpu/gemm_universal_preshuffle.inc         | 28 ++++---------------
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index ab9cc4199c..3dda8af8e2 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -32,6 +32,33 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
     return atomicAdd(p_dst, x);
 }
 
+template <>
+__device__ unsigned short atomic_add<unsigned short>(unsigned short* p_dst, const unsigned short& x)
+{
+    unsigned short old_val, new_val;
+    do
+    {
+        old_val = *p_dst;
+        new_val = old_val + x;
+    } while(atomicCAS(p_dst, old_val, new_val) != old_val);
+    return old_val;
+}
+
+template <>
+__device__ _Float16 atomic_add<_Float16>(_Float16* p_dst, const _Float16& x)
+{
+    _Float16 old_val, new_val;
+    do
+    {
+        old_val = *p_dst;
+        new_val = old_val + x; // Proper FP16 addition
+    } while(atomicCAS(reinterpret_cast<unsigned short*>(p_dst),
+                      *reinterpret_cast<unsigned short*>(&old_val),
+                      *reinterpret_cast<unsigned short*>(&new_val)) !=
+            *reinterpret_cast<unsigned short*>(&old_val));
+    return old_val;
+}
+
 template <>
 __device__ double atomic_add<double>(double* p_dst, const double& x)
 {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
index b44d60deaf..b987519082 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
@@ -10,27 +10,11 @@ namespace instance {
 
 #if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
 
-using GemmF8F8BF16InstanceVector =
-    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
-                                                        Col,
-                                                        Row,
-                                                        F8,
-                                                        F8,
-                                                        BF16,
-                                                        PassThrough,
-                                                        PassThrough,
-                                                        PassThrough>>>&;
+using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;
 
-using GemmF8F8F16InstanceVector =
-    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
-                                                        Col,
-                                                        Row,
-                                                        F8,
-                                                        F8,
-                                                        F16,
-                                                        PassThrough,
-                                                        PassThrough,
-                                                        PassThrough>>>&;
+using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
     GemmF8F8BF16InstanceVector& instances);
@@ -48,7 +32,7 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances
     GemmF8F8BF16InstanceVector& instances);
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
-        GemmF8F8BF16InstanceVector& instances);
+    GemmF8F8BF16InstanceVector& instances);
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
     GemmF8F8BF16InstanceVector& instances);
 
@@ -84,7 +68,7 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_defau
     GemmF8F8F16InstanceVector& instances);
 
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-        GemmF8F8F16InstanceVector& instances);
+    GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
     GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(

From 93420ecf89d0747c35b096aa95453eaaceb0aea3 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 8 Jul 2025 19:01:26 -0700
Subject: [PATCH 293/443] =?UTF-8?q?Revert=20"Add=20templates=20for=20fp16?=
 =?UTF-8?q?=20and=20unsigned=20short=20atomic=20add=20to=20fix=20FBGEMM=20?=
 =?UTF-8?q?bu=E2=80=A6"=20(#2474)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 112b47e8851a5f86ffa1ef17981d543fcaa0fe4b.
---
 .../utility/generic_memory_space_atomic.hpp   | 27 ------------------
 .../gpu/gemm_universal_preshuffle.inc         | 28 +++++++++++++++----
 2 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index 3dda8af8e2..ab9cc4199c 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -32,33 +32,6 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
     return atomicAdd(p_dst, x);
 }
 
-template <>
-__device__ unsigned short atomic_add<unsigned short>(unsigned short* p_dst, const unsigned short& x)
-{
-    unsigned short old_val, new_val;
-    do
-    {
-        old_val = *p_dst;
-        new_val = old_val + x;
-    } while(atomicCAS(p_dst, old_val, new_val) != old_val);
-    return old_val;
-}
-
-template <>
-__device__ _Float16 atomic_add<_Float16>(_Float16* p_dst, const _Float16& x)
-{
-    _Float16 old_val, new_val;
-    do
-    {
-        old_val = *p_dst;
-        new_val = old_val + x; // Proper FP16 addition
-    } while(atomicCAS(reinterpret_cast<unsigned short*>(p_dst),
-                      *reinterpret_cast<unsigned short*>(&old_val),
-                      *reinterpret_cast<unsigned short*>(&new_val)) !=
-            *reinterpret_cast<unsigned short*>(&old_val));
-    return old_val;
-}
-
 template <>
 __device__ double atomic_add<double>(double* p_dst, const double& x)
 {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
index b987519082..b44d60deaf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
@@ -10,11 +10,27 @@ namespace instance {
 
 #if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
 
-using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
-    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;
+using GemmF8F8BF16InstanceVector =
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>&;
 
-using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
-    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
+using GemmF8F8F16InstanceVector =
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>&;
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
     GemmF8F8BF16InstanceVector& instances);
@@ -32,7 +48,7 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances
     GemmF8F8BF16InstanceVector& instances);
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
-    GemmF8F8BF16InstanceVector& instances);
+        GemmF8F8BF16InstanceVector& instances);
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
     GemmF8F8BF16InstanceVector& instances);
 
@@ -68,7 +84,7 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_defau
     GemmF8F8F16InstanceVector& instances);
 
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-    GemmF8F8F16InstanceVector& instances);
+        GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
     GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(

From e391b025a05d368eb4f76f7ee0b2f910faf011ae Mon Sep 17 00:00:00 2001
From: Vidyasagar Ananthan <vanantha@amd.com>
Date: Tue, 8 Jul 2025 22:36:50 -0700
Subject: [PATCH 294/443] New ninja tracing script (#2472)

* Adding ninja log json convertion utility

* Updating to match old ninjatracing

* Updating Jenkins to use new ninjatracing

* Ensuring v7 works

* Removing old ninjatracing from dockerfile
---
 Dockerfile                     |   1 -
 Jenkinsfile                    |   3 +-
 script/ninja_json_converter.py | 510 +++++++++++++++++++++++++++++++++
 3 files changed, 511 insertions(+), 3 deletions(-)
 create mode 100644 script/ninja_json_converter.py

diff --git a/Dockerfile b/Dockerfile
index 1a47639d31..0219f99238 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -77,7 +77,6 @@ RUN git clone https://github.com/ccache/ccache.git && \
     wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
     gunzip /usr/local/bin/ninja.gz && \
     chmod a+x /usr/local/bin/ninja && \
-    git clone https://github.com/nico/ninjatracing.git && \
 #Install ClangBuildAnalyzer
     git clone https://github.com/aras-p/ClangBuildAnalyzer.git && \
     cd ClangBuildAnalyzer/ && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 395e9bd836..50c15701a7 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -384,8 +384,7 @@ def cmake_build(Map conf=[:]){
                     sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer  --analyze clang_build.log > clang_build_analysis.log"
                     archiveArtifacts "clang_build_analysis.log"
                 }
-                
-                sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
+                sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace.json"
                 archiveArtifacts "ck_build_trace.json"
 
                 // do not run unit tests when building instances only
diff --git a/script/ninja_json_converter.py b/script/ninja_json_converter.py
new file mode 100644
index 0000000000..92660dc7b3
--- /dev/null
+++ b/script/ninja_json_converter.py
@@ -0,0 +1,510 @@
+#!/usr/bin/env python3
+
+"""
+Converts .ninja_log files into Chrome's about:tracing format.
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Iterator
+
+
+class BuildTarget:
+    """Represents a single build target with timing information."""
+    
+    def __init__(self, start_time: int, end_time: int, output_name: str, cmd_hash: str):
+        self.start_time = int(start_time)
+        self.end_time = int(end_time)
+        self.cmd_hash = cmd_hash
+        self.duration = self.end_time - self.start_time
+        self.targets = [output_name]  # List of target names for this command hash
+        
+    @property
+    def category(self) -> str:
+        """Categorize the build target based on file extension."""
+        # Use the first target for categorization
+        primary_target = self.targets[0] if self.targets else ""
+        ext = Path(primary_target).suffix.lower()
+        if ext in ['.o', '.obj']:
+            return 'compile'
+        elif ext in ['.a', '.lib']:
+            return 'archive'
+        elif ext in ['.so', '.dll', '.dylib']:
+            return 'link_shared'
+        elif ext in ['.exe', '.out']:
+            return 'link_executable'
+        elif 'test' in primary_target.lower():
+            return 'test'
+        else:
+            return 'other'
+    
+    @property
+    def output_name(self) -> str:
+        """Get the primary output name (for backward compatibility)."""
+        return self.targets[0] if self.targets else ""
+
+
+class ThreadScheduler:
+    """Simulates thread allocation for parallelism analysis."""
+    
+    def __init__(self, legacy_mode: bool = False):
+        self.workers: List[int] = []
+        self.legacy_mode = legacy_mode
+        
+    def allocate_thread(self, target: BuildTarget) -> int:
+        """Allocate a thread for the given target."""
+        if self.legacy_mode:
+            # Legacy algorithm from old ninjatracer
+            for worker in range(len(self.workers)):
+                if self.workers[worker] >= target.end_time:
+                    self.workers[worker] = target.start_time
+                    return worker
+            self.workers.append(target.start_time)
+            return len(self.workers) - 1
+        else:
+            # New algorithm
+            for i, worker_end_time in enumerate(self.workers):
+                if worker_end_time <= target.start_time:
+                    self.workers[i] = target.end_time
+                    return i
+            
+            # No available worker, create a new one
+            self.workers.append(target.end_time)
+            return len(self.workers) - 1
+
+
+class NinjaLogParser:
+    """Parser for ninja build log files."""
+    
+    def __init__(self, show_all_builds: bool = False):
+        self.show_all_builds = show_all_builds
+        
+    def parse_log_file(self, log_path: str) -> List[BuildTarget]:
+        """Parse the ninja log file and return build targets."""
+        if not os.path.exists(log_path):
+            raise FileNotFoundError(f"Ninja log file not found: {log_path}")
+            
+        with open(log_path, 'r', encoding='utf-8') as file:
+            lines = file.readlines()
+            
+        if not lines:
+            raise ValueError("Empty ninja log file")
+            
+        # Parse and validate header
+        header = lines[0].strip()
+        version_match = re.match(r'^# ninja log v(\d+)$', header)
+        if not version_match:
+            raise ValueError(f"Invalid ninja log header: {header}")
+            
+        version = int(version_match.group(1))
+        if version < 5:
+            raise ValueError(f"Unsupported ninja log version: {version}")
+            
+        # Skip additional header line for version 6
+        start_line = 2 if version > 5 else 1
+        
+        targets: Dict[str, BuildTarget] = {}
+        last_end_time = 0
+        
+        for line_num, line in enumerate(lines[start_line:], start=start_line + 1):
+            line = line.strip()
+            
+            # Skip empty lines and comments
+            if not line or line.startswith('#'):
+                continue
+                
+            parts = line.split('\t')
+            if len(parts) < 5:
+                print(f"Warning: Skipping malformed line {line_num}: {line}", file=sys.stderr)
+                continue
+                
+            try:
+                start_time, end_time, _, output_name, cmd_hash = parts[:5]
+                start_time, end_time = int(start_time), int(end_time)
+                
+                # Handle incremental builds
+                if not self.show_all_builds and end_time < last_end_time:
+                    targets.clear()
+                    
+                last_end_time = end_time
+                
+                # Group targets by command hash
+                if cmd_hash not in targets:
+                    targets[cmd_hash] = BuildTarget(start_time, end_time, output_name, cmd_hash)
+                else:
+                    # Update with the latest timing and add output
+                    existing = targets[cmd_hash]
+                    existing.start_time = min(existing.start_time, start_time)
+                    existing.end_time = max(existing.end_time, end_time)
+                    existing.duration = existing.end_time - existing.start_time
+                    existing.targets.append(output_name)
+                    
+            except (ValueError, IndexError) as e:
+                print(f"Warning: Error parsing line {line_num}: {e}", file=sys.stderr)
+                continue
+                
+        return sorted(targets.values(), key=lambda t: t.end_time, reverse=True)
+
+
+class FTimeTraceReader:
+    """Reads and processes Clang -ftime-trace JSON files."""
+    
+    def __init__(self, granularity_us: int = 50000):
+        self.granularity_us = granularity_us
+        
+    def read_trace_file(self, trace_path: str) -> Optional[Dict]:
+        """Read and parse a Clang time trace file."""
+        try:
+            with open(trace_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError, IOError):
+            return None
+            
+    def filter_events(self, trace_data: Dict) -> List[Dict]:
+        """Filter trace events based on criteria."""
+        if 'traceEvents' not in trace_data:
+            return []
+            
+        filtered_events = []
+        for event in trace_data['traceEvents']:
+            # Only include complete events (ph=X) that meet duration threshold
+            if (event.get('ph') == 'X' and 
+                event.get('dur', 0) >= self.granularity_us and
+                not event.get('name', '').startswith('Total')):
+                filtered_events.append(event)
+                
+        return filtered_events
+        
+    def adjust_event_timing(self, event: Dict, target: BuildTarget, pid: int, tid: int) -> Dict:
+        """Adjust event timing to align with ninja build timing."""
+        ninja_duration_us = target.duration * 1000
+        
+        # Validate event duration against ninja timing
+        if event.get('dur', 0) > ninja_duration_us:
+            print(f"Warning: Clang trace event duration ({event['dur']}μs) exceeds "
+                  f"ninja duration ({ninja_duration_us}μs) for {target.output_name}", 
+                  file=sys.stderr)
+            return None
+            
+        # Adjust event timing
+        adjusted_event = event.copy()
+        adjusted_event['pid'] = pid
+        adjusted_event['tid'] = tid
+        adjusted_event['ts'] += target.start_time * 1000  # Offset by ninja start time
+        
+        return adjusted_event
+
+class ChromeTraceGenerator:
+    """Generates Chrome tracing format from build targets."""
+    
+    def __init__(self, process_id: int = 1, embed_ftime_traces: bool = False, 
+                 granularity_us: int = 50000, ninja_log_dir: Optional[str] = None,
+                 legacy_format: bool = False):
+        self.process_id = process_id
+        self.scheduler = ThreadScheduler(legacy_mode=legacy_format)
+        self.embed_ftime_traces = embed_ftime_traces
+        self.ninja_log_dir = ninja_log_dir
+        self.ftime_reader = FTimeTraceReader(granularity_us) if embed_ftime_traces else None
+        self.legacy_format = legacy_format
+        
+    def find_ftime_trace_files(self, target: BuildTarget) -> List[str]:
+        """Find Clang -ftime-trace files for a build target."""
+        if not self.ninja_log_dir:
+            return []
+            
+        trace_files = []
+        
+        # Look for .json files adjacent to object files
+        obj_path = Path(self.ninja_log_dir) / target.output_name
+        json_path = obj_path.with_suffix('.json')
+        
+        if json_path.exists():
+            trace_files.append(str(json_path))
+            
+        return trace_files
+        
+    def generate_ftime_events(self, target: BuildTarget, tid: int) -> Iterator[Dict]:
+        """Generate Clang -ftime-trace events for a target."""
+        if not self.embed_ftime_traces or not self.ftime_reader:
+            return
+            
+        trace_files = self.find_ftime_trace_files(target)
+        
+        for trace_file in trace_files:
+            trace_data = self.ftime_reader.read_trace_file(trace_file)
+            if not trace_data:
+                continue
+                
+            filtered_events = self.ftime_reader.filter_events(trace_data)
+            
+            for event in filtered_events:
+                adjusted_event = self.ftime_reader.adjust_event_timing(
+                    event, target, self.process_id, tid
+                )
+                if adjusted_event:
+                    yield adjusted_event
+        
+    def generate_trace_events(self, targets: List[BuildTarget]) -> List[Dict]:
+        """Generate Chrome trace events from build targets."""
+        events = []
+        
+        for target in targets:
+            thread_id = self.scheduler.allocate_thread(target)
+            
+            # Add main ninja build event
+            if self.legacy_format:
+                # Legacy format: join multiple targets with commas, use "targets" category, empty args
+                target_name = ', '.join(target.targets) if len(target.targets) > 1 else target.output_name
+                ninja_event = {
+                    'name': target_name,
+                    'cat': 'targets',
+                    'ph': 'X',  # Complete event
+                    'ts': target.start_time * 1000,  # Convert to microseconds
+                    'dur': target.duration * 1000,   # Convert to microseconds
+                    'pid': self.process_id,
+                    'tid': thread_id,
+                    'args': {}
+                }
+            else:
+                # New format: smart categorization, detailed args
+                ninja_event = {
+                    'name': target.output_name,
+                    'cat': target.category,
+                    'ph': 'X',  # Complete event
+                    'ts': target.start_time * 1000,  # Convert to microseconds
+                    'dur': target.duration * 1000,   # Convert to microseconds
+                    'pid': self.process_id,
+                    'tid': thread_id,
+                    'args': {
+                        'output': target.output_name,
+                        'duration_ms': target.duration,
+                        'cmd_hash': target.cmd_hash
+                    }
+                }
+            events.append(ninja_event)
+            
+            # Add embedded Clang -ftime-trace events
+            if self.embed_ftime_traces:
+                ftime_events = list(self.generate_ftime_events(target, thread_id))
+                events.extend(ftime_events)
+                
+                if ftime_events:
+                    print(f"Embedded {len(ftime_events)} -ftime-trace events for {target.output_name}", 
+                          file=sys.stderr)
+            
+        return events
+
+
+class BuildAnalyzer:
+    """Analyzes build performance and provides statistics."""
+    
+    def __init__(self, targets: List[BuildTarget]):
+        self.targets = targets
+        
+    def get_build_summary(self) -> Dict:
+        """Generate build performance summary."""
+        if not self.targets:
+            return {}
+            
+        total_duration = sum(t.duration for t in self.targets)
+        total_targets = len(self.targets)
+        
+        # Category statistics
+        category_stats = {}
+        for target in self.targets:
+            cat = target.category
+            if cat not in category_stats:
+                category_stats[cat] = {'count': 0, 'total_time': 0}
+            category_stats[cat]['count'] += 1
+            category_stats[cat]['total_time'] += target.duration
+            
+        # Top slowest targets
+        slowest_targets = sorted(self.targets, key=lambda t: t.duration, reverse=True)[:10]
+        
+        return {
+            'total_targets': total_targets,
+            'total_duration_ms': total_duration,
+            'total_duration_sec': total_duration / 1000,
+            'average_duration_ms': total_duration / total_targets if total_targets > 0 else 0,
+            'category_stats': category_stats,
+            'slowest_targets': [
+                {'name': t.output_name, 'duration_ms': t.duration, 'category': t.category}
+                for t in slowest_targets
+            ]
+        }
+        
+    def print_summary(self):
+        """Print build summary to stderr."""
+        summary = self.get_build_summary()
+        if not summary:
+            print("No build data available", file=sys.stderr)
+            return
+            
+        print(f"\n=== Build Summary ===", file=sys.stderr)
+        print(f"Total targets: {summary['total_targets']}", file=sys.stderr)
+        print(f"Total time: {summary['total_duration_sec']:.2f}s", file=sys.stderr)
+        print(f"Average time per target: {summary['average_duration_ms']:.2f}ms", file=sys.stderr)
+        
+        print(f"\nBy category:", file=sys.stderr)
+        for category, stats in summary['category_stats'].items():
+            avg_time = stats['total_time'] / stats['count'] if stats['count'] > 0 else 0
+            print(f"  {category:15} {stats['count']:6} targets "
+                  f"{stats['total_time']/1000:8.2f}s "
+                  f"(avg: {avg_time/1000:.3f}s)", file=sys.stderr)
+                  
+        print(f"\nSlowest targets:", file=sys.stderr)
+        for i, target in enumerate(summary['slowest_targets'][:5], 1):
+            print(f"  {i:2}. {target['name']} ({target['duration_ms']}ms, {target['category']})", file=sys.stderr)
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """Create command line argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Convert ninja build logs to Chrome tracing format",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s build/.ninja_log                 # Basic usage
+  %(prog)s build/.ninja_log --output trace.json  # Save to file
+  %(prog)s build/.ninja_log --summary       # Show build summary
+  %(prog)s build/.ninja_log --show-all      # Include all builds
+  %(prog)s build/.ninja_log --embed-ftime-trace  # Include Clang timing data
+  %(prog)s build/.ninja_log --granularity 10000  # Custom granularity threshold
+        """
+    )
+    
+    parser.add_argument(
+        'ninja_logs',
+        nargs='+',  # Accept one or more ninja log files
+        help='Path(s) to the .ninja_log file(s)'
+    )
+    
+    parser.add_argument(
+        '-o', '--output',
+        help='Output file (default: stdout)'
+    )
+    
+    parser.add_argument(
+        '--show-all',
+        action='store_true',
+        help='Show all builds, not just the last one'
+    )
+    
+    parser.add_argument(
+        '--summary',
+        action='store_true',
+        help='Print build summary to stderr'
+    )
+    
+    parser.add_argument(
+        '--pretty',
+        action='store_true',
+        help='Pretty-print JSON output'
+    )
+    
+    parser.add_argument(
+        '--embed-ftime-trace',
+        action='store_true',
+        help='Embed Clang -ftime-trace JSON files found adjacent to targets'
+    )
+    
+    parser.add_argument(
+        '--granularity',
+        type=int,
+        default=50000,
+        help='Minimum duration for -ftime-trace events in microseconds (default: 50000)'
+    )
+    
+    parser.add_argument(
+        '--legacy-format',
+        action='store_true',
+        help='Output in legacy format compatible with old ninjatracer (simple JSON array, all categories as "targets", empty args)'
+    )
+    
+    return parser
+
+
+def main():
+    """Main entry point."""
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    
+    try:
+        # Process multiple ninja log files
+        all_events = []
+        
+        for pid, ninja_log_path in enumerate(args.ninja_logs):
+            # Parse ninja log
+            log_parser = NinjaLogParser(show_all_builds=args.show_all)
+            targets = log_parser.parse_log_file(ninja_log_path)
+            
+            if not targets:
+                print(f"No build targets found in ninja log: {ninja_log_path}", file=sys.stderr)
+                continue
+                
+            # Determine ninja log directory for -ftime-trace files
+            ninja_log_dir = os.path.dirname(os.path.abspath(ninja_log_path)) if args.embed_ftime_trace else None
+            
+            # Generate trace events for this log file
+            trace_generator = ChromeTraceGenerator(
+                process_id=pid,  # Use different PID for each log file
+                embed_ftime_traces=args.embed_ftime_trace,
+                granularity_us=args.granularity,
+                ninja_log_dir=ninja_log_dir,
+                legacy_format=args.legacy_format
+            )
+            events = trace_generator.generate_trace_events(targets)
+            all_events.extend(events)
+            
+            # Print summary if requested (for each log file)
+            if args.summary:
+                print(f"\n=== Summary for {ninja_log_path} ===", file=sys.stderr)
+                analyzer = BuildAnalyzer(targets)
+                analyzer.print_summary()
+        
+        if not all_events:
+            print("No build targets found in any ninja log files", file=sys.stderr)
+            return 1
+        
+        # Output format logic
+        if args.legacy_format:
+            # Legacy format: always output simple JSON array
+            json_kwargs = {'indent': 2} if args.pretty else {}
+            json_output = json.dumps(all_events, **json_kwargs)
+        elif args.output or args.pretty:
+            # Enhanced format with metadata (when saving to file or pretty printing)
+            trace_data = {
+                'traceEvents': all_events,
+                'displayTimeUnit': 'ms',
+                'systemTraceEvents': 'SystemTraceData',
+                'otherData': {
+                    'version': '1.0',
+                    'generator': 'ninja_json_converter.py'
+                }
+            }
+            json_kwargs = {'indent': 2} if args.pretty else {}
+            json_output = json.dumps(trace_data, **json_kwargs)
+        else:
+            # Original format (simple JSON array to stdout)
+            json_output = json.dumps(all_events)
+        
+        if args.output:
+            with open(args.output, 'w') as f:
+                f.write(json_output)
+            print(f"Trace written to {args.output}", file=sys.stderr)
+        else:
+            print(json_output)
+            
+        return 0
+        
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From ad9863fe05beb7f2c46c29d0200a9312601ae092 Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Wed, 9 Jul 2025 22:01:33 +0800
Subject: [PATCH 295/443] [CK_TILE] Low CU utilization optimization for fMHA
 fwd kernels (#2402)

* Wrap tile size mapping as class method

* Warp pipeline generating as class method

* Add constraint as kernel dispatching criteria

* Support mutltiple tile size for a (hdim, hdim_v) combination

* Use smaller tile size if CU utilization is low

* Use integar as the key of the tile size map

* Fix type error

* Simply override parent class method return value

* Add attribute to eliminate warnging

* Allow using environment variables to turn on/off custom factory

* Unify param naming style

* Add missing HIP runtime include directive

* Fix os.environ.get() usage
---
 .../01_fmha/codegen/ops/fmha_batch_prefill.py |  20 +-
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 242 ++++++++++++------
 2 files changed, 168 insertions(+), 94 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index 0f5670f1b9..ffb6d579ed 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -150,14 +150,14 @@ unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seq
 float fmha_batch_prefill(fmha_batch_prefill_traits t, fmha_batch_prefill_args a, const ck_tile::stream_config& s) {{
     float r = -1;
 
-    const float min_cu_util_rate = 0.8; // minimum CU utilization rate
+    [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate
 
     unsigned num_cus;
     if (!get_num_cus(num_cus)) {{
         return r;
     }}
 
-    auto get_num_blocks = [&](unsigned kM0) {{
+    [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
         return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
     }};
 
@@ -490,7 +490,7 @@ class KernelComponentFactory:
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
         if dtype == 'fp16' or dtype == 'bf16':
             return {
-                '128' : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                128 : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
             }
         else:
             return None
@@ -516,13 +516,11 @@ class KernelComponentFactory:
 class CustomFactory(KernelComponentFactory):
     @staticmethod
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        result = KernelComponentFactory.get_hdim_tile_size_dict(dtype)
         if dtype == 'fp16' or dtype == 'bf16':
-            return {
-                '128' : [FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint('get_num_blocks(128) < num_cus * min_cu_util_rate')),
-                         FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),]
-            }
-        else:
-            return None
+            if 128 in result.keys():
+                result[128].insert(0, FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint('get_num_blocks(128) < num_cus * min_cu_util_rate')))
+        return result
 
 def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
@@ -536,9 +534,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
         if d == None:
             continue
         #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
-        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
-            tiles = d[hdim_str]
-            hdim = int(hdim_str)
+        for (hdim, tiles), mode in itertools.product(d.items(), MODE_MAP.keys()):
             for tile, pipeline in itertools.product(tiles, CustomFactory.get_pipelines(dtype, hdim, receipt, mask_impl)):
                 if mode == "group":
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 37a1b7329b..ee74cb8fb2 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -3,9 +3,10 @@
 # generate kernel instances to speed up compilation
 
 import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import fnmatch
 import itertools
+import os
 from pathlib import Path
 from typing import List, Optional, Tuple
 
@@ -114,8 +115,52 @@ float fmha_fwd_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_args a)
 
 FMHA_FWD_API_FILENAME="fmha_fwd_api.cpp"
 FMHA_FWD_API="""
+#include <cstdio>
+
+#include <hip/hip_runtime.h>
+
+namespace {{
+bool get_num_cus(unsigned& num_cus) {{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device");
+        return false;
+    }}
+
+    hipDeviceProp_t props{{}};
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device properties");
+        return false;
+    }}
+
+    num_cus = props.multiProcessorCount;
+    return true;
+}}
+
+unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{
+    const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0;
+    const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1
+
+    return batch * nheads * num_m_blocks * num_n_blocks;
+}}
+}} // namespace
+
 float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config& s){{
     float r = -1;
+
+    [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate
+
+    unsigned num_cus;
+    if (!get_num_cus(num_cus)) {{
+        return r;
+    }}
+
+    [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
+        return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
+    }};
+
 {F_dispatch}
     return r;
 }}
@@ -131,37 +176,51 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
                 using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
 
+@dataclass
+class CppConstraint:
+    bool_expr: str = None
+
+    def __str__(self):
+        if self.bool_expr is None:
+            return 'true'
+        else:
+            return f'{self.bool_expr}'
+
+    def __and__(self, other):
+        return CppConstraint(f'({str(self)}) && ({str(other)})')
+
 @dataclass
 class FmhaFwdApiTrait:
     pipeline_tag : str
     # sync with fmha_fwd_traits<>, to generate fallback calls
-    hdim      : str
-    dtype     : str  # data type
-    mode      : str  # value from MODE_MAP
-    bm0       : int  # tile size along q seqlen (block size)
-    bn0       : int  # tile size along qk seqlen
-    bk0       : int  # tile size along qk gemm unroll
-    bn1       : int  # tile size along v head_dim
-    bk1       : int  # tile size along kv gemm unroll
-    bk0max    : int
-    vlayout   : str
-    logits    : str
-    mask      : str
-    bias      : str  #
-    lse       : str  #
-    dropout   : str
-    squant    : str  #
-    spad      : str
-    skpad     : str
-    dpad      : str
-    dvpad     : str
-    skip      : str
+    hdim       : str
+    dtype      : str  # data type
+    mode       : str  # value from MODE_MAP
+    bm0        : int  # tile size along q seqlen (block size)
+    bn0        : int  # tile size along qk seqlen
+    bk0        : int  # tile size along qk gemm unroll
+    bn1        : int  # tile size along v head_dim
+    bk1        : int  # tile size along kv gemm unroll
+    bk0max     : int
+    vlayout    : str
+    logits     : str
+    mask       : str
+    bias       : str  #
+    lse        : str  #
+    dropout    : str
+    squant     : str  #
+    spad       : str
+    skpad      : str
+    dpad       : str
+    dvpad      : str
+    skip       : str
+    constraint : CppConstraint
 
     @property
     def name(self) -> str:
@@ -218,18 +277,19 @@ class FmhaFwdApiTrait:
 class FmhaFwdPipeline:
     tag : str
 
-    F_vlayout   : str  # row/col
-    F_spad      : str  # true/false
-    F_skpad     : str  #
-    F_dpad      : str  #
-    F_dvpad     : str  #
-    F_logits    : str  # t/f
-    F_bias      : str  # true/false
-    F_lse       : str  #
-    F_dropout   : str  #
-    F_squant    : str  #
-    F_mask      : str  # value from MASK_MAP
-    F_skip      : str  # true/false
+    F_vlayout    : str  # row/col
+    F_spad       : str  # true/false
+    F_skpad      : str  #
+    F_dpad       : str  #
+    F_dvpad      : str  #
+    F_logits     : str  # t/f
+    F_bias       : str  # true/false
+    F_lse        : str  #
+    F_dropout    : str  #
+    F_squant     : str  #
+    F_mask       : str  # value from MASK_MAP
+    F_skip       : str  # true/false
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
     def name(self) -> str:
@@ -303,6 +363,7 @@ class FmhaFwdApiPool:
                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
                                    F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip],
                                    F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_constraint=trait.constraint,
                                    F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
                                    F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                    F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
@@ -317,25 +378,27 @@ class FmhaFwdApiPool:
 
 @dataclass
 class FmhaFwdTileSize:
-    F_bm0       : int  # tile size along q seqlen (block size)
-    F_bn0       : int  # tile size along k seqlen
-    F_bk0       : int  # tile size along qk gemm unroll
-    F_bn1       : int  # tile size along v head_dim
-    F_bk1       : int  # tile size along kv gemm unroll
-    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
-    F_rm0       : int  # number of warps for gemm0 along q seqlen
-    F_rn0       : int  # number of warps for gemm0 along k seqlen
-    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
-    F_rm1       : int  # number of warps for gemm1 along q seqlen
-    F_rn1       : int  # number of warps for gemm1 along head dim v
-    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
-    F_wm0       : int  # gemm0 warp size along m
-    F_wn0       : int  # gemm0 warp size along n
-    F_wk0       : int  # gemm0 warp size along k
-    F_wm1       : int  # gemm1 warp size along m
-    F_wn1       : int  # gemm1 warp size along n
-    F_wk1       : int  # gemm1 warp size along k
-    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_bm0        : int  # tile size along q seqlen (block size)
+    F_bn0        : int  # tile size along k seqlen
+    F_bk0        : int  # tile size along qk gemm unroll
+    F_bn1        : int  # tile size along v head_dim
+    F_bk1        : int  # tile size along kv gemm unroll
+    F_bk0max     : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0        : int  # number of warps for gemm0 along q seqlen
+    F_rn0        : int  # number of warps for gemm0 along k seqlen
+    F_rk0        : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1        : int  # number of warps for gemm1 along q seqlen
+    F_rn1        : int  # number of warps for gemm1 along head dim v
+    F_rk1        : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0        : int  # gemm0 warp size along m
+    F_wn0        : int  # gemm0 warp size along n
+    F_wk0        : int  # gemm0 warp size along k
+    F_wm1        : int  # gemm1 warp size along m
+    F_wn1        : int  # gemm1 warp size along n
+    F_wk1        : int  # gemm1 warp size along k
+    F_occupancy  : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
+
     @property
     def name(self) -> str:
         return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
@@ -429,35 +492,38 @@ class FmhaFwdKernel:
                 skpad=self.F_pipeline.F_skpad,
                 dpad=self.F_pipeline.F_dpad,
                 dvpad=self.F_pipeline.F_dvpad,
-                skip=self.F_pipeline.F_skip)
+                skip=self.F_pipeline.F_skip,
+                constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
 
-# TODO: design a more practical way to do it
-# this is current supported tile size per hdim
-def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
-    if dtype == 'fp16' or dtype == 'bf16':
-        return {
-            (32, 32)  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            (64, 64)  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### (96, 128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            (128,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### (160,160) : FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  1),
-            (192,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### (192,192) : FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  1),
-            (256,256) : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        }
-    elif dtype == 'fp8' or dtype == 'bf8':
-        return {
-            (64,64 )  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            (128,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            (256,256) : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-        }
-    else:
-        return None
+class KernelComponentFactory:
+    # TODO: design a more practical way to do it
+    # this is current supported tile size per hdim
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        if dtype == 'fp16' or dtype == 'bf16':
+            return {
+                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            ### (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            ### (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            ### (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            }
+        elif dtype == 'fp8' or dtype == 'bf8':
+            return {
+                (64,64 )  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+                (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+            }
+        else:
+            return None
 
-def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
     #       support this in future
-    def get_pipelines(dtype, hdim, hdim_v) -> List[FmhaFwdPipeline]:
+    @staticmethod
+    def get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) -> List[FmhaFwdPipeline]:
         # this function will populate a list possible pipelines
         # TODO: the order of List matters! the later in this list will be also be checked later
         # TODO: currently for qr pipeline, let 't' padding to appear later!!
@@ -502,16 +568,28 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
             assert False
         return pipelines
 
+class CustomFactory(KernelComponentFactory):
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        result = KernelComponentFactory.get_hdim_tile_size_dict(dtype)
+        if dtype == 'fp16' or dtype == 'bf16':
+            if (128, 128) in result.keys():
+                result[(128, 128)].insert(0, FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint('get_num_blocks(128) < num_cus * min_cu_util_rate')))
+        return result
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
     gen = list()
     api_pool = FmhaFwdApiPool(mask_impl)
 
+    factory = CustomFactory if os.environ.get('CK_TILE_FMHA_FWD_CUSTOM_FACTORY', '0') == '1' else KernelComponentFactory
+
     for dtype in FWD_DTYPE_MAP.keys():
-        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
+        d = factory.get_hdim_tile_size_dict(dtype)
         if d == None:
             continue
         #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
-        for ((hdim, hdim_v), tile), mode in itertools.product(d.items(), MODE_MAP.keys()):
-            for pipeline in get_pipelines(dtype, hdim, hdim_v):
+        for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for tile, pipeline in itertools.product(tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl)):
                 if mode == "group":
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not

From 032ca60015e8c4a5c3a3f88b6705f24f624b6352 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Wed, 9 Jul 2025 22:27:54 +0800
Subject: [PATCH 296/443]  [CK_TILE] Avoid compile kernel in host pass (#2475)

---
 include/ck_tile/host/kernel_launch.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 9770e99738..f6ccb6968b 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -4,11 +4,12 @@
 #pragma once
 
 #include "ck_tile/core/config.hpp"
-#include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/timer.hpp"
-#include <hip/hip_runtime.h>
 #include <cstddef>
+#include <hip/hip_runtime.h>
 
 namespace ck_tile {
 
@@ -24,7 +25,11 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
 #endif
     __global__ void kentry(Args... args)
 {
+#if defined(__HIP_DEVICE_COMPILE__)
     Kernel{}(args...);
+#else
+    (..., (ignore = args, 0));
+#endif
 }
 
 //

From d814fefe1898971b2c3eb97b986bdafc450f18b5 Mon Sep 17 00:00:00 2001
From: shay-li77 <xiangxli@amd.com>
Date: Wed, 9 Jul 2025 23:18:55 +0800
Subject: [PATCH 297/443] support y-direction step length greater than 1 for
 SimplifiedGenericAttentionMask (#2338)

* mask support ratio for y axis

* format code

* add notes for param y_ratio

* fix comments error

* support template and mdiv for ratio mask

* refactor y-ratio mask constructor

* optimize coordinate calculation

* add SimplifiedRatioAttentionMask
---
 .../ck_tile/ops/fmha/block/block_masking.hpp  | 190 ++++++++++++++++++
 1 file changed, 190 insertions(+)

diff --git a/include/ck_tile/ops/fmha/block/block_masking.hpp b/include/ck_tile/ops/fmha/block/block_masking.hpp
index 726543b97a..f5c12e11d2 100644
--- a/include/ck_tile/ops/fmha/block/block_masking.hpp
+++ b/include/ck_tile/ops/fmha/block/block_masking.hpp
@@ -401,6 +401,196 @@ struct SimplifiedGenericAttentionMask
     index_t y_total, x_total;
 };
 
+// clang-format off
+namespace impl {
+    template <bool IsMasking_> struct SimplifiedRatioMaskName;
+    template<> struct SimplifiedRatioMaskName<false> { static constexpr const char * name = "nomask"; };
+    template<> struct SimplifiedRatioMaskName<true> { static constexpr const char * name = "mask"; };
+}
+// clang-format on
+
+// this version is used for cases that the step length of y-direction changes greater than one. It
+// means that the mask is not a regular triangular matrix.
+
+// clang-format off
+/*  y_ratio is used to describe the step length of y-direction changes
+    in certain performance optimization scenarios like merging seqlen 
+    and qk_head_ratio, for example:
+
+    x=1/y=6/y_ratio=2(top-left)
+    1 * * * * * * * 
+    1 * * * * * * *
+    1 1 * * * * * * 
+    1 1 * * * * * *
+    1 1 1 * * * * *
+    1 1 1 * * * * *
+
+*/
+// clang-format on
+template <bool IsMasking_ = true>
+struct SimplifiedRatioAttentionMask
+{
+    static constexpr bool IsMasking = IsMasking_; // false will disable masking
+
+    static constexpr const char* name = impl::SimplifiedRatioMaskName<IsMasking>::name;
+
+    CK_TILE_HOST_DEVICE SimplifiedRatioAttentionMask(index_t y_total_, index_t x_total_)
+        : SimplifiedRatioAttentionMask(0, 0, y_total_, x_total_, 0, 1, mdiv{})
+    {
+    }
+
+    CK_TILE_HOST_DEVICE
+    SimplifiedRatioAttentionMask(
+        index_t y_real_, index_t x_, index_t y_total_, index_t x_total_, mdiv y_ratio_mdiv_)
+        : SimplifiedRatioAttentionMask(/*y_=*/y_real_ * static_cast<index_t>(y_ratio_mdiv_.get()),
+                                       /*x_=*/x_,
+                                       /*y_total_=*/y_total_,
+                                       /*x_total_=*/x_total_,
+                                       /*y_real_=*/y_real_,
+                                       /*y_ratio_=*/static_cast<index_t>(y_ratio_mdiv_.get()),
+                                       /*y_ratio_mdiv_=*/y_ratio_mdiv_)
+
+    {
+    }
+    CK_TILE_HOST_DEVICE
+    SimplifiedRatioAttentionMask(index_t y_,
+                                 index_t x_,
+                                 index_t y_total_,
+                                 index_t x_total_,
+                                 index_t y_real_,
+                                 index_t y_ratio_,
+                                 mdiv y_ratio_mdiv_)
+        : y(y_),
+          x(x_),
+          y_total(y_total_),
+          x_total(x_total_),
+          y_real(y_real_),
+          y_ratio(y_ratio_),
+          y_ratio_mdiv(y_ratio_mdiv_)
+    {
+    }
+
+    // to get the loop length along X axis, return index:[start, end), end-start=length
+    // use this if need loop over X axis tile by tile (like k-seqlen loopover)
+    // TODO: x_end still could be negative, so end-start could be negative(need check)
+    template <index_t YTile, index_t XTile>
+    CK_TILE_HOST_DEVICE constexpr auto
+    GetTileRangeAlongX(index_t i_y, number<YTile>, number<XTile>) const
+    {
+        if constexpr(!IsMasking)
+        {
+            return ck_tile::make_tuple(0, x_total);
+        }
+        else
+        {
+            // get the tile start/end range assum we loop over along X tile by tile
+            index_t x_start = [&]() {
+                index_t tmp = -y_real +
+                              static_cast<index_t>(y_ratio_mdiv.div(static_cast<uint32_t>(i_y))) +
+                              1;
+
+                return (tmp / XTile) * XTile; // round to tile aligned
+            }();
+
+            // TODO: end could be negative, we ignore clamp here, and let caller to check
+            //      ... in which case end-start is negative
+            index_t x_end = [&]() {
+                uint32_t y_offset = i_y + YTile - 1;
+                index_t tmp = min(static_cast<index_t>(y_ratio_mdiv.div(y_offset)) + x, x_total);
+                return ((tmp + XTile - 1) / XTile) * XTile;
+            }();
+
+            return ck_tile::make_tuple(x_start, x_end);
+        }
+    }
+
+    // to get the loop length along Y axis, return index:[start, end), end-start=length
+    // use this if need loop over Y axis tile by tile (like q-seqlen loopover)
+    // TODO: y_end still could be negative, so end-start could be negative(need check)
+    template <index_t YTile, index_t XTile>
+    CK_TILE_HOST_DEVICE constexpr auto
+    GetTileRangeAlongY(index_t i_x, number<YTile>, number<XTile>) const
+    {
+        if constexpr(!IsMasking)
+        {
+            return ck_tile::make_tuple(0, y_total);
+        }
+        else
+        {
+            // get the tile start/end range assum we loop over along Y tile by tile
+            index_t y_start = [&]() {
+                index_t tmp = max((-x + i_x + 1) * y_ratio, 0);
+                return (tmp / YTile) * YTile; // round to tile aligned
+            }();
+
+            // TODO: end could be negative, we ignore clamp here, and let caller to check
+            //      ... in which case end-start is negative
+            index_t y_end = [&]() {
+                index_t tmp = min((i_x + XTile - 1) * y_ratio + y, y_total);
+                return ((tmp + YTile - 1) / YTile) * YTile;
+            }();
+
+            return ck_tile::make_tuple(y_start, y_end);
+        }
+    }
+
+    // per-pixel check if out-of-bound, if true, need mask a value(like -INF)
+    CK_TILE_HOST_DEVICE constexpr auto IsOutOfBound(index_t i_y, index_t i_x) const
+    {
+        if constexpr(!IsMasking)
+        {
+            return i_x >= x_total;
+        }
+        else
+        {
+            index_t x_tmp   = static_cast<index_t>(y_ratio_mdiv.div(static_cast<uint32_t>(i_y)));
+            index_t x_start = -y_real + x_tmp + 1;
+            index_t x_end   = min(x_tmp + x,
+                                x_total); // need min in case x is padded
+            return i_x < x_start || i_x >= x_end || i_y >= y_total;
+        }
+    }
+
+    // if current tile is at the edge, means need per-pixel mask check.
+    // otherwise no need to check per-pixel
+    // Attention! assume the idex passed in this function is with in range of GetTileRangeAlongX/Y()
+    // can be used as a fast-path to decide if do per-pixel check or not
+    template <index_t TileHeight, index_t TileWidth>
+    CK_TILE_HOST_DEVICE constexpr auto
+    IsEdgeTile(index_t i_y, index_t i_x, number<TileHeight>, number<TileWidth>) const
+    {
+        if constexpr(!IsMasking)
+        {
+            // the only case that need do following compare is under kPadSeqLenK
+            // ... for non-masking kernel.
+            // return (i_x < x_total) && ((i_x + TileWidth) > x_total);
+
+            return (i_x + TileWidth) > x_total;
+        }
+        else
+        {
+            // check top-right corner > x or left-borrom corner < x
+            index_t i_x_end = i_x + TileWidth;
+            index_t i_y_end = i_y + TileHeight;
+            // index_t x_end    = min(i_y + x, x_total);
+            uint32_t y_tmp      = static_cast<uint32_t>(i_y);
+            bool top_right_edge = i_x_end > min(static_cast<index_t>(y_ratio_mdiv.div(y_tmp)) + x,
+                                                x_total); // consider right pad
+            bool bottom_left_edge =
+                i_y_end > min(i_x * y_ratio + y, y_total); // consider bottom pad
+            return top_right_edge || bottom_left_edge;
+        }
+    }
+
+    private:
+    index_t y, x;
+    index_t y_total, x_total;
+    // y_real is vertical axis before multiplying y_ratio. y_real * y_ratio = y
+    index_t y_real;
+    index_t y_ratio;
+    mdiv y_ratio_mdiv;
+};
+
 // TODO: prefer use this function in host code
 // can convert from the FA style left/right to our generic coordinate
 // if left_size < 0 && right_size = 0, it is normal causal mask

From d9b37c7121e3061ee43f268e7a30ac3ade38bc7a Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 10 Jul 2025 07:12:13 -0700
Subject: [PATCH 298/443] Fix blockscale fp8 gemm examples (#2476)

* fix blockscale fp8 gemm examples

* refactor the compiler flags

* fix hip version calculation
---
 .../65_gemm_multiply_multiply/CMakeLists.txt  | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index b9748aabda..9f4c43338e 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -22,7 +22,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
         add_example_executable(example_moe_gemm1_xdl_pk_i4 moe_gemm1_xdl_pk_i4.cpp)
         add_example_executable(example_moe_gemm2_xdl_pk_i4 moe_gemm2_xdl_pk_i4.cpp)
-        if(CK_hip_VERSION VERSION_LESS_EQUAL 6.3.42132)
+        if(hip_VERSION_FLAT LESS_EQUAL 600342132)
             set(EXAMPLE_COMPILE_OPTIONS)
             check_cxx_compiler_flag("-mllvm --amdgpu-enable-max-ilp-scheduling-strategy=1" HAS_MAX_ILP_SCHEDULING_STRATEGY)
             if(HAS_MAX_ILP_SCHEDULING_STRATEGY)
@@ -31,8 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
             example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
             example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()
-        set(GEMM_OPTIONS)
-        list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
         example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
@@ -40,16 +39,25 @@ foreach(gpu IN LISTS GPU_TARGETS)
     endif()
 endforeach()
 
-set(GEMM_OPTIONS)
-list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
-set(BLOCKSCALE_GEMM_OPTIONS)
+set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+set(BLOCKSCALE_GEMM_OPTIONS )
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
-if(HAS_MISCHED_BOTTOMUP)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
-elseif(HAS_MISCHED_PRERA_DIRECTION)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+
+if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000)
+  if(HAS_MISCHED_BOTTOMUP)
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+  elseif(HAS_MISCHED_PRERA_DIRECTION)
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+  endif()
+else()
+  if(HAS_MISCHED_BOTTOMUP)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
+  elseif(HAS_MISCHED_PRERA_DIRECTION)
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup")
+  endif()
 endif()
+
 check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental " HAS_MAX_OCCUPANCY_EXPERIMENTAL)
 if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
     list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)

From 1b66f3f4a32f1e755e8ac70a16e879f4f6523870 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 10 Jul 2025 07:18:56 -0700
Subject: [PATCH 299/443] Add declarations for atomic add for fp16 and unsigned
 short. (#2483)

* add template for fp16 atomic add

* add template for unsigned short atomic add

* use atomicCAS in atomic add for fp16 and unsigned short

* revrt back to atomic add using casting
---
 .../utility/generic_memory_space_atomic.hpp   | 16 +++++++++++
 .../gpu/gemm_universal_preshuffle.inc         | 28 ++++---------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
index ab9cc4199c..011491ffc6 100644
--- a/include/ck/utility/generic_memory_space_atomic.hpp
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -32,6 +32,22 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
     return atomicAdd(p_dst, x);
 }
 
+template <>
+__device__ unsigned short atomic_add<unsigned short>(unsigned short* p_dst, const unsigned short& x)
+{
+    // Use atomicAdd with unsigned int
+    return static_cast<unsigned short>(
+        atomicAdd(reinterpret_cast<unsigned int*>(p_dst), static_cast<unsigned int>(x)));
+}
+
+template <>
+__device__ _Float16 atomic_add<_Float16>(_Float16* p_dst, const _Float16& x)
+{
+    // Use atomicAdd with unsigned int
+    return static_cast<_Float16>(
+        atomicAdd(reinterpret_cast<unsigned int*>(p_dst), static_cast<unsigned int>(x)));
+}
+
 template <>
 __device__ double atomic_add<double>(double* p_dst, const double& x)
 {
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
index b44d60deaf..b987519082 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
@@ -10,27 +10,11 @@ namespace instance {
 
 #if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
 
-using GemmF8F8BF16InstanceVector =
-    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
-                                                        Col,
-                                                        Row,
-                                                        F8,
-                                                        F8,
-                                                        BF16,
-                                                        PassThrough,
-                                                        PassThrough,
-                                                        PassThrough>>>&;
+using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;
 
-using GemmF8F8F16InstanceVector =
-    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
-                                                        Col,
-                                                        Row,
-                                                        F8,
-                                                        F8,
-                                                        F16,
-                                                        PassThrough,
-                                                        PassThrough,
-                                                        PassThrough>>>&;
+using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
     GemmF8F8BF16InstanceVector& instances);
@@ -48,7 +32,7 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances
     GemmF8F8BF16InstanceVector& instances);
 
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
-        GemmF8F8BF16InstanceVector& instances);
+    GemmF8F8BF16InstanceVector& instances);
 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
     GemmF8F8BF16InstanceVector& instances);
 
@@ -84,7 +68,7 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_defau
     GemmF8F8F16InstanceVector& instances);
 
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
-        GemmF8F8F16InstanceVector& instances);
+    GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
     GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(

From aadeffde1811b422cf0452bd874740fee94d6ed6 Mon Sep 17 00:00:00 2001
From: Andres Lugo <108368282+alugorey@users.noreply.github.com>
Date: Thu, 10 Jul 2025 11:00:23 -0500
Subject: [PATCH 300/443]  Update FMHA recipe for Pytorch SDPA integration
 (#2480)

* Add receipts in splitk and appendk

* remove grouped

* Remove logits

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
---
 example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py          | 1 +
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py          | 2 ++
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py | 6 ++++++
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py  | 9 +++++++++
 4 files changed, 18 insertions(+)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index c251460a9a..89fbcff40c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -527,6 +527,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= bias in ['no', 'bias']
                     cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
                     cond &= dpad == dvpad
+                    cond &= mode == 'batch'
                     cond &= deterministic == "f"
                     if not cond:
                         continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index ee74cb8fb2..06a012d277 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -629,7 +629,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     cond &= pipeline.F_vlayout == 'row'
                     cond &= pipeline.F_bias in ['no', 'bias']
                     cond &= pipeline.F_squant == 'f'
+                    cond &= mode == 'batch'
                     cond &= pipeline.F_skip == 'f'
+                    cond &= pipeline.F_logits == 'f'
                     if not cond:
                         continue
                 # Aiter(mha_fwd) integration
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index dc7ef712e2..517e84f380 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -332,6 +332,12 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= pipeline.F_vlayout == 'row'
                     if not cond:
                         continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16, bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    if not cond:
+                        continue
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 2d2d71555d..edc1532a05 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -754,6 +754,15 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16, bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    cond &= mode == 'batch'
+                    if not cond:
+                        continue
                 # Aiter(mha_varlen_fwd) integration
                 elif receipt == 200:
                     cond = dtype in ['fp16', 'bf16']

From a26ba690fd08aa6b6aef967a39f857292ab2b8bd Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Thu, 10 Jul 2025 13:00:47 -0400
Subject: [PATCH 301/443] fix(precommit_install): fix bug for bare metal
 machines (#2448)

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
---
 script/install_precommit.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/script/install_precommit.sh b/script/install_precommit.sh
index 83e526035c..6132f6a287 100755
--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
@@ -9,13 +9,13 @@ run_and_check() {
     return $status
 }
 
-echo "I: Installing tools required for pre-commit checks..."
-run_and_check apt install clang-format-12
-
 echo "I: Creating and activating virtual environment for pre-commit..."
 python3 -m venv "$(dirname "$0")/../.venv"
 source "$(dirname "$0")/../.venv/bin/activate"
 
+echo "I: Installing tools required for pre-commit checks..."
+run_and_check pip install dos2unix
+run_and_check pip install clang-format==12.0.1
 echo "I: Installing pre-commit in virtual environment..."
 run_and_check pip install pre-commit
 run_and_check pre-commit install

From 45904b8fd7cde71dfc3741970325b3d552b06d27 Mon Sep 17 00:00:00 2001
From: Qianfeng <qianfeng.zhang@amd.com>
Date: Fri, 11 Jul 2025 18:14:47 +0800
Subject: [PATCH 302/443] Add separate mask checking for scope
 [aligned_physical_seqlen_k_start, physical_seqlen_k_end) (#2487)

* Add separate mask checking for scope [aligned_physical_seqlen_k_start, physical_seqlen_k_end) in pagedkv pipeline

* i_nhead_ conversion type to prevent overflow

---------

Co-authored-by: ltqin <letaoqin@amd.com>
---
 .../fmha/kernel/fmha_fwd_pagedkv_kernel.hpp   |  6 ++-
 ...ock_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp | 54 ++++++++++++-------
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
index e56d518634..d8cd006c60 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -1122,7 +1122,8 @@ struct FmhaFwdPagedKVKernel
                     const index_t num_blocks =
                         integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
 
-                    const long_index_t fixed_offset = i_nhead_ * kargs.nhead_stride_k;
+                    const long_index_t fixed_offset =
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_k;
 
                     return make_page_block_navigator<const KDataType, 0>(
                         kargs.k_ptr,
@@ -1152,7 +1153,8 @@ struct FmhaFwdPagedKVKernel
                     const index_t num_blocks =
                         integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
 
-                    const long_index_t fixed_offset = i_nhead_ * kargs.nhead_stride_v;
+                    const long_index_t fixed_offset =
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_v;
 
                     return make_page_block_navigator<const VDataType, 1>(
                         kargs.v_ptr,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
index 6ad5844b69..9d267e1cee 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
@@ -441,28 +441,46 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                 }
             }
             move_tile_window(bias_dram_window, {0, kN0});
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+
             {
                 const auto k_origin = k_page_block_navigator.to_global_window_origin(
                     i_page_block_k, k_dram_block_window.get_window_origin());
-                // mask accept only logical coordinates, do conversion here
-                bool need_perpixel_check = mask.IsEdgeTile(q_origin.at(number<0>{}),
-                                                           k_origin.at(number<0>{}) - kv_l2p_offset,
-                                                           number<kM0>{},
-                                                           number<kN0>{});
-                if(need_perpixel_check)
+
+                if constexpr(kIsPagedKV)
                 {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
-                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
-                            return !variant.LogitsMask(variant_params,
-                                                       block_indices.batch_idx,
-                                                       row,
-                                                       col - kv_l2p_offset,
-                                                       block_indices.qo_head_idx,
-                                                       block_indices.kv_head_idx);
-                        });
+                    // check columns in [aligned_physical_seqlen_k_start, physical_seqlen_k_end)
+                    if(kv_l2p_offset > 0)
+                    {
+                        set_tile_if(
+                            s_acc,
+                            -numeric<SMPLComputeDataType>::infinity(),
+                            [&, physical_seqlen_k_start_ = physical_seqlen_k_start](auto tile_idx) {
+                                const auto col =
+                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                return col < physical_seqlen_k_start_;
+                            });
+                    };
+                }
+
+                if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+                {
+                    // mask accept only logical coordinates, do conversion here
+                    bool need_perpixel_check =
+                        mask.IsEdgeTile(q_origin.at(number<0>{}),
+                                        k_origin.at(number<0>{}) - kv_l2p_offset,
+                                        number<kM0>{},
+                                        number<kN0>{});
+                    if(need_perpixel_check)
+                    {
+                        set_tile_if(
+                            s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                                const auto row =
+                                    q_origin.at(number<0>{}) + tile_idx.at(number<0>{});
+                                const auto col =
+                                    k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                                return mask.IsOutOfBound(row, col - kv_l2p_offset);
+                            });
+                    }
                 }
             }
 

From d239b91fd54f63cc6e46ba2f6fe7d02512ebe3f1 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Fri, 11 Jul 2025 08:27:55 -0700
Subject: [PATCH 303/443] Merge flatmm Operator with universal gemm (#2434)

* Initial commit

* Adding new tile partitioner to flatmm

* intermediate changes

* debugging kernels

* Updating flatmm example to universal gemm example

* updated flatmm kernel to run via gemmKernel

* update universal gemm to incorporate flatmm

* debug

* Fix flatmm call

* Fixing other kernels and tests for API changes

* clang formatted

* fixing gemm tests

* added test for flatmm and simplify kernel arguments

* adding flatmm test

* fix test for flatmm

* simplify gemm kernel with flatmm

* remove flatmm related files

* addressing review comments and code clean up

* resolving empty file

* resolving empty file

* clang formatted

* addressing review comments

* enable persistent kernel for flatmm

* reverted the removed files for flatmm

* reverted the removed files for flatmm

* changed flatmm to weightPReshuffle; removed the _1 added in teh faltmm example

* some more renames

* clang formatted
---
 example/ck_tile/03_gemm/CMakeLists.txt        |   1 +
 example/ck_tile/03_gemm/gemm_utils.hpp        |  71 +++
 .../03_gemm/gemm_weight_preshuffle.cpp        | 294 +++++++++++
 example/ck_tile/03_gemm/run_gemm_example.inc  |  76 ++-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  27 +-
 example/ck_tile/18_flatmm/flatmm_basic.cpp    | 137 +++--
 example/ck_tile/18_flatmm/flatmm_basic.hpp    |  61 ++-
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |  64 ++-
 .../ops/flatmm/kernel/flatmm_kernel.hpp       | 357 +++++++++----
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |  83 ++-
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp | 124 ++++-
 include/ck_tile/ops/gemm.hpp                  |   4 +
 .../block/block_wp_asmem_bsmem_creg_v1.hpp    | 122 +++++
 ...k_wp_asmem_bsmem_creg_v1_custom_policy.hpp |  38 ++
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  86 +++-
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |  23 +-
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |  11 +
 ...peline_ag_bg_cr_comp_v4_default_policy.hpp |   4 +-
 .../gemm_pipeline_ag_bg_cr_comp_v5.hpp        |   1 +
 ...peline_ag_bg_cr_comp_v5_default_policy.hpp |   4 +-
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |   1 +
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |  39 +-
 .../ops/gemm/pipeline/tile_gemm_shape.hpp     |   4 +
 .../ops/gemm/pipeline/tile_gemm_traits.hpp    |  12 +-
 .../wp_pipeline_agmem_bgmem_creg_v1.hpp       | 472 ++++++++++++++++++
 ...wp_pipeline_agmem_bgmem_creg_v1_policy.hpp | 450 +++++++++++++++++
 test/ck_tile/CMakeLists.txt                   |   1 +
 .../gemm/test_gemm_pipeline_kernel_types.hpp  |   7 +-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  25 +-
 .../gemm_weight_preshuffle/CMakeLists.txt     |  22 +
 .../test_gemm_pipeline_kernel_types.hpp       |  32 ++
 .../test_gemm_pipeline_ut_cases.inc           |  21 +
 .../test_gemm_pipeline_util.hpp               | 384 ++++++++++++++
 .../test_gemm_pipeline_wp.cpp                 |  16 +
 34 files changed, 2736 insertions(+), 338 deletions(-)
 create mode 100644 example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
 mode change 100644 => 100755 include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
 create mode 100644 include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
 create mode 100644 include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp
 mode change 100644 => 100755 include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
 create mode 100644 include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
 create mode 100644 include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
 create mode 100644 test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt
 create mode 100644 test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
 create mode 100755 test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
 create mode 100644 test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
 create mode 100644 test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 411db2e317..3d3a54020c 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
+add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
   list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 2157397f1d..9deccc7f16 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -14,6 +14,7 @@
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4
+#define CK_TILE_PIPELINE_PRESHUFFLE 5
 
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
@@ -32,6 +33,21 @@ constexpr ck_tile::index_t get_k_warp_tile()
         return 32;
 #endif
 }
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(__gfx950__)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
+#endif
+}
 
 struct GemmConfigBase
 {
@@ -51,6 +67,7 @@ struct GemmConfigBase
     static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
     static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
     static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
 };
 
 template <typename PrecType>
@@ -213,6 +230,50 @@ struct GemmConfigComputeV5 : public GemmConfigBase
     static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
 };
 
+template <typename PrecType>
+struct GemmConfigPreshufle_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufle_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+};
+
 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
 struct GemmTypeConfig;
 
@@ -367,6 +428,16 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
     using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
 };
 
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
new file mode 100644
index 0000000000..f57c24f458
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    float ave_time{0};
+
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 UniversalGemmProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation,
+                                                 GemmConfig::NumWaveGroups>>;
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+                static constexpr ck_tile::index_t APackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                static constexpr ck_tile::index_t BPackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                };
+                ave_time = ck_tile::launch_kernel_preprocess(
+                    s,
+                    run_flush_cache,
+                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                        Kernel{}, grids, blocks, 0, kargs));
+            }
+            else
+            {
+                ave_time =
+                    ck_tile::launch_kernel(s,
+                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                               Kernel{}, grids, blocks, 0, kargs));
+            }
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
+    auto [result, arg_parser] = create_args(argc, argv);
+    bool preshuffle           = GemmConfig::Preshuffle;
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+            argc, argv, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout for the input matrices!");
+    }
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_gemm_example<GemmConfigPreshufle_1>(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        // Return a non-zero code to indicate failure
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index d3ef974d91..f13a4b693b 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -251,6 +251,22 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     return ave_time;
 }
 
+template <typename GemmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_                = t.get_lengths()[1];
+    int k_                = t.get_lengths()[0];
+    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                   GemmConfig::N_Warp_Tile,
+                                   k_ / GemmConfig::K_Warp_Tile,
+                                   divisor,
+                                   GemmConfig::K_Warp_Tile / divisor});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+}
+
 template <typename GemmConfig,
           typename ADataType,
           typename BDataType = ADataType,
@@ -284,6 +300,8 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::index_t init_method = arg_parser.get_int("init");
     bool persistent              = arg_parser.get_int("persistent");
 
+    const bool preshuffle = GemmConfig::Preshuffle;
+
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
     stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
@@ -316,7 +334,7 @@ int run_gemm_example_with_layouts(int argc,
         b_k_n.SetZero();
     }
 
-    if(GemmConfig::UseStructuredSparsity)
+    if(!preshuffle && GemmConfig::UseStructuredSparsity)
     {
         ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
     }
@@ -326,33 +344,43 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
     static_assert(!GemmConfig::PermuteA, "Not implemented");
-    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+
+    if constexpr(preshuffle)
     {
-        // Permute vector pk_i4x4 data for device implementation
-        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-        if constexpr(GemmConfig::PermuteB)
-        {
-            permute_tensor_b<GemmConfig,
-                             decltype(b_k_n_dev),
-                             ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout>(b_k_n_dev);
-        }
-        permute_vectors_i4x4_b(b_k_n_dev);
-        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        // shuffled buffer B for device implementation
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
     }
     else
     {
-        if constexpr(GemmConfig::PermuteB)
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
         {
-            std::cout << "Permute for this DataType is not implemented." << std::endl;
-            return false;
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            if constexpr(GemmConfig::PermuteB)
+            {
+                permute_tensor_b<GemmConfig,
+                                 decltype(b_k_n_dev),
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(b_k_n_dev);
+            }
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(GemmConfig::PermuteB)
+            {
+                std::cout << "Permute for this DataType is not implemented." << std::endl;
+                return false;
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
         }
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
     }
 
     a_m_k_dev_buf.ToDevice(a_m_k.data());
@@ -415,6 +443,10 @@ int run_gemm_example_with_layouts(int argc,
             // Restore input for B for gpu reference
             b_k_n_dev_buf.ToDevice(b_k_n.data());
         }
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
 
         // memory on host to store gpu reference result
         ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index c2c3fc1fa4..c96a470910 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -59,7 +59,8 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                                                                  GemmConfig::TransposeC,
                                                                  GemmConfig::UseStructuredSparsity,
                                                                  Persistent,
-                                                                 GemmConfig::NumWaveGroups>;
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
@@ -71,7 +72,6 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
     const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
     const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
     float ave_time{0};
 
     const auto Run =
@@ -92,6 +92,7 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
 
             using GemmPipeline = typename PipelineTypeTraits<
                 GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
                 ck_tile::CShuffleEpilogueProblem<ADataType,
                                                  BDataType,
@@ -101,7 +102,7 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                                                  DsLayout,
                                                  ELayout,
                                                  CDEElementWise,
-                                                 GemmPipelineProblem::kBlockSize,
+                                                 UniversalGemmProblem::kBlockSize,
                                                  TilePartitioner::MPerBlock,
                                                  TilePartitioner::NPerBlock,
                                                  GemmConfig::M_Warp,
@@ -112,6 +113,7 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
                                                  UniversalGemmProblem::TransposeC,
                                                  memory_operation,
                                                  GemmConfig::NumWaveGroups>>;
+
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
@@ -135,7 +137,7 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
             {
                 std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
                           << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
                           << "pipeline: " << GemmPipeline::GetName() << '\n'
                           << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                           << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
@@ -214,8 +216,21 @@ template <typename GemmConfig,
           typename CPrecType = APrecType>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
+    auto [result, arg_parser] = create_args(argc, argv);
+    bool preshuffle           = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
 
     if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
     {
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index f96f558101..4d29b68694 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -13,50 +13,94 @@
 #include "flatmm_basic.hpp"
 #include "run_flatmm_example.inc"
 
-template <typename ADataType,
+template <typename FlatmmConfig,
+          typename ADataType,
           typename BDataType,
+          typename DsDatatype,
           typename AccDataType,
           typename CDataType,
-          typename FlatmmConfig,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
-float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
+          typename DsLayout,
+          typename ELayout,
+          bool persistent,
+          typename CDEElementWise>
+float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_config& s)
 {
-    using CodegenFlatmmShape = ck_tile::TileFlatmmShape<
+    using CodegenFlatmmShape = ck_tile::TileGemmShape<
         ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
         ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
         ck_tile::sequence<FlatmmConfig::M_Warp_Tile,
                           FlatmmConfig::N_Warp_Tile,
                           FlatmmConfig::K_Warp_Tile>>;
 
-    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenFlatmmShape>;
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<CodegenFlatmmShape,
+                                                   FlatmmConfig::TileParitionerGroupNum,
+                                                   FlatmmConfig::TileParitionerM01>;
 
-    using CodegenGemmTraits = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
-                                                      FlatmmConfig::kPadN,
-                                                      FlatmmConfig::kPadK,
-                                                      ALayout,
-                                                      BLayout,
-                                                      CLayout>;
+    using Traits = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
+                                           FlatmmConfig::kPadN,
+                                           FlatmmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           FlatmmConfig::NumWaveGroups>;
 
-    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
-                                                                BDataType,
-                                                                AccDataType,
-                                                                CodegenFlatmmShape,
-                                                                CodegenGemmTraits>;
+    using CodegenGemmTraits = ck_tile::TileGemmUniversalTraits<FlatmmConfig::kPadM,
+                                                               FlatmmConfig::kPadN,
+                                                               FlatmmConfig::kPadK,
+                                                               FlatmmConfig::DoubleSmemBuffer,
+                                                               ALayout,
+                                                               BLayout,
+                                                               ELayout,
+                                                               FlatmmConfig::TransposeC,
+                                                               FlatmmConfig::UseStructuredSparsity,
+                                                               persistent,
+                                                               FlatmmConfig::NumWaveGroups,
+                                                               true>;
 
-    const auto Run = [&](const auto memory_operation_) {
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenFlatmmShape, Traits>;
+
+    using BaseGemmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * FlatmmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * FlatmmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = FlatmmConfig::Scheduler;
         constexpr auto memory_operation = memory_operation_.value;
 
+        using CodegenPipelineProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             CodegenFlatmmShape,
+                                                                             CodegenGemmTraits,
+                                                                             scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>;
+
+        using CodegenFlatmmPipeline =
+            ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
-                                             ck_tile::tuple<>,
+                                             DsDatatype,
                                              AccDataType,
                                              CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
                                              CodegenPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
@@ -66,11 +110,8 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                                              FlatmmConfig::N_Warp_Tile,
                                              FlatmmConfig::K_Warp_Tile,
                                              CodegenPipelineProblem::TransposeC,
-                                             memory_operation>>;
-
-        using CodegenFlatmmPolicy = ck_tile::UniversalFlatmmPipelineAgBgCrPolicy;
-        using CodegenFlatmmPipeline =
-            ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenFlatmmPolicy>;
+                                             memory_operation,
+                                             FlatmmConfig::NumWaveGroups>>;
 
         // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
         // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
@@ -88,14 +129,15 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
 
         if(s.log_level_ > 0)
         {
-            std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName()
-                      << CodegenPipelineProblem::GetName() << " grid: {" << grids.x << ", "
-                      << grids.y << ", " << grids.z << "}"
+            std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName() << "\n"
+                      << "Shape: " << CodegenFlatmmShape::GetName() << "\n"
+                      << "problem: " << CodegenPipelineProblem::GetName() << "\n"
+                      << "pipeline: " << CodegenFlatmmPipeline::GetName() << "\n"
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                       << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                       << std::endl;
         }
 
-        float ave_time{0};
         if(s.flush_cache_)
         {
             std::cout << "Flushing cache..." << std::endl;
@@ -113,7 +155,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
             auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
 
             ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.a_ptr, kargs.b_shuffle_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+                kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
             rotating_mem.Print();
 
             auto run_flush_cache = [&]() {
@@ -124,7 +166,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                 // clear c mem
                 if(args.k_batch > 1)
                     hipGetErrorString(hipMemsetAsync(
-                        args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             };
             ave_time = ck_tile::launch_kernel_preprocess(
                 s,
@@ -141,16 +183,25 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
         }
         return ave_time;
     };
-    if(args.k_batch == 1)
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
-    }
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
 }
 
 template <template <typename PreType> typename FlatmmConfig>
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp
index 01a02290ce..963a6ba675 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -12,25 +12,6 @@
 #include "ck_tile/ops/flatmm.hpp"
 #include "ck_tile/ops/gemm.hpp"
 
-#define CK_TILE_PIPELINE_COMPUTE 1
-#define CK_TILE_PIPELINE_MEMORY 2
-
-#ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE
-#endif
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
-#endif
-
 // GEMM config with 32x132 warp tile
 template <typename DataType>
 struct FlatmmConfig32
@@ -47,10 +28,19 @@ struct FlatmmConfig32
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 32;
 
-    static constexpr bool kPadM      = false;
-    static constexpr bool kPadN      = false;
-    static constexpr bool kPadK      = false;
-    static constexpr int kBlockPerCu = 2;
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                = 2;
+    static constexpr int TileParitionerGroupNum     = 8;
+    static constexpr int TileParitionerM01          = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool DoubleSmemBuffer          = false;
 };
 
 template <typename DataType>
@@ -75,10 +65,19 @@ struct FlatmmConfig16
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 64;
 
-    static constexpr bool kPadM      = false;
-    static constexpr bool kPadN      = false;
-    static constexpr bool kPadK      = false;
-    static constexpr int kBlockPerCu = 2;
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                = 2;
+    static constexpr int TileParitionerGroupNum     = 8;
+    static constexpr int TileParitionerM01          = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool DoubleSmemBuffer          = false;
 };
 
 template <typename DataType>
@@ -159,10 +158,10 @@ struct DataTypeTraits<ck_tile::half_t>
     static constexpr const char* name = "fp16";
 };
 
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
+template <typename T>
+struct is_8bit_type
+    : std::bool_constant<std::is_same_v<T, ck_tile::fp8_t> || std::is_same_v<T, ck_tile::bf8_t>>
 {
-    static constexpr const char* name = "bf16";
 };
 
 auto create_args(int argc, char* argv[])
@@ -200,4 +199,4 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
+float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 24009ac132..3c4d0908dd 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -69,14 +69,31 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ADataType,
+template <typename FlatmmConfig,
+          typename ADataType,
           typename BDataType,
+          typename DsDatatype,
           typename AccDataType,
           typename CDataType,
-          typename FlatmmConfig,
           typename ALayout,
           typename BLayout,
-          typename CLayout>
+          typename DsLayout,
+          typename ELayout,
+          bool persistent,
+          typename CDEElementWise>
+float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_config& s);
+
+template <typename FlatmmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDatatype,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
                     ck_tile::DeviceMem& b_shuffle_dev_buf,
                     ck_tile::DeviceMem& c_dev_buf,
@@ -90,27 +107,31 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
                     int n_warmup,
                     int n_repeat)
 {
-    ck_tile::FlatmmHostArgs args;
-    args.a_ptr         = a_dev_buf.GetDeviceBuffer();
-    args.b_shuffle_ptr = b_shuffle_dev_buf.GetDeviceBuffer();
-    args.c_ptr         = c_dev_buf.GetDeviceBuffer();
+    ck_tile::FlatmmHostArgs<> args = {a_dev_buf.GetDeviceBuffer(),
+                                      b_shuffle_dev_buf.GetDeviceBuffer(),
+                                      {},
+                                      c_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      {},
+                                      stride_C};
 
-    args.k_batch  = kbatch;
-    args.M        = M;
-    args.N        = N;
-    args.K        = K;
-    args.stride_A = stride_A;
-    args.stride_B = stride_B;
-    args.stride_C = stride_C;
-
-    float ave_time = flatmm_calc<ADataType,
+    float ave_time = flatmm_calc<FlatmmConfig,
+                                 ADataType,
                                  BDataType,
+                                 DsDatatype,
                                  AccDataType,
                                  CDataType,
-                                 FlatmmConfig,
                                  ALayout,
                                  BLayout,
-                                 CLayout>(
+                                 DsLayout,
+                                 CLayout,
+                                 false,
+                                 CDEElementWise>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
 
     std::size_t flop = std::size_t(2) * M * N * K;
@@ -159,6 +180,7 @@ int run_flatmm_example_with_layouts(int argc,
     int n_warmup                 = arg_parser.get_int("warmup");
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
+    // persistent not added
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -204,13 +226,15 @@ int run_flatmm_example_with_layouts(int argc,
     ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
     b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
 
-    invoke_flatmm<ADataType,
+    invoke_flatmm<FlatmmConfig,
+                  ADataType,
                   BDataType,
+                  ck_tile::tuple<>,
                   AccDataType,
                   CDataType,
-                  FlatmmConfig,
                   ALayout,
                   BLayout,
+                  ck_tile::tuple<>,
                   CLayout>(a_dev_buf,
                            b_shuffle_dev_buf,
                            c_dev_buf,
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
old mode 100644
new mode 100755
index d2e1bde58f..76df056ea6
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -12,47 +12,75 @@
 
 namespace ck_tile {
 
-struct FlatmmProblem
-{
-    CK_TILE_HOST FlatmmProblem() = default;
-    CK_TILE_HOST FlatmmProblem(
-        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
-        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
-    {
-    }
-
-    index_t M;
-    index_t N;
-    index_t K;
-    index_t stride_A;
-    index_t stride_B;
-    index_t stride_C;
-};
-
-struct FlatmmHostArgs : public FlatmmProblem
+template <index_t NumDTensor = 0>
+struct FlatmmHostArgs
 {
     CK_TILE_HOST FlatmmHostArgs() = default;
     CK_TILE_HOST FlatmmHostArgs(const void* a_ptr_,
-                                const void* b_shuffle_ptr_,
-                                void* c_ptr_,
+                                const void* b_ptr_,
+                                const std::array<const void*, NumDTensor>& ds_ptr_,
+                                void* e_ptr_,
                                 index_t k_batch_,
                                 index_t M_,
                                 index_t N_,
                                 index_t K_,
                                 index_t stride_A_,
                                 index_t stride_B_,
-                                index_t stride_C_)
-        : FlatmmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_),
-          a_ptr(a_ptr_),
-          b_shuffle_ptr(b_shuffle_ptr_),
-          c_ptr(c_ptr_),
+                                const std::array<index_t, NumDTensor>& stride_Ds_,
+                                index_t stride_E_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          ds_ptr(ds_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_Ds(stride_Ds_),
+          stride_E(stride_E_),
           k_batch(k_batch_)
     {
     }
 
     const void* a_ptr;
-    const void* b_shuffle_ptr;
-    void* c_ptr;
+    const void* b_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    const std::array<index_t, NumDTensor> stride_Ds;
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
+    index_t k_batch;
+};
+
+template <index_t NumDTensor = 0>
+struct FlatmmKernelArgs
+{
+    const void* a_ptr;
+    // const void* b_shuffle_ptr;
+    const void* b_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
+    void* e_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    std::array<index_t, NumDTensor> stride_Ds;
+    index_t stride_E;
     index_t k_batch;
 };
 
@@ -63,23 +91,29 @@ struct FlatmmKernel
     using FlatmmPipeline  = remove_cvref_t<FlatmmPipeline_>;
     using BlockGemmShape =
         remove_cvref_t<typename FlatmmPipeline::BlockGemmShape>; // TileFlatmmShape
-    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout                            = remove_cvref_t<typename FlatmmPipeline::ALayout>;
-    using BLayout                            = remove_cvref_t<typename FlatmmPipeline::BLayout>;
-    using CLayout                            = remove_cvref_t<typename FlatmmPipeline::CLayout>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout          = remove_cvref_t<typename FlatmmPipeline::ALayout>;
+    using BLayout          = remove_cvref_t<typename FlatmmPipeline::BLayout>;
+    using ELayout          = remove_cvref_t<typename FlatmmPipeline::CLayout>;
+    using DsLayout         = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+    using DsDataType       = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
     static constexpr index_t KernelBlockSize = FlatmmPipeline::BlockSize;
 
     using ADataType = remove_cvref_t<typename FlatmmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename FlatmmPipeline::BDataType>;
     // Below type is actually accumulation data type - the output of block GEMM.
-    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    static constexpr auto I0   = number<0>();
-    static constexpr auto I1   = number<1>();
-    static constexpr auto I2   = number<2>();
-    static constexpr auto idxM = I0;
-    static constexpr auto idxN = I1;
-    static constexpr auto idxK = I2;
+    static constexpr index_t NumDTensor = DsDataType::size();
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>();
+
+    static_assert(DsLayout::size() == DsDataType::size(),
+                  "The size of DsLayout and DsDataType should be the same");
+    using KernelArgs = FlatmmKernelArgs<DsLayout::size()>;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
@@ -95,32 +129,21 @@ struct FlatmmKernel
 
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
-    struct FlatmmKernelArgs
+    CK_TILE_HOST static constexpr KernelArgs
+    MakeKernelArgs(const FlatmmHostArgs<NumDTensor>& hostArgs)
     {
-        const void* a_ptr;
-        const void* b_shuffle_ptr;
-        void* c_ptr;
-        index_t M;
-        index_t N;
-        index_t K;
-        index_t stride_A;
-        index_t stride_B;
-        index_t stride_C;
-        index_t k_batch;
-    };
-
-    CK_TILE_HOST static constexpr FlatmmKernelArgs MakeKernelArgs(const FlatmmHostArgs& hostArgs)
-    {
-        return FlatmmKernelArgs{hostArgs.a_ptr,
-                                hostArgs.b_shuffle_ptr,
-                                hostArgs.c_ptr,
-                                hostArgs.M,
-                                hostArgs.N,
-                                hostArgs.K,
-                                hostArgs.stride_A,
-                                hostArgs.stride_B,
-                                hostArgs.stride_C,
-                                hostArgs.k_batch};
+        return KernelArgs{hostArgs.a_ptr,
+                          hostArgs.b_ptr,
+                          hostArgs.ds_ptr,
+                          hostArgs.e_ptr,
+                          hostArgs.M,
+                          hostArgs.N,
+                          hostArgs.K,
+                          hostArgs.stride_A,
+                          hostArgs.stride_B,
+                          hostArgs.stride_Ds,
+                          hostArgs.stride_E,
+                          hostArgs.k_batch};
     }
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -130,8 +153,7 @@ struct FlatmmKernel
 
     struct SplitKBatchOffset
     {
-        __device__ SplitKBatchOffset(const FlatmmKernelArgs& kargs,
-                                     const std::size_t k_id = blockIdx.z)
+        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
         {
             constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
             const index_t K_t   = kargs.k_batch * K1;
@@ -170,10 +192,10 @@ struct FlatmmKernel
         index_t splitted_k;
     };
 
-    CK_TILE_HOST static bool IsSupportedArgument(const FlatmmKernelArgs& kargs)
+    CK_TILE_HOST static bool IsSupportedArgument(const KernelArgs& kargs)
     {
         if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                     is_any_of<CDataType, fp16_t, bf16_t>::value)
+                     is_any_of<EDataType, fp16_t, bf16_t>::value)
         {
             if(kargs.k_batch != 1)
             {
@@ -244,7 +266,45 @@ struct FlatmmKernel
             }
         }
 
-        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        bool DTesnorIsValid = {true};
+        static_for<0, NumDTensor, 1>{}([&](auto index) {
+            using DiLayout = remove_cvref_t<std::tuple_element_t<index.value, DsLayout>>;
+            if(std::is_same_v<DiLayout, ELayout> == false)
+            {
+                DTesnorIsValid = false;
+            }
+            if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+                {
+                    CK_TILE_ERROR("Can't support N for tensor D that is not a multiple of "
+                                  "NPerBlock without padding!");
+                    DTesnorIsValid = false;
+                }
+                if(kargs.N % EpiloguePipeline::GetVectorSizeD(index) != 0)
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for D tensor!");
+                    DTesnorIsValid = false;
+                }
+            }
+            else
+            {
+                if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+                {
+                    CK_TILE_ERROR("Can't support M for tensor D that is not a multiple of "
+                                  "MPerBlock without padding!");
+
+                    DTesnorIsValid = false;
+                }
+                if(kargs.M % EpiloguePipeline::GetVectorSizeD(index) != 0)
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for D tensor!");
+                    DTesnorIsValid = false;
+                }
+            }
+        });
+
+        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
         {
             if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
             {
@@ -274,15 +334,17 @@ struct FlatmmKernel
                 return false;
             }
         }
-        return true;
+        return DTesnorIsValid;
     }
 
     template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
-                                                   const BDataType* b_flat_ptr,
-                                                   CDataType* c_ptr,
-                                                   const FlatmmKernelArgs& kargs,
-                                                   const SplitKBatchOffset& splitk_batch_offset)
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const ADataType* a_ptr,
+                        const BDataType* b_flat_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        EDataType* e_ptr,
+                        const KernelArgs& kargs,
+                        const SplitKBatchOffset& splitk_batch_offset)
     {
         const auto& a_tensor_view = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
@@ -317,29 +379,54 @@ struct FlatmmKernel
                 number<1>{});
         }();
 
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                using DiLayout   = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                using DDataType_ = remove_cvref_t<std::tuple_element_t<i.value, DsDataType>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const DDataType_*>(ds_ptr[i]),
+                        make_tuple(kargs.M, kargs.N),
+                        make_tuple(kargs.stride_Ds[i], 1),
+                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
+                        number<1>{});
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const DDataType_*>(ds_ptr[i]),
+                        make_tuple(kargs.N, kargs.M),
+                        make_tuple(kargs.stride_Ds[i], 1),
+                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
+                        number<1>{});
+                }
+            },
+            number<NumDTensor>{});
+
         // TODO: enable vector write for C in ColMajor
-        const auto& c_tensor_view = [&]() {
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        const auto& e_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
             {
                 return make_naive_tensor_view<address_space_enum::global>(
-                    c_ptr,
+                    e_ptr,
                     make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_C, 1),
+                    make_tuple(kargs.stride_E, 1),
                     number<EpiloguePipeline::GetVectorSizeC()>{},
                     number<1>{});
             }
             else
             {
                 return make_naive_tensor_view<address_space_enum::global>(
-                    c_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(1, kargs.stride_C),
+                    e_ptr,
+                    make_tuple(kargs.N, kargs.M),
+                    make_tuple(kargs.stride_E, 1),
                     number<1>{},
                     number<1>{});
             }
         }();
 
-        return make_tuple(a_tensor_view, b_flat_tensor_view, c_tensor_view);
+        return make_tuple(a_tensor_view, b_flat_tensor_view, ds_tensor_view, e_tensor_view);
     }
 
     template <typename TensorView>
@@ -365,26 +452,47 @@ struct FlatmmKernel
 
         const auto& b_flat_tensor_view = views.at(I1);
 
+        const auto& ds_pad_view = generate_tuple(
+            [&](auto i) {
+                const auto& d_tensor_view = views.at(I2);
+                using DiLayout            = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return pad_tensor_view(d_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                      number<TilePartitioner::NPerBlock>{}),
+                                           sequence<false, FlatmmPipeline::kPadN>{});
+                }
+                else
+                {
+                    return pad_tensor_view(d_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                      number<TilePartitioner::MPerBlock>{}),
+                                           sequence<false, FlatmmPipeline::kPadM>{});
+                }
+            },
+            number<NumDTensor>{});
+
         // TODO vector write in for C in ColMajor
-        const auto& c_pad_view = [&]() {
-            const auto& c_tensor_view = views.at(I2);
-            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        const auto& e_pad_view = [&]() {
+            const auto& e_tensor_view = views.at(I3);
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
             {
-                return pad_tensor_view(c_tensor_view,
+                return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
                                        sequence<false, FlatmmPipeline::kPadN>{});
             }
             else
             {
-                return pad_tensor_view(c_tensor_view,
+                return pad_tensor_view(e_tensor_view,
                                        make_tuple(number<TilePartitioner::MPerBlock>{},
                                                   number<TilePartitioner::NPerBlock>{}),
                                        sequence<FlatmmPipeline::kPadM, false>{});
             }
         }();
 
-        return make_tuple(a_pad_view, b_flat_tensor_view, c_pad_view);
+        return make_tuple(a_pad_view, b_flat_tensor_view, ds_pad_view, e_pad_view);
     }
 
     template <typename PadView>
@@ -393,7 +501,8 @@ struct FlatmmKernel
     {
         const auto& a_pad_view      = views.at(I0);
         const auto& b_flat_pad_view = views.at(I1);
-        const auto& c_pad_view      = views.at(I2);
+        const auto& ds_pad_view     = views.at(I2);
+        const auto& e_pad_view      = views.at(I3);
 
         const auto& a_block_window = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
@@ -416,21 +525,43 @@ struct FlatmmKernel
             make_tile_window(b_flat_pad_view,
                              make_tuple(number<FlatmmPipeline::flatNPerWarp>{},
                                         number<FlatmmPipeline::flatKPerWarp>{}),
-                             {static_cast<int>(i_n / BlockGemmShape::WarpTile::at(idxN)), 0});
+                             {static_cast<int>(i_n / BlockGemmShape::WarpTile::at(I1)), 0});
 
-        auto c_block_window = make_tile_window(
-            c_pad_view,
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                using DiLayout = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_tile_window(ds_pad_view[i],
+                                            make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                       number<TilePartitioner::NPerBlock>{}),
+                                            {i_m, i_n});
+                }
+                else
+                {
+                    return make_tile_window(ds_pad_view[i],
+                                            make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                       number<TilePartitioner::MPerBlock>{}),
+                                            {i_n, i_m});
+                }
+            },
+            number<NumDTensor>{});
+
+        auto e_block_window = make_tile_window(
+            e_pad_view,
             make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
             {i_m, i_n});
 
-        return make_tuple(a_block_window, b_flat_block_window, c_block_window);
+        return make_tuple(a_block_window, b_flat_block_window, ds_block_window, e_block_window);
     }
 
+    template <bool UseDefaultScheduler = true>
     CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
                                          const BDataType* b_flat_ptr,
-                                         CDataType* c_ptr,
+                                         const std::array<const void*, NumDTensor>& ds_ptr,
+                                         EDataType* e_ptr,
                                          void* smem_ptr,
-                                         const FlatmmKernelArgs& kargs,
+                                         const KernelArgs& kargs,
                                          const SplitKBatchOffset& splitk_batch_offset,
                                          const index_t block_idx_m,
                                          const index_t block_idx_n)
@@ -438,7 +569,7 @@ struct FlatmmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_flat_ptr, c_ptr, kargs, splitk_batch_offset);
+                a_ptr, b_flat_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
@@ -450,15 +581,18 @@ struct FlatmmKernel
         const auto& d_block_window      = gemm_tile_windows.at(I2);
         const auto& c_block_tile        = FlatmmPipeline{}.template operator()(
             a_block_window, b_flat_block_window, num_loop, smem_ptr);
+        if(UseDefaultScheduler || (get_warp_id() == 0))
+        {
+            // Run Epilogue Pipeline
+            auto& c_block_window = gemm_tile_windows.at(I3);
 
-        // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(I2);
-
-        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, d_block_window, smem_ptr);
+            EpiloguePipeline{}.template
+            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+                c_block_window, c_block_tile, d_block_window, smem_ptr);
+        }
     }
 
-    CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
+    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
         const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
@@ -468,18 +602,27 @@ struct FlatmmKernel
         // options
         const ADataType* a_ptr =
             static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
-        const BDataType* b_flat_ptr = static_cast<const BDataType*>(kargs.b_shuffle_ptr) +
-                                      splitk_batch_offset.b_k_split_offset;
-        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+        const BDataType* b_flat_ptr =
+            static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+        EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
 
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
         if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
                        EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                       is_any_of<CDataType, fp16_t, bf16_t>::value))
+                       is_any_of<EDataType, fp16_t, bf16_t>::value))
         {
-            RunFlatmm(a_ptr, b_flat_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            constexpr auto scheduler_type = (FlatmmPipeline::NumWaveGroups == 1);
+            RunFlatmm<scheduler_type>(a_ptr,
+                                      b_flat_ptr,
+                                      kargs.ds_ptr,
+                                      e_ptr,
+                                      smem_ptr,
+                                      kargs,
+                                      splitk_batch_offset,
+                                      i_m,
+                                      i_n);
         }
     }
 };
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 648b2b85bd..edb5853c7f 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -9,9 +9,33 @@
 
 namespace ck_tile {
 
-template <typename Problem, typename PipelinePolicy = UniversalFlatmmPipelineAgBgCrPolicy>
-struct FlatmmPipelineAGmemBGmemCRegV1
+template <typename Problem>
+struct BaseFlatmmPipelineAGmemBGmemCRegV1
 {
+    static constexpr index_t PrefetchStages   = 1;
+    static constexpr index_t PrefillStages    = 1;
+    static constexpr index_t GlobalBufferNum  = 1;
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t) { return true; }
+
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t)
+    {
+        return TailNumber::Empty;
+    }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool, TailNumber)
+    {
+        return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
+    }
+};
+template <typename Problem, typename PipelinePolicy = UniversalFlatmmPipelineAgBgCrPolicy>
+struct FlatmmPipelineAGmemBGmemCRegV1 : public BaseFlatmmPipelineAGmemBGmemCRegV1<Problem>
+{
+    using Base           = BaseFlatmmPipelineAGmemBGmemCRegV1<Problem>;
     using ADataType      = remove_cvref_t<typename Problem::ADataType>;
     using BDataType      = remove_cvref_t<typename Problem::BDataType>;
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
@@ -33,39 +57,44 @@ struct FlatmmPipelineAGmemBGmemCRegV1
     static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
     static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
 
-    static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; }
-    static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
-    static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
+    static constexpr index_t GetVectorSizeA()
+    {
+        return PipelinePolicy::template GetVectorSizeA<Problem>();
+    }
+    static constexpr index_t GetVectorSizeB()
+    {
+        return PipelinePolicy::template GetVectorSizeB<Problem>();
+    }
 
     static constexpr bool kPadM = Problem::kPadM;
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr index_t kLdsAlignmentInBytes = 16;
+    static constexpr index_t NumWaveGroups        = Problem::NumWaveGroups;
 
-    static constexpr auto I0   = number<0>();
-    static constexpr auto I1   = number<1>();
-    static constexpr auto I2   = number<2>();
-    static constexpr auto idxM = I0;
-    static constexpr auto idxN = I1;
-    static constexpr auto idxK = I2;
-    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
-    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
-    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+
+    using BlockTile  = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile   = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
+    using Base::UsePersistentKernel;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
         return concat('_', "pipeline_AGmemBGmemCRegV1", 
                       concat('x', kMPerBlock, kNPerBlock, kKPerBlock,  BlockSize),
-                      concat('x', GetVectorSizeA(), GetVectorSizeB(), GetVectorSizeC()),
+                      concat('x', GetVectorSizeA(), GetVectorSizeB()),
                       concat('x', kPadM, kPadN, kPadK));
         // clang-format on
     }
 
-    // For the basic gemm pipelien DoubleSmemBuffer set to be false naturally.
-    static constexpr bool DoubleSmemBuffer = false;
-
     CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
@@ -162,13 +191,19 @@ struct FlatmmPipelineAGmemBGmemCRegV1
                                         void* p_smem) const
     {
         static_assert(
-            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>>,
-            "wrong!");
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BFlatBlockWindowTmp::DataType>>,
+            "A/B Dram block window should have the same data type as appropriate "
+            "([A|B]DataType) defined in Problem definition!");
 
-        static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}],
-                      "wrong!");
-        static_assert(kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
-                      "wrong!");
+        constexpr bool is_a_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+
+        static_assert(is_a_col_major
+                          ? (kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                             kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1])
+                          : (kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                             kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1]),
+                      "A block window has incorrect lengths for defined ALayout!");
 
         constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
 
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 5c33666ec4..837aeb13e3 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
 
 namespace ck_tile {
 
@@ -122,6 +123,95 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
 #endif
     }
 
+    /**
+     * @brief Get the maximum global memory vector load size.
+     *
+     * @tparam Problem      The UniversalGemmPipelineProblem object.
+     * @tparam DataType     The tensor data type we're considering.
+     * @tparam MNPerBlock   The MPerBlock or NPerBlock value depending on tensor (A/B).
+     * @tparam XPerTile     The contiguous Tile dimension size.
+     * @return Maximum DRAM vector load size.
+     */
+    template <typename Problem, typename DataType, index_t MNPerBlock, index_t XPerTile>
+    CK_TILE_HOST_DEVICE static constexpr auto GetGlobalVectorLoadSize()
+    {
+        constexpr index_t BlockSize           = Problem::kBlockSize;
+        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
+        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
+
+        // Assume DataType is even!
+        if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     PackedSize == 2)
+        {
+            return (PackedSize * 32 / sizeof(DataType));
+        }
+        else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 16 / sizeof(DataType));
+        }
+        else if constexpr(XPerTile % (PackedSize * 8 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 8 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 8 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 4 &&
+                          XPerTile % (PackedSize * 4 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 4 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 4 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 2 &&
+                          XPerTile % (PackedSize * 2 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 2 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 2 / sizeof(DataType));
+        }
+        else
+        {
+            return PackedSize;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA()
+    {
+        using ALayout               = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType             = remove_cvref_t<typename Problem::ADataType>;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, KPerBlock>();
+        }
+        else
+        {
+            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, MPerBlock>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB()
+    {
+        using BLayout               = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType             = remove_cvref_t<typename Problem::BDataType>;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, NPerBlock>();
+        }
+        else
+        {
+            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, KPerBlock>();
+        }
+    }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
     {
@@ -148,14 +238,14 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
     {
         using TileShape = typename Problem::BlockGemmShape;
-        if constexpr(TileShape::WarpTile::at(TileShape::idxN) == 32)
+        if constexpr(TileShape::WarpTile::at(I1) == 32)
         {
-            return TileShape::WarpTile::at(TileShape::idxK) / 2;
+            return TileShape::WarpTile::at(I2) / 2;
         }
         else
         {
-            static_assert(TileShape::WarpTile::at(TileShape::idxN) == 16);
-            return TileShape::WarpTile::at(TileShape::idxK) / 4;
+            static_assert(TileShape::WarpTile::at(I1) == 16);
+            return TileShape::WarpTile::at(I2) / 4;
         }
     }
 
@@ -267,7 +357,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
 
         constexpr index_t NBPerLoad   = 1;
         constexpr index_t NThdPerWave = 1;
-        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(TileShape::idxN); // N_Warp
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(number<1>{}); // N_Warp
         constexpr index_t NRepeat     = 1;
 
         constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
@@ -337,23 +427,25 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
     {
-        using AccDataType = float;
-        using BlockWarps  = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile    = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm    = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+        // using AccDataType = float;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
                                                 typename Problem::BDataType,
-                                                AccDataType,
+                                                typename Problem::CDataType,
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
                                                 WarpTile::at(I2),
                                                 Problem::TransposeC>;
 
-        using BlockFlatmmPolicy =
-            BlockFlatmmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
-                                                    typename Problem::BDataType,
-                                                    typename Problem::CDataType,
-                                                    BlockWarps,
-                                                    WarpGemm>;
+        using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy<
+            typename Problem::ADataType,
+            // BlockGemmASmemBSmemCRegV1CustomPolicy<typename
+            // Problem::ADataType,
+            typename Problem::BDataType,
+            typename Problem::CDataType,
+            BlockWarps,
+            WarpGemm>;
         return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
     }
 };
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 237c00d6c9..f1e8bcc0a8 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -24,6 +24,8 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
+#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
@@ -42,6 +44,8 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
new file mode 100644
index 0000000000..f4659c44fe
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1.hpp
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
+
+namespace ck_tile {
+
+// A is block window on shared memory
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename BlockPolicy_>
+struct BlockWeightPreshuffleASmemBSmemCRegV1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using BlockPolicy    = remove_cvref_t<BlockPolicy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+
+        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockWindow, typename BFlatBlockTensor>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   ABlockWindow& a_warp_windows,
+                                   BFlatBlockTensor& b_warp_tensor) const
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG              = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp =
+            BlockTile::at(idxN) / (WarpTile::at(idxN) * BlockWarps::at(idxN));
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block window
+                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor(nIter)(kIter));
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp
new file mode 100644
index 0000000000..55a2fbc34c
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// Default policy for BlockGemmASmemBSmemCRegV1
+// Default policy class should not be templated, put template on member functions instead
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_>
+struct BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
old mode 100644
new mode 100755
index 60de052dc0..516d4298ef
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -306,7 +306,7 @@ struct GemmKernel
         if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
         {
             if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false)
+               GemmPipeline::kPadK == false) // k_batch is extra compared to flatmm
             {
                 if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                 {
@@ -368,7 +368,7 @@ struct GemmKernel
         else
         {
             if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false)
+               GemmPipeline::kPadK == false) // again k_batch is extra compared to flatmm
             {
                 if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
                 {
@@ -489,6 +489,7 @@ struct GemmKernel
                         const SplitKBatchOffset& splitk_batch_offset)
     {
         static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+
         const auto& a_tensor_view = [&]() {
             if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
             {
@@ -563,12 +564,30 @@ struct GemmKernel
                 }
                 else
                 {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        b_ptr,
-                        make_tuple(kargs.N, splitk_batch_offset.splitted_k),
-                        make_tuple(kargs.stride_B, 1),
-                        number<GemmPipeline::GetVectorSizeB()>{},
-                        number<1>{});
+                    if constexpr(GemmPipeline::Preshuffle)
+                    {
+                        index_t kFlatK =
+                            GemmPipeline::BlockGemmShape::flatKPerWarp *
+                            (splitk_batch_offset.splitted_k /
+                             TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}));
+                        index_t kFlatN = kargs.N * kargs.K / kFlatK;
+
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            b_ptr,
+                            make_tuple(kFlatN, kFlatK),
+                            make_tuple(kFlatK, 1),
+                            number<GemmPipeline::GetVectorSizeB()>{},
+                            number<1>{});
+                    }
+                    else
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            b_ptr,
+                            make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                            make_tuple(kargs.stride_B, 1),
+                            number<GemmPipeline::GetVectorSizeB()>{},
+                            number<1>{});
+                    }
                 }
             }
         }();
@@ -613,7 +632,7 @@ struct GemmKernel
             {
                 return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
                     e_ptr,
-                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.M, kargs.N), // arguments not matching with flatmm.
                     make_tuple(1, kargs.stride_E),
                     number<1>{},
                     number<1>{});
@@ -644,6 +663,8 @@ struct GemmKernel
             }
         }();
 
+        const auto& b_flat_pad_view = views.at(I1);
+
         const auto& b_pad_view = [&]() {
             const auto& b_tensor_view = views.at(I1);
             if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
@@ -701,8 +722,15 @@ struct GemmKernel
                                        sequence<GemmPipeline::kPadM, false>{});
             }
         }();
-
-        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, e_pad_view);
+        if constexpr(GemmPipeline::Preshuffle)
+        {
+            // For flatmm, we need to use the flat B tensor view
+            return make_tuple(a_pad_view, b_flat_pad_view, ds_pad_view, e_pad_view);
+        }
+        else
+        {
+            return make_tuple(a_pad_view, b_pad_view, ds_pad_view, e_pad_view);
+        }
     }
 
     template <typename PadView>
@@ -732,19 +760,30 @@ struct GemmKernel
         }();
 
         const auto& b_block_window = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            if constexpr(GemmPipeline::Preshuffle)
             {
-                return make_tile_window(b_pad_view,
-                                        make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                   number<TilePartitioner::KPerBlock>{}),
-                                        {i_n, 0});
+                return make_tile_window(
+                    b_pad_view,
+                    make_tuple(number<GemmPipeline::BlockGemmShape::flatNPerWarp>{},
+                               number<GemmPipeline::BlockGemmShape::flatKPerWarp>{}),
+                    {static_cast<int>(i_n / GemmPipeline::BlockGemmShape::WarpTile::at(I1)), 0});
             }
             else
             {
-                return make_tile_window(b_pad_view,
-                                        make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                   number<TilePartitioner::NPerBlock>{}),
-                                        {0, i_n});
+                if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+                {
+                    return make_tile_window(b_pad_view,
+                                            make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                       number<TilePartitioner::KPerBlock>{}),
+                                            {i_n, 0});
+                }
+                else
+                {
+                    return make_tile_window(b_pad_view,
+                                            make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                       number<TilePartitioner::NPerBlock>{}),
+                                            {0, i_n});
+                }
             }
         }();
 
@@ -807,7 +846,8 @@ struct GemmKernel
                 a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
         const index_t num_loop = __builtin_amdgcn_readfirstlane(
             TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
@@ -822,7 +862,6 @@ struct GemmKernel
 
         if(UseDefaultScheduler || (get_warp_id() == 0))
         {
-            // Run Epilogue Pipeline
             auto& c_block_window = gemm_tile_windows.at(I3);
 
             EpiloguePipeline{}.template
@@ -865,7 +904,8 @@ struct GemmKernel
                 a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
         const index_t num_loop = __builtin_amdgcn_readfirstlane(
             TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index eb47d9bad6..6d0db060cd 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -112,11 +112,6 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
-    static constexpr index_t APackedSize =
-        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
-    static constexpr index_t BPackedSize =
-        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
-
     using ALayout = remove_cvref_t<typename Problem::ALayout>;
     using BLayout = remove_cvref_t<typename Problem::BLayout>;
     using CLayout = remove_cvref_t<typename Problem::CLayout>;
@@ -127,6 +122,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     using I2        = number<2>;
 
     static constexpr index_t BlockSize = Problem::kBlockSize;
+
     static constexpr index_t MPerBlock = BlockGemmShape::kM;
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
@@ -135,6 +131,11 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
     static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
 
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
     static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
     static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
 
@@ -144,10 +145,13 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
     static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
 
-    static constexpr bool HasHotLoop = Problem::HasHotLoop;
-    static constexpr auto TailNum    = Problem::TailNum;
-    static constexpr auto Scheduler  = Problem::Scheduler;
+    static constexpr bool HasHotLoop =
+        Problem::HasHotLoop; // Base::BlockHasHotloop(Problem::num_loop);
+    static constexpr auto TailNum =
+        Problem::TailNum; // Base::GetBlockLoopTailNum(Problem::num_loop);
+    static constexpr auto Scheduler = Problem::Scheduler;
 
     using Base::PrefetchStages;
     using Base::UsePersistentKernel;
@@ -155,7 +159,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
-        return concat('_', "pipeline_AgBgCrCompV3", BlockSize,
+        return concat('_', "pipeline_AgBgCrCompV3", 
+                      concat('x', MPerBlock, NPerBlock, KPerBlock,  BlockSize),
                       concat('x', GetVectorSizeA(), GetVectorSizeB(),  GetVectorSizeC()),
                       concat('x', kPadM, kPadN, kPadK));
         // clang-format on
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 8424c43e86..8e6bab21be 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -135,11 +135,22 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
     static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
 
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
     static constexpr auto TailNum    = Problem::TailNum;
     static constexpr auto Scheduler  = Problem::Scheduler;
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "pipeline_AgBgCrCompV3", 
+                      concat('x', MPerBlock, NPerBlock, KPerBlock,  BlockSize),
+                      concat('x', GetVectorSizeA(), GetVectorSizeB(),  GetVectorSizeC()),
+                      concat('x', kPadM, kPadN, kPadK));
+        // clang-format on
+    }
+
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         return Policy::template GetSmemSize<Problem>();
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index f6920f1c57..a42ddd93a0 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -20,12 +20,12 @@ struct GemmPipelineAgBgCrCompV4DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        using AccDataType     = float;
+        // using AccDataType     = float;
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
                                                 typename Problem::BDataType,
-                                                AccDataType,
+                                                typename Problem::CDataType, // AccDataType
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
                                                 WarpTile::at(I2),
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 424565060b..0fdcc04d89 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -70,6 +70,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
 
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
     static constexpr auto TailNum    = Problem::TailNum;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
index c03db08c3f..7784b1d508 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
@@ -20,12 +20,12 @@ struct GemmPipelineAgBgCrCompV5DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        using AccDataType     = float;
+        // using AccDataType     = float;
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
                                                 typename Problem::BDataType,
-                                                AccDataType,
+                                                typename Problem::CDataType, // AccDataType
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
                                                 WarpTile::at(I2),
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index 1f2ab80797..dfcc398dfc 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -189,6 +189,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
     static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
 
     // Where is the right place for HasHotLoop and TailNum ???
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index dc7d150b46..c19d42ce25 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -24,7 +24,7 @@ struct GemmPipelineProblemBase
 
     using ADataType       = remove_cvref_t<ADataType_>;
     using BDataType       = remove_cvref_t<BDataType_>;
-    using CDataType       = remove_cvref_t<CDataType_>;
+    using CDataType       = remove_cvref_t<CDataType_>; // actually AccDataType
     using ComputeDataType = remove_cvref_t<ComputeDataType_>;
 
     static constexpr bool FixedVectorSize = FixedVectorSize_;
@@ -35,10 +35,8 @@ struct GemmPipelineProblemBase
     using BLayout = remove_cvref_t<typename Traits::BLayout>;
     using CLayout = remove_cvref_t<typename Traits::CLayout>;
 
-    static constexpr bool TransposeC = Traits::TransposeC;
-
-    static constexpr index_t NumWaveGroups = Traits::NumWaveGroups;
-
+    static constexpr bool TransposeC            = Traits::TransposeC;
+    static constexpr index_t NumWaveGroups      = Traits::NumWaveGroups;
     static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
 
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
@@ -198,12 +196,10 @@ struct UniversalGemmPipelineProblem
 
     using ADataType       = remove_cvref_t<ADataType_>;
     using BDataType       = remove_cvref_t<BDataType_>;
-    using CDataType       = remove_cvref_t<CDataType_>;
+    using CDataType       = remove_cvref_t<CDataType_>; // actually AccDataType
     using ComputeDataType = remove_cvref_t<ComputeDataType_>;
 
     static constexpr bool FixedVectorSize = FixedVectorSize_;
-    static constexpr index_t VectorSizeA  = VectorSizeA_;
-    static constexpr index_t VectorSizeB  = VectorSizeB_;
 
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
@@ -211,6 +207,10 @@ struct UniversalGemmPipelineProblem
     using BLayout = remove_cvref_t<typename Traits::BLayout>;
     using CLayout = remove_cvref_t<typename Traits::CLayout>;
 
+    static constexpr bool TransposeC            = Traits::TransposeC;
+    static constexpr index_t NumWaveGroups      = Traits::NumWaveGroups;
+    static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
+
     static constexpr index_t kBlockSize = BlockGemmShape::NumWarps * get_warp_size();
 
     static constexpr bool kPadM = Traits::kPadM;
@@ -218,15 +218,24 @@ struct UniversalGemmPipelineProblem
     static constexpr bool kPadK = Traits::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Traits::DoubleSmemBuffer;
+    static constexpr auto Scheduler        = Scheduler_;
+    static constexpr bool Preshuffle       = Traits::Preshuffle;
 
-    static constexpr auto Scheduler  = Scheduler_;
-    static constexpr auto HasHotLoop = HasHotLoop_;
-    static constexpr auto TailNum    = TailNum_;
+    static constexpr index_t VectorSizeA = VectorSizeA_;
+    static constexpr index_t VectorSizeB = VectorSizeB_;
 
-    static constexpr bool TransposeC            = Traits::TransposeC;
-    static constexpr bool UseStructuredSparsity = Traits::UseStructuredSparsity;
-
-    static constexpr index_t NumWaveGroups = Traits::NumWaveGroups;
+    static constexpr auto HasHotLoop        = HasHotLoop_;
+    static constexpr auto TailNum           = TailNum_;
+    static constexpr index_t VectorLoadSize = Traits::_VectorSize;
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm_problem", 
+                      concat('x', kBlockSize),
+                      concat('x', kPadM, kPadN, kPadK),
+                      Scheduler);
+        // clang-format on
+    }
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
index f0aa4472e1..25cd20ae27 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
@@ -28,6 +28,10 @@ struct TileGemmShape
     static constexpr bool PermuteA = PermuteA_;
     static constexpr bool PermuteB = PermuteB_;
 
+    static constexpr index_t flatNPerWarp  = BlockWarps::at(number<1>{});
+    static constexpr index_t flatKPerWarp  = WarpTile::at(number<2>{}) * WarpTile::at(number<1>{});
+    static constexpr index_t flatKPerBlock = flatKPerWarp * kK / WarpTile::at(number<2>{});
+
     CK_TILE_HOST static std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
index b546cebcd5..be777df6a6 100644
--- a/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
@@ -42,13 +42,14 @@ template <bool kPadM_,
           bool TransposeC_            = false,
           bool UseStructuredSparsity_ = false,
           bool UsePersistentKernel_   = false,
-          index_t NumWaveGroups_      = 1>
+          index_t NumWaveGroups_      = 1,
+          bool Preshuffle_            = 0>
 struct TileGemmUniversalTraits
 {
-    static constexpr bool kPadM = kPadM_;
-    static constexpr bool kPadN = kPadN_;
-    static constexpr bool kPadK = kPadK_;
-
+    static constexpr bool kPadM            = kPadM_;
+    static constexpr bool kPadN            = kPadN_;
+    static constexpr bool kPadK            = kPadK_;
+    static constexpr int _VectorSize       = 16;
     static constexpr bool DoubleSmemBuffer = DoubleSmemBuffer_;
 
     using ALayout = ALayout_;
@@ -59,6 +60,7 @@ struct TileGemmUniversalTraits
     static constexpr bool UseStructuredSparsity = UseStructuredSparsity_;
     static constexpr bool UsePersistentKernel   = UsePersistentKernel_;
     static constexpr index_t NumWaveGroups      = NumWaveGroups_;
+    static constexpr bool Preshuffle            = Preshuffle_;
 };
 
 template <bool kPadM_,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
new file mode 100644
index 0000000000..432245d8a1
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+
+namespace ck_tile {
+
+template <typename Problem>
+struct BaseWeightPreshufflePipelineAGmemBGmemCRegV1
+{
+    static constexpr index_t PrefetchStages   = 1;
+    static constexpr index_t PrefillStages    = 1;
+    static constexpr index_t GlobalBufferNum  = 1;
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t) { return true; }
+
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t)
+    {
+        return TailNumber::Empty;
+    }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool, TailNumber)
+    {
+        return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
+    }
+};
+
+template <typename Problem, typename PipelinePolicy = UniversalWeightPreshufflePipelineAgBgCrPolicy>
+struct WeightPreshufflePipelineAGmemBGmemCRegV1
+    : public BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>
+{
+    using Base           = BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockWeightPreshuffle =
+        remove_cvref_t<decltype(PipelinePolicy::template GetBlockWeightPreshuffle<Problem>())>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kMPerBlock = BlockGemmShape::kM;
+    static constexpr index_t kNPerBlock = BlockGemmShape::kN;
+    static constexpr index_t kKPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
+    static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
+
+    static constexpr index_t GetVectorSizeA()
+    {
+        return PipelinePolicy::template GetVectorSizeA<Problem>();
+    }
+    static constexpr index_t GetVectorSizeB()
+    {
+        return PipelinePolicy::template GetVectorSizeB<Problem>();
+    }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr index_t kLdsAlignmentInBytes = 16;
+    static constexpr index_t NumWaveGroups        = Problem::NumWaveGroups;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+
+    using BlockTile  = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile   = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
+    using Base::UsePersistentKernel;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "pipeline_AGmemBGmemCRegV1", 
+                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock,  BlockSize),
+                      concat('x', GetVectorSizeA(), GetVectorSizeB()),
+                      concat('x', kPadM, kPadN, kPadK));
+        // clang-format on
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return PipelinePolicy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto config =
+            BlockWeightPreshuffle::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t KIterPerWarp = kKPerBlock / WG::kK;
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WG::kN);
+
+        constexpr index_t KPerLoad               = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
+        constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
+        constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
+
+        if constexpr(WG::kM == 16 && WG::kN == 16)
+        {
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+            });
+        }
+        else if constexpr(WG::kM == 32 && WG::kN == 32 &&
+                          (A_LDS_Read_Inst_Num / 2 >
+                           A_Buffer_Load_Inst_Num + B_Buffer_Load_Inst_Num))
+        {
+            static_for<0,
+                       A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num - B_Buffer_Load_Inst_Num,
+                       1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+        }
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const AElementFunction& a_element_func,
+                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                        index_t num_loop,
+                                        void* p_smem) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BFlatBlockWindowTmp::DataType>>,
+            "A/B Dram block window should have the same data type as appropriate "
+            "([A|B]DataType) defined in Problem definition!");
+
+        constexpr bool is_a_col_major = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+
+        static_assert(is_a_col_major
+                          ? (kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                             kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1])
+                          : (kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                             kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1]),
+                      "A block window has incorrect lengths for defined ALayout!");
+
+        constexpr auto config =
+            BlockWeightPreshuffle::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WG::kK;
+
+        constexpr index_t KFlatPerBlockPerIter = flatKPerWarp;
+        constexpr index_t NFlatPerBlockPerIter = flatNPerWarp;
+
+        constexpr index_t MPerBlockPerIter = kMPerBlock / MIterPerWarp;
+        constexpr index_t KPerBlockPerIter = kKPerBlock / KIterPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        auto a_warp_window_tmp = make_tile_window(
+            a_lds_gemm_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_lds_gemm_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // Block GEMM
+        auto block_flatmm = BlockWeightPreshuffle();
+
+        // B flat DRAM window for load
+        auto b_flat_distribution =
+            PipelinePolicy::template MakeBFlatDramTileDistribution<Problem>();
+        auto b_flat_dram_window = // tile_window_with_static_distribution
+            make_tile_window(
+                b_flat_dram_block_window_tmp.get_bottom_tensor_view(), // from kernel gemm_pad_views
+                make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
+                b_flat_dram_block_window_tmp.get_window_origin(),
+                b_flat_distribution);
+
+        // Acc register tile
+        auto c_block_tile = block_flatmm.MakeCBlockTile();
+
+        // prefetch
+        // global read 0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_flat_dram_window), KIterPerWarp>,
+            NIterPerWarp>
+            b_flat_dram_windows;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensor;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensor_2;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+            });
+        });
+
+        {
+            // move to 1
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                    PipelinePolicy::template MakeShuffledARegBlockDistribution<Problem>());
+                shuffle_tile(a_shuffle_tmp, a_block_tile);
+                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
+                store_tile(a_copy_lds_window, a_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+            }
+            block_sync_lds();
+        }
+
+        index_t iCounter = num_loop / 2 - 1;
+        while(iCounter > 0)
+        {
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor);
+
+            block_sync_lds();
+
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // move to i + 2
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            // LDS write i + 1
+            auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+            HotLoopScheduler();
+            block_sync_lds();
+
+            // iCounter--;
+
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor_2);
+
+            block_sync_lds();
+
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // move to i + 2
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // move to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            // LDS write i + 1
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            HotLoopScheduler();
+            block_sync_lds();
+
+            iCounter--;
+        }
+
+        // tail
+        {
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+
+            // GEMM i
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor);
+
+            block_sync_lds();
+
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_2(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // move to i + 2
+            // move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // LDS write i + 1
+            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            // move to next flat K
+            // move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            HotLoopScheduler();
+            block_sync_lds();
+
+            // GEMM num_loop - 1
+            block_flatmm(c_block_tile, a_warp_windows, b_warp_tensor_2);
+        }
+
+        return c_block_tile;
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return operator()(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_flat_dram_block_window_tmp,
+            num_loop,
+            p_smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
new file mode 100644
index 0000000000..6922ddf8a7
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+struct UniversalWeightPreshufflePipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    // 3d + padding
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+        if constexpr(MPerXdl == 16 && NPerXdl == 16)
+        {
+            /*reduce transform layers,compare with old ck*/
+            constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_xor_transform(
+                               make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
+        else
+        {
+            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t kKPack     = GetSmemPackA<Problem>();
+
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
+                make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
+                number<kKPack>{},
+                number<1>{});
+
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_pass_through_transform(kMPerBlock),
+                           make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
+/*xor*/
+#if 0
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t kKPack     = GetSmemPackA<Problem>();
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / kKPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / kKPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / kKPack * MLdsLayer>{},
+                    number<kMPerBlock / MLdsLayer>{},
+                    number<kKPack>{}),
+            make_tuple(number<kKPack>{}, number<kKPerBlock * MLdsLayer>{}, number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<kMPerBlock / MLdsLayer>{},
+                                                    number<kKPerBlock / kKPack * MLdsLayer>{})),
+                    make_pass_through_transform(number<kKPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                        make_tuple(number<MLdsLayer>{}, number<kKPerBlock / kKPack>{})),
+                        make_pass_through_transform(number<kMPerBlock / MLdsLayer>{}),
+                        make_pass_through_transform(number<kKPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform(
+                        make_tuple(number<kMPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                        make_merge_transform(
+                        make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return a_lds_block_desc;
+#endif
+    }
+
+    /**
+     * @brief Get the maximum global memory vector load size.
+     *
+     * @tparam Problem      The UniversalGemmPipelineProblem object.
+     * @tparam DataType     The tensor data type we're considering.
+     * @tparam MNPerBlock   The MPerBlock or NPerBlock value depending on tensor (A/B).
+     * @tparam XPerTile     The contiguous Tile dimension size.
+     * @return Maximum DRAM vector load size.
+     */
+    template <typename Problem, typename DataType, index_t MNPerBlock, index_t XPerTile>
+    CK_TILE_HOST_DEVICE static constexpr auto GetGlobalVectorLoadSize()
+    {
+        constexpr index_t BlockSize           = Problem::kBlockSize;
+        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
+        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
+
+        // Assume DataType is even!
+        if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     PackedSize == 2)
+        {
+            return (PackedSize * 32 / sizeof(DataType));
+        }
+        else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 16 / sizeof(DataType));
+        }
+        else if constexpr(XPerTile % (PackedSize * 8 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 8 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 8 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 4 &&
+                          XPerTile % (PackedSize * 4 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 4 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 4 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 2 &&
+                          XPerTile % (PackedSize * 2 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 2 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 2 / sizeof(DataType));
+        }
+        else
+        {
+            return PackedSize;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA()
+    {
+        using ALayout               = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType             = remove_cvref_t<typename Problem::ADataType>;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, KPerBlock>();
+        }
+        else
+        {
+            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, MPerBlock>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB()
+    {
+        using BLayout               = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType             = remove_cvref_t<typename Problem::BDataType>;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, NPerBlock>();
+        }
+        else
+        {
+            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, KPerBlock>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
+                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
+    {
+        return Problem::VectorLoadSize / sizeof(typename Problem::ADataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+        if constexpr(TileShape::WarpTile::at(I1) == 32)
+        {
+            return TileShape::WarpTile::at(I2) / 2;
+        }
+        else
+        {
+            static_assert(TileShape::WarpTile::at(I1) == 16);
+            return TileShape::WarpTile::at(I2) / 4;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() >= (K2 * M0))
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M1, M2 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t KBPerLoad   = GetKBPerLoad<Problem>();
+        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
+        constexpr index_t KWavePerBlk = 1;
+        constexpr index_t KRepeat     = 1;
+        static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");
+
+        constexpr index_t NBPerLoad   = 1;
+        constexpr index_t NThdPerWave = 1;
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(number<1>{}); // N_Warp
+        constexpr index_t NRepeat     = 1;
+
+        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<WaveRepeat>,                                          // ?
+                tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>,  // second direction
+                      sequence<KRepeat, KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
+                // wave in blk,     // thd in wave
+                // <M, K>           // <M, K>
+                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                // <repeat, vec_load>
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDistribution()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = kMPerBlock / M1;
+        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackA<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size >= (K2 * M0))
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockWeightPreshuffle()
+    {
+        // using AccDataType = float;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                typename Problem::CDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                Problem::TransposeC>;
+
+        using BlockWeightPreshufflePolicy =
+            BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                              typename Problem::BDataType,
+                                                              typename Problem::CDataType,
+                                                              BlockWarps,
+                                                              WarpGemm>;
+        return BlockWeightPreshuffleASmemBSmemCRegV1<Problem, BlockWeightPreshufflePolicy>{};
+    }
+};
+
+} // namespace ck_tile
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 5d05243238..cc933012ac 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory(image_to_column)
 add_subdirectory(gemm)
+add_subdirectory(gemm_weight_preshuffle)
 add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index 5b7d105638..ae8899ba71 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -22,9 +22,10 @@ using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                              ck_tile::GemmPipelineScheduler::Intrawave>;
 using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                              ck_tile::GemmPipelineScheduler::Interwave>;
-using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
-using CompV3    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
-using CompV4    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
+
+using Mem    = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
+using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
+using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
 
 using Persistent    = std::true_type;
 using NonPersistent = std::false_type;
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index a6a4817143..450a3a538f 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -90,7 +90,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
         ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
     // TODO: expose tile size through test t-param ?
 
-    template <bool PadM, bool PadN, bool PadK>
+    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void invoke_gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args,
                      const ck_tile::stream_config& s)
     {
@@ -107,9 +107,10 @@ class TestCkTileGemmPipeline : public ::testing::Test
         constexpr ck_tile::index_t N_Warp_Tile = 32;
         constexpr ck_tile::index_t K_Warp_Tile = 16;
 
-        constexpr bool kPadM = PadM;
-        constexpr bool kPadN = PadN;
-        constexpr bool kPadK = PadK;
+        constexpr bool kPadM      = PadM;
+        constexpr bool kPadN      = PadN;
+        constexpr bool kPadK      = PadK;
+        constexpr bool preshuffle = Preshuffle;
 
         constexpr bool DoubleSmemBuffer = (PipelineType == GemmPipelineType::CompV4) ? true : false;
 
@@ -131,7 +132,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
 
         using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
         static constexpr bool StructuredSparsity = false;
-        using GemmUniversalTraits                = ck_tile::TileGemmUniversalTraits<kPadM,
+        static constexpr bool NumWaveGroup       = 1;
+
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
                                                                      kPadN,
                                                                      kPadK,
                                                                      DoubleSmemBuffer,
@@ -140,7 +143,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                                                      CLayout,
                                                                      TransposeC,
                                                                      StructuredSparsity,
-                                                                     Persistent>;
+                                                                     Persistent,
+                                                                     NumWaveGroup,
+                                                                     preshuffle>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -261,7 +266,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
         }
     }
 
-    template <bool PadM = true, bool PadN = true, bool PadK = true>
+    template <bool PadM = true, bool PadN = true, bool PadK = true, bool Preshuffle = false>
     void Run(const int M,
              const int N,
              const int K,
@@ -271,11 +276,11 @@ class TestCkTileGemmPipeline : public ::testing::Test
     {
         for(auto kb : k_batches_)
         {
-            RunSingle<PadM, PadN, PadK>(M, N, K, StrideA, StrideB, StrideC, kb);
+            RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
         }
     }
 
-    template <bool PadM, bool PadN, bool PadK>
+    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void RunSingle(const int M,
                    const int N,
                    const int K,
@@ -352,7 +357,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
         args.stride_B = stride_B;
         args.stride_E = stride_C;
 
-        invoke_gemm<PadM, PadN, PadK>(args, ck_tile::stream_config{nullptr, false});
+        invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         bool pass = true;
diff --git a/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt b/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt
new file mode 100644
index 0000000000..4b9e6049e3
--- /dev/null
+++ b/test/ck_tile/gemm_weight_preshuffle/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Currently ck_tile_gemm is only built on gfx94/gfx95
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+set(EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
+    -mllvm
+    -enable-noalias-to-md-conversion=0
+)
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+    add_gtest_executable(test_ck_tile_gemm_pipeline_wp test_gemm_pipeline_wp.cpp)
+
+    target_compile_options(test_ck_tile_gemm_pipeline_wp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+else()
+    message(DEBUG "Skipping ck_tile_gemm tests for current target")
+endif()
+
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
new file mode 100644
index 0000000000..152017dbad
--- /dev/null
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F32 = float;
+using F8  = ck_tile::fp8_t;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+using Default = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
+                                           ck_tile::GemmPipelineScheduler::Default>;
+
+using WeightPreshuffle =
+    ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::WeightPreshuffle>;
+
+// clang-format off
+
+using KernelTypesWeightPreshuffle = ::testing::Types<
+     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Default,        WeightPreshuffle>,
+     std::tuple<    Row,     Col,     Row,       F8,         F8,         F32,       F16,             Default,        WeightPreshuffle>
+>;
+
+// clang-format on
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
new file mode 100755
index 0000000000..b3a82e5fbc
--- /dev/null
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#ifndef TEST_GEMM_PIPELINE_UT_CASES_INC
+#define TEST_GEMM_PIPELINE_UT_CASES_INC
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle)
+{
+    constexpr int M           = 2048;
+    constexpr int N           = 4096;
+    constexpr int K           = 5120;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+#endif
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
new file mode 100644
index 0000000000..0315f69c16
--- /dev/null
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+#include <sstream>
+#include <gtest/gtest.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+enum struct GemmPipelineType
+{
+    WeightPreshuffle
+};
+
+template <GemmPipelineType PT, typename Problem>
+struct GemmPipelineTypeSelector;
+
+template <typename Problem>
+struct GemmPipelineTypeSelector<GemmPipelineType::WeightPreshuffle, Problem>
+{
+    using base_pipeline = ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
+    using pipeline      = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<Problem>;
+
+    static constexpr auto GetName() { return "GemmPipelineAgBgCrWeightPreshuffle"; }
+};
+template <typename Datatype>
+struct config
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(Datatype);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(Datatype) == 2 ? 16 : 32;
+};
+template <typename Tuple>
+class TestCkTileGemmPipeline : public ::testing::Test
+{
+    protected:
+    using ALayout                      = std::tuple_element_t<0, Tuple>;
+    using BLayout                      = std::tuple_element_t<1, Tuple>;
+    using CLayout                      = std::tuple_element_t<2, Tuple>;
+    using ADataType                    = std::tuple_element_t<3, Tuple>;
+    using BDataType                    = std::tuple_element_t<4, Tuple>;
+    using AccDataType                  = std::tuple_element_t<5, Tuple>;
+    using CDataType                    = std::tuple_element_t<6, Tuple>;
+    static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
+    static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
+
+    using DsLayout   = ck_tile::tuple<>;
+    using DsDataType = ck_tile::tuple<>;
+    using GemmConfig = config<ADataType>;
+
+    static constexpr bool Persistent =
+        ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
+    // TODO: expose tile size through test t-param ?
+
+    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
+    void invoke_gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args,
+                     const ck_tile::stream_config& s)
+    {
+        // TODO: This should be parameterized in tests
+        // constexpr ck_tile::index_t M_Tile = 128;
+        // constexpr ck_tile::index_t N_Tile = 128;
+        // constexpr ck_tile::index_t K_Tile = 128;
+
+        // constexpr ck_tile::index_t M_Warp = 1;
+        // constexpr ck_tile::index_t N_Warp = 4;
+        // constexpr ck_tile::index_t K_Warp = 1;
+
+        // constexpr ck_tile::index_t M_Warp_Tile = 32;
+        // constexpr ck_tile::index_t N_Warp_Tile = 32;
+        // constexpr ck_tile::index_t K_Warp_Tile = sizeof(ADataType) == 2 ? 16 : 32;
+
+        constexpr bool kPadM      = PadM;
+        constexpr bool kPadN      = PadN;
+        constexpr bool kPadK      = PadK;
+        constexpr bool preshuffle = Preshuffle;
+
+        constexpr bool DoubleSmemBuffer = false;
+
+        // TODO: For now - but this should also be a test parameter
+        constexpr bool TransposeC = false;
+
+        constexpr int kBlockPerCu                         = 2;
+        constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        // ===============================================
+
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::sequence<GemmConfig::M_Warp_Tile,
+                              GemmConfig::N_Warp_Tile,
+                              GemmConfig::K_Warp_Tile>>;
+        using TilePartitioner = ck_tile::
+            GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        static constexpr bool StructuredSparsity = false;
+        static constexpr bool NumWaveGroup       = 1;
+
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                     kPadN,
+                                                                     kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     TransposeC,
+                                                                     StructuredSparsity,
+                                                                     Persistent,
+                                                                     NumWaveGroup,
+                                                                     preshuffle>;
+
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline =
+            typename GemmPipelineTypeSelector<PipelineType, GemmPipelineProblem>::base_pipeline;
+
+        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               Scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline =
+                typename GemmPipelineTypeSelector<PipelineType, UniversalGemmProblem>::pipeline;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 GemmPipeline::BlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args:"
+                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::set>{});
+            }
+            else
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    }
+
+    public:
+    std::vector<int> k_batches_;
+
+    void SetUp() override
+    {
+
+        // Otherwise, use k_batch = 1 and 2
+        k_batches_ = {1};
+    }
+
+    template <bool PadM = true, bool PadN = true, bool PadK = true, bool Preshuffle = false>
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA = 0,
+             const int StrideB = 0,
+             const int StrideC = 0)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle<PadM, PadN, PadK, Preshuffle>(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch = 1)
+    {
+        using namespace ck_tile::literals;
+
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    // give a chance if stride is zero, return a default packed stride
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        std::size_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
+        std::size_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
+        std::size_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
+
+        ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
+        ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5}(b_k_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        ck_tile::HostTensor<BDataType> t_view({N / GemmConfig::N_Warp_Tile,
+                                               GemmConfig::N_Warp_Tile,
+                                               K / GemmConfig::K_Warp_Tile,
+                                               divisor,
+                                               GemmConfig::K_Warp_Tile / divisor});
+
+        std::copy(b_k_n.begin(), b_k_n.end(), t_view.begin());
+        ck_tile::HostTensor<BDataType> b_shuffle_host =
+            ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        ck_tile::GemmHostArgs</*NumDTensor = 0*/> args;
+        args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
+        args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
+        args.e_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
+        args.k_batch  = kbatch;
+        args.M        = M;
+        args.N        = N;
+        args.K        = K;
+        args.stride_A = stride_A;
+        args.stride_B = stride_B;
+        args.stride_E = stride_C;
+
+        invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
+
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+        bool pass = true;
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
new file mode 100644
index 0000000000..de71c4682d
--- /dev/null
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_wp.cpp
@@ -0,0 +1,16 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineWeightPreshuffle : public TestCkTileGemmPipeline<T>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineWeightPreshuffle
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineWeightPreshuffle, KernelTypesWeightPreshuffle);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME

From 518dc21ae8fa8e78abf3deaa238daff0df1c7771 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Fri, 11 Jul 2025 13:07:05 -0600
Subject: [PATCH 304/443] MX GEMM - FP6 Support in GEMM MX v3 Pipeline (#2481)

* Add GEMM MX BF6 example

* Fix BF6 type_convert

* Add type_convert for bf16x6

* Add compare operator to f4x2_pk_t

* Update README for 67_gemm_microscaling

* Fix host tensor initialization with integer values for FP8
---
 example/67_gemm_microscaling/CMakeLists.txt   |   4 +
 example/67_gemm_microscaling/README.md        |   6 +-
 example/67_gemm_microscaling/gemm_mx_bf6.cpp  | 101 ++++++++++++++++++
 .../67_gemm_microscaling/gemm_mx_common.hpp   |   7 +-
 include/ck/library/utility/host_tensor.hpp    |   9 +-
 include/ck/utility/amd_xdlops.hpp             |  48 +++++++++
 include/ck/utility/data_type.hpp              |  11 ++
 include/ck/utility/dtype_vector.hpp           |   5 +-
 include/ck/utility/type_convert.hpp           |  71 ++++++++++--
 test/data_type/CMakeLists.txt                 |   1 +
 test/data_type/test_bf6.cpp                   |  55 ++++++++++
 11 files changed, 303 insertions(+), 15 deletions(-)
 create mode 100644 example/67_gemm_microscaling/gemm_mx_bf6.cpp

diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 35c5d18d50..14b648c9f8 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -13,6 +13,9 @@ add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
 add_example_executable(example_gemm_mx_fp6 gemm_mx_fp6.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp6)
 
+add_example_executable(example_gemm_mx_bf6 gemm_mx_bf6.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_bf6)
+
 add_example_executable(example_gemm_mx_fp4 gemm_mx_fp4.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp4)
 
@@ -62,3 +65,4 @@ example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 set(FP6_MXGEMM_OPTIONS)
 list(APPEND FP6_MXGEMM_OPTIONS -mavx512f)
 example_compile_options(example_gemm_mx_fp6 PRIVATE ${FP6_MXGEMM_OPTIONS})
+example_compile_options(example_gemm_mx_bf6 PRIVATE ${FP6_MXGEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/README.md b/example/67_gemm_microscaling/README.md
index 57b6490eda..007c934b7e 100644
--- a/example/67_gemm_microscaling/README.md
+++ b/example/67_gemm_microscaling/README.md
@@ -8,14 +8,16 @@ Custom verification parameters:
 # arg2: initialization (0=constant values, 1=integer values, 2=decimal values)
 # arg3: time kernel (0=no, 1=yes)
 # arg4: verbosity (0=no info, 1=verbose info)
-# arg5 to 10: M(128x), N(128x), K(64x), StrideA, StrideB, StrideC
+# arg5 to 10: M(256x), N(256x), K(512x), StrideA, StrideB, StrideC
 # arg11: KBatch
+# arg12: warmup runs pre-timing
+# arg13: repeat run count for timing
 ./bin/example_gemm_mx_fp8 1 1 0 1
 ```
 
 Custom tensor shapes:
 ```bash
-./bin/example_gemm_mx_fp8 1 2 1 0 128  128  256 -1 -1 -1 1
+./bin/example_gemm_mx_fp8 1 2 1 0 256  256  512 -1 -1 -1 1 10 10
 ```
 
 Default invocation:
diff --git a/example/67_gemm_microscaling/gemm_mx_bf6.cpp b/example/67_gemm_microscaling/gemm_mx_bf6.cpp
new file mode 100644
index 0000000000..34810c2961
--- /dev/null
+++ b/example/67_gemm_microscaling/gemm_mx_bf6.cpp
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::bf6x16_pk_t;
+using BDataType = ck::bf6x16_pk_t;
+
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t;
+
+using CDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t DataPackedSize = 16;                   // Packed representation of data
+constexpr ck::index_t ScaleBlockSize = 32;                   // scaling block size
+constexpr ck::index_t KPerBlock      = 256 / DataPackedSize; // 256 bf6 = 16 bf6x16_pk_t
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v3;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XPackedDataType,  // AScaleDataType
+    BDataType,        // BDataType
+    XPackedDataType,  // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    256,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    128,              // NPerBlock
+    KPerBlock,        // KPerBlock
+    1,                // AK1
+    1,                // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    4,                // MXdlPerWave
+    4,                // NXdlPerWave
+    S<16, 16, 1>,     // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    1,                // ABlockTransferSrcScalarPerVector
+    1,                // ABlockTransferDstScalarPerVector_AK1
+    true,             // ABlockLdsExtraM
+    S<16, 16, 1>,     // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    1,                // BBlockTransferSrcScalarPerVector
+    1,                // BBlockTransferDstScalarPerVector_BK1
+    true,             // BBlockLdsExtraN
+    2,                // CShuffleMXdlPerWavePerShuffle
+    2,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               XPackedDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 6ce10817ff..2d0585c880 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -100,8 +100,11 @@ bool parse_cmd_args(int argc,
                   << std::endl
                   << "arg3: time kernel (0=no, 1=yes)" << std::endl
                   << "arg4: verbosity (0=no info, 1=verbose info)" << std::endl
-                  << "arg5 to 10: M(128x), N(128x), K(256x), StrideA, StrideB, StrideC" << std::endl
-                  << "arg11: KBatch" << std::endl;
+                  << "arg5 to 10: M(256x), N(256x), K(512x), StrideA, StrideB, StrideC" << std::endl
+                  << "arg11: KBatch" << std::endl
+                  << "arg12: warmup runs pre-timing" << std::endl
+                  << "arg13: repeat run count for timing" << std::endl;
+
         return false;
     }
 
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 46028b79f9..33c918c997 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -550,7 +550,14 @@ struct Tensor
                 auto dis_ = dis; // copy
                 g_.discard(ib_begin * BLOCK_SIZE * ck::packed_size_v<T>);
                 auto t_fn = [&]() {
-                    if constexpr(ck::packed_size_v<T> == 1)
+                    // As user can pass integer distribution in dis, we must ensure that the correct
+                    // constructor/converter is called at all times. For f4/f6/f8 types, to ensure
+                    // correct results, we convert from float to the target type. In these cases
+                    // integer constructors are interpreted as direct initialization of the internal
+                    // storage with binary values instead of treating integers as subset of floats.
+                    if constexpr(ck::is_same_v<T, ck::f8_t> || ck::is_same_v<T, ck::bf8_t>)
+                        return ck::type_convert<T>(static_cast<float>(fn(dis_(g_))));
+                    else if constexpr(ck::packed_size_v<T> == 1)
                         return ck::type_convert<T>(fn(dis_(g_)));
                     else if constexpr(ck::is_same_v<T, ck::f4x2_pk_t>)
                         return ck::f4x2_pk_t{ck::type_convert<ck::f4x2_t>(
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index efb877b3f2..8646b8393b 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1118,6 +1118,54 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16, OpselA, OpselB>
 #endif
     }
 
+    template <class FloatC>
+    __device__ static void Run(const bf6x16x2_t& reg_a,
+                               const int32_t scale_a,
+                               const bf6x16x2_t& reg_b,
+                               const int32_t scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        using arg_type = int32x8_t;
+        arg_type arg_a{
+            static_cast<int32_t>(reg_a.template AsType<bf6x16x2_t::data_t>()[Number<0>{}][0]),
+            static_cast<int32_t>(reg_a.template AsType<bf6x16x2_t::data_t>()[Number<0>{}][1]),
+            static_cast<int32_t>(reg_a.template AsType<bf6x16x2_t::data_t>()[Number<0>{}][2]),
+            static_cast<int32_t>(reg_a.template AsType<bf6x16x2_t::data_t>()[Number<1>{}][0]),
+            static_cast<int32_t>(reg_a.template AsType<bf6x16x2_t::data_t>()[Number<1>{}][1]),
+            static_cast<int32_t>(reg_a.template AsType<bf6x16x2_t::data_t>()[Number<1>{}][2]),
+            0,
+            0};
+        arg_type arg_b{
+            static_cast<int32_t>(reg_b.template AsType<bf6x16x2_t::data_t>()[Number<0>{}][0]),
+            static_cast<int32_t>(reg_b.template AsType<bf6x16x2_t::data_t>()[Number<0>{}][1]),
+            static_cast<int32_t>(reg_b.template AsType<bf6x16x2_t::data_t>()[Number<0>{}][2]),
+            static_cast<int32_t>(reg_b.template AsType<bf6x16x2_t::data_t>()[Number<1>{}][0]),
+            static_cast<int32_t>(reg_b.template AsType<bf6x16x2_t::data_t>()[Number<1>{}][1]),
+            static_cast<int32_t>(reg_b.template AsType<bf6x16x2_t::data_t>()[Number<1>{}][2]),
+            0,
+            0};
+
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                arg_a,
+                arg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                3,      // cbsz {0 FP8 E4M3; 1 FP8 E5M2; 2 FP6 E2M3; 3 FP6 E3M2; 4 FP4 E2M1}
+                3,      // blgp
+                OpselA, // OPSEL
+                scale_a,
+                OpselB, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
+#endif
+    }
+
     template <class FloatC>
     __device__ static void Run(const f4x32_t& reg_a,
                                const int32_t scale_a,
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 15b8841c39..8f5a45bdf0 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -60,6 +60,17 @@ struct f4x2_pk_t
     {
         return (x0 << 4) | (x1 & 0b00001111);
     }
+
+    // Compare operator
+    __host__ __device__ friend bool operator==(const f4x2_pk_t& lhs, const f4x2_pk_t& rhs)
+    {
+        return lhs.data == rhs.data;
+    }
+
+    __host__ __device__ friend bool operator!=(const f4x2_pk_t& lhs, const f4x2_pk_t& rhs)
+    {
+        return !(lhs == rhs);
+    }
 };
 
 template <typename BitType, index_t pk_size>
diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp
index effe445883..ae0edb35ee 100644
--- a/include/ck/utility/dtype_vector.hpp
+++ b/include/ck/utility/dtype_vector.hpp
@@ -2254,8 +2254,9 @@ using f6x16x2_t = typename vector_type<f6x16_pk_t, 2>::type;
 using f6x32_t   = typename vector_type<f6x32_pk_t, 1>::type;
 
 // bf6
-using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type;
-using bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type;
+using bf6x16_t   = typename vector_type<bf6x16_pk_t, 1>::type;
+using bf6x16x2_t = typename vector_type<bf6x16_pk_t, 2>::type;
+using bf6x32_t   = typename vector_type<bf6x32_pk_t, 1>::type;
 
 // e8m0
 using e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type;
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 23ab1bebb5..05e461fa63 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -2102,17 +2102,15 @@ inline __host__ __device__ bf6x32_t bf6_convert_rne(float32_t x, float scale = 1
         float float_array[32];
     } in{x};
 
-    union
-    {
-        bf6x32_t bf6_vector;
-        bf6_t bf6_array[32];
-    } out{};
+    using array_type = uint8_t __attribute__((ext_vector_type(32)));
+    array_type uint8_array;
 
+    // collect the 6-bit values into an array
     ck::static_for<0, 32, 1>{}([&](auto i) {
-        out.bf6_array[i] = utils::sat_convert_to_type<bf6_t>(in.float_array[i] / scale);
+        uint8_array[static_cast<index_t>(i)] =
+            utils::sat_convert_to_type<bf6_t>(in.float_array[i] / scale);
     });
-
-    return out.bf6_vector;
+    return bf6x32_t{bf6x32_pk_t{uint8_array}};
 #endif
 }
 
@@ -2257,6 +2255,37 @@ inline __host__ __device__ bf6x32_pk_t type_convert<bf6x32_pk_t, float32_t>(floa
     return static_cast<bf6x32_pk_t>(type_convert<bf6x32_t>(x));
 }
 
+template <>
+inline __host__ __device__ bf6x16_t type_convert<bf6x16_t, float16_t>(float16_t x)
+{
+
+    union
+    {
+        float16_t v16x2[2];
+        float32_t v32;
+    } in{{x, x}};
+
+    union
+    {
+        bf6x32_t v32;
+        bf6x16_t v16x2[2];
+    } out{};
+
+#if CK_USE_SR_F6_CONVERSION
+    out.v32 = bf6_convert_sr(in.v32);
+#else
+    out.v32 = bf6_convert_rne(in.v32);
+#endif
+
+    return out.v16x2[0];
+}
+
+template <>
+inline __host__ __device__ bf6x16_pk_t type_convert<bf6x16_pk_t, float16_t>(float16_t x)
+{
+    return static_cast<bf6x16_pk_t>(type_convert<bf6x16_t>(x));
+}
+
 /**
  * @brief Specializes the type conversion template for converting a bf6_t value to float.
  *
@@ -2329,6 +2358,32 @@ inline __host__ __device__ float32_t type_convert<float32_t, bf6x32_t>(bf6x32_t
     return out.float_vector;
 #endif
 }
+
+template <>
+inline __host__ __device__ float16_t type_convert<float16_t, bf6x16_t>(bf6x16_t x)
+{
+    union
+    {
+        bf6x16_t v16x2[2];
+        bf6x32_t v32;
+    } in{{x, x}};
+
+    union
+    {
+        float16_t v16x2[2];
+        float32_t v32;
+    } out{};
+
+    out.v32 = type_convert<float32_t>(in.v32);
+    return out.v16x2[0];
+}
+
+template <>
+inline __host__ __device__ float16_t type_convert<float16_t, bf6x16_pk_t>(bf6x16_pk_t x)
+{
+    return type_convert<float16_t>(static_cast<bf6x16_t>(x));
+}
+
 #endif
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <typename Y, typename X, size_t NumElems>
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
index 7e23998f8c..32d5464e8f 100644
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -60,6 +60,7 @@ if(GPU_TARGETS MATCHES "gfx950")
 
   add_gtest_executable(test_bf6 test_bf6.cpp)
   if(result EQUAL 0)
+    target_compile_options(test_bf6 PRIVATE -mavx512f)
     target_link_libraries(test_bf6 PRIVATE utility)
   endif()
   add_dependencies(test_mx_data_types test_bf6)
diff --git a/test/data_type/test_bf6.cpp b/test/data_type/test_bf6.cpp
index 25c01076e9..904cd302dc 100644
--- a/test/data_type/test_bf6.cpp
+++ b/test/data_type/test_bf6.cpp
@@ -6,6 +6,7 @@
 #include "ck/utility/type_convert.hpp"
 #include "ck/utility/env.hpp"
 #include "ck/utility/scaled_type_convert.hpp"
+#include "ck/library/utility/device_memory.hpp"
 
 using ck::bf6_convert_rne;
 using ck::bf6_convert_sr;
@@ -455,3 +456,57 @@ TEST(BF6, TestAllValues)
         }
     });
 }
+
+__global__ void test_bf6_convert_rne(float* p_test, uint64_t* p_completed)
+{
+    constexpr int N = 32;
+    if(p_completed == nullptr)
+    {
+        return;
+    }
+
+    uint64_t& i = *p_completed;
+    i           = 0;
+
+    if(p_test == nullptr)
+    {
+        return;
+    }
+
+    ck::float32_t float32_in(1.0f);
+    ck::float32_t float32_out{};
+
+    auto bf6x32_vec = bf6_convert_rne(float32_in);
+    float32_out     = type_convert<ck::float32_t>(bf6x32_vec);
+
+    ck::static_for<0, N, 1>{}([&](auto ii) { p_test[i++] = float32_out[static_cast<int>(ii)]; });
+    i = N;
+}
+
+TEST(MXBF6, DeviceBF6ConvertRNE)
+{
+    constexpr int N = 32;
+    std::vector<float> out(N, -1.0f);
+
+    DeviceMem device_out(N * sizeof(float));
+    DeviceMem device_completed(sizeof(uint64_t));
+
+    device_out.SetValue(-21.0f);
+    device_completed.SetValue(-21.0f);
+
+    test_bf6_convert_rne<<<1, 1>>>(static_cast<float*>(device_out.GetDeviceBuffer()),
+                                   static_cast<uint64_t*>(device_completed.GetDeviceBuffer()));
+
+    uint64_t completed = 0;
+    device_completed.FromDevice(&completed);
+    device_out.FromDevice(out.data());
+
+    EXPECT_EQ(N, completed);
+    ck::static_for<0, N, 1>{}(
+        [&](auto ii) { EXPECT_EQ(out[static_cast<int>(ii)], 1.0f) << "ii: " << ii << std::endl; });
+
+    auto bf6x32_vec_tc    = ck::type_convert<bf6x32_pk_t>(ck::float32_t(1.0f));
+    auto bf6x32_vec_cnstr = bf6x32_pk_t(0x0C);
+
+    EXPECT_EQ(bf6x32_vec_tc, bf6x32_vec_cnstr);
+}

From 25b359d63041636087a9f0d5bdf27632ffe8cf0d Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Fri, 11 Jul 2025 15:32:12 -0600
Subject: [PATCH 305/443] MX GEMM - Add FP6 GEMM Test (#2488)

* Add F6 GEMM MX Test

* Add BF6 GEMM MX Test
---
 .../device_operation_instance_factory.hpp     |  2 +
 .../tensor_operation_instance/gpu/gemm_mx.hpp | 40 +++++++++++
 .../gpu/gemm_mx/CMakeLists.txt                |  4 ++
 ...vice_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp | 66 ++++++++++++++++++
 ...bf6_bf6_bf16_mk_nk_mn_default_instance.cpp | 32 +++++++++
 .../device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp | 67 +++++++++++++++++++
 ...dl_f6_f6_f16_mk_nk_mn_default_instance.cpp | 32 +++++++++
 .../include/profiler/profile_gemm_mx_impl.hpp | 20 ++++--
 test/gemm_mx/CMakeLists.txt                   |  1 +
 test/gemm_mx/test_gemm_mx.cpp                 |  8 ++-
 test/gemm_mx/test_gemm_mx_util.hpp            |  2 +-
 11 files changed, 265 insertions(+), 9 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 022afe7fa4..f6983810be 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -24,6 +24,8 @@ using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
 using I4   = ck::pk_i4_t;
 using F4   = ck::f4x2_pk_t;
+using F6   = ck::f6x16_pk_t;
+using BF6  = ck::bf6x16_pk_t;
 
 using E8M0   = ck::e8m0_bexp_t;
 using E8M0PK = int32_t;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
index ec75a0cfb0..2fe4a5c975 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
@@ -87,6 +87,34 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(
                                              PassThrough,
                                              PassThrough>>>& instances);
 
+void add_device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F6,
+                                             E8M0PK,
+                                             F6,
+                                             E8M0PK,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+void add_device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             BF6,
+                                             E8M0PK,
+                                             BF6,
+                                             E8M0PK,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
 template <typename ADataType,
           typename AScaleDataType,
           typename BDataType,
@@ -130,6 +158,8 @@ struct DeviceOperationInstanceFactory<
 
         if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> && is_same_v<CLayout, Row>)
         {
+            // Row-Col-Row -- one of the two currently supported layouts, another one is
+            // Row-MFMA-Row
             if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
                          is_same_v<CDataType, F16>)
             {
@@ -147,6 +177,16 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instances(op_ptrs);
             }
+            else if constexpr(is_same_v<ADataType, F6> && is_same_v<BDataType, F6> &&
+                              is_same_v<CDataType, F16>)
+            {
+                add_device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ADataType, BF6> && is_same_v<BDataType, BF6> &&
+                              is_same_v<CDataType, BF16>)
+            {
+                add_device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instances(op_ptrs);
+            }
         }
         else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                           is_same_v<CLayout, Row>)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
index bb67a9edae..67805a86b1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
@@ -2,6 +2,8 @@
 set(GEMM_MX_INSTANCES)
 
 list(APPEND GEMM_MX_INSTANCES
+        device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
         device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
         device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
         device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
@@ -11,6 +13,8 @@ list(APPEND GEMM_MX_INSTANCES
     )
 
 
+set_source_files_properties(device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..4a3d54e90b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16   = bhalf_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
+using BF6    = ck::bf6x16_pk_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+static constexpr auto KPerBlock      = 256 / ck::packed_size_v<BF6>; // 256 bf6 = 16 bf6x16_pk_t
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+    //###########################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|      KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //###########################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block|     Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //###########################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |          |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //###########################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |          |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    std::nullptr_t
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..bc07b32871
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             BF6,
+                                             E8M0PK,
+                                             BF6,
+                                             E8M0PK,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..08c8f472c9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16    = half_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
+using F6     = ck::f6x16_pk_t;
+using BF6    = ck::bf6x16_pk_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+static constexpr auto KPerBlock      = 256 / ck::packed_size_v<F6>; // 256 f6 = 16 f6x16_pk_t
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+    //###########################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|      KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //###########################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block|     Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //###########################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |          |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //###########################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |          |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    std::nullptr_t
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp
new file mode 100644
index 0000000000..d92d0b97fe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F6,
+                                             E8M0PK,
+                                             F6,
+                                             E8M0PK,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_mx_impl.hpp b/profiler/include/profiler/profile_gemm_mx_impl.hpp
index 4df2348700..1fbe60c6cf 100644
--- a/profiler/include/profiler/profile_gemm_mx_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_mx_impl.hpp
@@ -216,12 +216,20 @@ bool profile_gemm_mx_impl(int do_verification,
     auto a_data_element = [](float x) {
         if constexpr(ck::is_same_v<ADataType, ck::f4x2_pk_t>)
             return ck::type_convert<ADataType>(ck::float2_t(x));
+        else if constexpr(ck::packed_size_v<ADataType> == 32)
+            return ck::type_convert<ADataType>(ck::float32_t(x));
+        else if constexpr(ck::packed_size_v<ADataType> == 16)
+            return ck::type_convert<ADataType>(ck::float16_t(x));
         else
             return ck::type_convert<ADataType>(x);
     };
     auto b_data_element = [](float x) {
         if constexpr(ck::is_same_v<BDataType, ck::f4x2_pk_t>)
             return ck::type_convert<BDataType>(ck::float2_t(x));
+        else if constexpr(ck::packed_size_v<BDataType> == 32)
+            return ck::type_convert<BDataType>(ck::float32_t(x));
+        else if constexpr(ck::packed_size_v<BDataType> == 16)
+            return ck::type_convert<BDataType>(ck::float16_t(x));
         else
             return ck::type_convert<BDataType>(x);
     };
@@ -247,15 +255,17 @@ bool profile_gemm_mx_impl(int do_verification,
 
     case 1:
 
-        a_m_k.GenerateTensorDistr(int_distr{-4, 5});  // Z[-4,4]
-        b_k_n->GenerateTensorDistr(int_distr{-4, 5}); // Z[-4,4]
+        a_m_k.GenerateTensorDistr(
+            int_distr{-4, 4}, ck::identity{}, std::minstd_rand(time(nullptr))); // Z[-4,4]
+        b_k_n->GenerateTensorDistr(int_distr{-4, 4});                           // Z[-4,4]
 
-        a_m_k_scale.GenerateTensorDistr(int_distr{125, 129}); // scales: {0.25, 0.5, 1, 2}
-        b_k_n_scale.GenerateTensorDistr(int_distr{125, 129}); // scales: {0.25, 0.5, 1, 2}
+        a_m_k_scale.GenerateTensorDistr(int_distr{125, 128}); // scales: {0.25, 0.5, 1, 2}
+        b_k_n_scale.GenerateTensorDistr(int_distr{125, 128}); // scales: {0.25, 0.5, 1, 2}
         break;
 
     default:
-        a_m_k.GenerateTensorDistr(float_distr{-2.0, 2.0});
+        a_m_k.GenerateTensorDistr(
+            float_distr{-2.0, 2.0}, ck::identity{}, std::minstd_rand(time(nullptr)));
         a_m_k_scale.GenerateTensorDistr(float_distr{powf(2.0f, -125.0f), 1.0f});
 
         b_k_n->GenerateTensorDistr(float_distr{-2.0, 2.0});
diff --git a/test/gemm_mx/CMakeLists.txt b/test/gemm_mx/CMakeLists.txt
index 71a0a98f2d..7a04d5378f 100644
--- a/test/gemm_mx/CMakeLists.txt
+++ b/test/gemm_mx/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_gtest_executable(test_gemm_mx test_gemm_mx.cpp)
 if(result EQUAL 0)
+   target_compile_options(test_gemm_mx PRIVATE -mavx512f)
    target_link_libraries(test_gemm_mx PRIVATE utility device_gemm_mx_instance)
  endif()
diff --git a/test/gemm_mx/test_gemm_mx.cpp b/test/gemm_mx/test_gemm_mx.cpp
index a3449cb1bb..b63fd880c1 100644
--- a/test/gemm_mx/test_gemm_mx.cpp
+++ b/test/gemm_mx/test_gemm_mx.cpp
@@ -10,8 +10,8 @@
 using E8M0 = ck::e8m0_bexp_t;
 using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
-using F6   = ck::f6_t;
-using BF6  = ck::bf6_t;
+using F6   = ck::f6x16_pk_t;
+using BF6  = ck::bf6x16_pk_t;
 using F4   = ck::f4x2_pk_t;
 using F16  = ck::half_t;
 using BF16 = ck::bhalf_t;
@@ -58,7 +58,9 @@ using KernelTypes_MK_NK = ::testing::Types<
     std::tuple<       F8,        F8,             F16, ck::Number<32> >,
     std::tuple<       F8,        F8,            BF16, ck::Number<32> >,
 #endif
-    std::tuple<       F4,        F4,             F16, ck::Number<32> >
+    std::tuple<       F4,        F4,             F16, ck::Number<32> >,
+    std::tuple<       F6,        F6,             F16, ck::Number<32> >,
+    std::tuple<      BF6,       BF6,            BF16, ck::Number<32> >
     >;
 
 using KernelTypes_MK_KN = ::testing::Types<
diff --git a/test/gemm_mx/test_gemm_mx_util.hpp b/test/gemm_mx/test_gemm_mx_util.hpp
index 675a3de127..c2b56bb01f 100644
--- a/test/gemm_mx/test_gemm_mx_util.hpp
+++ b/test/gemm_mx/test_gemm_mx_util.hpp
@@ -74,7 +74,7 @@ class TestGemmMX : public testing::Test
                    const int StrideB,
                    const int StrideC,
                    int kbatch   = 1,
-                   int n_warmup = 1,
+                   int n_warmup = 10,
                    int n_iter   = 10)
     {
         bool pass = ck::profiler::profile_gemm_mx_impl<ADataType,

From 141bf2d54d78f8250fc1ad51ef8f2f54792d2a08 Mon Sep 17 00:00:00 2001
From: Gino Lu <gino.lu@amd.com>
Date: Mon, 14 Jul 2025 20:35:06 +0800
Subject: [PATCH 306/443] [CK_TILE] Add pk_fp4 data type  (#2422)

* [draft] Add pk_fp4 and test

* Add hw conversion for fp4

* Refine test code and pk_fp4 constructor.

* fix test indent

* modify according to comment.

* fix clang-format

* modify according comments.

---------

Co-authored-by: asleepzzz <hanwen.chang@amd.com>
---
 include/ck_tile/core.hpp                      |   2 +
 include/ck_tile/core/numeric/mxfp_convert.hpp | 213 ++++++++++++
 include/ck_tile/core/numeric/numeric.hpp      | 178 +++++-----
 include/ck_tile/core/numeric/pk_fp4.hpp       | 324 ++++++++++++++++++
 include/ck_tile/core/numeric/type_convert.hpp |  16 +
 test/ck_tile/data_type/CMakeLists.txt         |   1 +
 test/ck_tile/data_type/test_pk_fp4.cpp        | 162 +++++++++
 7 files changed, 806 insertions(+), 90 deletions(-)
 create mode 100644 include/ck_tile/core/numeric/mxfp_convert.hpp
 create mode 100644 include/ck_tile/core/numeric/pk_fp4.hpp
 create mode 100644 test/ck_tile/data_type/test_pk_fp4.cpp

diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index ed39719cf4..10dfdd7d28 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -33,8 +33,10 @@
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/numeric/mxfp_convert.hpp"
 #include "ck_tile/core/numeric/null_type.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
+#include "ck_tile/core/numeric/pk_fp4.hpp"
 #include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/numeric/type_convert.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
diff --git a/include/ck_tile/core/numeric/mxfp_convert.hpp b/include/ck_tile/core/numeric/mxfp_convert.hpp
new file mode 100644
index 0000000000..b2e138e880
--- /dev/null
+++ b/include/ck_tile/core/numeric/mxfp_convert.hpp
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck_tile {
+// modify from include/ck/utility/mxfp_utils.hpp
+
+template <typename T>
+struct numeric_utils : numeric_traits<T>
+{
+
+    using traits   = numeric_traits<T>;
+    using _numeric = numeric<T>;
+    using raw_type = typename T::raw_type;
+
+    static constexpr int exp_mask = (1 << traits::exp) - 1;
+
+    static constexpr int get_exponent(raw_type x)
+    {
+        // TODO: check if repeated calls are optimized.
+        return (x >> traits::mant) & exp_mask;
+    }
+    static constexpr bool is_positive(raw_type x)
+    {
+        return (x >> (traits::exp + traits::mant)) == _numeric::binary_zero;
+    }
+    static constexpr bool is_subnormal(raw_type x)
+    {
+        return get_exponent(x) == _numeric::binary_zero;
+    }
+    // TODO: replace double with template arg?
+    static constexpr double get_mantissa(raw_type x)
+    {
+        double mantissa = is_subnormal(x) ? 0.0f : 1.0f;
+        for(uint32_t i = 0; i < traits::mant; ++i)
+        {
+            mantissa += std::ldexp(static_cast<float>(x & 0b1), -(traits::mant - i));
+            x >>= 1;
+        }
+        return mantissa;
+    }
+};
+
+template <typename T>
+CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, int scale_exp = 127)
+{
+    using utils                    = numeric_utils<T>;
+    static constexpr int e8m0_bias = 127; // TODO: make it generic.
+    float sign                     = utils::is_positive(data) ? 1.0 : -1.0;
+    int exp    = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias;
+    float mant = utils::get_mantissa(data);
+
+    return std::ldexp(sign * mant, exp + scale_exp - e8m0_bias);
+}
+
+template <typename T>
+CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value)
+{
+    using bitwise_type = typename numeric_traits<T>::bitwise_type;
+
+    if(std::abs(value) > float(numeric<T>::max()))
+    {
+        float max_value = numeric<T>::max();
+
+        // cppcheck-suppress redundantAssignment
+        uint32_t max_bitwise = bit_cast<uint32_t>(max_value);
+
+        // cppcheck-suppress redundantAssignment
+        bitwise_type sign =
+            bit_cast<uint32_t>(value) >> (numeric_traits<float>::exp + numeric_traits<float>::mant);
+        bitwise_type exp =
+            ((max_bitwise >> numeric_traits<float>::mant) & numeric_traits<float>::exp_mask) -
+            (numeric_traits<float>::bias - numeric_traits<T>::bias);
+        bitwise_type mantissa =
+            max_bitwise >> (numeric_traits<float>::mant - numeric_traits<T>::mant);
+
+        uint32_t mant_prev = max_bitwise >> (numeric_traits<float>::mant - numeric_traits<T>::mant);
+        mant_prev &= ((1 << numeric_traits<T>::mant) - 1);
+        mant_prev--;
+
+        mant_prev <<= (numeric_traits<float>::mant - numeric_traits<T>::mant);
+        uint32_t prev_bit =
+            ((max_bitwise >> numeric_traits<float>::mant) << numeric_traits<float>::mant) |
+            mant_prev;
+
+        float prev_val = bit_cast<float>(prev_bit);
+        float diff     = max_value - prev_val;
+
+        float actual_max = max_value + (diff / 2);
+
+        if(std::abs(value) < actual_max)
+        {
+            return sign << ((numeric_traits<T>::exp + numeric_traits<T>::mant)) |
+                   (exp << numeric_traits<T>::mant) | mantissa;
+        }
+        else
+        {
+            if constexpr(!numeric<T>::has_inf())
+            {
+
+                return (1 << (numeric_traits<T>::mant + numeric_traits<T>::exp)) - 1;
+            }
+            else
+            {
+                exp++;
+                return sign << ((numeric_traits<T>::exp + numeric_traits<T>::mant)) |
+                       (exp << numeric_traits<T>::mant);
+            }
+        }
+    }
+    const int mfmt = numeric_traits<float>::mant;
+    uint32_t x;
+    x = bit_cast<uint32_t>(value);
+
+    uint32_t head, mantissa;
+    int32_t exponent, bias;
+    uint32_t sign;
+
+    head     = x & numeric_traits<float>::head_mask;
+    mantissa = x & numeric_traits<float>::mant_mask;
+    exponent = (head >> numeric_traits<float>::mant) & numeric_traits<float>::exp_mask;
+    sign     = head >> (numeric_traits<float>::mant + numeric_traits<float>::exp);
+    bias     = numeric_traits<float>::bias;
+
+    if(x == 0)
+    {
+        return 0b0;
+    }
+
+    const int mini_bias                  = numeric_traits<T>::bias;
+    const int mini_denormal_act_exponent = 1 - mini_bias;
+
+    int act_exponent, out_exponent, exponent_diff;
+
+    bool is_subnorm = false;
+
+    if(exponent == 0)
+    {
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = mini_denormal_act_exponent - act_exponent;
+        is_subnorm    = true;
+    }
+    else
+    {
+        act_exponent = exponent - bias;
+        if(act_exponent <= mini_denormal_act_exponent)
+        {
+            exponent_diff = mini_denormal_act_exponent - act_exponent;
+            is_subnorm    = true;
+        }
+        else
+        {
+            exponent_diff = 0;
+        }
+        mantissa += (1UL << mfmt);
+    }
+
+    auto shift_amount = (mfmt - numeric_traits<T>::mant + exponent_diff);
+    shift_amount      = (shift_amount >= 64) ? 63 : shift_amount;
+    bool midpoint     = (mantissa & ((1UL << shift_amount) - 1)) == (1UL << (shift_amount - 1));
+
+    float min_subnorm = float(numeric<T>::epsilon()) * (sign ? -1 : 1);
+
+    if(is_subnorm && std::abs(value) < std::abs(min_subnorm))
+    {
+        // closer to 0
+        if(std::abs(value) <= std::abs(min_subnorm - value))
+            return sign << (numeric_traits<T>::exp + numeric_traits<T>::mant);
+        else
+            return 1 | (sign << (numeric_traits<T>::exp + numeric_traits<T>::mant));
+    }
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    out_exponent      = (act_exponent + exponent_diff) + mini_bias - (implicit_one ? 0 : 1);
+
+    uint32_t drop_mask = (1UL << (mfmt - numeric_traits<T>::mant)) - 1;
+    bool odd           = mantissa & (1UL << (mfmt - numeric_traits<T>::mant));
+    mantissa += (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa) & drop_mask;
+
+    if(out_exponent == 0)
+    {
+        if((1UL << mfmt) & mantissa)
+        {
+            out_exponent = 1;
+        }
+    }
+    else
+    {
+        if((1UL << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            out_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - numeric_traits<T>::mant);
+
+    if(out_exponent == 0 && mantissa == 0)
+    {
+        return sign << (numeric_traits<T>::exp + numeric_traits<T>::mant);
+    }
+
+    mantissa &= (1UL << numeric_traits<T>::mant) - 1;
+    return (sign << (numeric_traits<T>::exp + numeric_traits<T>::mant)) |
+           (out_exponent << numeric_traits<T>::mant) | mantissa;
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/numeric.hpp b/include/ck_tile/core/numeric/numeric.hpp
index f125fbf2ce..6b61e3f99c 100644
--- a/include/ck_tile/core/numeric/numeric.hpp
+++ b/include/ck_tile/core/numeric/numeric.hpp
@@ -103,94 +103,92 @@ struct numeric_traits<float>
 
 } // namespace ck_tile
 
-#define CK_TILE_ARITHMETIC_USING_FLOAT(attr_, type_)                 \
-    attr_ bool operator==(const type_& x, const type_& y)            \
-    {                                                                \
-        return static_cast<float>(x) == static_cast<float>(y);       \
-    }                                                                \
-    attr_ bool operator!=(const type_& x, const type_& y)            \
-    {                                                                \
-        return static_cast<float>(x) != static_cast<float>(y);       \
-    }                                                                \
-    attr_ bool operator<(const type_& x, const type_& y)             \
-    {                                                                \
-        return static_cast<float>(x) < static_cast<float>(y);        \
-    }                                                                \
-    attr_ bool operator<=(const type_& x, const type_& y)            \
-    {                                                                \
-        return static_cast<float>(x) <= static_cast<float>(y);       \
-    }                                                                \
-    attr_ bool operator>(const type_& x, const type_& y)             \
-    {                                                                \
-        return static_cast<float>(x) > static_cast<float>(y);        \
-    }                                                                \
-    attr_ bool operator>=(const type_& x, const type_& y)            \
-    {                                                                \
-        return static_cast<float>(x) >= static_cast<float>(y);       \
-    }                                                                \
-    attr_ type_ operator+(const type_& x, const type_& y)            \
-    {                                                                \
-        return type_(static_cast<float>(x) + static_cast<float>(y)); \
-    }                                                                \
-    attr_ type_ operator-(const type_& x)                            \
-    {                                                                \
-        constexpr uint32_t bits = sizeof(type_) * 8;                 \
-        constexpr uint32_t mask = 1 << (bits - 1);                   \
-        type_ y                 = x;                                 \
-        y.data ^= static_cast<typename type_::raw_type>(mask);       \
-        return y;                                                    \
-    }                                                                \
-    attr_ type_ operator-(const type_& x, const type_& y)            \
-    {                                                                \
-        return type_(static_cast<float>(x) - static_cast<float>(y)); \
-    }                                                                \
-    attr_ type_ operator*(const type_& x, const type_& y)            \
-    {                                                                \
-        return type_(static_cast<float>(x) * static_cast<float>(y)); \
-    }                                                                \
-    attr_ type_ operator/(const type_& x, const type_& y)            \
-    {                                                                \
-        return type_(static_cast<float>(x) / static_cast<float>(y)); \
-    }                                                                \
-    attr_ type_& operator+=(type_& x, const type_& y)                \
-    {                                                                \
-        x = type_(static_cast<float>(x) + static_cast<float>(y));    \
-        return x;                                                    \
-    }                                                                \
-    attr_ type_& operator-=(type_& x, const type_& y)                \
-    {                                                                \
-        x = type_(static_cast<float>(x) - static_cast<float>(y));    \
-        return x;                                                    \
-    }                                                                \
-    attr_ type_& operator*=(type_& x, const type_& y)                \
-    {                                                                \
-        x = type_(static_cast<float>(x) * static_cast<float>(y));    \
-        return x;                                                    \
-    }                                                                \
-    attr_ type_& operator/=(type_& x, const type_& y)                \
-    {                                                                \
-        x = type_(static_cast<float>(x) / static_cast<float>(y));    \
-        return x;                                                    \
-    }                                                                \
-    attr_ type_& operator++(type_& x)                                \
-    {                                                                \
-        x = type_(static_cast<float>(x) + 1.f);                      \
-        return x;                                                    \
-    }                                                                \
-    attr_ type_& operator--(type_& x)                                \
-    {                                                                \
-        x = type_(static_cast<float>(x) - 1.f);                      \
-        return x;                                                    \
-    }                                                                \
-    attr_ type_ operator++(type_& x, int)                            \
-    {                                                                \
-        type_ y(x);                                                  \
-        x = type_(static_cast<float>(x) + 1.f);                      \
-        return y;                                                    \
-    }                                                                \
-    attr_ type_ operator--(type_& x, int)                            \
-    {                                                                \
-        type_ y(x);                                                  \
-        x = type_(static_cast<float>(x) - 1.f);                      \
-        return y;                                                    \
+#define CK_TILE_ARITHMETIC_USING_FLOAT(attr_, type_)                                       \
+    attr_ bool operator==(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return std::abs(static_cast<float>(x) - static_cast<float>(y)) <                   \
+               static_cast<float>(numeric<type_>::epsilon());                              \
+    }                                                                                      \
+    attr_ bool operator!=(const type_& x, const type_& y) { return not operator==(x, y); } \
+    attr_ bool operator<(const type_& x, const type_& y)                                   \
+    {                                                                                      \
+        return static_cast<float>(x) < static_cast<float>(y);                              \
+    }                                                                                      \
+    attr_ bool operator<=(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return static_cast<float>(x) <= static_cast<float>(y);                             \
+    }                                                                                      \
+    attr_ bool operator>(const type_& x, const type_& y)                                   \
+    {                                                                                      \
+        return static_cast<float>(x) > static_cast<float>(y);                              \
+    }                                                                                      \
+    attr_ bool operator>=(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return static_cast<float>(x) >= static_cast<float>(y);                             \
+    }                                                                                      \
+    attr_ type_ operator+(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return type_(static_cast<float>(x) + static_cast<float>(y));                       \
+    }                                                                                      \
+    attr_ type_ operator-(const type_& x)                                                  \
+    {                                                                                      \
+        constexpr uint32_t bits = sizeof(type_) * 8;                                       \
+        constexpr uint32_t mask = 1 << (bits - 1);                                         \
+        type_ y                 = x;                                                       \
+        y.data ^= static_cast<typename type_::raw_type>(mask);                             \
+        return y;                                                                          \
+    }                                                                                      \
+    attr_ type_ operator-(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return type_(static_cast<float>(x) - static_cast<float>(y));                       \
+    }                                                                                      \
+    attr_ type_ operator*(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return type_(static_cast<float>(x) * static_cast<float>(y));                       \
+    }                                                                                      \
+    attr_ type_ operator/(const type_& x, const type_& y)                                  \
+    {                                                                                      \
+        return type_(static_cast<float>(x) / static_cast<float>(y));                       \
+    }                                                                                      \
+    attr_ type_& operator+=(type_& x, const type_& y)                                      \
+    {                                                                                      \
+        x = type_(static_cast<float>(x) + static_cast<float>(y));                          \
+        return x;                                                                          \
+    }                                                                                      \
+    attr_ type_& operator-=(type_& x, const type_& y)                                      \
+    {                                                                                      \
+        x = type_(static_cast<float>(x) - static_cast<float>(y));                          \
+        return x;                                                                          \
+    }                                                                                      \
+    attr_ type_& operator*=(type_& x, const type_& y)                                      \
+    {                                                                                      \
+        x = type_(static_cast<float>(x) * static_cast<float>(y));                          \
+        return x;                                                                          \
+    }                                                                                      \
+    attr_ type_& operator/=(type_& x, const type_& y)                                      \
+    {                                                                                      \
+        x = type_(static_cast<float>(x) / static_cast<float>(y));                          \
+        return x;                                                                          \
+    }                                                                                      \
+    attr_ type_& operator++(type_& x)                                                      \
+    {                                                                                      \
+        x = type_(static_cast<float>(x) + 1.f);                                            \
+        return x;                                                                          \
+    }                                                                                      \
+    attr_ type_& operator--(type_& x)                                                      \
+    {                                                                                      \
+        x = type_(static_cast<float>(x) - 1.f);                                            \
+        return x;                                                                          \
+    }                                                                                      \
+    attr_ type_ operator++(type_& x, int)                                                  \
+    {                                                                                      \
+        type_ y(x);                                                                        \
+        x = type_(static_cast<float>(x) + 1.f);                                            \
+        return y;                                                                          \
+    }                                                                                      \
+    attr_ type_ operator--(type_& x, int)                                                  \
+    {                                                                                      \
+        type_ y(x);                                                                        \
+        x = type_(static_cast<float>(x) - 1.f);                                            \
+        return y;                                                                          \
     }
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
new file mode 100644
index 0000000000..b7dca9dd0a
--- /dev/null
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/mxfp_convert.hpp"
+
+#if defined(__gfx950__)
+#define CK_TILE_FP4_CVT_DEVICE 1
+#else
+#define CK_TILE_FP4_CVT_DEVICE 0
+#endif
+
+#define TEST_convert_with_table 0
+
+namespace ck_tile {
+
+using fp32_t   = float;
+using fp32x2_t = float __attribute__((ext_vector_type(2)));
+using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
+using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+
+CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float);
+
+// TODO: Add stochastic method
+struct pk_float4_e2m1_t
+{
+    static constexpr int exponent = 2;
+    static constexpr int mantissa = 1;
+    static constexpr int bias     = 1;
+    // TODO: Can we merge raw_type and type?
+    using raw_type = uint8_t;
+    using type     = raw_type;
+    raw_type data;
+
+    CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t() : data{type{}} {}
+    template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+    CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t(T init) : data{static_cast<type>(init)}
+    {
+    }
+    CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init) : data{float_to_e2m1(init)}
+    {
+    }
+    CK_TILE_HOST_DEVICE constexpr operator type() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; }
+    CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr operator float() const;
+    CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const;
+    CK_TILE_HOST_DEVICE constexpr operator fp16_t() const;
+    CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const;
+    CK_TILE_HOST_DEVICE constexpr operator bf16_t() const;
+    CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const;
+
+    template <index_t I>
+    CK_TILE_HOST_DEVICE raw_type unpack(number<I>) const;
+    CK_TILE_HOST_DEVICE static pk_float4_e2m1_t pack(const type x0, const type x1)
+    {
+        return (x1 << 4) | (x0 & 0b00001111);
+    }
+
+#if TEST_convert_with_table
+    static constexpr float e2m1_to_fp32_table[16] = {
+        0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6};
+    static constexpr fp16_t e2m1_to_fp16_table[16] = {
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x0000)), //  0
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x3800)), //  0.5
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x3C00)), //  1
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x3E00)), //  1.5
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x4000)), //  2
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x4200)), //  3
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x4400)), //  4
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x4600)), //  6
+        bit_cast<fp16_t>(static_cast<uint16_t>(0x8000)), // -0
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xB800)), // -0.5
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xBC00)), // -1
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xBE00)), // -1.5
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xC000)), // -2
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xC200)), // -3
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xC400)), // -4
+        bit_cast<fp16_t>(static_cast<uint16_t>(0xC600))  // -6
+    };
+#endif
+};
+
+using pk_fp4_t     = pk_float4_e2m1_t;
+using pk_fp4_raw_t = typename pk_fp4_t::raw_type;
+
+template <>
+struct numeric_traits<pk_fp4_t>
+{
+    using bitwise_type = pk_fp4_raw_t;
+
+    static constexpr int exp        = 2;
+    static constexpr int mant       = 1;
+    static constexpr int bias       = 1;
+    static constexpr int PackedSize = 2;
+};
+
+// limits
+template <class T>
+struct numeric;
+
+template <>
+struct numeric<pk_fp4_t>
+{
+    static constexpr pk_fp4_raw_t binary_min_normal    = 0b00100010; // 1
+    static constexpr pk_fp4_raw_t binary_max_normal    = 0b01110111; // 6
+    static constexpr pk_fp4_raw_t binary_lowest_normal = 0b11111111; // -6
+    static constexpr pk_fp4_raw_t binary_min_subnorm   = 0b00010001; // 0.5
+    static constexpr pk_fp4_raw_t binary_max_subnorm   = 0b00010001; // 0.5
+    static constexpr pk_fp4_raw_t binary_zero          = 0b00000000; // 0
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t min() { return binary_min_normal; }
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t max() { return binary_max_normal; }
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t lowest() { return binary_lowest_normal; }
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t epsilon() { return binary_min_subnorm; }
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t round_error() { return binary_min_subnorm; }
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t zero() { return binary_zero; }
+    CK_TILE_HOST_DEVICE static constexpr fp8_t denorm_min() { return binary_min_subnorm; }
+
+    CK_TILE_HOST_DEVICE static constexpr bool has_inf() { return false; }
+    // N/A
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t infinity() { return max(); }
+    // N/A
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t quiet_NaN() { return max(); }
+    // N/A
+    CK_TILE_HOST_DEVICE static constexpr pk_fp4_t signaling_NaN() { return max(); }
+};
+
+template <index_t I>
+CK_TILE_HOST_DEVICE pk_fp4_raw_t pk_fp4_t::unpack(number<I>) const
+{
+    static_assert(I < 2, "Index is out of range.");
+    if constexpr(I == 1)
+        return (data >> 4);
+    else
+        return data & 0b00001111;
+}
+CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST_DEVICE, pk_fp4_t)
+// TODO: consider replace this macro to improve performance
+
+#if CK_TILE_FP4_CVT_DEVICE
+namespace impl {
+
+template <typename T>
+CK_TILE_DEVICE T _from_f4(pk_fp4_raw_t src, float scale = 1.0f)
+{
+    // TODO: check the order
+    if constexpr(std::is_same_v<T, fp32_t>)
+        return fp32x2_t(__builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0))[0];
+    else if constexpr(std::is_same_v<T, fp32x2_t>)
+        return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0);
+    else if constexpr(std::is_same_v<T, fp16_t>)
+        return fp16x2_t(__builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0))[0];
+    else if constexpr(std::is_same_v<T, fp16x2_t>)
+        return __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0);
+    else if constexpr(std::is_same_v<T, bf16_t>)
+        return bf16x2_t(__builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 0))[0];
+    else if constexpr(std::is_same_v<T, bf16x2_t>)
+        return __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 0);
+    else
+        static_assert(std::false_type::value, "Unsupported type.");
+    return T{};
+}
+template <typename T>
+CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f)
+{
+    // TODO: check the order
+    union
+    {
+        uint32_t u32;
+        pk_fp4_raw_t pf4[4];
+    } cvt{0};
+    if constexpr(std::is_same_v<T, fp32_t>)
+        cvt.u32 = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(cvt.u32, src, src, scale, 0);
+    else if constexpr(std::is_same_v<T, fp32x2_t>)
+        cvt.u32 = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(cvt.u32, src[0], src[1], scale, 0);
+    else if constexpr(std::is_same_v<T, fp16_t>)
+        cvt.u32 = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(cvt.u32, fp16x2_t{src, src}, scale, 0);
+    else if constexpr(std::is_same_v<T, fp16x2_t>)
+        cvt.u32 = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(cvt.u32, src, scale, 0);
+    else if constexpr(std::is_same_v<T, bf16_t>)
+        cvt.u32 = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(cvt.u32, bf16x2_t{src, src}, scale, 0);
+    else if constexpr(std::is_same_v<T, bf16x2_t>)
+        cvt.u32 = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(cvt.u32, src, scale, 0);
+    else
+        static_assert(std::false_type::value, "Unsupported type.");
+    return cvt.pf4[0];
+}
+
+} // namespace impl
+#endif
+
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16_t() const
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_from_f4<bf16_t>(data);
+#else
+    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{})))};
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16x2_t() const
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_from_f4<bf16x2_t>(data);
+#else
+    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}))),
+                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{})))};
+#endif
+}
+
+// TODO: make float_to_e2m1 generic so that we can convert from directrly.
+CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x)
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x);
+#else
+    return convert_to_type<pk_fp4_t>(x);
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x) { return fp32x2_t(x); }
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x) { return fp16x2_t(x); }
+CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x) { return bf16x2_t(x); }
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x) { return float_to_e2m1(x); }
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x)
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x);
+#else
+    return float_to_e2m1(type_convert<float>(x));
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x)
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x);
+#else
+    return float_to_e2m1(type_convert<float>(x));
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x)
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x);
+#else
+    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0])),
+                          float_to_e2m1(type_convert<float>(x[1])));
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x)
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x);
+#else
+    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0])),
+                          float_to_e2m1(type_convert<float>(x[1])));
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x)
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_to_f4(x);
+#else
+    return pk_fp4_t::pack(float_to_e2m1(x[0]), float_to_e2m1(x[1]));
+#endif
+}
+
+#if TEST_convert_with_table == 0
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_from_f4<fp32_t>(data);
+#else
+    return convert_to_float<pk_fp4_t>(unpack(number<0>{}));
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_from_f4<fp32x2_t>(data);
+#else
+    return fp32x2_t{convert_to_float<pk_fp4_t>(unpack(number<0>{})),
+                    convert_to_float<pk_fp4_t>(unpack(number<1>{}))};
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_from_f4<fp16_t>(data);
+#else
+    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{})))};
+#endif
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const
+{
+#if CK_TILE_FP4_CVT_DEVICE
+    return impl::_from_f4<fp16x2_t>(data);
+#else
+    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}))),
+                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{})))};
+#endif
+}
+#else
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const
+{
+    return e2m1_to_fp32_table[data & 0xf];
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const
+{
+    return fp32x2_t{e2m1_to_fp32_table[data & 0xf], e2m1_to_fp32_table[data >> 4]};
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const
+{
+    return e2m1_to_fp16_table[data & 0xf];
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const
+{
+    return fp16x2_t{e2m1_to_fp16_table[data & 0xf], e2m1_to_fp16_table[data >> 4]};
+}
+#endif
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp
index 4011e08ce4..94d6e3cd34 100644
--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -11,6 +11,7 @@
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/int8.hpp"
+#include "ck_tile/core/numeric/mxfp_convert.hpp"
 
 namespace ck_tile {
 
@@ -64,6 +65,21 @@ CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float)
 CK_TILE_TYPE_CONVERT(float, float, int8_t, int8)
 CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
 
+} // namespace ck_tile
+
+#include "ck_tile/core/numeric/pk_fp4.hpp"
+
+namespace ck_tile {
+
+CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2)
+CK_TILE_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4)
+CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2)
+CK_TILE_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4)
+CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2)
+CK_TILE_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4)
+CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float)
+CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16)
+CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16)
 #undef CK_TILE_TYPE_CONVERT
 #endif
 
diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt
index e489f306f7..655a0cef9c 100644
--- a/test/ck_tile/data_type/CMakeLists.txt
+++ b/test/ck_tile/data_type/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Currently ck_tile is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
     add_gtest_executable(test_ck_tile_pk_int4 test_pk_int4.cpp)
+	add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp)
 endif()
diff --git a/test/ck_tile/data_type/test_pk_fp4.cpp b/test/ck_tile/data_type/test_pk_fp4.cpp
new file mode 100644
index 0000000000..15f027e95d
--- /dev/null
+++ b/test/ck_tile/data_type/test_pk_fp4.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include <hip/hip_runtime.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using ck_tile::bf16_t;
+using ck_tile::bf16x2_t;
+using ck_tile::fp16_t;
+using ck_tile::fp16x2_t;
+using ck_tile::fp32_t;
+using ck_tile::fp32x2_t;
+using ck_tile::number;
+using ck_tile::pk_fp4_t;
+
+template <typename SRC, typename DST, bool is_device>
+CK_TILE_HOST void test_convert();
+
+TEST(PackedFp4, NumericLimits)
+{
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::has_inf(), false);
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::zero(), pk_fp4_t{0b00000000});
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::min(), pk_fp4_t{0b00100010});
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::max(), pk_fp4_t{0b01110111});
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::lowest(), pk_fp4_t{0b11111111});
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::epsilon(), pk_fp4_t{0b00010001});
+    EXPECT_EQ(ck_tile::numeric<pk_fp4_t>::round_error(), pk_fp4_t{0b00010001});
+}
+TEST(PackedFp4, ConvertBasic)
+{
+    EXPECT_EQ(ck_tile::convert_to_type<pk_fp4_t>(0.0f), pk_fp4_t{0b00000000}.get());
+    EXPECT_EQ(ck_tile::convert_to_type<pk_fp4_t>(-0.0f), pk_fp4_t{0b00001000}.get());
+    EXPECT_EQ(ck_tile::convert_to_type<pk_fp4_t>(-1.0f), pk_fp4_t{0b00001010}.get());
+    EXPECT_EQ(ck_tile::type_convert<pk_fp4_t>(0.0f), pk_fp4_t{0b00000000});
+    EXPECT_EQ(ck_tile::type_convert<pk_fp4_t>(-0.0f), pk_fp4_t{0b00001000});
+    EXPECT_EQ(ck_tile::type_convert<pk_fp4_t>(-1.0f), pk_fp4_t{0b00001010});
+    EXPECT_EQ(pk_fp4_t(0.0f), pk_fp4_t{0b00000000});
+    EXPECT_EQ(pk_fp4_t(-0.0f), pk_fp4_t{0b00001000});
+    EXPECT_EQ(pk_fp4_t(-1.0f), pk_fp4_t{0b00001010});
+    EXPECT_EQ(pk_fp4_t{0.0f}, pk_fp4_t{0b00000000});
+    EXPECT_EQ(pk_fp4_t{-0.0f}, pk_fp4_t{0b00001000});
+    EXPECT_EQ(pk_fp4_t{-1.0f}, pk_fp4_t{0b00001010});
+}
+TEST(PackedFp4, NumericBasic)
+{
+    auto f1  = pk_fp4_t{1.5f};
+    auto f2  = pk_fp4_t{3.0f};
+    auto ref = pk_fp4_t{-1.5f};
+    EXPECT_EQ(f1 - f2, ref);
+}
+TEST(PackedFp4, ConvertDevice)
+{
+    constexpr bool is_device = true;
+    test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
+    test_convert<fp16_t, fp16_t, is_device>();
+    test_convert<bf16_t, bf16_t, is_device>();
+    test_convert<fp32_t, fp16_t, is_device>();
+    test_convert<fp32_t, bf16_t, is_device>();
+    test_convert<fp16_t, fp32_t, is_device>();
+    test_convert<bf16_t, fp32_t, is_device>();
+}
+TEST(PackedFp4, ConvertHost)
+{
+    constexpr bool is_device = false;
+    test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
+    test_convert<fp16_t, fp16_t, is_device>();
+    test_convert<bf16_t, bf16_t, is_device>();
+    test_convert<fp32_t, fp16_t, is_device>();
+    test_convert<fp32_t, bf16_t, is_device>();
+    test_convert<fp16_t, fp32_t, is_device>();
+    test_convert<bf16_t, fp32_t, is_device>();
+}
+
+#define toF32(x) ck_tile::type_convert<float>(x)
+#define toPF4(x) ck_tile::type_convert<pk_fp4_t>(x)
+#define toSRC(x) ck_tile::type_convert<SRC>(x)
+#define toDST(x) ck_tile::type_convert<DST>(x)
+#define toDSTx2(x) ck_tile::type_convert<DSTx2_t>(x)
+
+template <typename Kernel, typename... Args>
+__global__ void MyKernel(Args... args)
+{
+    Kernel{}(args...);
+}
+template <typename SRC, typename DST, int N>
+struct SrcPkfp4Dst
+{
+    CK_TILE_HOST_DEVICE void operator()(const SRC* src, DST* dst) const
+    {
+
+        using SRCx2_t = ck_tile::ext_vector_t<SRC, 2>;
+        using DSTx2_t = ck_tile::ext_vector_t<DST, 2>;
+
+        ck_tile::static_for<0, N, 2>{}([&](auto i) {
+            const auto input2 = SRCx2_t{src[i], src[i + 1]};
+
+            if(i % 4 == 0)
+            {
+                // ex: fp32_t -> fp4 -> bf16_t
+                dst[i] = toDST(toPF4(src[i]));
+                // ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t
+                dst[i + 1] = toDST(toPF4(toPF4(input2).unpack(number<1>{})));
+            }
+            else
+            {
+                // ex: fp32x2_t -> pk_fp4_t -> bf16x2_t
+                reinterpret_cast<DSTx2_t*>(dst)[i >> 1] = toDSTx2(toPF4(input2));
+            }
+        });
+    }
+};
+
+template <typename SRC, typename DST, bool is_device>
+CK_TILE_HOST void test_convert()
+{
+    const auto test_data = std::array{0.f,  0.25f,  0.5f,  0.75f,  1.f,  1.25f,  1.5f,    1.75f,
+                                      -0.f, -0.25f, -0.5f, -0.75f, -1.f, -1.25f, -1.5f,   -1.75f,
+                                      2.f,  2.5f,   3.f,   3.5f,   4.f,  5.f,    5.0625f, 6.f};
+    const auto ref_data =
+        std::array{0.f,  0.f,  0.5f,  1.f,  1.f, 1.f, 1.5f, 2.f, -0.f, -0.f, -0.5f, -1.f,
+                   -1.f, -1.f, -1.5f, -2.f, 2.f, 2.f, 3.f,  4.f, 4.f,  4.f,  6.f,   6.f};
+
+    static_assert(test_data.size() == ref_data.size());
+    static_assert(test_data.size() % 2 == 0);
+
+    constexpr int N = test_data.size();
+    std::array<SRC, N> in;
+    std::array<DST, N> ref, out;
+
+    // prepare input and ground truth in host
+    for(int i = 0; i < N; ++i)
+    {
+        in[i]  = toSRC(test_data[i]);
+        ref[i] = toDST(ref_data[i]);
+        EXPECT_EQ(test_data[i], toF32(in[i]));
+        EXPECT_EQ(ref_data[i], toF32(ref[i]));
+    }
+
+    using job = SrcPkfp4Dst<SRC, DST, N>;
+
+    if constexpr(is_device)
+    {
+        auto in_d  = std::make_unique<ck_tile::DeviceMem>(in.size() * sizeof(SRC));
+        auto out_d = std::make_unique<ck_tile::DeviceMem>(out.size() * sizeof(DST));
+        in_d->ToDevice(in.data());
+
+        MyKernel<job><<<1, 1>>>(reinterpret_cast<const SRC*>(in_d->GetDeviceBuffer()),
+                                reinterpret_cast<DST*>(out_d->GetDeviceBuffer()));
+
+        out_d->FromDevice(out.data());
+    }
+    else
+    {
+        job{}(in.data(), out.data());
+    }
+
+    for(int i = 0; i < N; ++i)
+        EXPECT_EQ(ref[i], out[i]) << "i:" << i;
+}

From cfe211cc6042d0074420e8a38394c91c456da2ee Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Tue, 15 Jul 2025 09:42:18 +0800
Subject: [PATCH 307/443] [CK_TILE] moe sorting optimize local_token (#2469)

* fix bug in loops that need use local tokens to compute

* support extra chain local_token

* update

* update

* refine some main

* update

* support dispatch_policy

* fix 15 example
---
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    | 127 ++++--
 .../13_moe_sorting/moe_sorting_api.cpp        |  39 +-
 .../13_moe_sorting/moe_sorting_api.hpp        |  12 +-
 .../13_moe_sorting/script/smoke_test.sh       |  45 ++
 .../15_fused_moe/instances/fused_moe_api.cpp  |  42 +-
 .../instances/fused_moesorting_api.cpp        |   3 +-
 example/ck_tile/15_fused_moe/main.cpp         |   2 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 395 ++++++++++++++++--
 .../fused_moe/kernel/moe_sorting_problem.hpp  |   8 +
 9 files changed, 579 insertions(+), 94 deletions(-)

diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index f139081cd4..16fe0ef150 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -35,7 +35,20 @@ auto create_args(int argc, char* argv[])
         .insert("e", "8", "number of num_experts")
         .insert("k", "4", "topk")
         .insert("unit", "32", "unit_size")
+#if MOE_SORTING_FMOE_2D_BUF
+        .insert("moe_buf_interm_dim", "0", "interm_dim(col) of the following fmoe buf")
+        .insert(
+            "moe_buf_elem_bytes", "2", "fmoe buf element byte size, 1:8bit, 2:16bit, 4:32bit...")
+#else
         .insert("moe_buf_size", "0", "moe_buf_size")
+#endif
+        .insert("ci",
+                "1",
+                "clear workspace inside API or not(if \"0\", require manually clear outside)")
+        .insert(
+            "dispatch",
+            "0",
+            "dispatch policy. 0:automatically pick up kernel, 1:use single kernel, 2:use mp kernel")
         .insert("local_eid",
                 "-1",
                 "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
@@ -88,10 +101,17 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     int topk                = args.get_int("k");
     int seed                = args.get_int("seed");
     int unit_size           = args.get_int("unit");
-    int64_t moe_buf_size    = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
-    int kname               = args.get_int("kname");
-    int warmup              = args.get_int("warmup");
-    int repeat              = args.get_int("repeat");
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = args.get_int("moe_buf_interm_dim");
+    int moe_buf_elem_bytes = args.get_int("moe_buf_elem_bytes");
+#else
+    int64_t moe_buf_size = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
+#endif
+    int kname           = args.get_int("kname");
+    int warmup          = args.get_int("warmup");
+    int repeat          = args.get_int("repeat");
+    bool clear_inside   = args.get_int("ci") != 0;
+    int dispatch_policy = args.get_int("dispatch");
 
     int max_output_ids =
         ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
@@ -149,11 +169,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
     ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
     ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
-    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({1}, {1});
+    // for simplicity, below buffer allocate 2 dword
+    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
+#if MOE_SORTING_FMOE_2D_BUF
+    ck_tile::HostTensor<int8_t> moe_buf_host(
+        {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
+         moe_buf_elem_bytes});
+    auto moe_buf_bytes = moe_buf_interm_dim == 0 ? static_cast<std::size_t>(0)
+                                                 : moe_buf_host.get_element_space_size_in_bytes();
+#else
     ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+    auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
+                                           : moe_buf_host.get_element_space_size_in_bytes();
+#endif
 
     ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+#if MOE_SORTING_FMOE_2D_BUF
+    ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
+#else
     ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+#endif
     topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
 
     ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
@@ -176,7 +211,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
 
     topk_ids_dev.ToDevice(topk_ids_host.data());
     weights_dev.ToDevice(weights_host.data());
-    if(moe_buf_size > 0)
+    if(moe_buf_bytes > 0)
     {
         moe_buf_dev.ToDevice(moe_buf_host.data());
     }
@@ -184,29 +219,31 @@ bool test_moe_sorting(ck_tile::ArgParser args)
         local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
 
     // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-    ck_tile::index_t workspace_size = moe_sorting_get_workspace_size(tokens, num_experts, topk);
+    ck_tile::index_t workspace_size =
+        moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
     ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
-    if(workspace_size != 0)
+    if(workspace_size != 0 && clear_inside == false)
         moe_sorting_ws.SetZero(); // note, clear here!!!!
 
-    moe_sorting_trait trait{index_prec, weight_prec, local_expert_masking};
+    moe_sorting_trait trait{
+        index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
 
-    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
-                          weights_dev.GetDeviceBuffer(),
-                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
-                                               : nullptr,
-                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
-                          sorted_ids_dev.GetDeviceBuffer(),
-                          sorted_weights_dev.GetDeviceBuffer(),
-                          sorted_expert_ids_dev.GetDeviceBuffer(),
-                          sorted_id_cnt_dev.GetDeviceBuffer(),
-                          moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
-                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
-                          tokens,
-                          unit_size,
-                          num_experts,
-                          topk,
-                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))};
+    moe_sorting_args karg
+    {
+        topk_ids_dev.GetDeviceBuffer(), weights_dev.GetDeviceBuffer(),
+            local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer() : nullptr,
+            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+            sorted_ids_dev.GetDeviceBuffer(), sorted_weights_dev.GetDeviceBuffer(),
+            sorted_expert_ids_dev.GetDeviceBuffer(), sorted_id_cnt_dev.GetDeviceBuffer(),
+            moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr, tokens, unit_size,
+            num_experts, topk,
+#if MOE_SORTING_FMOE_2D_BUF
+            moe_buf_interm_dim, moe_buf_elem_bytes
+#else
+            static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+#endif
+    };
 
     ck_tile::stream_config sc{nullptr,
                               true,
@@ -219,7 +256,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
 
 #if 0
     {
-        ck_tile::HostTensor<char> ws_host({workspace_size}, {1});
+    ck_tile::HostTensor<char> ws_host({workspace_size}, {1});
         moe_sorting_ws.FromDevice(ws_host.data());
 
         int * p_mesh = reinterpret_cast<int*>(ws_host.data());
@@ -268,7 +305,12 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     }
 #endif
 
-    printf("[%s|%s]tokens:%d", index_prec.c_str(), weight_prec.c_str(), tokens);
+    printf("[%s|%s|%s|%d]tokens:%d",
+           index_prec.c_str(),
+           weight_prec.c_str(),
+           workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
+           dispatch_policy,
+           tokens);
     if(is_local_token)
     {
         printf("(%d)", local_tokens);
@@ -280,6 +322,19 @@ bool test_moe_sorting(ck_tile::ArgParser args)
         printf("local_eid:%s, ", args.get_str("local_eid").c_str());
     }
 
+    if(moe_buf_bytes > 0)
+    {
+#if MOE_SORTING_FMOE_2D_BUF
+        printf("moe_buf:%lu(%d,%d), ",
+               static_cast<uint64_t>(moe_buf_bytes),
+               moe_buf_interm_dim,
+               moe_buf_elem_bytes);
+#else
+
+        printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
+#endif
+    }
+
     if(ms < 0)
         printf("not supported\n");
     else
@@ -294,7 +349,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     sorted_weights_dev.FromDevice(sorted_weights_host.data());
     sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
     sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
-    if(moe_buf_size > 0)
+    if(moe_buf_bytes > 0)
     {
         moe_buf_dev.FromDevice(moe_buf_host.data());
     }
@@ -340,6 +395,16 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                                       std::string("OUT Error: Incorrect eid!"),
                                       1e-6,
                                       1e-6);
+            // if(is_local_token)
+            {
+                auto t_ = is_local_token ? local_tokens : tokens;
+                bool _f = t_ == sorted_id_cnt_host.mData[1];
+                rtn &= _f;
+                if(!_f)
+                {
+                    printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
+                }
+            }
         }
         else
         {
@@ -347,9 +412,13 @@ bool test_moe_sorting(ck_tile::ArgParser args)
             rtn = false;
         }
 
-        if(moe_buf_size)
+        if(moe_buf_bytes)
         {
+#if MOE_SORTING_FMOE_2D_BUF
+            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
+#else
             ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+#endif
             rtn &= ck_tile::check_err(
                 moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
         }
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index 0899fefcfc..037891353e 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -175,7 +175,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }
         }
 #else
-        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts, a.topk) != 0)
+        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts, a.topk, t.dispatch_policy) != 0)
         {
             return moe_sorting_mp(t, a, s);
         }
@@ -293,6 +293,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         {                                                                                          \
             float ave_time =                                                                       \
                 ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
                                        MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, true),     \
                                        MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, true),     \
                                        MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, true));  \
@@ -302,6 +303,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         {                                                                                          \
             float ave_time =                                                                       \
                 ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
                                        MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, false),    \
                                        MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, false),    \
                                        MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, false)); \
@@ -314,6 +316,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         {                                                                                          \
             float ave_time =                                                                       \
                 ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
                                        MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, true),    \
                                        MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, true),    \
                                        MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, true)); \
@@ -323,6 +326,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         {                                                                                          \
             float ave_time = ck_tile::launch_kernel(                                               \
                 s,                                                                                 \
+                maybe_clear_workspace,                                                             \
                 MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, false),                          \
                 MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, false),                          \
                 MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, false));                       \
@@ -330,6 +334,17 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }                                                                                          \
     }
 
+#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                 \
+    [&]() {                                                                                 \
+        using problem_ =                                                                    \
+            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>;  \
+        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;              \
+        auto kargs        = kernel::MakeKargs(a);                                           \
+        const dim3 grids  = kernel::GridSize(a);                                            \
+        const dim3 blocks = kernel::BlockSize(a);                                           \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs); \
+    }()
+
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
 {
     bool is_local_token = a.p_local_tokens != nullptr;
@@ -338,6 +353,22 @@ float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_co
         using ms_index_t     = ck_tile::index_t;
         using ms_weight_type = float;
 
+        auto maybe_clear_workspace = [=](const ck_tile::stream_config& s_) {
+            if(t.clear_workspace_inside_api)
+            {
+                if(is_local_token)
+                {
+                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(true, 1024, 1);
+                    k(s_);
+                }
+                else
+                {
+                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(false, 1024, 1);
+                    k(s_);
+                }
+            }
+        };
+
         if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
            ck_tile::get_smem_capacity())
         {
@@ -345,6 +376,7 @@ float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_co
             if(t.local_expert_masking)
             {
                 float ave_time = ck_tile::launch_kernel(s,
+                                                        maybe_clear_workspace,
                                                         MOE_SORTING_MP_0(ms_index_t, 1, true),
                                                         MOE_SORTING_MP_1(ms_index_t, 1, true),
                                                         MOE_SORTING_MP_2(ms_index_t, 1, true),
@@ -354,6 +386,7 @@ float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_co
             else
             {
                 float ave_time = ck_tile::launch_kernel(s,
+                                                        maybe_clear_workspace,
                                                         MOE_SORTING_MP_0(ms_index_t, 1, false),
                                                         MOE_SORTING_MP_1(ms_index_t, 1, false),
                                                         MOE_SORTING_MP_2(ms_index_t, 1, false),
@@ -405,7 +438,7 @@ float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_co
     return -1;
 }
 
-int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk)
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk, int dispatch_policy)
 {
-    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk);
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
 }
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
index 0fe8d81e70..6c6cd0f4fa 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -10,8 +10,14 @@
 struct moe_sorting_trait
 {
     std::string index_type;
-    std::string weight_type;   // currently always float
-    bool local_expert_masking; // if mask experts as local expert
+    std::string weight_type;         // currently always float
+    bool local_expert_masking;       // if mask experts as local expert
+    bool clear_workspace_inside_api; // if true, no need clear workspace outsize (will take care of
+                                     // it inside API)
+    int dispatch_policy; // 0 - let the API choose kernel for you. 1 - always use single kerenl. 2 -
+                         // always use mp kernel NOTE: moe_sorting_get_workspace_size() need use
+                         // same dispatch_policy value. it will be undefined behavior if ppl using
+                         // different value when get ws and call the kernel
 };
 
 struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
@@ -22,6 +28,6 @@ struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
 // if return non zero, means need workspace, you need to allocate a GPU buffer
 // and set to moe_sorting_args.p_ws
 // NOTE: workspace size are required to clear zero before use the API
-int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk);
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk, int dispatch_policy);
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
diff --git a/example/ck_tile/13_moe_sorting/script/smoke_test.sh b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
index 63bc0acceb..2c245f6e7f 100644
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -1,7 +1,9 @@
 # #!/bin/sh
 
 EXE=./build/bin/tile_example_moe_sorting
+MOE_BUF="12"
 
+if [ "x$MOE_BUF" = "x1" ] ; then
 $EXE -t=80 -e=17 -moe_buf_size=16
 $EXE -t=111 -e=117 -moe_buf_size=4
 $EXE -t=1000 -e=55 -moe_buf_size=1024
@@ -42,3 +44,46 @@ $EXE -t=23 -local_t=9 -e=1 -k=1
 $EXE -t=7 -local_t=0 -e=89 -k=1 -local_eid=0,8,12,33
 $EXE -t=61 -local_t=0 -e=333 -k=99 -local_eid=0,8,12,33
 $EXE -t=133940 -local_t=111921 -e=256 -k=17 -moe_buf_size=133940
+else
+$EXE -t=80 -e=17 -moe_buf_interm_dim=16 -moe_buf_elem_bytes=4
+$EXE -t=111 -e=117 -moe_buf_interm_dim=4 -moe_buf_elem_bytes=4
+$EXE -t=1000 -e=55 -moe_buf_interm_dim=1024 -moe_buf_elem_bytes=1
+$EXE -t=99 -e=120  -moe_buf_interm_dim=10244 -moe_buf_elem_bytes=2
+$EXE -t=175 -e=64 -k=8
+$EXE -t=65 -e=8 -k=2
+$EXE -t=1 -e=25
+$EXE -t=31 -e=19 -k=15
+$EXE -t=81 -e=37 -k=7
+$EXE -t=23 -e=1 -k=1
+$EXE -t=127 -e=99 -k=19
+$EXE -t=71 -e=11 -k=11
+$EXE -t=1 -e=1 -k=1
+$EXE -t=99 -e=2 -k=1
+$EXE -t=333 -e=99 -k=13
+$EXE -t=11 -e=256 -k=5
+$EXE -t=64 -e=455 -k=8
+$EXE -t=777 -e=802 -k=99
+$EXE -t=4097 -e=906 -k=51
+$EXE -t=128 -e=32 -k=5 -local_t=6 -moe_buf_interm_dim=262144
+$EXE -t=13 -e=64 -k=3 -local_eid=4,5,6,7,8,9,10,11
+$EXE -t=99 -e=33 -k=9 -local_eid=6,10,11,15,19
+$EXE -t=80 -e=99 -k=10 -local_eid=0,8,12,33
+$EXE -t=11 -e=256 -k=5 -local_eid=99,110,129
+$EXE -t=128 -e=128 -k=6 -moe_buf_interm_dim=163840 -moe_buf_elem_bytes=1
+$EXE -t=8192 -e=32 -k=5 -local_t=11 -moe_buf_interm_dim=163840
+$EXE -t=8192 -e=32 -k=8 -local_t=12 -moe_buf_interm_dim=163840 -moe_buf_elem_bytes=1
+$EXE -t=8192 -e=256 -k=5 -local_t=13 -moe_buf_interm_dim=163840
+$EXE -t=8192 -e=256 -k=8 -local_t=8 -moe_buf_interm_dim=163840
+$EXE -t=163840 -e=256 -k=8 -local_t=4 -moe_buf_interm_dim=163840 -moe_buf_elem_bytes=4
+$EXE -t=12 -local_t=3 -e=256 -k=5 -local_eid=9,10,199,145
+$EXE -t=67 -local_t=9 -e=555 -k=5 -local_eid=19,23,24,25,26,99
+$EXE -t=99 -local_t=93 -e=121 -local_t=4 -moe_buf_interm_dim=10244
+$EXE -t=536 -local_t=345 -e=802 -k=99
+$EXE -t=331 -local_t=39 -e=83 -k=33
+$EXE -t=765 -local_t=654 -e=783 -k=8
+$EXE -t=23 -local_t=9 -e=1 -k=1
+$EXE -t=7 -local_t=0 -e=89 -k=1 -local_eid=0,8,12,33
+$EXE -t=61 -local_t=0 -e=333 -k=99 -local_eid=0,8,12,33
+$EXE -t=133940 -local_t=111921 -e=256 -k=17 -local_t=2 -moe_buf_interm_dim=133940 -moe_buf_elem_bytes=1
+
+fi
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
index 27274878a2..78f664a671 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -6,7 +6,8 @@
 
 int fused_moe_get_workspace_size(int tokens, int num_experts, int topk)
 {
-    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk);
+    return ck_tile::moe_sorting_get_workspace_size(
+        tokens, num_experts, topk, 0 /*dispatch policy*/);
 }
 
 float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s)
@@ -24,23 +25,28 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
     }();
 
     auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
-    auto a0 = fused_moesorting_args{
-        a.topk_ids_ptr,          // const void* p_topk_ids;
-        a.topk_weight_ptr,       // const void* p_weights;
-        a.local_expert_mask_ptr, // const void* p_local_expert_mask;
-        a.local_tokens,
-        a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
-        a.sorted_weight_ptr,     // void* p_sorted_weights;
-        a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
-        a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
-        a.o_ptr,                 // void* p_moe_buf;
-        a.ws_ptr,                // void* p_ws;
-        a.num_tokens,            // index_t tokens;
-        a.block_m,               // index_t unit_size;
-        a.num_experts,           // index_t num_experts;
-        a.topk,                  // index_t topk;
-        static_cast<ck_tile::long_index_t>(a.num_tokens) * a.stride_token *
-            o_data_bytes // index_t moe_buf_bytes;
+    auto a0 = fused_moesorting_args
+    {
+        a.topk_ids_ptr,              // const void* p_topk_ids;
+            a.topk_weight_ptr,       // const void* p_weights;
+            a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+            a.local_tokens,
+            a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+            a.sorted_weight_ptr,     // void* p_sorted_weights;
+            a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+            a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+            a.o_ptr,                 // void* p_moe_buf;
+            a.ws_ptr,                // void* p_ws;
+            a.num_tokens,            // index_t tokens;
+            a.block_m,               // index_t unit_size;
+            a.num_experts,           // index_t num_experts;
+            a.topk,                  // index_t topk;
+#if MOE_SORTING_FMOE_2D_BUF
+            a.stride_token, o_data_bytes,
+#else
+            static_cast<ck_tile::long_index_t>(a.num_tokens) *
+                a.stride_token* o_data_bytes // index_t moe_buf_bytes;
+#endif
     };
 
     auto t1 = fused_moegemm_traits{t.prec_i,
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
index f745284f3e..83454a3969 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -413,5 +413,6 @@ float fused_moesorting_mp(fused_moesorting_trait t,
 
 int fused_moesorting_get_workspace_size(int tokens, int num_experts, int topk)
 {
-    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk);
+    return ck_tile::moe_sorting_get_workspace_size(
+        tokens, num_experts, topk, 0 /*dispatch policy*/);
 }
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index d9950426a2..35f24c1155 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -399,7 +399,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
         ck_tile::index_t workspace_size =
-            ck_tile::moe_sorting_get_workspace_size(tokens, experts, topk);
+            ck_tile::moe_sorting_get_workspace_size(tokens, experts, topk, 0 /*dispatch_policy*/);
         ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
         if(workspace_size != 0)
             moe_sorting_ws.SetZero(); // note, clear here!!!!
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 5da675ae42..db85fae643 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -23,6 +23,11 @@ namespace ck_tile {
 #define MOE_SORTING_FUSE_MP_01 0
 #endif
 
+// weather use 2d buffer indexing for fmoe ws or 1d
+#ifndef MOE_SORTING_FMOE_2D_BUF
+#define MOE_SORTING_FMOE_2D_BUF 1
+#endif
+
 // clang-format off
 // [indexing implementation-1]
 // using M_a as constexpr block_size to partition all tokens into different slices
@@ -171,7 +176,7 @@ struct MoeSortingHostArgs
     void* p_sorted_token_ids;
     void* p_sorted_weights;
     void* p_sorted_expert_ids;
-    void* p_total_tokens_post_pad;
+    void* p_total_tokens_post_pad; // [2], [0]:outputed tokens_post_padded, [1]:actual tokens on current rank (local_tokens or tokens)
     // we fused the setzero of output of fused-moe buffer
     // set this pointer to nullptr will skip this operation
     void* p_moe_buf;
@@ -182,7 +187,18 @@ struct MoeSortingHostArgs
     index_t unit_size;      // this is the M_a of fused-moe kernel
     index_t num_experts;
     index_t topk;
+#if MOE_SORTING_FMOE_2D_BUF
+    // NOTE:
+    // moe_buf_* is a 2d ws buffer used for the following fmoe kernel
+    // arranged as row*col, where row=tokens(or local_token), col=interm_dim
+    // we fuse this clearing inside sorting kernel
+    // Besides, we require inter_dim to be multiple of 16 byte(make sure when alloc ws for fmoe)
+    index_t moe_buf_interm_dim; // p_moe_buf interm_dim
+    index_t moe_buf_elem_bytes; // p_moe_buf byte size(8bit, 16bit, 32bit, etc.)
+#else
     long_index_t moe_buf_bytes;  // byte size of p_moe_buf
+#endif
+    
 };
 
 template <typename Problem_>
@@ -197,6 +213,9 @@ struct MoeSortingKernel
 
     using Hargs = MoeSortingHostArgs;
 
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
     struct Kargs
     {
         const void* p_topk_ids;
@@ -210,8 +229,12 @@ struct MoeSortingKernel
         void* p_moe_buf;
         index_t tokens;
         index_t num_experts;
+#if MOE_SORTING_FMOE_2D_BUF
+        index_t moe_buf_interm_dim; // p_moe_buf interm_dim
+        index_t moe_buf_elem_bytes; // p_moe_buf byte size(8bit, 16bit, 32bit, etc.)
+#else
         long_index_t moe_buf_bytes;
-
+#endif
         index_t tokens_per_thread;
         index_t smem_rows;
         mdiv unit_size_mdiv;
@@ -220,10 +243,27 @@ struct MoeSortingKernel
         // mdiv sub_tokens_mdiv;
     };
 
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
     {
+#if MOE_SORTING_FMOE_2D_BUF
+        (void)h;
+        return get_num_cu() * OCCUPANCY;
+#else
         // TODO: assume num-experts not too much
         return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BlockSize(h).x * 16));
+#endif
     }
 
     CK_TILE_HOST static constexpr auto BlockSize(const Hargs& h)
@@ -263,7 +303,12 @@ struct MoeSortingKernel
         k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
         k.tokens                  = h.tokens;
         k.num_experts             = h.num_experts;
+#if MOE_SORTING_FMOE_2D_BUF
+        k.moe_buf_interm_dim      = h.moe_buf_interm_dim;
+        k.moe_buf_elem_bytes      = h.moe_buf_elem_bytes;
+#else
         k.moe_buf_bytes           = h.moe_buf_bytes;
+#endif
 
         const auto blocks   = BlockSize(h);
         // NOTE: tokens could from p_local_tokens, so here this variable is useless
@@ -431,6 +476,24 @@ struct MoeSortingKernel
         }
     }
 
+    CK_TILE_DEVICE void
+    moe_buf_set_zero_kernel_2d(void* buf, index_t row, index_t col, index_t elem_bytes) const
+    {
+        const long_index_t total_pixels = static_cast<long_index_t>(row) * col;
+        const long_index_t total_bytes  = total_pixels * elem_bytes;
+        const long_index_t total_elems  = total_bytes / 16; // always use dwordx4
+
+        using vector_type  = ext_vector_t<index_t, 4>;
+        vector_type* p_buf = reinterpret_cast<vector_type*>(buf);
+        auto zero_         = vector_type{0};
+
+        for(long_index_t i = (blockIdx.x - 1) * BLOCK_SIZE + threadIdx.x; i < total_elems;
+            i += (gridDim.x - 1) * BLOCK_SIZE)
+        {
+            p_buf[i] = zero_;
+        }
+    }
+
     CK_TILE_DEVICE void moe_align_block_size_kernel(const IndexType* __restrict__ topk_id,
                                                     const WeightType* __restrict__ weights,
                                                     index_t* p_sorted_token_ids,
@@ -863,7 +926,8 @@ struct MoeSortingKernel
                 }
                 if((lid + i_e_ - get_warp_size()) == (num_experts - 1))
                 {
-                    *p_total_tokens_post_pad = local_cumsum_;
+                    *p_total_tokens_post_pad   = local_cumsum_;
+                    p_total_tokens_post_pad[1] = tokens;
                 }
             }
             __syncthreads();
@@ -1005,20 +1069,6 @@ struct MoeSortingKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        if(blockIdx.x > 0)
-        {
-            if(kargs.p_moe_buf)
-            {
-                moe_buf_set_zero_kernel(reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
-                                        kargs.moe_buf_bytes);
-            }
-            return;
-        }
-        const size_t numel = kargs.tokens * kargs.topk_mdiv.divisor;
-        extern __shared__ char smem[];
-
-#if MOE_SORTING_USE_EX_KERNEL
-        (void)numel;
         index_t tokens_ = [&]() {
             if constexpr(Problem::LocalToken)
             {
@@ -1029,6 +1079,25 @@ struct MoeSortingKernel
                 return kargs.tokens;
             }
         }();
+
+        if(blockIdx.x > 0)
+        {
+            if(kargs.p_moe_buf)
+            {
+#if MOE_SORTING_FMOE_2D_BUF
+                moe_buf_set_zero_kernel_2d(
+                    kargs.p_moe_buf, tokens_, kargs.moe_buf_interm_dim, kargs.moe_buf_elem_bytes);
+#else
+                moe_buf_set_zero_kernel(reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
+                                        kargs.moe_buf_bytes);
+#endif
+            }
+            return;
+        }
+
+        extern __shared__ char smem[];
+
+#if MOE_SORTING_USE_EX_KERNEL
         return moe_align_block_size_kernel_ex(
             static_cast<const IndexType*>(kargs.p_topk_ids),
             static_cast<const WeightType*>(kargs.p_weights),
@@ -1045,6 +1114,7 @@ struct MoeSortingKernel
             kargs.smem_rows,
             smem);
 #else
+        const size_t numel = kargs.tokens * kargs.topk_mdiv.divisor;
         return moe_align_block_size_kernel(static_cast<const IndexType*>(kargs.p_topk_ids),
                                            static_cast<const WeightType*>(kargs.p_weights),
                                            static_cast<IndexType*>(kargs.p_sorted_token_ids),
@@ -1066,6 +1136,8 @@ namespace impl {
 // [expert, padded_tokens]
 CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_stride(index_t tokens)
 {
+    // Pad to multiply of 32. This can make sure even if the mesh is in 8bit,
+    // we can still use dwordx4 load/store
     constexpr index_t chunk = 32;
     return (tokens + chunk - 1) / chunk * chunk;
 };
@@ -1261,6 +1333,24 @@ CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, long_index_t buf_by
     }
 }
 
+template <index_t BLOCK_SIZE = 256>
+CK_TILE_DEVICE void moe_buf_set_zero_kernel_2d(
+    void* buf, index_t row, index_t col, index_t elem_bytes, index_t gid, index_t blocks)
+{
+    const long_index_t total_pixels = static_cast<long_index_t>(row) * col;
+    const long_index_t total_bytes  = total_pixels * elem_bytes;
+    const long_index_t total_elems  = total_bytes / 16; // always use dwordx4
+
+    using vector_type  = ext_vector_t<index_t, 4>;
+    vector_type* p_buf = reinterpret_cast<vector_type*>(buf);
+    auto zero_         = vector_type{0};
+
+    for(long_index_t i = gid * BLOCK_SIZE + threadIdx.x; i < total_elems; i += blocks * BLOCK_SIZE)
+    {
+        p_buf[i] = zero_;
+    }
+}
+
 } // namespace impl
 
 // TODO: tokens could be from
@@ -1292,12 +1382,29 @@ CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size(int tokens_, int num_expe
 }
 
 // return size in byte
-CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts_, int topk_)
+// dispatch_policy: 0-automatically pick up kerel. 1-always use single kernel, 2-always use mp
+// kernel
+CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_,
+                                                    int num_experts_,
+                                                    int topk_,
+                                                    int dispatch_policy_)
 {
 #if 1
-    if(moe_sorting_is_oneshot(tokens_, num_experts_))
+    // return 0;
+    if(dispatch_policy_ == 0)
     {
-        return 0;
+        if(moe_sorting_is_oneshot(tokens_, num_experts_))
+        {
+            return 0;
+        }
+        else
+        {
+            return moe_sorting_mp_get_workspace_size(tokens_, num_experts_, topk_);
+        }
+    }
+    else if(dispatch_policy_ == 1)
+    {
+        return 0; // always use single kernel
     }
     else
     {
@@ -1308,6 +1415,98 @@ CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts
 #endif
 }
 
+template <typename Problem_>
+struct MoeSortingClearWorkspaceKernel
+{
+    using Problem                       = remove_cvref_t<Problem_>;
+    static constexpr index_t BLOCK_SIZE = Problem::BlockSize;
+    static constexpr index_t OCCUPANCY  = Problem::Occu;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_local_tokens; // [1], if not nullptr, use this as actual tokens
+        void* p_expert_mesh;        // [expert, tokens]
+        index_t tokens; // if p_local_tokens is not nullptr, this indicate the max possible tokens
+                        // used for ws/LDS calculation
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        index_t mesh_byte_size;
+    };
+
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_local_tokens = h.p_local_tokens;
+        k.p_expert_mesh  = h.p_ws;
+        k.tokens         = h.tokens;
+        k.num_experts    = h.num_experts;
+        k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.mesh_byte_size = impl::moe_sorting_mesh_byte_size(h.tokens, h.num_experts, h.topk);
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST static constexpr auto GetSmemSize() { return 0; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        index_t tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return kargs.tokens;
+            }
+        }();
+
+        index_t mesh_stride = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return impl::moe_sorting_mp_mesh_stride(tokens);
+            }
+            else
+            {
+                return kargs.mesh_stride;
+            }
+        }();
+
+        index_t row_size    = mesh_stride; // impl::moe_sorting_mp_mesh_stride(tokens);
+        index_t pixels      = kargs.num_experts * row_size;
+        index_t total_bytes = pixels * kargs.mesh_byte_size;
+        index_t total_elems = total_bytes / 16; // always use dwordx4
+
+        using vector_type          = ext_vector_t<index_t, 4>;
+        vector_type* p_expert_mesh = reinterpret_cast<vector_type*>(kargs.p_expert_mesh);
+        auto zero_                 = vector_type{0};
+
+        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elems;
+            i += gridDim.x * BLOCK_SIZE)
+        {
+            p_expert_mesh[i] = zero_;
+        }
+    }
+};
+
 // below kernel is multi-phase implementation for large token and/or expert case
 
 // write into a buffer to record the token cnt
@@ -1435,6 +1634,16 @@ struct MoeSortingMultiPhaseKernel_P0
             else
                 return tokens;
         }();
+        index_t mesh_stride = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return impl::moe_sorting_mp_mesh_stride(tokens);
+            }
+            else
+            {
+                return kargs.mesh_stride;
+            }
+        }();
         index_t total_elem = rounded_tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
 
 #pragma unroll Problem::SubTokenTile
@@ -1449,12 +1658,11 @@ struct MoeSortingMultiPhaseKernel_P0
                 if constexpr(Problem::LocalToken)
                 {
                     if(static_cast<index_t>(curr_token_id) < tokens)
-                        p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
+                        p_expert_mesh[eid * mesh_stride + curr_token_id] =
                             (curr_topk_id + 1) & 0xffff;
                 }
                 else
-                    p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] =
-                        (curr_topk_id + 1) & 0xffff;
+                    p_expert_mesh[eid * mesh_stride + curr_token_id] = (curr_topk_id + 1) & 0xffff;
             });
         }
     }
@@ -1479,6 +1687,7 @@ struct MoeSortingMultiPhaseKernel_P1
     struct Kargs
     {
         const void* p_local_expert_mask; // [expert]
+        const void* p_local_tokens;      // [1], if not nullptr, use this as actual tokens
         void* p_expert_mesh;             // [expert, tokens]
         void* p_expert_cumsum;
         index_t mesh_stride; // mesh_stride for p_expert_mesh
@@ -1488,6 +1697,7 @@ struct MoeSortingMultiPhaseKernel_P1
     {
         Kargs k;
         k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_local_tokens      = h.p_local_tokens;
         k.p_expert_mesh       = h.p_ws;
         k.p_expert_cumsum     = reinterpret_cast<void*>(
             reinterpret_cast<char*>(h.p_ws) +
@@ -1511,12 +1721,9 @@ struct MoeSortingMultiPhaseKernel_P1
     {
         __shared__ char smem[GetSmemSize()];
 
-        int eid = blockIdx.x;
-
+        int eid                      = blockIdx.x;
         constexpr index_t index_pack = Problem::SubTokenTile;              // always packed
         using r_t                    = ext_vector_t<MeshType, index_pack>; // always use int32x4
-        r_t* p_expert_mesh           = reinterpret_cast<r_t*>(
-            reinterpret_cast<MeshType*>(kargs.p_expert_mesh) + eid * kargs.mesh_stride);
 
         const IndexType* p_local_expert_mask =
             static_cast<const IndexType*>(kargs.p_local_expert_mask);
@@ -1524,7 +1731,32 @@ struct MoeSortingMultiPhaseKernel_P1
 
         auto f_sum = [](auto x_, auto y_) { return x_ + y_; };
 
-        int loops = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        index_t tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return 0; // will not use if not LocalToken
+            }
+        }();
+
+        index_t mesh_stride = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return impl::moe_sorting_mp_mesh_stride(tokens);
+            }
+            else
+            {
+                return kargs.mesh_stride;
+            }
+        }();
+
+        r_t* p_expert_mesh = reinterpret_cast<r_t*>(
+            reinterpret_cast<MeshType*>(kargs.p_expert_mesh) + eid * mesh_stride);
+
+        int loops = (mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
         if constexpr(Problem::LocalExpertMasking)
         {
@@ -1538,7 +1770,7 @@ struct MoeSortingMultiPhaseKernel_P1
         {
             int position = i * BLOCK_SIZE + threadIdx.x;
             r_t v{0};
-            if(position < (kargs.mesh_stride / index_pack))
+            if(position < (mesh_stride / index_pack))
                 v = p_expert_mesh[position];
             index_t local_sum = 0;
             static_for<0, index_pack, 1>{}(
@@ -1835,7 +2067,7 @@ struct MoeSortingMultiPhaseKernel_P2
         const void* p_local_tokens;      // [1]
         void* p_expert_mesh;             // [expert, tokens]
         void* p_expert_cumsum;           // [expert + 1]
-        void* p_total_tokens_post_pad;   // [1]
+        void* p_total_tokens_post_pad;   // [2]
         void* p_sorted_expert_ids;
         void* p_moe_buf;
         index_t tokens;
@@ -1863,15 +2095,36 @@ struct MoeSortingMultiPhaseKernel_P2
         k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
         k.unit_size_mdiv = mdiv{static_cast<uint32_t>(h.unit_size)};
 
+#if MOE_SORTING_FMOE_2D_BUF
+        k.moe_buf_interm_dim = h.moe_buf_interm_dim;
+        k.moe_buf_elem_bytes = h.moe_buf_elem_bytes;
+#else
         k.moe_buf_bytes = h.moe_buf_bytes;
+#endif
 
         return k;
     }
 
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
     {
+#if MOE_SORTING_FMOE_2D_BUF
+        return dim3(h.num_experts + get_num_cu() * OCCUPANCY);
+#else
         // use 1 block to cumsum
         return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+#endif
     }
 
     CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
@@ -1888,11 +2141,21 @@ struct MoeSortingMultiPhaseKernel_P2
     {
         if(blockIdx.x > 0)
         {
+#if MOE_SORTING_FMOE_2D_BUF
+            impl::moe_buf_set_zero_kernel_2d<BLOCK_SIZE>(kargs.p_moe_buf,
+                                                         kargs.tokens,
+                                                         kargs.moe_buf_interm_dim,
+                                                         kargs.moe_buf_elem_bytes,
+                                                         blockIdx.x - 1,
+                                                         gridDim.x - 1);
+            return;
+#else
             impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
                 reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
                 kargs.moe_buf_bytes,
                 blockIdx.x - 1);
             return;
+#endif
         }
         __shared__ char smem[GetSmemSize()];
         IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -2223,7 +2486,7 @@ struct MoeSortingMultiPhaseKernel_P23
         const void* p_local_tokens;      // [1]
         void* p_expert_mesh;             // [expert, tokens]
         void* p_expert_cumsum;           // [expert + 1]
-        void* p_total_tokens_post_pad;   // [1]
+        void* p_total_tokens_post_pad;   // [2]
         void* p_sorted_expert_ids;
 
         void* p_sorted_token_ids;
@@ -2235,7 +2498,17 @@ struct MoeSortingMultiPhaseKernel_P23
         index_t mesh_stride; // mesh_stride for p_expert_mesh
         mdiv unit_size_mdiv;
         mdiv topk_mdiv;
-        long_index_t moe_buf_bytes;
+#if MOE_SORTING_FMOE_2D_BUF
+        // NOTE:
+        // moe_buf_* is a 2d ws buffer used for the following fmoe kernel
+        // arranged as row*col, where row=tokens(or local_token), col=interm_dim
+        // we fuse this clearing inside sorting kernel
+        // Besides, we require inter_dim to be multiple of 16 byte(make sure when alloc ws for fmoe)
+        index_t moe_buf_interm_dim; // p_moe_buf interm_dim
+        index_t moe_buf_elem_bytes; // p_moe_buf byte size(8bit, 16bit, 32bit, etc.)
+#else
+        long_index_t moe_buf_bytes; // byte size of p_moe_buf
+#endif
     };
 
     CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
@@ -2262,16 +2535,37 @@ struct MoeSortingMultiPhaseKernel_P23
         k.unit_size_mdiv = mdiv{static_cast<uint32_t>(h.unit_size)};
         k.topk_mdiv      = mdiv{static_cast<uint32_t>(h.topk)};
 
+#if MOE_SORTING_FMOE_2D_BUF
+        k.moe_buf_interm_dim = h.moe_buf_interm_dim;
+        k.moe_buf_elem_bytes = h.moe_buf_elem_bytes;
+#else
         k.moe_buf_bytes = h.moe_buf_bytes;
+#endif
 
         return k;
     }
 
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
     {
+#if MOE_SORTING_FMOE_2D_BUF
+        return dim3(h.num_experts + get_num_cu() * OCCUPANCY);
+#else
         // use 1 block to cumsum
         // return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
         return dim3(h.num_experts + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+#endif
     }
 
     CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
@@ -2287,13 +2581,34 @@ struct MoeSortingMultiPhaseKernel_P23
     // reduce single pixel within a wave
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
+        index_t tokens = [&]() {
+            if constexpr(Problem::LocalToken)
+            {
+                return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+            }
+            else
+            {
+                return kargs.tokens;
+            }
+        }();
+
         if(static_cast<index_t>(blockIdx.x) >= kargs.num_experts)
         {
+#if MOE_SORTING_FMOE_2D_BUF
+            impl::moe_buf_set_zero_kernel_2d<BLOCK_SIZE>(kargs.p_moe_buf,
+                                                         tokens,
+                                                         kargs.moe_buf_interm_dim,
+                                                         kargs.moe_buf_elem_bytes,
+                                                         blockIdx.x - kargs.num_experts,
+                                                         gridDim.x - kargs.num_experts);
+            return;
+#else
             impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
                 reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
                 kargs.moe_buf_bytes,
                 blockIdx.x - kargs.num_experts);
             return;
+#endif
         }
 
         extern __shared__ char smem[];
@@ -2428,13 +2743,15 @@ struct MoeSortingMultiPhaseKernel_P23
             {
                 auto total_tokens_post_pad = prev_cumsum_a * kargs.unit_size_mdiv.divisor;
                 if(blockIdx.x == 0)
+                {
                     p_total_tokens_post_pad[0] = total_tokens_post_pad;
+                    p_total_tokens_post_pad[1] = tokens;
+                }
                 p_expert_cumsum_smem[kargs.num_experts] = total_tokens_post_pad;
             }
         }
 
         __syncthreads();
-
         {
             const IndexType* p_local_expert_mask =
                 static_cast<const IndexType*>(kargs.p_local_expert_mask);
@@ -2463,14 +2780,14 @@ struct MoeSortingMultiPhaseKernel_P23
                     return; // skip empty expert
             }
 
-            index_t tokens = [&]() {
+            index_t mesh_stride = [&]() {
                 if constexpr(Problem::LocalToken)
                 {
-                    return reinterpret_cast<const index_t*>(kargs.p_local_tokens)[0];
+                    return impl::moe_sorting_mp_mesh_stride(tokens);
                 }
                 else
                 {
-                    return kargs.tokens;
+                    return kargs.mesh_stride;
                 }
             }();
 
@@ -2478,7 +2795,8 @@ struct MoeSortingMultiPhaseKernel_P23
             constexpr index_t index_pack = Problem::SubTokenTile;              // always packed
             using r_t                    = ext_vector_t<MeshType, index_pack>; // always use int32x4
             using d_t                    = ext_vector_t<index_t, index_pack>;
-            int loops       = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            int loops                    = (mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
             int prev_cumsum = 0;
 
             for(int i = 0; i < loops; i++)
@@ -2487,8 +2805,7 @@ struct MoeSortingMultiPhaseKernel_P23
                 r_t x_v          = 0;
                 if(i_token_pack < (tokens + index_pack - 1) / index_pack)
                 {
-                    x_v = reinterpret_cast<r_t*>(p_expert_mesh +
-                                                 eid * kargs.mesh_stride)[i_token_pack];
+                    x_v = reinterpret_cast<r_t*>(p_expert_mesh + eid * mesh_stride)[i_token_pack];
                 }
 
                 r_t x_r;
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
index 181266d7af..ea218b9c25 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
@@ -73,4 +73,12 @@ struct MoeSortingProblemMp
                   SubTokenTile == 8 || SubTokenTile == 16);
 };
 
+template <bool LocalToken_, index_t BlockSize_ = 1024, index_t Occu_ = 1>
+struct MoeSortingClearWorkspaceProblem
+{
+    static constexpr bool LocalToken   = LocalToken_;
+    static constexpr index_t BlockSize = BlockSize_;
+    static constexpr index_t Occu      = Occu_;
+};
+
 } // namespace ck_tile

From 6b09f0823e64df0bcd035443c9ce4599a838de02 Mon Sep 17 00:00:00 2001
From: rahjain-amd <Rahul.Jain@amd.com>
Date: Tue, 15 Jul 2025 21:25:56 +0530
Subject: [PATCH 308/443] add missing condition for bf16 (#2502)

Without this DataType = unknown -
``` sh
Run Flatmm kernel with DataType = unknown M =1280 N =16384 K =1024 StrideA =1024 StrideB =1024 StrideC =16384 : 0.228837 ms, 187.687 TFlops, 341.374 GB/s,
```

after this change
```sh
Run Flatmm kernel with DataType = bf16 M =1280 N =16384 K =1024 StrideA =1024 StrideB =1024 StrideC =16384 : 0.227029 ms, 189.181 TFlops, 344.092 GB/s,
```
---
 example/ck_tile/18_flatmm/run_flatmm_example.inc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 3c4d0908dd..b583612cfb 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -18,6 +18,10 @@ constexpr const char* DataTypeToString()
     {
         return "bf8";
     }
+    else if  constexpr(std::is_same_v<T, ck_tile::bf16_t>)
+    {
+        return "bf16";
+    }
     else
     {
         return "unknown";

From 3499fe67ff24f9e3610b208d14589fac645e0ea7 Mon Sep 17 00:00:00 2001
From: MHYangAMD <meng-hsuan.yang@amd.com>
Date: Wed, 16 Jul 2025 14:05:26 +0800
Subject: [PATCH 309/443] [CK_TILE] Enhance RMSNorm Accuracy: New Pipeline Pass
 for Selectable Implementation (#2409)

* Add Rmsnorm2dFwdPipelineModelSensitiveT5Pass

* Update rmsnorm2d_fwd_pipeline_model_sensitive_pass

1.  Add BlockReduce2dTreeCrossWarpSync

* Add Rmsnorm2dFusedModelSensitiveEnum

* Update patch

1. Reverse generate.py
2. Remove comment in generate.py
3. Update tree cross warp reduce

* Refactor RMSNorm model enum and introduce T5-like option

* Update the n stage for cross warp reduce

* Add new cmdline option in RMSNorm for new pipeline testing

---------

Co-authored-by: Clement Lin <clement.lin@amd.com>
Co-authored-by: ClementLinCF <162283536+ClementLinCF@users.noreply.github.com>
---
 .../10_rmsnorm2d/example_rmsnorm2d_fwd.cpp    |  39 ++-
 example/ck_tile/10_rmsnorm2d/generate.py      | 257 ++++++++++++------
 .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp    |  35 ++-
 .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp    |   2 +
 .../ck_tile/10_rmsnorm2d/script/perf_test.sh  | 103 ++++---
 .../ck_tile/10_rmsnorm2d/script/smoke_test.sh |  54 ++--
 .../ops/reduce/block/block_reduce2d.hpp       | 133 +++++++++
 include/ck_tile/ops/rmsnorm2d.hpp             |   1 +
 .../rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp |  17 +-
 .../rmsnorm2d_fwd_pipeline_default_policy.hpp |   9 +
 ...rm2d_fwd_pipeline_model_sensitive_pass.hpp | 228 ++++++++++++++++
 .../rmsnorm2d_fwd_pipeline_one_pass.hpp       |   5 +-
 .../pipeline/rmsnorm2d_fwd_traits.hpp         |  31 ++-
 13 files changed, 730 insertions(+), 184 deletions(-)
 create mode 100644 include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_model_sensitive_pass.hpp

diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
index 25598282e3..13924f5fe9 100644
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -15,13 +15,14 @@ auto create_args(int argc, char* argv[])
         .insert("v", "1", "cpu validation or not")
         .insert("prec", "fp16", "precision")
         .insert("warmup", "0", "cold iter")
-        .insert("repeat", "1", "hot iter");
+        .insert("repeat", "1", "hot iter")
+        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
 }
 
-template <typename DataType>
+template <typename DataType, int USEModelSensitive>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
     ck_tile::index_t m      = arg_parser.get_int("m");
@@ -81,8 +82,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                     false, // kSaveInvRms
                                     false, // kSaveUnquant
                                     kTwoPass,
-                                    ck_tile::Rmsnorm2dFusedAddEnum::NO_ADD,      // fuse add
-                                    ck_tile::Rmsnorm2dFusedQuantEnum::NO_SWEEP>; // fuse quant
+                                    ck_tile::Rmsnorm2dFusedAddEnum::NO_ADD,     // fuse add
+                                    ck_tile::Rmsnorm2dFusedQuantEnum::NO_SWEEP, // fuse quant
+                                    static_cast<ck_tile::Rmsnorm2dSensitiveEnum>(
+                                        USEModelSensitive)>;
 
     using Problem = ck_tile::Rmsnorm2dFwdPipelineProblem<XDataType,
                                                          GammaDataType,
@@ -97,7 +100,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<Problem>;
     using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<Problem>;
-    using Pipeline        = std::conditional_t<kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using T5PassPipeline  = ck_tile::Rmsnorm2dFwdPipelineModelSensitiveT5Pass<Problem>;
+
+    using Pipeline =
+        std::conditional_t<(PipelineTraits::kUseModelSensitiveRMSNorm ==
+                                ck_tile::Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL ||
+                            PipelineTraits::kTwoPass), // TODO: consider TwoPass for T5PassPipeline
+                           std::conditional_t<PipelineTraits::kTwoPass,
+                                              TwoPassPipeline,
+                                              OnePassPipeline>, // kUseModelSensitiveRMSNorm
+                                                                // == 0
+                           T5PassPipeline>;
 
     using Default2DEpilogueProblem = ck_tile::
         Default2DEpilogueProblem<ComputeDataType, YDataType, false, PipelineTraits::kPadN, false>;
@@ -172,7 +185,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
         std::cout << "[" << data_type << "]"
                   << " m:" << m << ", n:" << n << ", stride:" << stride
-                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+                  << ", s:" << USEModelSensitive << ", valid:" << (pass ? "y" : "n") << std::flush
+                  << std::endl;
     }
 
     return pass;
@@ -184,10 +198,19 @@ int main(int argc, char* argv[])
     if(!result)
         return -1;
 
-    const std::string data_type = arg_parser.get_str("prec");
+    const std::string data_type           = arg_parser.get_str("prec");
+    const int use_model_sensitive_rmsnorm = arg_parser.get_int("s");
+
     if(data_type == "fp16")
     {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        if(use_model_sensitive_rmsnorm == 0) // 0: for no specific RMSNorm
+        {
+            return run<ck_tile::half_t, 0>(arg_parser) ? 0 : -2;
+        }
+        else if(use_model_sensitive_rmsnorm == 1) // 1: for T5-like RMSNorm
+        {
+            return run<ck_tile::half_t, 1>(arg_parser) ? 0 : -2;
+        }
     }
 
     return -3;
diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py
index 4296b7373e..b0ba400af1 100644
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -65,7 +65,8 @@ template <typename XDataType_,
           bool kSaveUnquant_,
           bool kTwoPass_,
           ck_tile::index_t kFusedAdd_ = 0,
-          ck_tile::index_t kFusedQuant_ = 0>
+          ck_tile::index_t kFusedQuant_ = 0,
+          ck_tile::index_t kUseModelSensitiveRMSNorm_ = 0>
 struct rmsnorm2d_fwd_traits_
 {
     using XDataType           = ck_tile::remove_cvref_t<XDataType_>;
@@ -127,8 +128,9 @@ struct rmsnorm2d_fwd_traits_
     static constexpr bool kSaveInvRms  = kSaveInvRms_;
     static constexpr bool kSaveUnquant = kSaveUnquant_;
     static constexpr bool kTwoPass     = kTwoPass_;
-    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
-    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+    static constexpr ck_tile::index_t kFusedAdd                 = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant               = kFusedQuant_;
+    static constexpr ck_tile::index_t kUseModelSensitiveRMSNorm = kUseModelSensitiveRMSNorm_;
 };
 
 template <typename XDataType_,
@@ -146,7 +148,8 @@ template <typename XDataType_,
           bool kSaveUnquant_,
           bool kTwoPass_,
           int  kFusedAdd_,
-          int  kFusedQuant_>
+          int  kFusedQuant_,
+          int  kUseModelSensitiveRMSNorm_>
 using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
                                       YDataType_,
                                       SmoothScaleDataType_,
@@ -162,7 +165,8 @@ using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
                                       kSaveUnquant_,
                                       kTwoPass_,
                                       kFusedAdd_,
-                                      kFusedQuant_>;
+                                      kFusedQuant_,
+                                      kUseModelSensitiveRMSNorm_>;
 """
 
     API_COMMON_HEADER = """
@@ -197,7 +201,8 @@ float rmsnorm2d_fwd_(const S& s, A a)
                                     Traits_::kSaveUnquant,
                                     Traits_::kTwoPass,
                                     static_cast<ck_tile::Rmsnorm2dFusedAddEnum>(Traits_::kFusedAdd),
-                                    static_cast<ck_tile::Rmsnorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+                                    static_cast<ck_tile::Rmsnorm2dFusedQuantEnum>(Traits_::kFusedQuant),
+                                    static_cast<ck_tile::Rmsnorm2dSensitiveEnum>(Traits_::kUseModelSensitiveRMSNorm)>;
 
     using PipelineProblem =
         ck_tile::Rmsnorm2dFwdPipelineProblem<typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
@@ -213,7 +218,13 @@ float rmsnorm2d_fwd_(const S& s, A a)
 
     using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<PipelineProblem>;
     using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<PipelineProblem>;
-    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using T5PassPipeline  = ck_tile::Rmsnorm2dFwdPipelineModelSensitiveT5Pass<PipelineProblem>;
+
+    using Pipeline = std::conditional_t<
+        (Traits_::kUseModelSensitiveRMSNorm == 0 || Traits_::kTwoPass), // TODO: consider TwoPass for T5PassPipeline
+        std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>,  // kUseModelSensitiveRMSNorm == 0
+        T5PassPipeline
+    >;
 
     using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
     using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
@@ -387,12 +398,13 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         F_kTwoPass : bool
         F_kFusedAdd : int
         F_kFusedQuant : int
+        F_use_model_sensitive_rmsnorm : int
 
         @property
         def trait_name(self) ->str:
             t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {DATA_TYPE_MAP[self.F_UnquantYDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
             t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveInvRms):5}, {BOOL_MAP(self.F_kSaveUnquant):5}'
-            t_ += f', {BOOL_MAP(self.F_kTwoPass):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}, {self.F_use_model_sensitive_rmsnorm:4}'
             return t_
 
         # string when calling this kernel
@@ -413,6 +425,7 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         F_add : int
         F_sweep : int
         F_saveunquant : bool
+        F_use_model_sensitive_rmsnorm : int
         instance_list : List[Any] # List[h_traits]
 
         @property
@@ -426,6 +439,10 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                 nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
             if self.F_saveunquant:
                 nnn = nnn + '_saveunquant'
+            if self.F_use_model_sensitive_rmsnorm == 0:
+                nnn = nnn + '_nsm'
+            elif self.F_use_model_sensitive_rmsnorm == 1:
+                nnn = nnn + '_t5ml'
             return nnn
 
         @property
@@ -481,9 +498,9 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                         elif ins.F_kFusedQuant == 2:
                             _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\" && t.save_unquant == {f_suq})'.format(
                                 f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType, f_suq=BOOL_MAP(ins.F_kSaveUnquant))
-                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}) && (t.use_model_sensitive_rmsnorm == {f_use_model_sensitive_rmsnorm}) )'.format(
                                         f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
-                                        f_sweep_cond = _sweep_cond)
+                                        f_sweep_cond = _sweep_cond, f_use_model_sensitive_rmsnorm = ins.F_use_model_sensitive_rmsnorm)
                         inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
                                             F_VEC_COND = _cond, F_instance_func=ins.call_name)
                     #inner_str = inner_str + vec_str
@@ -516,85 +533,149 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
         fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused (smooth) dynamic quant
         bool_list = [False, True]
 
-        #                                                              rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '640' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0)],
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  2,  64, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0)],
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0)],
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0)],
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 128, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0)]}
+        h_trait_dicts = {
+            0: {
+                #                                              rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep  srm
+                '64'   :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '128'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '256'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '512'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '640'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0,    0)],
+                '768'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  2,  64, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0,    0)],
+                '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0,    0)],
+                '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0,    0)],
+                '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 128, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0,    0)]
+            },
+            1: {
+                #                                               rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep srm
+                '64'   :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '128'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '256'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  32, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '512'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '640'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  2, 128, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0,    1)],
+                '768'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  2, 128, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  2, 128, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0,    1)],
+                '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0,    1)],
+                '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0,    1)],
+                '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0,    1)]
+            } 
+        }
+        
         total_blob = list()
-        for hs_key in h_trait_dict:
-            hs = h_trait_dict[hs_key]
-            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
-            for dtype, scale_type, fused_add, fused_quant, save_unquant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list, bool_list):
-                prec_i, prec_o = dtype.split(',')
-                scale_sm, scale_y = scale_type.split(',')
-                if prec_o in dynamic_quant_out_dtype and fused_quant != 1 and fused_quant != 2:
-                    continue # skip non dynamic quant case
-                if (fused_quant == 1 or fused_quant == 2) and hs_key == 'big':
-                    continue
-                if (fused_quant == 0 and save_unquant == True):
-                    continue # save_unquant should always be false when there is no quant enabled
-                current_hs = list()
-                for chs_ in hs:
-                    h_ = copy.copy(chs_) # copy the base instance out
-                    h_.F_XDataType = prec_i
-                    h_.F_YDataType = prec_o
-                    h_.F_SmoothScaleDataType = scale_sm
-                    h_.F_YScaleDataType = scale_y
-                    h_.F_UnquantYDataType = prec_i
-                    h_.F_kFusedAdd = fused_add
-                    h_.F_kFusedQuant = fused_quant
-                    h_.F_kSaveUnquant = save_unquant
-                    current_hs.append(h_) # + "\n"
-                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
-                current_n_str = 'big' if hs_key == 'big' else current_n
-                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, save_unquant, current_hs))
+
+        for model_sensitive_flag in [0, 1]: # 0: default; 1: model sensitive
+            current_trait_dict = h_trait_dicts[model_sensitive_flag]
+            for hs_key in current_trait_dict:
+                hs = current_trait_dict[hs_key]            
+                current_n = hs_key
+                for dtype, scale_type, fused_add, fused_quant, save_unquant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list, bool_list):
+                    prec_i, prec_o = dtype.split(',')
+                    scale_sm, scale_y = scale_type.split(',')
+                    if prec_o in dynamic_quant_out_dtype and fused_quant != 1 and fused_quant != 2:
+                        continue # skip non dynamic quant case
+                    if (fused_quant == 1 or fused_quant == 2) and hs_key == 'big':
+                        continue
+                    if (fused_quant == 0 and save_unquant == True):
+                        continue # save_unquant should always be false when there is no quant enabled
+                    current_hs = list()
+                    for chs_ in hs:
+                        h_ = copy.copy(chs_) # copy the base instance out
+                        h_.F_XDataType = prec_i
+                        h_.F_YDataType = prec_o
+                        h_.F_SmoothScaleDataType = scale_sm
+                        h_.F_YScaleDataType = scale_y
+                        h_.F_UnquantYDataType = prec_i
+                        h_.F_kFusedAdd = fused_add
+                        h_.F_kFusedQuant = fused_quant
+                        h_.F_kSaveUnquant = save_unquant
+                        current_hs.append(h_) # + "\n"
+                    #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                    current_n_str = 'big' if hs_key == 'big' else current_n
+                    total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, save_unquant, h_.F_use_model_sensitive_rmsnorm, current_hs))
         return total_blob
 
     def list_blobs(self) -> None:
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
index d5be4384ab..049a0cad41 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -52,7 +52,8 @@ auto create_args(int argc, char* argv[])
         .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
         .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
         .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -66,15 +67,16 @@ template <typename InDataType,
           bool SaveUnquant>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
-    float epsilon      = arg_parser.get_float("e");
-    int kname          = arg_parser.get_int("kname");
-    int do_validation  = arg_parser.get_int("v");
-    int fused_add      = arg_parser.get_int("fadd");
-    int fused_quant    = arg_parser.get_int("fquant");
-    int warmup         = arg_parser.get_int("warmup");
-    int repeat         = arg_parser.get_int("repeat");
+    ck_tile::index_t m                    = arg_parser.get_int("m");
+    ck_tile::index_t n                    = arg_parser.get_int("n");
+    float epsilon                         = arg_parser.get_float("e");
+    int kname                             = arg_parser.get_int("kname");
+    int do_validation                     = arg_parser.get_int("v");
+    int fused_add                         = arg_parser.get_int("fadd");
+    int fused_quant                       = arg_parser.get_int("fquant");
+    int warmup                            = arg_parser.get_int("warmup");
+    int repeat                            = arg_parser.get_int("repeat");
+    const int use_model_sensitive_rmsnorm = arg_parser.get_int("s");
 
     ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
     if(x_stride < 0)
@@ -194,10 +196,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
     std::cout << "[" << prec_str << "]"
               << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
               << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
-              << ", yr_stride:" << yr_stride << std::flush;
+              << ", yr_stride:" << yr_stride << ", s:" << use_model_sensitive_rmsnorm << std::flush;
 
-    rmsnorm2d_fwd_traits traits{
-        prec_i, prec_o, prec_sm, prec_sy, SaveRms, SaveUnquant, fused_add, fused_quant};
+    rmsnorm2d_fwd_traits traits{prec_i,
+                                prec_o,
+                                prec_sm,
+                                prec_sy,
+                                SaveRms,
+                                SaveUnquant,
+                                fused_add,
+                                fused_quant,
+                                use_model_sensitive_rmsnorm};
 
     rmsnorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
                             fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
index bb4a2f5ef4..c1090ed28b 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
@@ -64,6 +64,8 @@ struct rmsnorm2d_fwd_traits
     bool save_unquant;
     int fused_add;   // 0:no-add, 1:pre-add-store, 2:pre-add
     int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+
+    int use_model_sensitive_rmsnorm = 0; // 0: Use default RMSNorm; 1: Use T5-like implementation
 };
 
 float rmsnorm2d_fwd(rmsnorm2d_fwd_traits, rmsnorm2d_fwd_args, const ck_tile::stream_config&);
diff --git a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
index 7b9d0820fd..bc4362c105 100755
--- a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
@@ -1,37 +1,74 @@
 #!/bin/sh
 EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)"
 
-$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+# 0: for no specific RMSNorm
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
 
-$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
-$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
\ No newline at end of file
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+
+# 1: for T5-like RMSNorm
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
\ No newline at end of file
diff --git a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
index 2bad7a00ea..1c79dafadd 100755
--- a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
@@ -5,29 +5,32 @@ for fquant in "" "-fquant=1 -prec_o=int8" "-fquant=2 -prec_o=int8" "-fquant=1 -p
   "-fquant=1 -prec_o=int8 -save_unquant=1" "-fquant=2 -prec_o=int8 -save_unquant=1" "-fquant=1 -prec_o=fp8 -save_unquant=1" "-fquant=2 -prec_o=fp8 -save_unquant=1"; do
 for pr_i in "fp16" "bf16" ; do
 for fadd in "0" "1"; do
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=99  -n=13
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=17  -n=16
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=100
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=4   -n=128
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=80  -n=127
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=22  -n=255 -stride=256
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=599
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=19  -n=512
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=33  -n=313 -stride=1000
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=11  -n=510
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=171 -n=676 -stride=818
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=91  -n=636
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=12  -n=768 -stride=800
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=100 -n=766 -stride=812
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=31  -n=1024
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=64  -n=1000 -stride=1004
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=8   -n=1501
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=1826
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=5   -n=2040
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
+# 0: for no specific RMSNorm; 1: for T-5 like RMSNorm
+for s in "0" "1"; do
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=99  -n=13
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=17  -n=16
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=1   -n=100
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=4   -n=128
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=80  -n=127
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=22  -n=255 -stride=256
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=7   -n=599
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=19  -n=512
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=33  -n=313 -stride=1000
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=11  -n=510
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=171 -n=676 -stride=818
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=91  -n=636
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=12  -n=768 -stride=800
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=100 -n=766 -stride=812
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=31  -n=1024
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=64  -n=1000 -stride=1004
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=8   -n=1501
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=3   -n=1826
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=5   -n=2040
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=7   -n=2734
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=1   -n=3182
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=9   -n=4096
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=3   -n=8192
+done
 done
 done
 done
@@ -36,8 +39,11 @@ done
 for fquant in ""
 for pr_i in "fp16" "bf16" ; do
 for fadd in "0" "1"; do
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
+# 0: for no specific RMSNorm; 1: for T-5 like RMSNorm
+for s in "0" "1"; do
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=1   -n=10547
 #$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
 done
 done
 done
+done
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index 6a1f926a9a..62c9944bd2 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -272,4 +272,137 @@ struct BlockReduce2dCrossWarpSync
     }
 };
 
+template <typename Problem_, typename Policy_ = void>
+struct BlockReduce2dTreeCrossWarpSync
+{
+    using Problem    = remove_cvref_t<Problem_>;
+    using BlockShape = typename Problem::BlockShape;
+
+    template <typename YDistributedTensor_>
+    CK_TILE_DEVICE static constexpr index_t GetReduceWarps()
+    {
+        constexpr index_t num_reduce_warps = [&]() {
+            using Dstr             = typename YDistributedTensor_::StaticTileDistribution;
+            using DstrEncode       = typename Dstr::DstrEncode;
+            using DstrEncodeDetail = typename DstrEncode::detail;
+
+            constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+            constexpr index_t idim_p_warp = 0;
+
+            index_t len_ = 1;
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_warp][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+                    len_ *= r_length;
+                }
+            });
+            return len_;
+        }();
+        return num_reduce_warps;
+    }
+
+    // return in byte
+    template <typename YDistributedTensor_>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        using DataType                    = typename YDistributedTensor_::DataType;
+        constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
+
+        // we need to store all data from every wave into smem
+        // e.g. 2x2 reduce along N
+        //     -------------> reduce N
+        //    | w0 | w1 |   ___>      | w01 |
+        //    | w2 | w3 |             | w23 |
+        //
+        //   -> store data from every wave into LDS
+        //
+        //
+        //     -------------> reduce N
+        //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
+        //
+        //   -> also store data from every wave into LDS
+        constexpr index_t num_warps = BlockShape::BlockSize / warpSize;
+        return num_warps * thread_buf_size * sizeof(DataType);
+    }
+
+    template <typename YDistributedTensor_, typename ReduceFunc>
+    CK_TILE_DEVICE void
+    operator()(YDistributedTensor_& y_tensor, void* smem, const ReduceFunc& reduce_func)
+    {
+        using Dstr             = typename YDistributedTensor_::StaticTileDistribution;
+        using DstrEncode       = typename Dstr::DstrEncode;
+        using DstrEncodeDetail = typename DstrEncode::detail;
+        using DataType         = typename YDistributedTensor_::DataType;
+
+        constexpr index_t NDimP = Dstr::get_num_of_dimension_p();
+        constexpr index_t NDimR = Dstr::get_num_of_dimension_r();
+
+        constexpr index_t idim_p_lane     = NDimP - 1;
+        constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
+
+        DataType* smem_ptr    = reinterpret_cast<DataType*>(smem);
+        const index_t lane_id = get_lane_id();
+        const index_t warp_id = get_warp_id();
+
+        constexpr index_t num_warps        = BlockShape::BlockSize / get_warp_size();
+        constexpr index_t num_reduce_warps = GetReduceWarps<YDistributedTensor_>();
+
+        if constexpr(num_reduce_warps == 1)
+            return;
+
+        // Each warp's lane 0 writes its partial results to shared memory
+        const index_t smem_offset = warp_id;
+        if(lane_id == 0)
+        {
+            static_for<0, thread_buf_size, 1>{}([&](auto i) {
+                // Store the i-th element of this warp's thread_buffer into SMEM
+                smem_ptr[smem_offset + i * num_warps] = y_tensor.get_thread_buffer()[i];
+            });
+        }
+        block_sync_lds();
+
+        // We let each warp holds a duplication to do reduction.
+        static_for<0, thread_buf_size, 1>{}([&](auto i) {
+            DataType v = 0;
+            if(lane_id < num_reduce_warps)
+            {
+                v = smem_ptr[lane_id + i * num_warps];
+            }
+
+            // cross-lane reduce for replication
+            // only reduce on R dimension correspond to lane
+            // (lane id maps to this R dimension)
+            static_for<0, NDimR, 1>{}([&](auto idim_r) {
+                // FIXME: nasty to use does_p_own_r_
+                if constexpr(DstrEncodeDetail::does_p_own_r_[idim_p_lane][idim_r])
+                {
+                    constexpr index_t r_length = DstrEncode::rs_lengths_[idim_r];
+
+                    constexpr index_t lid_over_rid_derivative =
+                        DstrEncodeDetail::ps_over_rs_derivative_[idim_p_lane][idim_r];
+
+                    static_assert(is_power_of_two_integer(r_length),
+                                  "wrong! only support power of 2 reduction");
+
+                    constexpr index_t nstage = integer_log2_floor(r_length);
+
+                    // reduction sweep forward
+                    static_for<0, nstage, 1>{}([&](auto istage) {
+                        // pull data from remote lane
+                        const auto o =
+                            __shfl_xor(v, number<lid_over_rid_derivative << istage.value>{}.value);
+
+                        // reduce
+                        v = reduce_func(v, o);
+                    });
+                }
+            });
+
+            y_tensor.get_thread_buffer()(i) = v;
+        });
+    }
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d.hpp b/include/ck_tile/ops/rmsnorm2d.hpp
index 3eec2a1ab6..610541b2e4 100644
--- a/include/ck_tile/ops/rmsnorm2d.hpp
+++ b/include/ck_tile/ops/rmsnorm2d.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_model_sensitive_pass.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp"
 #include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp"
diff --git a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
index f0251177d4..6cb81b8856 100644
--- a/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
@@ -58,13 +58,14 @@ struct Rmsnorm2dFwd
     static constexpr bool kSaveInvRms  = Problem::Traits::kSaveInvRms;
     static constexpr bool kSaveUnquant = Problem::Traits::kSaveUnquant;
 
-    static constexpr index_t Block_M  = Problem::BlockShape::Block_M;
-    static constexpr index_t Block_N  = Problem::BlockShape::Block_N;
-    static constexpr bool kPadM       = false; // always no need to pad along M
-    static constexpr bool kPadN       = Problem::Traits::kPadN;
-    static constexpr bool kTwoPass    = Problem::Traits::kTwoPass;
-    static constexpr auto kFusedAdd   = Problem::Traits::kFusedAdd;
-    static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant;
+    static constexpr index_t Block_M                = Problem::BlockShape::Block_M;
+    static constexpr index_t Block_N                = Problem::BlockShape::Block_N;
+    static constexpr bool kPadM                     = false; // always no need to pad along M
+    static constexpr bool kPadN                     = Problem::Traits::kPadN;
+    static constexpr bool kTwoPass                  = Problem::Traits::kTwoPass;
+    static constexpr auto kFusedAdd                 = Problem::Traits::kFusedAdd;
+    static constexpr auto kFusedQuant               = Problem::Traits::kFusedQuant;
+    static constexpr auto kUseModelSensitiveRMSNorm = Problem::Traits::kUseModelSensitiveRMSNorm;
 
     static constexpr index_t ThreadPerWarp_N = Problem::BlockShape::ThreadPerWarp_N;
     static constexpr index_t Vector_N        = Problem::BlockShape::Vector_N;
@@ -150,6 +151,8 @@ struct Rmsnorm2dFwd
             if (kPadN) n += "_pn";
             if (kSaveInvRms) n += "_rms";
             if (kTwoPass) n += "_2p";
+            if (kUseModelSensitiveRMSNorm == Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL) n += "_nsm";
+            else if (kUseModelSensitiveRMSNorm == Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE) n += "_t5ml";
             return n; }();
 
         auto prec_str = [&] () {
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
index 356a2e12ca..df689c6b46 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
@@ -69,6 +69,15 @@ struct Rmsnorm2dFwdPipelineDefaultPolicy
         return BlockReduce2dCrossWarpSync<P_>{};
     }
 
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dTreeCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::ComputeDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dTreeCrossWarpSync<P_>{};
+    }
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_model_sensitive_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_model_sensitive_pass.hpp
new file mode 100644
index 0000000000..810c3c5243
--- /dev/null
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_model_sensitive_pass.hpp
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+/**
+ * @brief This T5Pass implements the RMSNorm2d forward pipeline as a variant
+ *        based on Rmsnorm2dFwdPipelineOnePass and Rmsnorm2dFwdPipelineTwoPass using a T5 model-like
+ * method.
+ *
+ * The T5 model, developed by Google, is a transformer-based architecture designed to perform
+ * a variety of NLP tasks. The T5-like approach employed here is characterized by how RMS
+ * normalization is handled, particularly where intermediate values are cast to BF16. This aims to
+ * achieve a similar value distribution to that produced by the VLLM hip implementation, thereby
+ * enhancing model accuracy.
+ *
+ * Note: While this implementation improves precision and can reduce discrepancies with VLLM, it is
+ * not guaranteed to eliminate all differences or ensure uniform outcomes across every use case.
+ *
+ * This implementation is a variant based on the original one-pass and two-pass approaches,
+ * allowing for both fused and non-fused add operations.
+ */
+
+template <typename Problem_, typename Policy_ = Rmsnorm2dFwdPipelineDefaultPolicy>
+struct Rmsnorm2dFwdPipelineModelSensitiveT5Pass
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using GammaDataType   = ck_tile::remove_cvref_t<typename Problem::GammaDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using InvRmsDataType  = ck_tile::remove_cvref_t<typename Problem::InvRmsDataType>;
+
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
+    static constexpr bool kHasGamma    = !std::is_same_v<GammaDataType, ck_tile::null_type>;
+    static constexpr bool kSaveInvRms  = Problem::Traits::kSaveInvRms;
+    static constexpr bool kSaveUnquant = Problem::Traits::kSaveUnquant;
+
+    static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
+    static constexpr bool kPadM              = false; // TODO - BlockRmsnorm2dFwdProblem::kPadM
+    static constexpr bool kPadN              = Problem::Traits::kPadN;
+    static constexpr auto kFusedAdd          = Problem::Traits::kFusedAdd;
+    static constexpr auto kFusedQuant        = Problem::Traits::kFusedQuant;
+
+    static constexpr const char* name = []() {
+        if constexpr(kNeedCrossWarpSync)
+            return "bpr_op"; // block per row
+        else
+            return "wpr_op"; // warp per row
+    }();
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename XWindow,
+              typename XResidualWindow,
+              typename GammaWindow,
+              typename YWindow,
+              typename YResidualWindow,
+              typename InvRmsWindow,
+              typename SmoothScaleWindow,
+              typename YScaleWindow,
+              typename UnquantYWindow,
+              typename Epilogue>
+    CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
+                                   const XResidualWindow& x_residual_window_,
+                                   const GammaWindow& gamma_window_,
+                                   YWindow& y_window_,
+                                   const YResidualWindow& y_residual_window_,
+                                   InvRmsWindow& inv_rms_window,
+                                   const SmoothScaleWindow& sm_scale_window_,
+                                   YScaleWindow& y_scale_window_,
+                                   UnquantYWindow& unquant_y_window,
+                                   ComputeDataType epsilon,
+                                   ck_tile::index_t row_size,
+                                   void* smem,
+                                   Epilogue) const
+    {
+        const auto x_window =
+            make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        const auto gamma_window = make_tile_window(
+            gamma_window_, Policy::template MakeGammaBlockTileDistribution<Problem>());
+        const auto x_residual_window = make_tile_window(
+            x_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+        auto y_residual_window = make_tile_window(
+            y_residual_window_, Policy::template MakeXBlockTileDistribution<Problem>());
+
+        auto reduce_square_sum_func = ReduceOp::SquareAdd{};
+        auto reduce_sum_func        = ReduceOp::Add{};
+        auto block_reduce2d         = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync    = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_tree_cross_warp_sync =
+            Policy::template GetBlockReduce2dTreeCrossWarpSync<Problem>();
+
+        auto x      = load_tile(x_window);
+        auto x_resi = load_tile(x_residual_window);
+
+        // load gamma (TODO: support no gamma?)
+        const auto gamma = load_tile(gamma_window);
+
+        auto acc = cast_tile<ComputeDataType>(x);
+
+        if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD ||
+                     kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD_STORE)
+        {
+            [[maybe_unused]] auto pre_out =
+                make_static_distributed_tensor<YResidualDataType>(x.get_tile_distribution());
+
+            sweep_tile(x_resi, [&](auto idx) {
+                // compute x = x_resi + x
+                acc(idx) = type_convert<ComputeDataType>(x_resi(idx)) + acc(idx);
+
+                // To make norm input align with residual output
+                if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD_STORE)
+                {
+                    if constexpr(std::is_same_v<YResidualDataType, ck_tile::bf16_t>)
+                    {
+                        pre_out(idx) = float_to_bf16<bf16_rounding_mode::standard>(acc(idx));
+                    }
+                    else
+                    {
+                        pre_out(idx) = type_convert<YResidualDataType>(acc(idx));
+                    }
+                    acc(idx) = type_convert<ComputeDataType>(pre_out(idx));
+                }
+            });
+            if constexpr(kFusedAdd == Rmsnorm2dFusedAddEnum::PRE_ADD_STORE)
+            {
+                store_tile(y_residual_window, pre_out);
+            }
+        }
+
+        // compute mean square each-thread->cross-lane->cross-warp
+        auto square_sum = block_reduce2d.template MakeYBlockTile<decltype(acc)>();
+        set_tile(square_sum, 0);
+        if constexpr(Problem::BlockShape::Vector_N % 2 == 0)
+        {
+            sweep_tile(
+                acc,
+                [&](auto idx_0, auto idx_1) {
+                    square_sum(idx_0) += acc[idx_0] * acc[idx_0] + acc[idx_1] * acc[idx_1];
+                },
+                sequence<1, 2>{});
+        }
+        else
+        {
+            square_sum = block_reduce2d(acc,
+                                        reduce_square_sum_func.GetIdentityValue<ComputeDataType>(),
+                                        reduce_square_sum_func);
+        }
+        block_reduce2d_sync(square_sum, reduce_sum_func);
+        block_reduce2d_tree_cross_warp_sync(square_sum, smem, reduce_sum_func);
+
+        // compute inv-rms
+        auto inv_rms = tile_elementwise_in(
+            [&](const auto& v_) { return rsqrtf(v_ / row_size + epsilon); }, square_sum);
+
+        if constexpr(kSaveInvRms)
+            store_tile(inv_rms_window, cast_tile<InvRmsDataType>(inv_rms));
+
+        // rmsnorm computation
+        auto rmsn = make_static_distributed_tensor<ComputeDataType>(x.get_tile_distribution());
+        sweep_tile(rmsn, [&, inv_rms_ = inv_rms](auto idx) {
+            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
+            constexpr auto j_idx = make_tuple(idx[number<1>{}]);
+
+            const auto gamma_ = type_convert<ComputeDataType>(gamma[j_idx]);
+
+            if constexpr(std::is_same_v<YResidualDataType, ck_tile::bf16_t>)
+            {
+                const auto tmp0 =
+                    float_to_bf16<bf16_rounding_mode::standard>(acc[idx] * inv_rms_[i_idx]);
+                const auto tmp1 = float_to_bf16<bf16_rounding_mode::standard>(
+                    type_convert<ComputeDataType>(tmp0) * gamma_);
+                const auto rmsn_ = type_convert<ComputeDataType>(tmp1);
+                rmsn(idx)        = rmsn_;
+            }
+            else
+            {
+                const auto tmp   = type_convert<YResidualDataType>(acc[idx] * inv_rms_[i_idx]);
+                const auto rmsn_ = type_convert<ComputeDataType>(tmp) * gamma_;
+                rmsn(idx)        = rmsn_;
+            }
+        });
+
+        if constexpr(kFusedQuant == Rmsnorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT)
+        {
+            if constexpr(kSaveUnquant)
+            {
+                Epilogue{}(
+                    unquant_y_window, y_window_, sm_scale_window_, y_scale_window_, rmsn, smem);
+            }
+            else
+            {
+                Epilogue{}(y_window_, sm_scale_window_, y_scale_window_, rmsn, smem);
+            }
+        }
+        else if constexpr(kFusedQuant == Rmsnorm2dFusedQuantEnum::DYNAMIC_QUANT)
+        {
+            if constexpr(kSaveUnquant)
+            {
+                Epilogue{}(unquant_y_window, y_window_, y_scale_window_, rmsn, smem);
+            }
+            else
+            {
+                Epilogue{}(y_window_, y_scale_window_, rmsn, smem);
+            }
+        }
+        else
+        {
+            Epilogue{}(y_window_, rmsn);
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
index 58159142d0..c77d61872e 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
@@ -117,10 +117,7 @@ struct Rmsnorm2dFwdPipelineOnePass
 
         // compute inv-rms
         auto inv_rms = tile_elementwise_in(
-            [&](const auto& v_) {
-                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ / row_size + epsilon));
-            },
-            square_sum);
+            [&](const auto& v_) { return rsqrtf(v_ / row_size + epsilon); }, square_sum);
 
         if constexpr(kSaveInvRms)
             store_tile(inv_rms_window, cast_tile<InvRmsDataType>(inv_rms));
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp
index 152da60c01..b91f17ffdd 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp
@@ -37,20 +37,37 @@ template<> struct Rmsnorm2dFusedQuantEnumName<Rmsnorm2dFusedQuantEnum::DYNAMIC_Q
 template<> struct Rmsnorm2dFusedQuantEnumName<Rmsnorm2dFusedQuantEnum::SMOOTH_DYNAMIC_QUANT> { static constexpr const char * name = "smdqt"; };
 // clang-format on
 
+enum class Rmsnorm2dSensitiveEnum
+{
+    NO_SPECIFIC_MODEL = 0,
+    // T5-like model for RMSNorm. The T5 model, developed by Google, is a transformer-based
+    // architecture designed for a variety of NLP tasks. This option mimics T5's approach to
+    // RMSNorm, aiming to ensure similar value distributions and enhance accuracy.
+    T5_MODEL_LIKE = 1,
+};
+
+// clang-format off
+template<Rmsnorm2dSensitiveEnum> struct Rmsnorm2dSensitiveEnumName;
+template<> struct Rmsnorm2dSensitiveEnumName<Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL> { static constexpr const char * name = "nsm"; };
+template<> struct Rmsnorm2dSensitiveEnumName<Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE> { static constexpr const char * name = "t5ml"; };
+// clang-format on
+
 template <bool kPadN_,
           bool kSaveInvRms_,
           bool kSaveUnquant_,
           bool kTwoPass_,
           Rmsnorm2dFusedAddEnum kFusedAdd_,
-          Rmsnorm2dFusedQuantEnum kFusedQuant_>
+          Rmsnorm2dFusedQuantEnum kFusedQuant_,
+          Rmsnorm2dSensitiveEnum kUseModelSensitiveRMSNorm_>
 struct Rmsnorm2dFwdTraits
 {
-    static constexpr bool kPadN                          = kPadN_;
-    static constexpr bool kSaveInvRms                    = kSaveInvRms_;
-    static constexpr bool kSaveUnquant                   = kSaveUnquant_;
-    static constexpr bool kTwoPass                       = kTwoPass_;
-    static constexpr Rmsnorm2dFusedAddEnum kFusedAdd     = kFusedAdd_;
-    static constexpr Rmsnorm2dFusedQuantEnum kFusedQuant = kFusedQuant_;
+    static constexpr bool kPadN                                       = kPadN_;
+    static constexpr bool kSaveInvRms                                 = kSaveInvRms_;
+    static constexpr bool kSaveUnquant                                = kSaveUnquant_;
+    static constexpr bool kTwoPass                                    = kTwoPass_;
+    static constexpr Rmsnorm2dFusedAddEnum kFusedAdd                  = kFusedAdd_;
+    static constexpr Rmsnorm2dFusedQuantEnum kFusedQuant              = kFusedQuant_;
+    static constexpr Rmsnorm2dSensitiveEnum kUseModelSensitiveRMSNorm = kUseModelSensitiveRMSNorm_;
 };
 
 } // namespace ck_tile

From c1badfd30c1679f4c8e176c8f0608db2c6ac6505 Mon Sep 17 00:00:00 2001
From: huaiguxu <145733371+huaiguxu@users.noreply.github.com>
Date: Wed, 16 Jul 2025 15:44:34 +0800
Subject: [PATCH 310/443] Handle moe_fp8 no-mainloop cases. Supprese
 no-mainloop check (#2438)

Co-authored-by: felix <felix.li@amd.com>
---
 .../gpu/device/impl/device_moe_gemm.hpp       | 50 ++++++++++++++++---
 .../gpu/grid/gridwise_moe_gemm.hpp            |  2 +-
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
index 08d177035e..27d3c378ac 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
@@ -325,12 +325,50 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
                 // Tail number always 1
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
-                    const auto kernel = kernel_moe_gemm<GridwiseGemm,
-                                                        true,
-                                                        InMemoryDataOperationEnum::Set,
-                                                        minimum_occupancy,
-                                                        TailNumber::Odd>;
-                    RunKernel(kernel);
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                            false,
+                                                            MemoryDataOp,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_gemm<GridwiseGemm,
+                                                            false,
+                                                            MemoryDataOp,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 false,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                        RunKernel(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_moe_gemm_2lds<GridwiseGemm,
+                                                                 false,
+                                                                 MemoryDataOp,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                        RunKernel(kernel);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 & v2 support now");
                 }
             }
 #endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 36f8fd7cc1..3d5066d52d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1112,7 +1112,7 @@ struct GridwiseMoeGemm
         }
 
         // check gridwise gemm pipeline
-#if 1
+#if 0
         const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
 
         if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)

From f5d1e3fa4878fcfa380082e357e89152756327ce Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 16 Jul 2025 07:37:53 -0700
Subject: [PATCH 311/443] Use a clang20 compiler for gfx950 builds. (#2504)

* update docker tag for gfx950 ci build

* update compiler path for gfx950 ci build

* suppress compiler path override for gfx950

* clean up
---
 Jenkinsfile | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 50c15701a7..a7dc8360ee 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -234,11 +234,6 @@ def cmake_build(Map conf=[:]){
 
     def build_type_debug = (conf.get("build_type",'release') == 'debug')
 
-    // use special compiler for gfx950
-    if ( check_arch() == 7){
-        compiler = "/llvm-project/build/bin/clang++"
-    }
-
     //cmake_env can overwrite default CXX variables.
     def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
 
@@ -1352,12 +1347,12 @@ pipeline {
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx950" \
-                                           -DCMAKE_CXX_COMPILER=/llvm-project/build/bin/clang++ \
+                                           -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
                 }

From a4bf78ac0ec5882692423bd5b58d84feb3488629 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 16 Jul 2025 07:39:15 -0700
Subject: [PATCH 312/443] replace obsolete warpSize system variable with the
 new one (#2496)

---
 .../gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
index 156db6e636..be85528f28 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
@@ -467,7 +467,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
     __host__ __device__ static auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
     {
-        constexpr index_t NkSwizzleNumber = Number<warpSize * KPack>{};
+        constexpr index_t NkSwizzleNumber = Number<WarpSize * KPack>{};
         return make_naive_tensor_descriptor_packed(
             make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
     }
@@ -1474,7 +1474,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
                 make_multi_index(n_block_data_idx_on_grid,
                                  get_warp_local_1d_id() % NWave,
                                  0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -1567,7 +1567,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
                       make_multi_index(n_block_data_idx_on_grid,
                                        get_warp_local_1d_id() % NWave,
                                        0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % warpSize)));
+                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
             const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
             const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_b_scale_grid_up + expert_id * expert_scale_stride,
@@ -2185,7 +2185,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
                                  get_warp_local_1d_id() % NWave,
                                  0,
                                  0,
-                                 KPack * (get_thread_local_1d_id() % warpSize)));
+                                 KPack * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
@@ -2289,7 +2289,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
                                      get_warp_local_1d_id() % NWave,
                                      0,
                                      0,
-                                     KPack * (get_thread_local_1d_id() % warpSize)));
+                                     KPack * (get_thread_local_1d_id() % WarpSize)));
             const BScaleDataType* p_b_scale_grid_up =
                 p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
             const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(

From 6e76b82059eceb1a1614f4a335c70faa2d122c97 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Wed, 16 Jul 2025 22:58:23 +0800
Subject: [PATCH 313/443] Fix build errors on windows (#2456)

* Fix build errors on windows

* correct clang format

---------

Co-authored-by: Lin, Qun <Quentin.Lin+amdeng@amd.com>
---
 cmake/gtest.cmake                             |  3 ++
 .../34_batchnorm/batchnorm_backward_nhwc.cpp  |  4 +-
 .../batchnorm_forward_inferring_nhwc.cpp      |  5 +--
 .../batchnorm_forward_training_nhwc.cpp       |  7 ++--
 ...tchnorm_forward_training_nhwc_obsolete.cpp |  7 ++--
 example/CMakeLists.txt                        |  1 +
 include/ck/utility/amd_xdlops.hpp             | 32 +++++++-------
 include/ck/utility/env.hpp                    |  1 +
 include/ck/utility/synchronization.hpp        |  2 +-
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   |  2 +-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |  2 +-
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |  4 +-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    | 42 +++++++++----------
 .../include/profiler/profile_gemm_impl.hpp    |  4 ++
 profiler/src/profile_batched_gemm_b_scale.cpp |  3 +-
 profiler/src/profile_gemm_b_scale.cpp         |  3 +-
 test/scatter_gather/scatter_gather.cpp        |  4 +-
 17 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/cmake/gtest.cmake b/cmake/gtest.cmake
index 0915f53411..6587f4c4be 100644
--- a/cmake/gtest.cmake
+++ b/cmake/gtest.cmake
@@ -68,3 +68,6 @@ endif()
 
 target_compile_options(gtest PRIVATE ${GTEST_CXX_FLAGS})
 target_compile_options(gtest_main PRIVATE ${GTEST_CXX_FLAGS})
+target_compile_definitions(gtest PRIVATE GTEST_HAS_SEH=0)
+target_compile_definitions(gtest_main PRIVATE GTEST_HAS_SEH=0)
+
diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
index 3756310fd7..9737b0d99b 100644
--- a/example/34_batchnorm/batchnorm_backward_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
@@ -403,10 +403,10 @@ bool bnorm_bwd_nhwc_test(bool do_verification,
     return (pass);
 };
 
-static const double epsilon = std::numeric_limits<float>::epsilon();
-
 int main(int argc, char* argv[])
 {
+    static const double epsilon = std::numeric_limits<float>::epsilon();
+
     bool pass = true;
 
     if(argc > 1)
diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
index 6a8002025a..1ffbabd04b 100644
--- a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
@@ -314,11 +314,10 @@ bool bnorm_infer_nhwc_test(bool do_verification,
     return (pass);
 };
 
-static const double epsilon = std::numeric_limits<float>::epsilon();
-
 int main(int argc, char* argv[])
 {
-    bool pass = true;
+    static const double epsilon = std::numeric_limits<float>::epsilon();
+    bool pass                   = true;
 
     if(argc > 1)
     {
diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
index b27358fd9d..06441be860 100644
--- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
@@ -453,12 +453,11 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
     return (pass);
 };
 
-const double epsilon              = std::numeric_limits<float>::epsilon();
-static const double averageFactor = 0.1;
-
 int main(int argc, char* argv[])
 {
-    bool pass = true;
+    const double epsilon              = std::numeric_limits<float>::epsilon();
+    static const double averageFactor = 0.1;
+    bool pass                         = true;
 
     if(argc > 1)
     {
diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp
index ffb9f4b584..8f2b7613b5 100644
--- a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp
@@ -453,12 +453,11 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
     return (pass);
 };
 
-const double epsilon              = std::numeric_limits<float>::epsilon();
-static const double averageFactor = 0.1;
-
 int main(int argc, char* argv[])
 {
-    bool pass = true;
+    const double epsilon              = std::numeric_limits<float>::epsilon();
+    static const double averageFactor = 0.1;
+    bool pass                         = true;
 
     if(argc > 1)
     {
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 56d709f41b..3c67e9214f 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -128,6 +128,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
         target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+        target_link_libraries(${EXAMPLE_NAME} PRIVATE getopt::getopt)
         add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
         set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS} )
         add_dependencies(examples ${EXAMPLE_NAME})
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 8646b8393b..02a7a72b8c 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1396,8 +1396,8 @@ struct intrin_mfma_f32_32x32x16f8f8<32, 32>
 #if defined(__gfx94__)
         reg_c.template AsType<float16_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                bit_cast<long>(reg_a),
-                bit_cast<long>(reg_b),
+                bit_cast<int64_t>(reg_a),
+                bit_cast<int64_t>(reg_b),
                 reg_c.template AsType<float16_t>()[Number<0>{}],
                 0,
                 0,
@@ -1427,8 +1427,8 @@ struct intrin_mfma_f32_16x16x32f8f8<16, 16>
     {
 #if defined(__gfx94__)
         reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
-            bit_cast<long>(reg_a),
-            bit_cast<long>(reg_b),
+            bit_cast<int64_t>(reg_a),
+            bit_cast<int64_t>(reg_b),
             reg_c.template AsType<float4_t>()[Number<0>{}],
             0,
             0,
@@ -1459,8 +1459,8 @@ struct intrin_mfma_f32_32x32x16bf8bf8<32, 32>
 #if defined(__gfx94__)
         reg_c.template AsType<float16_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                bit_cast<long>(reg_a),
-                bit_cast<long>(reg_b),
+                bit_cast<int64_t>(reg_a),
+                bit_cast<int64_t>(reg_b),
                 reg_c.template AsType<float16_t>()[Number<0>{}],
                 0,
                 0,
@@ -1490,8 +1490,8 @@ struct intrin_mfma_f32_16x16x32bf8bf8<16, 16>
     {
 #if defined(__gfx94__)
         reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
-            bit_cast<long>(reg_a),
-            bit_cast<long>(reg_b),
+            bit_cast<int64_t>(reg_a),
+            bit_cast<int64_t>(reg_b),
             reg_c.template AsType<float4_t>()[Number<0>{}],
             0,
             0,
@@ -1522,8 +1522,8 @@ struct intrin_mfma_f32_32x32x16f8bf8<32, 32>
 #if defined(__gfx94__)
         reg_c.template AsType<float16_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                bit_cast<long>(reg_a),
-                bit_cast<long>(reg_b),
+                bit_cast<int64_t>(reg_a),
+                bit_cast<int64_t>(reg_b),
                 reg_c.template AsType<float16_t>()[Number<0>{}],
                 0,
                 0,
@@ -1553,8 +1553,8 @@ struct intrin_mfma_f32_16x16x32f8bf8<16, 16>
     {
 #if defined(__gfx94__)
         reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
-            bit_cast<long>(reg_a),
-            bit_cast<long>(reg_b),
+            bit_cast<int64_t>(reg_a),
+            bit_cast<int64_t>(reg_b),
             reg_c.template AsType<float4_t>()[Number<0>{}],
             0,
             0,
@@ -1585,8 +1585,8 @@ struct intrin_mfma_f32_32x32x16bf8f8<32, 32>
 #if defined(__gfx94__)
         reg_c.template AsType<float16_t>()(Number<0>{}) =
             __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                bit_cast<long>(reg_a),
-                bit_cast<long>(reg_b),
+                bit_cast<int64_t>(reg_a),
+                bit_cast<int64_t>(reg_b),
                 reg_c.template AsType<float16_t>()[Number<0>{}],
                 0,
                 0,
@@ -1616,8 +1616,8 @@ struct intrin_mfma_f32_16x16x32bf8f8<16, 16>
     {
 #if defined(__gfx94__)
         reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
-            bit_cast<long>(reg_a),
-            bit_cast<long>(reg_b),
+            bit_cast<int64_t>(reg_a),
+            bit_cast<int64_t>(reg_b),
             reg_c.template AsType<float4_t>()[Number<0>{}],
             0,
             0,
diff --git a/include/ck/utility/env.hpp b/include/ck/utility/env.hpp
index 46ba32bb87..2f5b804d16 100644
--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
@@ -8,6 +8,7 @@
 #include <cstring>
 #include <string>
 #include <string_view>
+#include <map>
 
 namespace ck {
 namespace internal {
diff --git a/include/ck/utility/synchronization.hpp b/include/ck/utility/synchronization.hpp
index d6b6eac26c..7652e73809 100644
--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -33,7 +33,7 @@ __device__ void block_sync_lds_direct_load()
 {
 #ifdef __gfx12__
     asm volatile("\
-    s_wait_vmcnt 0x0 \n \
+    s_wait_loadcnt 0x0 \n \
     s_wait_dscnt 0x0 \n \
     s_barrier_signal -1 \n \
     s_barrier_wait -1 \
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index 09c7d58558..fc72138abf 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -74,7 +74,7 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // clang-format off
         using P_ = GemmPipeline;
 
-        return concat('_', "gemm_batched", gemm_prec_str<ADataType, BDataType>,
+        return concat('_', "gemm_batched", gemm_prec_str<ADataType, BDataType>(),
                       concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock), 
                       concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
                       concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 516d4298ef..53c21b49f5 100755
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -196,7 +196,7 @@ struct GemmKernel
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
-        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>, GemmPipeline::GetName());
+        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>(), GemmPipeline::GetName());
         // clang-format on
     }
 
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 533cabb736..2605b1afbc 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -57,7 +57,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // clang-format off
         using P_ = GemmPipeline;
 
-        return concat('_', "gemm_grouped", gemm_prec_str<ADataType, BDataType>,
+        return concat('_', "gemm_grouped", gemm_prec_str<ADataType, BDataType>(),
                       concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock),
                       concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
                       concat('x', P_::kPadM, P_::kPadN, P_::kPadK),
@@ -95,7 +95,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static constexpr auto
+    CK_TILE_HOST static auto
     GridSize(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs)
     {
         index_t grid_size = 0;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index 80f38f263b..0831cf85c4 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1095,16 +1095,16 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
 #if defined(__gfx94__) or defined(__gfx95__)
             if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
             else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
             else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
             else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
 #else
             ck_tile::ignore = c_vec;
             ck_tile::ignore = a_vec;
@@ -1119,16 +1119,16 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base
 #if defined(__gfx94__) or defined(__gfx95__)
         if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
 #else
         ck_tile::ignore = a_vec;
         ck_tile::ignore = b_vec;
@@ -1254,16 +1254,16 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 #if defined(__gfx94__) or defined(__gfx95__)
             if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
             else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
             else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
             else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
                 c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                    bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                    bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
 #elif defined(__gfx908__) || defined(__gfx90a__)
             static_for<0, 8, 1>{}([&](auto k) {
                 float a_f32 =
@@ -1289,16 +1289,16 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
 #if defined(__gfx94__) or defined(__gfx95__)
         if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
         else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
             return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), CVecType{0.f}, 0, 0, 0));
 #elif defined(__gfx908__) || defined(__gfx90a__)
         CVecType c_vec{0.f};
         static_for<0, 8, 1>{}([&](auto k) {
@@ -1580,7 +1580,7 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
         {
 #if defined(__gfx94__) or defined(__gfx95__)
             c_vec = __builtin_amdgcn_mfma_i32_32x32x16_i8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
 #elif defined(__gfx908__) || defined(__gfx90a__)
             static_for<0, 8, 1>{}([&](auto k) {
                 float a_f32 =
@@ -1650,7 +1650,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
         {
 #if defined(__gfx94__) or defined(__gfx95__)
             c_vec = __builtin_amdgcn_mfma_i32_16x16x32_i8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
 #else
             ck_tile::ignore = c_vec;
             ck_tile::ignore = a_vec;
@@ -1709,7 +1709,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x64_i8
         {
 #if defined(__gfx95__)
             c_vec = __builtin_amdgcn_mfma_i32_16x16x64_i8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+                bit_cast<int64_t>(a_vec), bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
 #else
             ck_tile::ignore = c_vec;
             ck_tile::ignore = a_vec;
@@ -1767,8 +1767,8 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x32_i8
         else
         {
 #if defined(__gfx95__)
-            c_vec =
-                __builtin_amdgcn_mfma_i32_32x32x32_i8(a_vec, bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+            c_vec = __builtin_amdgcn_mfma_i32_32x32x32_i8(
+                a_vec, bit_cast<int64_t>(b_vec), c_vec, 0, 0, 0);
 #else
             ck_tile::ignore = c_vec;
             ck_tile::ignore = a_vec;
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index 1373dbc497..d2a38b2a81 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -6,7 +6,9 @@
 #include <iomanip>
 #include <iostream>
 #include <typeinfo>
+#if defined(__unix__)
 #include <unistd.h>
+#endif
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -213,7 +215,9 @@ int profile_gemm_impl(int do_verification,
         instance_id++;
     }
 
+#if defined(__unix__)
     sleep(2);
+#endif
 
     // Run the best instance again
     {
diff --git a/profiler/src/profile_batched_gemm_b_scale.cpp b/profiler/src/profile_batched_gemm_b_scale.cpp
index f768a17570..5fe6f490be 100644
--- a/profiler/src/profile_batched_gemm_b_scale.cpp
+++ b/profiler/src/profile_batched_gemm_b_scale.cpp
@@ -5,6 +5,7 @@
 #include <initializer_list>
 #include <iostream>
 #include <numeric>
+#include <inttypes.h>
 
 #include "profiler/profile_batched_gemm_b_scale_impl.hpp"
 #include "profiler_operation_registry.hpp"
@@ -114,7 +115,7 @@ int profile_batched_gemm_b_scale(int argc, char* argv[])
         n_iter   = std::stoi(argv[18]);
         rotating = std::stoull(argv[19]) * 1024 * 1024;
 
-        printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating);
+        printf("n_warmup:%d, n_iter:%d, rotating:%" PRIu64 "\n", n_warmup, n_iter, rotating);
     }
 
     using F32 = float;
diff --git a/profiler/src/profile_gemm_b_scale.cpp b/profiler/src/profile_gemm_b_scale.cpp
index 443ebff834..7bcc96a434 100644
--- a/profiler/src/profile_gemm_b_scale.cpp
+++ b/profiler/src/profile_gemm_b_scale.cpp
@@ -5,6 +5,7 @@
 #include <initializer_list>
 #include <iostream>
 #include <numeric>
+#include <inttypes.h>
 
 #include "profiler/profile_gemm_b_scale_impl.hpp"
 #include "profiler_operation_registry.hpp"
@@ -100,7 +101,7 @@ int profile_gemm_b_scale(int argc, char* argv[])
         n_iter   = std::stoi(argv[17]);
         rotating = std::stoull(argv[18]) * 1024 * 1024;
 
-        printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating);
+        printf("n_warmup:%d, n_iter:%d, rotating:%" PRIu64 "\n", n_warmup, n_iter, rotating);
     }
 
     using F32 = float;
diff --git a/test/scatter_gather/scatter_gather.cpp b/test/scatter_gather/scatter_gather.cpp
index 81765b43e5..874c4d86c0 100644
--- a/test/scatter_gather/scatter_gather.cpp
+++ b/test/scatter_gather/scatter_gather.cpp
@@ -140,8 +140,8 @@ union pixel
 {
     struct __attribute__((packed))
     {
-        unsigned int r : 6;
-        unsigned int c : 10;
+        ushort r : 6;
+        ushort c : 10;
     };
     ushort data;
 };

From fbd9f32abe1015f375818e6c63c14bcf913866e6 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Thu, 17 Jul 2025 08:19:57 +0800
Subject: [PATCH 314/443] [CK][CONV] Support NCHW in class
 DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 (#2459)

1. Port NCHW support from ConvFwd (#2375) to conv bwd data
2. Add new instance device_grouped_conv_bwd_data_xdl_f16_nchw_instances for nchw

Co-authored-by: azhuang <anzhong.huang@amd.com>
---
 .../common.hpp                                |   2 +-
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 490 +++++++++++++-----
 .../transform_conv_bwd_data_to_gemm_v1.hpp    | 127 ++++-
 ...ice_grouped_conv_bwd_data_xdl_instance.hpp |  30 ++
 ...ata_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp |   8 +
 ..._xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp |   8 +
 6 files changed, 509 insertions(+), 156 deletions(-)

diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
index 6af8ac6488..1823d4fc0a 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
+++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
@@ -92,7 +92,7 @@ inline bool parse_cmd_args(int argc,
 
         const ck::index_t num_dim_spatial = std::stoi(argv[4]);
         conv_params                       = ck::utils::conv::parse_conv_param(
-            num_dim_spatial, threshold_to_catch_partial_args, argv);
+            num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
     }
     else
     {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 89a304fda4..db2426518a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -74,7 +74,10 @@ template <typename GridwiseGemm,
           typename CDEElementwiseOp,
           typename ComputePtrOffsetOfBatch,
           typename ComputePtrOffsetOfN,
-          InMemoryDataOperationEnum OutElementOp>
+          InMemoryDataOperationEnum OutElementOp,
+          bool HasMainKBlockLoopInAllGemm,
+          bool NoMainKBlockLoopInAllGemm,
+          bool CTranspose>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -101,16 +104,21 @@ __global__ void
     const index_t k_idx         = __builtin_amdgcn_readfirstlane(blockIdx.z - n_idx * KBatch);
 
     const long_index_t a_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))
+                   : amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
     const long_index_t b_batch_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))
+                   : amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
     const long_index_t e_batch_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
 
     const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
 
     const long_index_t a_n_offset =
-        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+        CTranspose ? 0 : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t b_n_offset =
+        CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)) : 0;
+
     const long_index_t e_n_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
 
@@ -141,11 +149,11 @@ __global__ void
         group_id = index_t((left + right) / 2);
     }
 
-    if(gemm_kernel_args[group_id].HasMainKBlockLoop_)
+    if constexpr(HasMainKBlockLoopInAllGemm || NoMainKBlockLoopInAllGemm)
     {
-        GridwiseGemm::template Run<true, OutElementOp>(
+        GridwiseGemm::template Run<HasMainKBlockLoopInAllGemm, OutElementOp>(
             p_a_grid + a_batch_offset + a_n_offset,
-            p_b_grid + b_batch_offset,
+            p_b_grid + b_batch_offset + b_n_offset,
             p_ds_grid_grp,
             p_e_grid + e_batch_offset + e_n_offset,
             p_shared,
@@ -162,22 +170,44 @@ __global__ void
     }
     else
     {
-        GridwiseGemm::template Run<false, OutElementOp>(
-            p_a_grid + a_batch_offset + a_n_offset,
-            p_b_grid + b_batch_offset,
-            p_ds_grid_grp,
-            p_e_grid + e_batch_offset + e_n_offset,
-            p_shared,
-            a_element_op,
-            b_element_op,
-            cde_element_op,
-            gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
-            gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
-            gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-            gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            gemm_kernel_args[group_id].block_2_ctile_map_,
-            KBatch,
-            k_idx);
+        if(gemm_kernel_args[group_id].HasMainKBlockLoop_)
+        {
+            GridwiseGemm::template Run<true, OutElementOp>(
+                p_a_grid + a_batch_offset + a_n_offset,
+                p_b_grid + b_batch_offset + b_n_offset,
+                p_ds_grid_grp,
+                p_e_grid + e_batch_offset + e_n_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                cde_element_op,
+                gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+                gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+                gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                gemm_kernel_args[group_id].block_2_ctile_map_,
+                KBatch,
+                k_idx);
+        }
+        else
+        {
+            GridwiseGemm::template Run<false, OutElementOp>(
+                p_a_grid + a_batch_offset + a_n_offset,
+                p_b_grid + b_batch_offset + b_n_offset,
+                p_ds_grid_grp,
+                p_e_grid + e_batch_offset + e_n_offset,
+                p_shared,
+                a_element_op,
+                b_element_op,
+                cde_element_op,
+                gemm_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
+                gemm_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
+                gemm_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                gemm_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                gemm_kernel_args[group_id].block_2_ctile_map_,
+                KBatch,
+                k_idx);
+        }
     }
 #else
     ignore = p_a_grid;
@@ -278,7 +308,11 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     // implementation we can avoid copy data to workspace before kernel launch since number of
     // groups is runtime parameter. If number of groups is larger than MaxGroupedGemmGroupsNum  then
     // we run this kernel in the loop.
-    static constexpr index_t MaxGroupedGemmGroupsNum = 32;
+    static constexpr index_t MaxGroupedGemmGroupsNum =
+        ConvBackwardDataSpecialization ==
+                ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0
+            ? 1
+            : 32;
 
     using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
 
@@ -296,24 +330,40 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
 
-    using ALayoutAfterTranspose =
-        std::conditional_t<is_NGCHW_NGKHW<ELayout, BLayout, ALayout>(),
-                           tensor_layout::convolution::NHWGK,
-                           std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>(),
-                                              tensor_layout::convolution::NDHWGK,
-                                              ALayout>>;
-    using BLayoutAfterTranspose =
-        std::conditional_t<is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>(),
-                           tensor_layout::convolution::GKYXC,
-                           std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>(),
-                                              tensor_layout::convolution::GKZYXC,
-                                              BLayout>>;
-    using ELayoutAfterTranspose =
-        std::conditional_t<is_NGCHW_NGKHW<ELayout, BLayout, ALayout>(),
-                           tensor_layout::convolution::NHWGC,
-                           std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>(),
-                                              tensor_layout::convolution::NDHWGC,
-                                              ELayout>>;
+    static constexpr bool isATensorColMajor =
+        (ConvBackwardDataSpecialization ==
+         ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0) &&
+        (ABlockTransferSrcVectorDim == 1) &&
+        (is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>());
+
+    static constexpr bool NeedTransposeKernel =
+        (isATensorColMajor == false) && (is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>());
+
+    static constexpr bool CTranspose =
+        (NeedTransposeKernel == false) && (is_same_v<ELayout, tensor_layout::convolution::NGCHW> ||
+                                           is_same_v<ELayout, tensor_layout::convolution::NGCDHW>);
+
+    using ALayoutAfterTranspose = std::conditional_t<
+        is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() && NeedTransposeKernel,
+        tensor_layout::convolution::NHWGK,
+        std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>() && NeedTransposeKernel,
+                           tensor_layout::convolution::NDHWGK,
+                           ALayout>>;
+    using BLayoutAfterTranspose = std::conditional_t<
+        is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() && NeedTransposeKernel,
+        tensor_layout::convolution::GKYXC,
+        std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>() &&
+                               NeedTransposeKernel,
+                           tensor_layout::convolution::GKZYXC,
+                           BLayout>>;
+    using ELayoutAfterTranspose = std::conditional_t<
+        is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() && NeedTransposeKernel,
+        tensor_layout::convolution::NHWGC,
+        std::conditional_t<is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>() && NeedTransposeKernel,
+                           tensor_layout::convolution::NDHWGC,
+                           ELayout>>;
 
     using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
                                                                      ConvBackwardDataSpecialization,
@@ -329,7 +379,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                                      ELayoutAfterTranspose,
                                                                      true, /*SplitConvN*/
                                                                      ABDataType,
-                                                                     EDataType>;
+                                                                     EDataType,
+                                                                     1,
+                                                                     index_t,
+                                                                     CTranspose>;
 
     static auto
     GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
@@ -357,15 +410,25 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                   DLayout,
                                                   true, /*SplitConvN*/
                                                   ABDataType,
-                                                  DDataType>;
+                                                  DDataType,
+                                                  1,       /*index_t NumGroupsToMerge = 1,*/
+                                                  index_t, /* typename IndexType       =  */
+                                                  CTranspose>;
                 return ConvToGemmBwdDataTransformD{}.MakeCDescriptor_M_N();
             },
             Number<NumDTensor>{});
 
         const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();
-
-        return make_tuple(
-            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
+        if constexpr(CTranspose)
+        {
+            return make_tuple(
+                b_grid_desc_bk0_n_bk1, a_grid_desc_ak0_m_ak1, ds_grid_desc_m_n, e_grid_desc_m_n);
+        }
+        else
+        {
+            return make_tuple(
+                a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
+        }
     }
 
 // GridwiseGemm
@@ -383,13 +446,34 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                           \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, BComputeType
+
+#define GridwiseGemmCTransposeTemplateParameters                                                \
+    ABDataType, ABDataType, AComputeType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
+        BElementwiseOp, AElementwiseOp, CDEElementwiseOp, NumGemmKPrefetchStage, BlockSize,     \
+        NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL, MPerXDL, NXdlPerWave, MXdlPerWave,  \
+        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder,  \
+        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                               \
+        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,          \
+        BBlockLdsExtraN, ABlockTransferThreadClusterLengths_AK0_M_AK1,                          \
+        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                  \
+        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                           \
+        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                           \
+        CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,                           \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                       \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, BComputeType
+
     using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmMultiDTemplateParams>;
+    using GridwiseGemmCTranspose = std::conditional_t<
+        CTranspose,
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>,
+        GridwiseGemm>;
 
     template <typename EGridDesc_M_N>
     static auto
     MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N e_grid_desc_m_n)
     {
-        return GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n);
+        return GridwiseGemmCTranspose::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            e_grid_desc_m_n);
     }
 
     template <typename Desc_K0_M_K1>
@@ -419,13 +503,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
     using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
 
     using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             DsGridDesc_M_N{}));
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
         decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
 
     // block-to-e-tile map
-    using Block2ETileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    using Block2ETileMap =
+        decltype(GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}));
 
     using GroupedGemmBlock2ETileMap = OffsettedBlockToCTileMap<Block2ETileMap>;
 
@@ -630,14 +715,17 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                 sizeof(EDataType);
 
             std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(a_g_n_k_wos_lengths,
-                                                                      a_g_n_k_wos_strides);
+                NeedTransposeKernel ? conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
+                                          a_g_n_k_wos_lengths, a_g_n_k_wos_strides)
+                                    : a_g_n_k_wos_strides;
             std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(b_g_k_c_xs_lengths,
-                                                                    b_g_k_c_xs_strides);
+                NeedTransposeKernel ? conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
+                                          b_g_k_c_xs_lengths, b_g_k_c_xs_strides)
+                                    : b_g_k_c_xs_strides;
             std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_transposed =
-                conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(e_g_n_c_wis_lengths,
-                                                                      e_g_n_c_wis_strides);
+                NeedTransposeKernel ? conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
+                                          e_g_n_c_wis_lengths, e_g_n_c_wis_strides)
+                                    : e_g_n_c_wis_strides;
 
             // populate Ds pointer
             static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -737,12 +825,27 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
                         conv_N_per_block_ = conv_to_gemm_transform_.N_;
 
-                        const auto a_grid_desc_ak0_m_ak1 =
-                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();
-
-                        const auto b_grid_desc_bk0_n_bk1 =
-                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();
+                        const auto a_grid_desc_ak0_m_ak1 = [&]() {
+                            if constexpr(CTranspose)
+                            {
+                                return conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();
+                            }
+                            else
+                            {
+                                return conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();
+                            }
+                        }();
 
+                        const auto b_grid_desc_bk0_n_bk1 = [&]() {
+                            if constexpr(CTranspose)
+                            {
+                                return conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();
+                            }
+                            else
+                            {
+                                return conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();
+                            }
+                        }();
                         DsGridDesc_M_N ds_grid_desc_m_n;
 
                         // populate Ds desc
@@ -764,7 +867,10 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                                                               DLayout,
                                                               true, /*SplitConvN*/
                                                               ABDataType,
-                                                              DDataType>;
+                                                              DDataType,
+                                                              1,
+                                                              index_t,
+                                                              CTranspose>;
                             ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{
                                 a_g_n_k_wos_lengths,
                                 a_g_n_k_wos_strides_transposed,
@@ -810,14 +916,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
                         const auto GemmK = a_grid_desc_m_k.GetLength(I1);
                         const bool HasMainKBlockLoop =
-                            GridwiseGemm::CalculateHasMainKBlockLoop(GemmK, k_batch_);
+                            GridwiseGemmCTranspose::CalculateHasMainKBlockLoop(GemmK, k_batch_);
 
                         gemm_kernel_args_[gemms_count_ /
                                           MaxGroupedGemmGroupsNum][gemms_count_ %
                                                                    MaxGroupedGemmGroupsNum] =
                             GemmArgs{a_grid_desc_ak0_m_ak1,
                                      b_grid_desc_bk0_n_bk1,
-                                     GridwiseGemm::
+                                     GridwiseGemmCTranspose::
                                          MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                                              ds_grid_desc_m_n),
                                      MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
@@ -851,8 +957,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
             num_workgroups_per_Conv_N_ = a_g_n_k_wos_lengths_[I1] / conv_N_per_block_;
 
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 // Use not modified base strides
                 a_in_transpose_desc_ =
@@ -892,8 +997,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         std::size_t GetWorkspaceATensorSizeBytes() const
         {
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const long_index_t a_acum = ck::accumulate_n<long_index_t>(
                     a_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -908,8 +1012,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         std::size_t GetWorkspaceBTensorSizeBytes() const
         {
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const long_index_t b_acum = ck::accumulate_n<long_index_t>(
                     b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -924,8 +1027,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
 
         std::size_t GetWorkspaceETensorSizeBytes() const
         {
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const long_index_t e_accum = ck::accumulate_n<long_index_t>(
                     e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -1030,24 +1132,25 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             const ADataType* p_a_grid = arg.p_a_grid_;
             const BDataType* p_b_grid = arg.p_b_grid_;
             EDataType* p_e_grid       = arg.p_e_grid_;
-
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
-                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                p_e_grid =
-                    type_convert<EDataType*>(arg.p_workspace_) +
-                    (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
-                        sizeof(EDataType);
-            }
+                if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
+                             is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+                {
+                    p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                    p_e_grid =
+                        type_convert<EDataType*>(arg.p_workspace_) +
+                        (arg.GetWorkspaceATensorSizeBytes() + arg.GetWorkspaceBTensorSizeBytes()) /
+                            sizeof(EDataType);
+                }
 
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
-            {
-                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
-                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                if constexpr(is_NGCHW_GKCYX_NGKHW<ELayout, BLayout, ALayout>() ||
+                             is_NGCDHW_GKCZYX_NGKDHW<ELayout, BLayout, ALayout>())
+                {
+                    p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                               arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                }
             }
-
             for(std::size_t gemm_set_id = 0; gemm_set_id < arg.gemm_kernel_args_.size();
                 gemm_set_id++)
             {
@@ -1067,42 +1170,111 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                     }
                 };
 
-                auto launch_kernel = [&]() {
-                    const auto kernel = kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        typename GridwiseGemm::DsGridPointer,
-                        EDataType,
-                        MaxGroupedGemmGroupsNum,
-                        GemmArgs,
-                        AElementwiseOp,
-                        BElementwiseOp,
-                        CDEElementwiseOp,
-                        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
-                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                        ElementOp>;
+                bool has_loop_in_all_gemm = true;
+                bool no_loop_in_all_gemm  = true;
+                for(auto i = 0; i < gemms_count_for_set; i++)
+                {
+                    has_loop_in_all_gemm &= gemm_kernel_args[i].HasMainKBlockLoop_;
+                    no_loop_in_all_gemm &= !gemm_kernel_args[i].HasMainKBlockLoop_;
+                }
 
-                    return launch_and_time_kernel_with_preprocess(stream_config,
-                                                                  clear_workspace,
-                                                                  kernel,
-                                                                  dim3(gdx, gdy, gdz),
-                                                                  dim3(BlockSize),
-                                                                  0,
-                                                                  p_a_grid,
-                                                                  p_b_grid,
-                                                                  arg.p_ds_grid_,
-                                                                  p_e_grid,
-                                                                  gemm_kernel_args,
-                                                                  gemms_count_for_set,
-                                                                  arg.a_element_op_,
-                                                                  arg.b_element_op_,
-                                                                  arg.cde_element_op_,
-                                                                  arg.compute_ptr_offset_of_batch_,
-                                                                  arg.compute_ptr_offset_of_n_,
-                                                                  arg.k_batch_);
+                auto launch_kernel = [&](auto has_main_k_block_loop, auto no_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
+                    constexpr bool no_main_loop  = no_main_k_block_loop.value;
+                    if constexpr(CTranspose)
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle<
+                            GridwiseGemmCTranspose,
+                            ADataType, // TODO: distiguish A/B datatype
+                            typename GridwiseGemm::DsGridPointer,
+                            EDataType,
+                            MaxGroupedGemmGroupsNum,
+                            GemmArgs,
+                            BElementwiseOp,
+                            AElementwiseOp,
+                            CDEElementwiseOp,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            ElementOp,
+                            has_main_loop,
+                            no_main_loop,
+                            CTranspose>;
+
+                        return launch_and_time_kernel_with_preprocess(
+                            stream_config,
+                            clear_workspace,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            p_b_grid,
+                            p_a_grid,
+                            arg.p_ds_grid_,
+                            p_e_grid,
+                            gemm_kernel_args,
+                            gemms_count_for_set,
+                            arg.b_element_op_,
+                            arg.a_element_op_,
+                            arg.cde_element_op_,
+                            arg.compute_ptr_offset_of_batch_,
+                            arg.compute_ptr_offset_of_n_,
+                            arg.k_batch_);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle<
+                            GridwiseGemm,
+                            ADataType, // TODO: distiguish A/B datatype
+                            typename GridwiseGemm::DsGridPointer,
+                            EDataType,
+                            MaxGroupedGemmGroupsNum,
+                            GemmArgs,
+                            AElementwiseOp,
+                            BElementwiseOp,
+                            CDEElementwiseOp,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            ElementOp,
+                            has_main_loop,
+                            no_main_loop,
+                            CTranspose>;
+
+                        return launch_and_time_kernel_with_preprocess(
+                            stream_config,
+                            clear_workspace,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            p_a_grid,
+                            p_b_grid,
+                            arg.p_ds_grid_,
+                            p_e_grid,
+                            gemm_kernel_args,
+                            gemms_count_for_set,
+                            arg.a_element_op_,
+                            arg.b_element_op_,
+                            arg.cde_element_op_,
+                            arg.compute_ptr_offset_of_batch_,
+                            arg.compute_ptr_offset_of_n_,
+                            arg.k_batch_);
+                    }
                 };
-
-                ave_time += launch_kernel();
+                if(has_loop_in_all_gemm)
+                {
+                    ave_time += launch_kernel(integral_constant<bool, true>{},
+                                              integral_constant<bool, false>{});
+                }
+                else if(no_loop_in_all_gemm)
+                {
+                    ave_time += launch_kernel(integral_constant<bool, false>{},
+                                              integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time += launch_kernel(integral_constant<bool, false>{},
+                                              integral_constant<bool, false>{});
+                }
             }
 
             return ave_time;
@@ -1116,9 +1288,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             {
                 arg.Print();
             }
+
             // Transpose from NGKHW to NHWGK
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 EDataType* p_e_in_grid =
                     type_convert<EDataType*>(arg.p_workspace_) +
@@ -1208,8 +1380,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             }
 
             // Transpose from NHWGC to NGCHW
-            if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                         is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+            if constexpr(NeedTransposeKernel)
             {
                 const index_t grid_size =
                     arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
@@ -1284,10 +1455,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             }
         }
 
-        const index_t ConvG = arg.b_g_k_c_xs_lengths_[0];
-        const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
-        const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
-
+        const index_t ConvG               = arg.b_g_k_c_xs_lengths_[0];
+        const index_t ConvK               = arg.b_g_k_c_xs_lengths_[1];
+        const index_t ConvC               = arg.b_g_k_c_xs_lengths_[2];
+        const index_t output_spatial_acum = ck::accumulate_n<index_t>(
+            arg.e_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+        const index_t input_spatial_acum = ck::accumulate_n<index_t>(
+            arg.a_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
         // Specifialization
         if constexpr(ConvBackwardDataSpecialization ==
                      ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
@@ -1307,15 +1481,30 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK> ||
                      is_same_v<ALayout, tensor_layout::convolution::GNDHWK> ||
                      is_same_v<ALayout, tensor_layout::convolution::NHWGK> ||
-                     is_same_v<ALayout, tensor_layout::convolution::NDHWGK> ||
-                     is_same_v<ALayout, tensor_layout::convolution::NGKHW> ||
-                     is_same_v<ALayout, tensor_layout::convolution::NGKDHW>)
+                     is_same_v<ALayout, tensor_layout::convolution::NDHWGK> || NeedTransposeKernel)
         {
             if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
             {
                 return false;
             }
         }
+        else if(is_same_v<ALayout, tensor_layout::convolution::NGKHW> ||
+                is_same_v<ALayout, tensor_layout::convolution::NGKDHW>)
+        {
+            static_assert(NeedTransposeKernel == false);
+
+            if constexpr(ABlockTransferSrcScalarPerVector != 1)
+            {
+                if(ABlockTransferSrcVectorDim != 1)
+                {
+                    return false;
+                }
+                if(output_spatial_acum % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+        }
         else
         {
             return false;
@@ -1351,10 +1540,20 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                          is_same_v<DLayout, tensor_layout::convolution::GC> ||
                          is_same_v<DLayout, tensor_layout::convolution::G_C>)
             {
-                // vector load D matrix from global memory
-                if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                if(CTranspose == false)
                 {
-                    ds_valid = false;
+                    // vector load D matrix from global memory
+                    if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                    {
+                        ds_valid = false;
+                    }
+                }
+                else
+                {
+                    if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                    {
+                        ds_valid = false;
+                    }
                 }
             }
             else
@@ -1376,10 +1575,20 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                      is_same_v<ELayout, tensor_layout::convolution::NGCHW> ||
                      is_same_v<ELayout, tensor_layout::convolution::NGCDHW>)
         {
-            // vector store C matrix into global memory
-            if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            if(CTranspose == false)
             {
-                return false;
+                // vector store C matrix into global memory
+                if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
             }
         }
         else
@@ -1390,7 +1599,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
         // Gridwise GEMM size
         for(std::size_t i = 0; i < arg.a_grid_desc_m_k_container_.size(); i++)
         {
-            if(!GridwiseGemm::CheckValidity(
+            if(!GridwiseGemmCTranspose::CheckValidity(
                    arg.a_grid_desc_m_k_container_[i],
                    arg.b_grid_desc_n_k_container_[i],
                    arg.ds_grid_desc_m_n_container_[i],
@@ -1403,8 +1612,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
             }
         }
 
-        if constexpr(is_NGCHW_NGKHW<ELayout, BLayout, ALayout>() ||
-                     is_NGCDHW_NGKDHW<ELayout, BLayout, ALayout>())
+        if constexpr(NeedTransposeKernel)
         {
             if((ConvG * ConvC) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
index a191c75099..977c622f06 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -30,7 +30,8 @@ template <
     typename ADataType       = float,
     typename CDataType       = float,
     index_t NumGroupsToMerge = 1,
-    typename IndexType       = index_t>
+    typename IndexType       = index_t,
+    bool CTranspose          = false>
 struct TransformConvBwdDataToGemm_v1
 {
     private:
@@ -555,6 +556,41 @@ struct TransformConvBwdDataToGemm_v1
                 return make_naive_tensor_descriptor_packed(make_tuple(N_, Do_, Ho_, Wo_, K_));
             }
         }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::NGKHW>)
+        {
+            // assume packed
+            static_assert(ConvBwdDataSpecialization ==
+                          ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                              Filter1x1Stride1Pad0);
+
+            const auto out_gemm_raw_grid_desc = make_naive_tensor_descriptor(
+                make_tuple(N_, Ho_ * Wo_, K_), make_tuple(NStrideTensorA_, I1, KStrideTensorA_));
+
+            return transform_tensor_descriptor(
+                out_gemm_raw_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Ho_ * Wo_)),
+                           make_pass_through_transform(K_)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(is_same_v<ALayout, tensor_layout::convolution::NGKDHW>)
+        {
+            // assume packed
+            static_assert(ConvBwdDataSpecialization ==
+                          ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                              Filter1x1Stride1Pad0);
+
+            const auto out_gemm_raw_grid_desc =
+                make_naive_tensor_descriptor(make_tuple(N_, Do_ * Ho_ * Wo_, K_),
+                                             make_tuple(NStrideTensorA_, I1, KStrideTensorA_));
+
+            return transform_tensor_descriptor(
+                out_gemm_raw_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_)),
+                           make_pass_through_transform(K_)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
         else
         {
             throw std::runtime_error("wrong! unsupported layout: " + ALayout::name());
@@ -608,7 +644,9 @@ struct TransformConvBwdDataToGemm_v1
                                     (is_same_v<ALayout_, tensor_layout::convolution::GNHWK> ||
                                      is_same_v<ALayout_, tensor_layout::convolution::GNDHWK> ||
                                      is_same_v<ALayout_, tensor_layout::convolution::NHWGK> ||
-                                     is_same_v<ALayout_, tensor_layout::convolution::NDHWGK>),
+                                     is_same_v<ALayout_, tensor_layout::convolution::NDHWGK> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::NGKHW> ||
+                                     is_same_v<ALayout_, tensor_layout::convolution::NGKDHW>),
                                 bool>::type = false>
     __host__ __device__ auto MakeADescriptor_AK0_M_AK1() const
     {
@@ -848,16 +886,16 @@ struct TransformConvBwdDataToGemm_v1
         }
     }
 
-    template <typename BLayout_                   = BLayout,
-              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
-                                          (is_same_v<BLayout_, tensor_layout::convolution::GKYXC> ||
-                                           is_same_v<BLayout_, tensor_layout::convolution::GKZYXC>),
-                                      bool>::type = false>
+    template <
+        typename BLayout_                   = BLayout,
+        typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
+                                    (is_same_v<BLayout_, tensor_layout::convolution::GKYXC> ||
+                                     is_same_v<BLayout_, tensor_layout::convolution::GKZYXC> ||
+                                     is_same_v<BLayout_, tensor_layout::convolution::GKCYX> ||
+                                     is_same_v<BLayout_, tensor_layout::convolution::GKCZYX>),
+                                bool>::type = false>
     __host__ __device__ auto MakeBDescriptor_BK0_N_BK1() const
     {
-        // assume packed
-        // k_y_x_c for 2d or k_z_y_x_c for 3d
-        const auto wei_grid_desc = MakeWeiGridDesc();
 
         if constexpr(ConvBwdDataSpecialization ==
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
@@ -886,6 +924,12 @@ struct TransformConvBwdDataToGemm_v1
         }
         else
         {
+            // assume packed
+            // k_y_x_c for 2d or k_z_y_x_c for 3d
+            static_assert(is_same_v<BLayout_, tensor_layout::convolution::GKYXC> ||
+                          is_same_v<BLayout_, tensor_layout::convolution::GKZYXC>);
+            const auto wei_grid_desc = MakeWeiGridDesc();
+
             // GemmK is different for each GEMM
             const auto ZDotSlice = math::integer_divide_ceil(Z_ - IdxZTilde_, ZTilde_);
             const auto YDotSlice = math::integer_divide_ceil(Y_ - IdxYTilde_, YTilde_);
@@ -1059,6 +1103,7 @@ struct TransformConvBwdDataToGemm_v1
                                 bool>::type = false>
     __host__ __device__ auto MakeCDescriptor_M_N() const
     {
+        static_assert(CTranspose == false);
         // assume strided
         // n_hi_wi_c for 2d n_di_hi_wi_c for 3d
         const auto in_grid_desc = MakeInGridDesc();
@@ -1314,6 +1359,48 @@ struct TransformConvBwdDataToGemm_v1
         }
     }
 
+    template <typename CLayout_                   = CLayout,
+              typename std::enable_if<(NDimSpatial == 2 || NDimSpatial == 3) &&
+                                          (is_same_v<CLayout_, tensor_layout::convolution::NGCHW> ||
+                                           is_same_v<CLayout_, tensor_layout::convolution::NGCDHW>),
+                                      bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        const auto in_grid_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, C_, Di_ * Hi_ * Wi_), make_tuple(NStrideTensorC_, CStrideTensorC_, I1));
+
+        static_assert(ConvBwdDataSpecialization ==
+                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                          Filter1x1Stride1Pad0);
+
+        if constexpr(CTranspose)
+        {
+            const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_pass_through_transform(C_),
+                           make_merge_transform(make_tuple(N_, Di_ * Hi_ * Wi_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return ck::tensor_operation::device::PadTensorDescriptor(
+                in_gemmmraw_gemmnraw_grid_desc,
+                make_tuple(GemmNPerBlock, GemmMPerBlock),
+                Sequence<DoPadGemmN, DoPadGemmM>{});
+        }
+        else
+        {
+            const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Di_ * Hi_ * Wi_)),
+                           make_pass_through_transform(C_)),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return ck::tensor_operation::device::PadTensorDescriptor(
+                in_gemmmraw_gemmnraw_grid_desc,
+                make_tuple(GemmMPerBlock, GemmNPerBlock),
+                Sequence<DoPadGemmM, DoPadGemmN>{});
+        }
+    }
     // for input bias
     template <typename CLayout_                   = CLayout,
               typename std::enable_if<NDimSpatial == 2 &&
@@ -1326,14 +1413,26 @@ struct TransformConvBwdDataToGemm_v1
                      ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
                          Filter1x1Stride1Pad0)
         {
-            const auto in_gemmm_gemmn_grid_desc =
-                make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, C_), make_tuple(I0, I1));
+            if constexpr(CTranspose)
+            {
+                const auto in_gemmm_gemmn_grid_desc = make_naive_tensor_descriptor(
+                    make_tuple(C_, N_ * Ho_ * Wo_), make_tuple(I1, I0));
 
-            return in_gemmm_gemmn_grid_desc;
+                return in_gemmm_gemmn_grid_desc;
+            }
+            else
+            {
+                const auto in_gemmm_gemmn_grid_desc = make_naive_tensor_descriptor(
+                    make_tuple(N_ * Ho_ * Wo_, C_), make_tuple(I0, I1));
+
+                return in_gemmm_gemmn_grid_desc;
+            }
         }
         else
         {
-            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            static_assert(CTranspose == false);
+            // only work on HTilde and WTilde that contribute to non-padding area of input
+            // tensor
             const auto IHTildeSliceBegin = math::integer_divide_floor(
                 math::max(I0, InLeftPadH_ - ConvDilationH_ * (YTilde_ - I1)), ConvStrideH_);
             const auto IWTildeSliceBegin = math::integer_divide_floor(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
index 5c0d7283f2..11a8ff8e91 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -112,6 +112,36 @@ using device_grouped_conv_bwd_data_xdl_f16_instances =
         // clang-format on
         >;
 
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionBackwardDataSpecialization ConvSpec>
+using device_grouped_conv_bwd_data_xdl_f16_nchw_instances =
+    std::tuple<
+        // clang-format off
+        // ##############################################|       NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        // generic instance
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,            1,            1,     S<1, 32, 1, 8>,              1>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,        S<4, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 8, 1, 8>,               1>,  
+
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,            1,            1,     S<1, 32, 1, 8>,              4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,            1,            1,     S<1, 32, 1, 8>,              8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,        S<4, 32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,            1,            1,     S<1, 16, 1, 8>,              4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 8>,              8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,            1,            1,     S<1, 16, 1, 8>,              4>, 
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 8>,              4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,            1,            1,     S<1, 32, 1, 8>,              4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,        S<4, 32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 8>,              4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,        S<4, 32, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 16, 1, 8>,              8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,    64,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,            1,            1,     S<1, 32, 1, 8>,              4>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,        S<4, 64, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,            1,            1,     S<1, 8, 1, 32>,              2>
+        // clang-format on
+        >;
 // bf16_bf16_f32_bf16
 template <index_t NDimSpatial,
           typename ALayout,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
index 3f94d30a55..320d637a07 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -32,6 +32,14 @@ void add_device_grouped_conv2d_bwd_data_xdl_ngkhw_gkcyx_ngchw_f16_instances(
                                                        Empty_Tuple,
                                                        NGCHW,
                                                        ConvBwdDataDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_nchw_instances<2,
+                                                            NGKHW,
+                                                            GKCYX,
+                                                            Empty_Tuple,
+                                                            NGCHW,
+                                                            ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
index bada2507c2..b1043260ea 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -32,6 +32,14 @@ void add_device_grouped_conv3d_bwd_data_xdl_ngkdhw_gkczyx_ngcdhw_f16_instances(
                                                        Empty_Tuple,
                                                        NGCDHW,
                                                        ConvBwdDataDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_nchw_instances<3,
+                                                            NGKDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGCDHW,
+                                                            ConvBwdDataFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance

From 722c22fb152aeddcee75fd63973dc4745d5a7c9d Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Thu, 17 Jul 2025 10:09:01 +0800
Subject: [PATCH 315/443] Revert "Eliminate warning caused by failed to meet
 occupancy requirement (#2389)" (#2514)

This reverts commit b2dea90116d1060c67db5edddb6d4498188ebac4.
---
 .../block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp       | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
index 10daea99d1..6398bf316e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
@@ -122,9 +122,6 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
             {
                 if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                     return 1;
-                // use larger K/V LDS buffer size will lower the occupancy
-                else if constexpr(64 <= kK0 || 64 <= kK1)
-                    return 1;
                 else
                     return 2;
             }

From 579bd73435bf544a2dfdf39aaa5fe62be1a01f2c Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Wed, 16 Jul 2025 22:33:03 -0700
Subject: [PATCH 316/443] Fixing numerical error, and interchange preshuffle
 configs to match with flatmm (#2515)

---
 example/ck_tile/03_gemm/gemm_utils.hpp             |  8 ++++----
 example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp |  2 +-
 example/ck_tile/03_gemm/run_gemm_example.inc       | 12 ++++++++++--
 example/ck_tile/18_flatmm/run_flatmm_example.inc   |  2 +-
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 9deccc7f16..7a9b5afaa2 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -241,8 +241,8 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase
     static constexpr ck_tile::index_t N_Warp = 4;
     static constexpr ck_tile::index_t K_Warp = 1;
 
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
@@ -263,8 +263,8 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
     static constexpr ck_tile::index_t N_Warp = 4;
     static constexpr ck_tile::index_t K_Warp = 1;
 
-    static constexpr ck_tile::index_t M_Warp_Tile = 16;
-    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index f57c24f458..b7b0701080 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -220,7 +220,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     auto [result, arg_parser] = create_args(argc, argv);
     bool preshuffle           = GemmConfig::Preshuffle;
 
-    if(preshuffle && a_layout != "R" && b_layout != "C")
+    if(preshuffle && (a_layout != "R" || b_layout != "C"))
     {
         throw std::runtime_error(
             "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index f13a4b693b..83836117e9 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -315,8 +315,16 @@ int run_gemm_example_with_layouts(int argc,
 
     if(init_method == 0)
     {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        if constexpr(preshuffle)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_k_n);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        }
     }
     else if(init_method == 1)
     {
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index b583612cfb..8f39b07be5 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -18,7 +18,7 @@ constexpr const char* DataTypeToString()
     {
         return "bf8";
     }
-    else if  constexpr(std::is_same_v<T, ck_tile::bf16_t>)
+    else if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
     {
         return "bf16";
     }

From f1d8ad2818d7277a7250e0826d8e58fb394a9f57 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Thu, 17 Jul 2025 14:56:22 +0800
Subject: [PATCH 317/443] [CK_TILE] Use read_tr in universal gemm (#2436)

* Use read_tr in universal gemm

* Enable all instances back

* Revert example37 changes

* Resolve comments

* resolve comments 2

* Fix assertion msg

* fix the gemm basic

* change index_t to bool for preshuffle variable

* Solve the comment

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: AviralGoelAMD <aviral.goel@amd.com>
---
 example/ck_tile/03_gemm/README.md             |   2 +-
 .../ck_tile/37_transpose/transpose_policy.hpp |   6 +-
 include/ck_tile/core.hpp                      |   1 +
 .../core/arch/amd_buffer_addressing.hpp       |  10 +-
 .../arch/amd_buffer_addressing_builtins.hpp   |  10 +-
 .../core/arch/amd_transpose_load_encoding.hpp |  58 ++--
 include/ck_tile/core/tensor/buffer_view.hpp   |  65 ++--
 .../core/tensor/load_tile_transpose.hpp       | 318 +++++++++++-------
 include/ck_tile/core/utility/debug.hpp        | 156 +++++++++
 include/ck_tile/ops/fmha.hpp                  |   6 +-
 include/ck_tile/ops/gemm.hpp                  |   6 +-
 ...emm_asmem_bsmem_creg_v1_default_policy.hpp |  42 ++-
 .../block/block_universal_gemm_as_bs_cr.hpp   | 154 +++++++--
 .../pipeline/gemm_pipeline_ag_bg_cr_base.hpp  |  63 +++-
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |  20 +-
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        | 137 ++++----
 ...peline_ag_bg_cr_comp_v4_default_policy.hpp |  18 +-
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |  79 +++--
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |   2 +
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   3 +
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 131 +++++---
 .../wp_pipeline_agmem_bgmem_creg_v1.hpp       |   2 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |  82 +++--
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    | 279 ++++++++-------
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |  61 +++-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |   4 +-
 26 files changed, 1150 insertions(+), 565 deletions(-)
 create mode 100644 include/ck_tile/core/utility/debug.hpp

diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index da37159aeb..20cc202176 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -23,7 +23,7 @@ args:
           -n    n dimension (default:2048)
           -k    k dimension (default:64)
    -a_layout    Tensor A data layout (default: R)
-   -b_layout    Tensor B data layout (default: R)
+   -b_layout    Tensor B data layout (default: C)
    -c_layout    Tensor C data layout (default: R)
    -stride_a    Tensor A stride (default:0)
    -stride_b    Tensor B stride (default:0)
diff --git a/example/ck_tile/37_transpose/transpose_policy.hpp b/example/ck_tile/37_transpose/transpose_policy.hpp
index ea1a4130fe..b7e52a94f7 100644
--- a/example/ck_tile/37_transpose/transpose_policy.hpp
+++ b/example/ck_tile/37_transpose/transpose_policy.hpp
@@ -48,8 +48,8 @@ struct TransposePolicy
         constexpr auto input_dstr = MakeLdsLoadTileDistribution<Problem>();
 
         using OutTileDstrEncode =
-            typename OutputTileDistributionTraits<remove_cvref_t<decltype(input_dstr)>,
-                                                  typename Problem::DataType>::OutDstrEncode;
+            typename OutputTileDistributionTraits<typename decltype(input_dstr)::DstrEncode,
+                                                  typename Problem::DataType>::TransposedDstrEncode;
         constexpr auto block_dstr = make_static_tile_distribution(OutTileDstrEncode{});
 
         return block_dstr;
@@ -131,7 +131,9 @@ struct TransposePolicy
         constexpr index_t kSecondDimIterations = Problem::kIterationsInSecondDim;
         constexpr index_t kSecondDimStrSub     = kSecondRepetitions / kSecondDimIterations;
 
+        constexpr index_t kLaneGroupSize      = 16;
         constexpr auto xdllevel_dstr_encoding = make_transposed_distr_encode<DataType,
+                                                                             kLaneGroupSize,
                                                                              kSecondDimStrSub,
                                                                              kSecondDimIterations,
                                                                              kLeadRepetitions,
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 10dfdd7d28..188cebaabc 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -66,6 +66,7 @@
 #include "ck_tile/core/tensor/transpose_tile.hpp"
 #include "ck_tile/core/tensor/update_tile.hpp"
 #include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/debug.hpp"
 #include "ck_tile/core/utility/env.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/functional_with_tuple.hpp"
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index aafc6c0a85..05775063b8 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2841,11 +2841,13 @@ __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
                 reinterpret_cast<uintptr_t>(in_ptr));
         return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
     }
-    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t>)
+    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
+                      std::is_same_v<remove_cvref_t<T>, ck_tile::bf8_t> ||
+                      std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
     {
-        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_fp8x8_t;
-        __attribute__((address_space(3))) llvm_fp8x8_t* lds_ptr =
-            reinterpret_cast<__attribute__((address_space(3))) llvm_fp8x8_t*>(
+        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
+        __attribute__((address_space(3))) llvm_i32x2_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_i32x2_t*>(
                 reinterpret_cast<uintptr_t>(in_ptr));
         return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
     }
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index ca4ff8ca7e..568a5be64c 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -2611,11 +2611,13 @@ __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
                 reinterpret_cast<uintptr_t>(in_ptr));
         return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
     }
-    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t>)
+    else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
+                      std::is_same_v<remove_cvref_t<T>, ck_tile::bf8_t> ||
+                      std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
     {
-        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_fp8x8_t;
-        __attribute__((address_space(3))) llvm_fp8x8_t* lds_ptr =
-            reinterpret_cast<__attribute__((address_space(3))) llvm_fp8x8_t*>(
+        typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
+        __attribute__((address_space(3))) llvm_i32x2_t* lds_ptr =
+            reinterpret_cast<__attribute__((address_space(3))) llvm_i32x2_t*>(
                 reinterpret_cast<uintptr_t>(in_ptr));
         return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
     }
diff --git a/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp b/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp
index 7ffe6dc0fb..665be1b167 100644
--- a/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp
+++ b/include/ck_tile/core/arch/amd_transpose_load_encoding.hpp
@@ -10,53 +10,55 @@
 namespace ck_tile {
 
 // this generate wave level tile distribution
-template <typename T, typename = void>
+template <typename T, index_t LaneGroupSize = 16, typename = void>
 struct LaneGroupTransposeTraits;
 
-template <typename T>
-struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 2>>
+template <typename T, index_t LaneGroupSize>
+struct LaneGroupTransposeTraits<T, LaneGroupSize, std::enable_if_t<sizeof(T) == 2>>
 {
+    static_assert(LaneGroupSize == 16 || LaneGroupSize == 32 || LaneGroupSize == 64,
+                  "LaneGroupSize must be 16, 32, or 64");
     // before transpose, 4x16
     static constexpr index_t ksecondDim = 4;
-    static constexpr index_t kleadDim   = 16;
+    static constexpr index_t kleadDim   = LaneGroupSize;
     // after transpose, 16x4
-    static constexpr index_t ksecondDimT = 16;
+    static constexpr index_t ksecondDimT = LaneGroupSize;
     static constexpr index_t kleadDimT   = 4;
     template <index_t kOuterDistDim0,
               index_t kOuterDistDim1,
               index_t kInnerDistDim0,
               index_t kInnerDistDim1>
-    using TileDistribution =
-        tile_distribution_encoding<sequence<>,
-                                   tuple<sequence<kOuterDistDim0, kOuterDistDim1, 4>,
-                                         sequence<kInnerDistDim0, kInnerDistDim1, 4, 4>>,
-                                   tuple<sequence<1, 2, 1, 2>>,
-                                   tuple<sequence<0, 0, 2, 2>>,
-                                   sequence<2, 1, 2>,
-                                   sequence<1, 1, 3>>;
+    using TileDistribution = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<kOuterDistDim0, kOuterDistDim1, 4>,
+              sequence<kInnerDistDim0, kInnerDistDim1, LaneGroupSize / 16, 4, 4>>,
+        tuple<sequence<1, 2, 2, 1, 2>>,
+        tuple<sequence<0, 0, 2, 2, 3>>,
+        sequence<2, 1, 2>,
+        sequence<1, 1, 4>>;
 };
 
-template <typename T>
-struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 1>>
+template <typename T, index_t LaneGroupSize>
+struct LaneGroupTransposeTraits<T, LaneGroupSize, std::enable_if_t<sizeof(T) == 1>>
 {
     static constexpr index_t ksecondDim = 8;
-    static constexpr index_t kleadDim   = 16;
+    static constexpr index_t kleadDim   = LaneGroupSize;
 
-    static constexpr index_t ksecondDimT = 16;
+    static constexpr index_t ksecondDimT = LaneGroupSize;
     static constexpr index_t kleadDimT   = 8;
 
     template <index_t kOuterDistDim0,
               index_t kOuterDistDim1,
               index_t kInnerDistDim0,
               index_t kInnerDistDim1>
-    using TileDistribution =
-        tile_distribution_encoding<sequence<>,
-                                   tuple<sequence<kOuterDistDim0, kOuterDistDim1, 8>,
-                                         sequence<kInnerDistDim0, kInnerDistDim1, 2, 8>>,
-                                   tuple<sequence<1, 2, 1, 2>>,
-                                   tuple<sequence<0, 0, 2, 2>>,
-                                   sequence<2, 1, 2>,
-                                   sequence<1, 1, 3>>;
+    using TileDistribution = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<kOuterDistDim0, kOuterDistDim1, 8>,
+              sequence<kInnerDistDim0, kInnerDistDim1, LaneGroupSize / 16, 2, 8>>,
+        tuple<sequence<1, 2, 2, 1, 2>>,
+        tuple<sequence<0, 0, 2, 2, 3>>,
+        sequence<2, 1, 2>,
+        sequence<1, 1, 4>>;
 };
 
 /*
@@ -72,15 +74,15 @@ struct LaneGroupTransposeTraits<T, std::enable_if_t<sizeof(T) == 1>>
  * consecutive.
  */
 template <typename T,
+          index_t LaneGroupSize,
           index_t kOuterDistDim0,
           index_t kOuterDistDim1,
           index_t kInnerDistDim0,
           index_t kInnerDistDim1>
 CK_TILE_DEVICE constexpr auto make_transposed_distr_encode()
 {
-    using xdllevel_dstr_encoding = typename LaneGroupTransposeTraits<T>::
-        template TileDistribution<kOuterDistDim0, kOuterDistDim1, kInnerDistDim0, kInnerDistDim1>;
-    return xdllevel_dstr_encoding{};
+    return typename LaneGroupTransposeTraits<T, LaneGroupSize>::
+        template TileDistribution<kOuterDistDim0, kOuterDistDim1, kInnerDistDim0, kInnerDistDim1>{};
 }
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 5cae332007..13b038bc48 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -994,51 +994,34 @@ struct buffer_view<address_space_enum::lds,
                 // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
                 // ds_write_b128
                 // TODO: remove this after compiler fix
+                // clang-format off
                 static_assert(
-                    (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                     std::is_same_v<remove_cvref_t<X>, int8_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8x4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8x8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
-                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                    (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x4_t> && std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x8_t> && std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x16_t> && std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
                         // int8 on thread buffer
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 8>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 4>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 2>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 1>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 16>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 2>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 1>>) ||
                         // ext_vector_type for pk_int4 must use int8_t as type
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4x4_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4x8_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
-                        (std::is_same_v<remove_cvref_t<T>, pk_int4x16_t> &&
-                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>),
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x4_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x8_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x16_t> && std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>),
                     "wrong! not implemented for this combination, please add "
                     "implementation");
+                // clang-format on
 
                 if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                               std::is_same_v<remove_cvref_t<X>, int8_t>) ||
@@ -1090,6 +1073,8 @@ struct buffer_view<address_space_enum::lds,
                 }
                 else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                    std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 16>>) ||
                                   (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                    std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>))
                 {
diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp b/include/ck_tile/core/tensor/load_tile_transpose.hpp
index d178ccb72c..ceb7e18556 100644
--- a/include/ck_tile/core/tensor/load_tile_transpose.hpp
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp
@@ -17,6 +17,11 @@
 
 namespace ck_tile {
 
+constexpr int DS_READ_TR_SIZE()
+{
+    return 8; // Literal constant, evaluated at compile time
+}
+
 namespace util {
 template <typename Suffix, typename Sequence>
 struct is_sequence_suffix
@@ -45,48 +50,60 @@ constexpr bool is_sequence_suffix_v = is_sequence_suffix<Suffix, Sequence>::valu
 template <typename DataType>
 struct DefaultTranspose
 {
+    template <index_t LaneGroupSize>
     struct Quad16
     {
-        using InputEncoding = tile_distribution_encoding<sequence<>,
-                                                         tuple<sequence<4>, sequence<4, 4>>,
-                                                         tuple<sequence<1, 2>>,
-                                                         tuple<sequence<0, 0>>,
-                                                         sequence<2>,
-                                                         sequence<1>>;
+        static_assert(LaneGroupSize == 64 || LaneGroupSize == 32 || LaneGroupSize == 16,
+                      "LaneGroupSize must be 64, 32, or 16");
+        using InputEncoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<4>, sequence<LaneGroupSize / 16, 4, 4>>,
+                                       tuple<sequence<2, 1, 2>>,
+                                       tuple<sequence<0, 0, 1>>,
+                                       sequence<2>,
+                                       sequence<2>>;
 
-        using OutputEncoding = tile_distribution_encoding<sequence<>,
-                                                          tuple<sequence<16>, sequence<4>>,
-                                                          tuple<sequence<1>>,
-                                                          tuple<sequence<0>>,
-                                                          sequence<2>,
-                                                          sequence<0>>;
+        using OutputEncoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<LaneGroupSize>, sequence<4>>,
+                                       tuple<sequence<1>>,
+                                       tuple<sequence<0>>,
+                                       sequence<2>,
+                                       sequence<0>>;
     };
 
+    template <index_t LaneGroupSize>
     struct Quad8
     {
-        using InputEncoding = tile_distribution_encoding<sequence<>,
-                                                         tuple<sequence<8>, sequence<2, 8>>,
-                                                         tuple<sequence<1, 2>>,
-                                                         tuple<sequence<0, 0>>,
-                                                         sequence<2>,
-                                                         sequence<1>>;
+        static_assert(LaneGroupSize == 64 || LaneGroupSize == 32 || LaneGroupSize == 16,
+                      "LaneGroupSize must be 64, 32, or 16");
+        using InputEncoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<8>, sequence<LaneGroupSize / 16, 2, 8>>,
+                                       tuple<sequence<2, 1, 2>>,
+                                       tuple<sequence<0, 0, 1>>,
+                                       sequence<2>,
+                                       sequence<2>>;
 
-        using OutputEncoding = tile_distribution_encoding<sequence<>,
-                                                          tuple<sequence<16>, sequence<8>>,
-                                                          tuple<sequence<1>>,
-                                                          tuple<sequence<0>>,
-                                                          sequence<2>,
-                                                          sequence<0>>;
+        using OutputEncoding =
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<LaneGroupSize>, sequence<8>>,
+                                       tuple<sequence<1>>,
+                                       tuple<sequence<0>>,
+                                       sequence<2>,
+                                       sequence<0>>;
     };
 
     // Select based on data size
+    template <index_t LaneGroupSize>
     using QuadInputEncoding = std::conditional_t<sizeof(DataType) == 2,
-                                                 typename Quad16::InputEncoding,
-                                                 typename Quad8::InputEncoding>;
+                                                 typename Quad16<LaneGroupSize>::InputEncoding,
+                                                 typename Quad8<LaneGroupSize>::InputEncoding>;
 
+    template <index_t LaneGroupSize>
     using QuadOutputEncoding = std::conditional_t<sizeof(DataType) == 2,
-                                                  typename Quad16::OutputEncoding,
-                                                  typename Quad8::OutputEncoding>;
+                                                  typename Quad16<LaneGroupSize>::OutputEncoding,
+                                                  typename Quad8<LaneGroupSize>::OutputEncoding>;
 
     // Always swap last two dimensions
     static constexpr auto transpose_dims = sequence<1, 0>{};
@@ -96,51 +113,79 @@ struct DefaultTranspose
         return idx; // Identity mapping
     };
 
-    template <typename InDstrEncode>
-    struct ValidationTraits
+    template <typename InDstrEncode, bool ReverseDirection, index_t LaneGroupSize>
+    struct ValidationTraitsImpl
     {
-        static constexpr auto input_hs_lengthss = InDstrEncode::hs_lengthss_;
-        static constexpr auto quad_hs_lengthss  = QuadInputEncoding::hs_lengthss_;
+        using QuadEncoding             = std::conditional_t<ReverseDirection,
+                                                QuadOutputEncoding<LaneGroupSize>,
+                                                QuadInputEncoding<LaneGroupSize>>;
+        static constexpr auto I0       = number<0>{};
+        static constexpr auto I1       = number<1>{};
+        static constexpr auto input_hs = InDstrEncode::hs_lengthss_;
+        static constexpr auto quad_hs  = QuadEncoding::hs_lengthss_;
         // 1. Must be 2D tensor
         static constexpr bool dims_valid = (InDstrEncode::NDimX == 2);
         // 2. Quad pattern must be suffix of input pattern
         static constexpr bool suffix_valid_dim0 =
-            util::is_sequence_suffix_v<decltype(quad_hs_lengthss.template get<0>()),
-                                       decltype(input_hs_lengthss.template get<0>())>;
+            util::is_sequence_suffix_v<decltype(quad_hs[I0]), decltype(input_hs[I0])>;
         static constexpr bool suffix_valid_dim1 =
-            util::is_sequence_suffix_v<decltype(quad_hs_lengthss.template get<1>()),
-                                       decltype(input_hs_lengthss.template get<1>())>;
+            util::is_sequence_suffix_v<decltype(quad_hs[I1]), decltype(input_hs[I1])>;
 
         // 3. PS→RHS mapping constraints
-        static constexpr auto input_ps_to_rhss_major = InDstrEncode::ps_to_rhss_major_;
-        static constexpr auto input_ps_to_rhss_minor = InDstrEncode::ps_to_rhss_minor_;
+        static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_;
+        static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_;
 
-        static constexpr index_t ndimp_outer = input_ps_to_rhss_major.size() - 1;
-        static constexpr index_t ndimp_inner =
-            input_ps_to_rhss_major[number<ndimp_outer>{}].size() - 1;
+        static constexpr auto quad_ps_major0 = QuadEncoding::ps_to_rhss_major_[I0];
+        static constexpr auto quad_ps_minor0 = QuadEncoding::ps_to_rhss_minor_[I0];
+
+        static constexpr auto input_ps_major_last =
+            input_ps_major[number<input_ps_major.size() - 1>{}];
+        static constexpr auto input_ps_minor_last =
+            input_ps_minor[number<input_ps_minor.size() - 1>{}];
+
+        using psys_offset = ck_tile::sequence<input_hs[I0].size() - quad_hs[I0].size(),
+                                              input_hs[I1].size() - quad_hs[I1].size()>;
+        static constexpr auto shifted_quad_ps_minor0 = generate_sequence_v2(
+            [](auto i) {
+                return number<quad_ps_minor0[i] + psys_offset{}[quad_ps_major0[i] - 1]>{};
+            },
+            number<quad_ps_minor0.size()>{});
 
         static constexpr bool ps_mapping_valid =
-            (input_ps_to_rhss_major[number<ndimp_outer>{}][number<ndimp_inner>{}] == 2) &&
-            (input_ps_to_rhss_minor[number<ndimp_outer>{}][number<ndimp_inner>{}] ==
-             input_hs_lengthss[number<1>{}].size() - 2) &&
-            (input_ps_to_rhss_major[number<ndimp_outer>{}][number<ndimp_inner - 1>{}] == 1) &&
-            (input_ps_to_rhss_minor[number<ndimp_outer>{}][number<ndimp_inner - 1>{}] ==
-             input_hs_lengthss[number<0>{}].size() - 1);
+            util::is_sequence_suffix_v<decltype(quad_ps_major0), decltype(input_ps_major_last)> &&
+            util::is_sequence_suffix_v<decltype(shifted_quad_ps_minor0),
+                                       decltype(input_ps_minor_last)>;
 
         // 4. YS→RHS mapping constraints
-        static constexpr auto input_ys_to_rhs_major = InDstrEncode::ys_to_rhs_major_;
-        static constexpr auto input_ys_to_rhs_minor = InDstrEncode::ys_to_rhs_minor_;
+        static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_;
+        static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_;
+        static constexpr auto quad_ys_major  = QuadEncoding::ys_to_rhs_major_;
+        static constexpr auto quad_ys_minor  = QuadEncoding::ys_to_rhs_minor_;
 
+        static_assert(quad_ys_major.size() == 1 && quad_ys_minor.size() == 1,
+                      "YS->RHS mapping must be single dimension");
+        static_assert(quad_ys_major.back() == 2 && quad_ys_minor.back() == quad_hs[I1].size() - 1,
+                      "YS->RHS mapping must be the last dimension");
         static constexpr bool ys_mapping_valid =
-            (input_ys_to_rhs_major.back() == 2) &&
-            (input_ys_to_rhs_minor.back() == input_hs_lengthss[number<1>{}].size() - 1) &&
-            (input_ys_to_rhs_major[input_ys_to_rhs_major.size() - 2] == 1) &&
-            (input_ys_to_rhs_minor[input_ys_to_rhs_minor.size() - 2] ==
-             input_hs_lengthss[number<0>{}].size() - 2);
+            (input_ys_major.back() == 2) && (input_ys_minor.back() == input_hs[I1].size() - 1);
 
         static constexpr bool value = dims_valid && suffix_valid_dim0 && suffix_valid_dim1 &&
                                       ps_mapping_valid && ys_mapping_valid;
     };
+
+    template <typename InDstrEncode, bool ReverseDirection = false>
+    struct ValidationTraits
+    {
+        static constexpr bool value =
+            ValidationTraitsImpl<InDstrEncode, ReverseDirection, 64>::value ||
+            ValidationTraitsImpl<InDstrEncode, ReverseDirection, 32>::value ||
+            ValidationTraitsImpl<InDstrEncode, ReverseDirection, 16>::value;
+        static constexpr index_t LaneGroupSize =
+            ValidationTraitsImpl<InDstrEncode, ReverseDirection, 64>::value   ? 64
+            : ValidationTraitsImpl<InDstrEncode, ReverseDirection, 32>::value ? 32
+            : ValidationTraitsImpl<InDstrEncode, ReverseDirection, 16>::value ? 16
+                                                                              : 0;
+    };
 };
 template <typename TileDistribution_, typename DataType_, typename Policy>
 struct TransposeTileDistrChecker
@@ -154,111 +199,150 @@ struct TransposeTileDistrChecker
 
 // this is used to generate the transposed output tile distribution encoding
 // based on the input tile distribution encoding
-template <typename TileDistribution_,
+template <typename TileDistributionEncoding_,
           typename DataType_,
-          typename Policy = DefaultTranspose<DataType_>>
-struct OutputTileDistributionTraits
+          typename Policy       = DefaultTranspose<DataType_>,
+          bool ReverseDirection = false>
+struct TransposeTileDistributionTraits
 {
-    using InDstrEncode = typename remove_cvref_t<TileDistribution_>::DstrEncode;
-    static constexpr auto input_hs_lengthss       = InDstrEncode::hs_lengthss_;
-    static constexpr auto quad_input_hs_lengthss  = Policy::QuadInputEncoding::hs_lengthss_;
-    static constexpr auto quad_output_hs_lengthss = Policy::QuadOutputEncoding::hs_lengthss_;
+    using InDstrEncode                      = remove_cvref_t<TileDistributionEncoding_>;
+    static constexpr auto input_hs_lengthss = InDstrEncode::hs_lengthss_;
+    static constexpr index_t LaneGroupSize =
+        Policy::template ValidationTraits<InDstrEncode, ReverseDirection>::LaneGroupSize;
+    static_assert(Policy::template ValidationTraits<InDstrEncode, ReverseDirection>::value,
+                  "The input tile distribution encoding is not valid for transpose!");
+
+    using QuadInputEncoding  = std::conditional_t< //
+        ReverseDirection,
+        typename Policy::template QuadOutputEncoding<LaneGroupSize>,
+        typename Policy::template QuadInputEncoding<LaneGroupSize>>;
+    using QuadOutputEncoding = std::conditional_t< //
+        ReverseDirection,
+        typename Policy::template QuadInputEncoding<LaneGroupSize>,
+        typename Policy::template QuadOutputEncoding<LaneGroupSize>>;
+
+    static constexpr auto quad_input_hs_lengthss  = QuadInputEncoding::hs_lengthss_;
+    static constexpr auto quad_output_hs_lengthss = QuadOutputEncoding::hs_lengthss_;
 
     static constexpr auto input_ps_to_rhss_major = InDstrEncode::ps_to_rhss_major_;
     static constexpr auto input_ps_to_rhss_minor = InDstrEncode::ps_to_rhss_minor_;
     static constexpr auto input_ys_to_rhs_major  = InDstrEncode::ys_to_rhs_major_;
     static constexpr auto input_ys_to_rhs_minor  = InDstrEncode::ys_to_rhs_minor_;
 
-    static constexpr auto quad_ps_to_rhss_major = Policy::QuadInputEncoding::ps_to_rhss_major_;
-    static constexpr auto quad_ps_to_rhss_minor = Policy::QuadInputEncoding::ps_to_rhss_minor_;
+    static constexpr auto I0                            = number<0>{};
+    static constexpr auto quad_input_ps_to_rhss_major0  = QuadInputEncoding::ps_to_rhss_major_[I0];
+    static constexpr auto quad_input_ps_to_rhss_minor0  = QuadInputEncoding::ps_to_rhss_minor_[I0];
+    static constexpr auto quad_output_ps_to_rhss_major0 = QuadOutputEncoding::ps_to_rhss_major_[I0];
+    static constexpr auto quad_output_ps_to_rhss_minor0 = QuadOutputEncoding::ps_to_rhss_minor_[I0];
+    static constexpr auto quad_output_ys_to_rhs_major   = QuadOutputEncoding::ys_to_rhs_major_;
+    static constexpr auto quad_output_ys_to_rhs_minor   = QuadOutputEncoding::ys_to_rhs_minor_;
+
+    static constexpr index_t dim0 = Policy::transpose_dims[0];
+    static constexpr index_t dim1 = Policy::transpose_dims[1];
+
+    static constexpr auto swap_one_and_two = [](const index_t idx) {
+        return (idx == 1) ? 2 : (idx == 2) ? 1 : idx;
+    };
 
     // for transpose load
-    // append the reversed quad output hs lengths to the input hs lengthss after removing
-    // the quad_input_hs_lengthss
-    // then reverse the whole sequence to get the dst_out_hs_lengthss
-    static constexpr auto reversed_quad_output_hs_lengthss = tuple_reverse(quad_output_hs_lengthss);
-
-    static constexpr auto full_out_hs_lengthss = generate_tuple(
+    // remove the quad_input_hs_lengthss from the input_hs_lengthss for each dimension and reverse
+    // dims and append the quad_output_hs_lengthss to the end of each dimension
+    static constexpr auto outer_hs_lengthss = generate_tuple(
         [](auto i) {
-            return input_hs_lengthss[i]
-                .extract(typename arithmetic_sequence_gen<0,
-                                                          input_hs_lengthss[i].size() -
-                                                              quad_input_hs_lengthss[i].size(),
-                                                          1>::type{})
-                .push_back(reversed_quad_output_hs_lengthss[i]);
+            constexpr auto input_i   = input_hs_lengthss[i];
+            constexpr auto outer_len = input_i.size() - quad_input_hs_lengthss[i].size();
+            return typename sequence_split<decltype(input_i), outer_len>::left_type{};
+        },
+        number<InDstrEncode::NDimX>{});
+    static constexpr auto reversed_outer_hs_lengthss = tuple_reverse(outer_hs_lengthss);
+    static constexpr auto dst_out_hs_lengthss        = generate_tuple(
+        [](auto i) {
+            auto outer_i = reversed_outer_hs_lengthss[i];
+            // append the reversed quad output hs lengths to the outer hs lengths
+            return outer_i.push_back(quad_output_hs_lengthss[i]);
         },
         number<InDstrEncode::NDimX>{});
 
-    static constexpr auto dst_out_hs_lengthss = tuple_reverse(full_out_hs_lengthss);
-
-    // for PS→RHS mapping(both major and minor), we need to modify the last element of the major
-    // sequence
-    static constexpr auto modified_ps_to_rhss_major = generate_tuple(
+    // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for
+    // thread distr) of the major sequence
+    static constexpr auto dst_ps_to_rhss_major = generate_tuple(
+        // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
         [](auto i) {
             if constexpr(i == input_ps_to_rhss_major.size() - 1)
             {
                 constexpr auto current_size             = input_ps_to_rhss_major[i].size();
-                constexpr auto reduce_size              = quad_ps_to_rhss_major[number<0>{}].size();
+                constexpr auto reduce_size              = quad_input_ps_to_rhss_major0.size();
+                constexpr auto quad_out                 = quad_output_ps_to_rhss_major0;
                 constexpr auto reduced_ps_to_rhss_major = input_ps_to_rhss_major[i].extract(
                     typename arithmetic_sequence_gen<0, current_size - reduce_size, 1>::type{});
-                return reduced_ps_to_rhss_major.push_back(number<2>{});
+                return reduced_ps_to_rhss_major.transform(swap_one_and_two).push_back(quad_out);
             }
             else
             {
-                // For all other sequences, keep them unchanged
-                return input_ps_to_rhss_major[i];
+                // For all other sequences (i.e. warp), keep them unchanged
+                return input_ps_to_rhss_major[i].transform(swap_one_and_two);
             }
         },
         number<input_ps_to_rhss_major.size()>{});
 
-    static constexpr auto minor_last_index =
-        full_out_hs_lengthss[number<InDstrEncode::NDimX - 1>{}].size() - 1;
-    static constexpr auto major_last_index = full_out_hs_lengthss[number<0>{}].size() - 1;
+    static constexpr auto quad_idx_offset =
+        transform_tuples([](auto x) { return number<x.size()>{}; }, reversed_outer_hs_lengthss);
+
+    // minus 1 because RsLength is not counted
+    static constexpr auto quad_output_ps_minor_offset = to_sequence(generate_tuple_for(
+        [](auto x) { return quad_idx_offset[number<x - 1>{}]; }, quad_output_ps_to_rhss_major0));
+    static constexpr auto quad_output_ys_minor_offset = to_sequence(generate_tuple_for(
+        [](auto x) { return quad_idx_offset[number<x - 1>{}]; }, quad_output_ys_to_rhs_major));
 
     static constexpr auto dst_ps_to_rhss_minor = generate_tuple(
         [](auto i) {
+            constexpr auto input_i = input_ps_to_rhss_minor[i];
             if constexpr(i == input_ps_to_rhss_minor.size() - 1)
             {
-                constexpr auto current_size             = input_ps_to_rhss_minor[i].size();
-                constexpr auto reduce_size              = quad_ps_to_rhss_minor[number<0>{}].size();
-                constexpr auto reduced_ps_to_rhss_minor = input_ps_to_rhss_minor[i].extract(
-                    typename arithmetic_sequence_gen<0, current_size - reduce_size, 1>::type{});
-                return reduced_ps_to_rhss_minor.push_back(number<minor_last_index>{});
+                constexpr auto outer_len = input_i.size() - quad_input_ps_to_rhss_minor0.size();
+                constexpr auto outer_ps =
+                    typename sequence_split<decltype(input_i), outer_len>::left_type{};
+
+                return outer_ps.push_back(quad_output_ps_minor_offset +
+                                          quad_output_ps_to_rhss_minor0);
             }
             else
             {
                 // For all other sequences, keep them unchanged
-                return input_ps_to_rhss_minor[i];
+                return input_i;
             }
         },
         number<input_ps_to_rhss_minor.size()>{});
 
+    static constexpr auto outer_input_ys_to_rhs_major = input_ys_to_rhs_major.pop_back();
+
     // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
-    static constexpr auto swap_one_and_two = [](const index_t idx) {
-        return (idx == 1) ? 2 : (idx == 2) ? 1 : idx;
-    };
-    static constexpr auto dst_ps_to_rhss_major = generate_tuple(
-        [](auto i) { return modified_ps_to_rhss_major[i].transform(swap_one_and_two); },
-        number<modified_ps_to_rhss_major.size()>{});
+    static constexpr auto dst_ys_to_rhs_major =
+        outer_input_ys_to_rhs_major.transform(swap_one_and_two).push_back(number<2>{});
 
-    static constexpr auto modified_input_ys_to_rhs_major =
-        input_ys_to_rhs_major.pop_back().push_back(number<1>{});
+    static constexpr auto dst_ys_to_rhs_minor = input_ys_to_rhs_minor.pop_back().push_back(
+        number<(quad_output_ys_minor_offset + quad_output_ys_to_rhs_minor)[I0]>{});
 
-    static constexpr auto dst_ys_to_rhs_major = generate_sequence_v2(
-        [](auto i) { return number<swap_one_and_two(modified_input_ys_to_rhs_major[i])>{}; },
-        number<modified_input_ys_to_rhs_major.size()>{});
-
-    static constexpr auto dst_ys_to_rhs_minor =
-        input_ys_to_rhs_minor.pop_back().push_back(number<major_last_index>{});
-
-    using OutDstrEncode = tile_distribution_encoding<typename InDstrEncode::RsLengths,
-                                                     remove_cvref_t<decltype(dst_out_hs_lengthss)>,
-                                                     remove_cvref_t<decltype(dst_ps_to_rhss_major)>,
-                                                     remove_cvref_t<decltype(dst_ps_to_rhss_minor)>,
-                                                     remove_cvref_t<decltype(dst_ys_to_rhs_major)>,
-                                                     remove_cvref_t<decltype(dst_ys_to_rhs_minor)>>;
+    using TransposedDstrEncode =
+        tile_distribution_encoding<typename InDstrEncode::RsLengths,
+                                   remove_cvref_t<decltype(dst_out_hs_lengthss)>,
+                                   remove_cvref_t<decltype(dst_ps_to_rhss_major)>,
+                                   remove_cvref_t<decltype(dst_ps_to_rhss_minor)>,
+                                   remove_cvref_t<decltype(dst_ys_to_rhs_major)>,
+                                   remove_cvref_t<decltype(dst_ys_to_rhs_minor)>>;
 };
 
+template <typename TileDistributionEncoding_,
+          typename DataType_,
+          typename Policy = DefaultTranspose<DataType_>>
+using OutputTileDistributionTraits =
+    TransposeTileDistributionTraits<TileDistributionEncoding_, DataType_, Policy, false>;
+template <typename TileDistributionEncoding_,
+          typename DataType_,
+          typename Policy = DefaultTranspose<DataType_>>
+using InputTileDistributionTraits =
+    TransposeTileDistributionTraits<TileDistributionEncoding_, DataType_, Policy, true>;
+
 template <typename InnerEncode,
           index_t kLeadIterPerWarp,
           index_t kSecondIterPerWarp,
@@ -321,9 +405,9 @@ load_tile_transpose(const tile_window_with_static_distribution<BottomTensorView_
                                                                TileDistribution_,
                                                                NumCoord>& tile_window)
 {
-    using OutTileDstrEncode =
-        typename OutputTileDistributionTraits<TileDistribution_,
-                                              typename BottomTensorView_::DataType>::OutDstrEncode;
+    using OutTileDstrEncode = typename OutputTileDistributionTraits<
+        typename TileDistribution_::DstrEncode,
+        typename BottomTensorView_::DataType>::TransposedDstrEncode;
     auto out_tensor = make_static_distributed_tensor<typename BottomTensorView_::DataType>(
         make_static_tile_distribution(OutTileDstrEncode{}));
     auto trans_tensor           = tile_window.template load_transpose<Policy>();
diff --git a/include/ck_tile/core/utility/debug.hpp b/include/ck_tile/core/utility/debug.hpp
new file mode 100644
index 0000000000..261bf50148
--- /dev/null
+++ b/include/ck_tile/core/utility/debug.hpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <stdio.h>
+#include <tuple>
+#include <utility>
+
+#include "ck_tile/core/numeric/integer.hpp"
+
+namespace ck_tile {
+template <auto... val>
+[[deprecated("Help function to print value")]] inline constexpr void CK_PRINT()
+{
+}
+template <typename... type>
+[[deprecated("Help function to print value")]] inline constexpr void CK_PRINT()
+{
+}
+
+template <char... Xs>
+struct str_literal
+{
+    static constexpr const char data[] = {Xs..., '\0'};
+    static constexpr const size_t size = sizeof...(Xs);
+
+    template <char... Ys>
+    CK_TILE_HOST_DEVICE constexpr auto operator+(str_literal<Ys...> /*rhs*/) const
+    {
+        return str_literal<Xs..., Ys...>{};
+    }
+
+    template <index_t N, char... Ys>
+    CK_TILE_HOST_DEVICE static constexpr auto duplicate_n(const str_literal<Ys...> sep)
+    {
+        if constexpr(N == 0)
+            return str_literal<>{};
+        else if constexpr(N == 1)
+            return str_literal<Xs...>{};
+        else
+            return duplicate_n<N - 1>(sep) + str_literal<Ys..., Xs...>{};
+    }
+};
+
+#define make_str_literal(lit_)                                                                     \
+    std::apply([](auto... indices) { return str_literal<(lit_)[decltype(indices)::value]...>{}; }, \
+               makeTuple(std::make_index_sequence<constexpr_strlen(lit_)>()))
+
+template <size_t... Idx>
+constexpr std::tuple<std::integral_constant<size_t, Idx>...>
+    makeTuple(std::index_sequence<Idx...>) noexcept
+{
+    return {};
+}
+constexpr size_t constexpr_strlen(const char* c)
+{
+    size_t t = 0;
+    while(*c++)
+        ++t;
+    return t;
+}
+
+template <typename DataType_, typename StaticTileDistribution_>
+struct static_distributed_tensor;
+
+template <typename T_, index_t N_>
+struct thread_buffer;
+
+// Usage example: CK_PRINTF<float>{}(tensor);
+template <typename ConvertTo = void,
+          typename FMT       = str_literal<>,
+          typename PREFIX    = str_literal<>,
+          typename SUFFIX    = str_literal<>>
+struct CK_PRINTF;
+template <typename ConvertTo, char... FMTChars, char... PREFIXChars, char... SUFFIXChars>
+struct CK_PRINTF<ConvertTo,
+                 str_literal<FMTChars...>,
+                 str_literal<PREFIXChars...>,
+                 str_literal<SUFFIXChars...>>
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE static constexpr auto default_format()
+    {
+        if constexpr(std::is_same_v<T, float>)
+            return make_str_literal("%8.3f");
+        else if constexpr(std::is_same_v<T, int>)
+            return make_str_literal("%5d");
+        else if constexpr(std::is_same_v<T, unsigned int>)
+            return make_str_literal("%5u");
+        else
+            return make_str_literal("0x%08x");
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_prefix()
+    {
+        constexpr auto fmt_tid = make_str_literal("tid %03d: [%02d] ");
+        if constexpr(sizeof...(PREFIXChars) == 0)
+            return fmt_tid;
+        else
+            return fmt_tid + make_str_literal(" ") + str_literal<PREFIXChars...>{};
+    }
+    CK_TILE_HOST_DEVICE static constexpr auto get_suffix()
+    {
+        constexpr auto lf = make_str_literal("\n");
+        if constexpr(sizeof...(SUFFIXChars) == 0)
+            return lf;
+        else
+            return str_literal<SUFFIXChars...>{} + lf;
+    }
+
+    template <typename T, index_t N, typename Y, index_t... Is>
+    CK_TILE_HOST_DEVICE void impl(const thread_buffer<T, N>& buf,
+                                  std::integer_sequence<index_t, Is...>) const
+    {
+        using FMT1                = std::conditional_t<sizeof...(FMTChars) == 0,
+                                        decltype(default_format<Y>()),
+                                        str_literal<FMTChars...>>;
+        constexpr auto fmt_v      = FMT1::template duplicate_n<N>(make_str_literal(" "));
+        constexpr auto fmt_wrap_v = get_prefix() + fmt_v + get_suffix();
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-nonliteral"
+        printf(fmt_wrap_v.data, get_thread_id(), N, type_convert<Y>(buf[Is])...);
+#pragma clang diagnostic pop
+    }
+
+    template <typename T, index_t N>
+    CK_TILE_HOST_DEVICE void operator()(const thread_buffer<T, N>& buf) const
+    {
+        using ConvertTo_ = std::conditional_t<std::is_same_v<ConvertTo, void>, T, ConvertTo>;
+        impl<T, N, ConvertTo_>(buf, std::make_integer_sequence<index_t, N>{});
+    }
+
+    template <typename... TS>
+    CK_TILE_HOST_DEVICE void operator()(const static_distributed_tensor<TS...>& tensor) const
+    {
+        return operator()(tensor.get_thread_buffer());
+    }
+};
+
+template <typename ConvertTo = void,
+          typename FMT       = str_literal<>,
+          typename PREFIX    = str_literal<>,
+          typename SUFFIX    = str_literal<>>
+struct CK_PRINTF_WARP0 : public CK_PRINTF<ConvertTo, FMT, PREFIX, SUFFIX>
+{
+    using base_t = CK_PRINTF<ConvertTo, FMT, PREFIX, SUFFIX>;
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(const T& buf) const
+    {
+        if(get_thread_id() < get_warp_size())
+            base_t::operator()(buf);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index f21136d2a8..30bea193b7 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -15,9 +15,9 @@
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_tile_partitioner.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp"
+#include "ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp"
 #include "ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp"
-#include "ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp"
@@ -29,14 +29,14 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp"
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index f1e8bcc0a8..b396f03244 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -13,9 +13,9 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2r1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp"
@@ -44,10 +44,10 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp"
-#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
index 8dd1d1ec28..862fa0bbe3 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 
 namespace ck_tile {
 
@@ -15,6 +16,19 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
     {
+#if defined(__gfx950__)
+        constexpr bool is_a_load_tr = std::is_same_v<remove_cvref_t<typename Problem::ALayout>,
+                                                     tensor_layout::gemm::ColumnMajor>;
+        constexpr bool is_b_load_tr = std::is_same_v<remove_cvref_t<typename Problem::BLayout>,
+                                                     tensor_layout::gemm::RowMajor>;
+#else
+        constexpr bool is_a_load_tr = false;
+        constexpr bool is_b_load_tr = false;
+#endif
+        constexpr auto wg_attr_num_access = (is_a_load_tr || is_b_load_tr)
+                                                ? WGAttrNumAccessEnum::Double
+                                                : WGAttrNumAccessEnum::Single;
+
         if constexpr(std::is_same_v<typename Problem::ADataType, half_t> &&
                      std::is_same_v<typename Problem::BDataType, half_t> &&
                      std::is_same_v<typename Problem::CDataType, float>)
@@ -40,14 +54,34 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
                 return make_tuple(WarpGemmMfmaF16F16F32M32N32K16{}, 2, 2);
             }
 #else
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution{}, 4, 1);
+            using WG = WarpGemmMfmaDispatcher<ck_tile::half_t,
+                                              ck_tile::half_t,
+                                              float,
+                                              32,
+                                              32,
+                                              16,
+                                              true,
+                                              false,
+                                              false,
+                                              wg_attr_num_access>;
+            return make_tuple(WG{}, 4, 1);
 #endif
         }
         else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
                           std::is_same_v<typename Problem::CDataType, float>)
         {
-            return make_tuple(WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution{}, 4, 1);
+            using WG = WarpGemmMfmaDispatcher<ck_tile::bf16_t,
+                                              ck_tile::bf16_t,
+                                              float,
+                                              32,
+                                              32,
+                                              16,
+                                              true,
+                                              false,
+                                              false,
+                                              wg_attr_num_access>;
+            return make_tuple(WG{}, 4, 1);
         }
         else
         {
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index d4e23d12dd..e1b0792ecf 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -218,10 +218,16 @@ struct BlockUniversalGemmAsBsCr
         BLdsTile b_warp_tile_;
 
         // C += A * B
-        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        template <typename CBlockTensor,
+                  typename ASmemBlockWindow,
+                  typename BSmemBlockWindow,
+                  bool ALoadTranspose = false,
+                  bool BLoadTranspose = false>
         CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
                                        const ASmemBlockWindow& a_block_window,
-                                       const BSmemBlockWindow& b_block_window)
+                                       const BSmemBlockWindow& b_block_window,
+                                       bool_constant<ALoadTranspose> = {},
+                                       bool_constant<BLoadTranspose> = {})
         {
             static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                           "The CDataType as defined in traits should be the same as correspoinding "
@@ -300,14 +306,23 @@ struct BlockUniversalGemmAsBsCr
         ALdsTile a_warp_tile_;
         BLdsTile b_warp_tile_;
 
-        template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+        template <typename ASmemBlockWindow,
+                  typename BSmemBlockWindow,
+                  bool ALoadTranspose = false,
+                  bool BLoadTranspose = false>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
-                                          const BSmemBlockWindow& b_block_window)
+                                          const BSmemBlockWindow& b_block_window,
+                                          bool_constant<ALoadTranspose> = {},
+                                          bool_constant<BLoadTranspose> = {})
         {
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
                 load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
+            else if constexpr(ALoadTranspose)
+            {
+                a_warp_tile_ = load_tile_transpose(a_block_window);
+            }
             else
             {
                 load_tile(a_warp_tile_, a_block_window);
@@ -316,6 +331,10 @@ struct BlockUniversalGemmAsBsCr
             {
                 load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
+            else if constexpr(BLoadTranspose)
+            {
+                b_warp_tile_ = load_tile_transpose(b_block_window);
+            }
             else
             {
                 load_tile(b_warp_tile_, b_block_window);
@@ -323,10 +342,16 @@ struct BlockUniversalGemmAsBsCr
         }
 
         // C += A * B
-        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        template <typename CBlockTensor,
+                  typename ASmemBlockWindow,
+                  typename BSmemBlockWindow,
+                  bool ALoadTranspose = false,
+                  bool BLoadTranspose = false>
         CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
                                        [[maybe_unused]] ASmemBlockWindow& a_block_window,
-                                       [[maybe_unused]] BSmemBlockWindow& b_block_window)
+                                       [[maybe_unused]] BSmemBlockWindow& b_block_window,
+                                       bool_constant<ALoadTranspose> = {},
+                                       bool_constant<BLoadTranspose> = {})
         {
             static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                           "The CDataType as defined in traits should be the same as correspoinding "
@@ -382,40 +407,73 @@ struct BlockUniversalGemmAsBsCr
         static constexpr index_t KInnerLoopIter = KPerInnerLoop / WarpGemm::kKPerThread;
 
         static constexpr auto ALdsTileDistr =
-            decltype(make_static_tile_distribution(MakeABlockDistributionEncode())){};
+            make_static_tile_distribution(MakeABlockDistributionEncode());
         static constexpr auto BLdsTileDistr =
-            decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
+            make_static_tile_distribution(MakeBBlockDistributionEncode());
 
         using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
         using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
 
         ALdsTile a_warp_tile_;
-        ALdsTile b_warp_tile_;
+        BLdsTile b_warp_tile_;
 
-        template <index_t KIdx, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        template <index_t KIdx,
+                  typename ASmemBlockWindow,
+                  typename BSmemBlockWindow,
+                  bool ALoadTranspose = false,
+                  bool BLoadTranspose = false>
         CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
-                                          const BSmemBlockWindow& b_block_window)
+                                          const BSmemBlockWindow& b_block_window,
+                                          bool_constant<ALoadTranspose> = {},
+                                          bool_constant<BLoadTranspose> = {})
         {
-            constexpr auto a_lds_load_tile_distr =
-                make_static_tile_distribution(MakeABlockDistributionEncode());
-            constexpr auto b_lds_load_tile_distr =
-                make_static_tile_distribution(MakeBBlockDistributionEncode());
+            constexpr auto a_lds_load_distr = [&]() {
+                if constexpr(ALoadTranspose)
+                    return make_static_tile_distribution(typename InputTileDistributionTraits<
+                                                         decltype(MakeABlockDistributionEncode()),
+                                                         ADataType>::TransposedDstrEncode{});
+                else
+                    return make_static_tile_distribution(MakeABlockDistributionEncode());
+            }();
+            constexpr auto b_lds_load_distr = [&]() {
+                if constexpr(BLoadTranspose)
+                    return make_static_tile_distribution(typename InputTileDistributionTraits<
+                                                         decltype(MakeBBlockDistributionEncode()),
+                                                         BDataType>::TransposedDstrEncode{});
+                else
+                    return make_static_tile_distribution(MakeBBlockDistributionEncode());
+            }();
+            constexpr auto a_lds_shape = []() {
+                if constexpr(ALoadTranspose)
+                    return make_tuple(number<KPerInnerLoop>{}, number<GemmTraits::MPerBlock>{});
+                else
+                    return make_tuple(number<GemmTraits::MPerBlock>{}, number<KPerInnerLoop>{});
+            }();
+            constexpr auto b_lds_shape = []() {
+                if constexpr(BLoadTranspose)
+                    return make_tuple(number<KPerInnerLoop>{}, number<GemmTraits::NPerBlock>{});
+                else
+                    return make_tuple(number<GemmTraits::NPerBlock>{}, number<KPerInnerLoop>{});
+            }();
+            constexpr auto k_idx_offset = KIdx * KPerInnerLoop;
+            constexpr auto a_offset =
+                ALoadTranspose ? multi_index<2>{k_idx_offset, 0} : multi_index<2>{0, k_idx_offset};
+            constexpr auto b_offset =
+                BLoadTranspose ? multi_index<2>{k_idx_offset, 0} : multi_index<2>{0, k_idx_offset};
 
             auto a_lds_gemm_window = make_tile_window(
-                a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::MPerBlock>{}, number<KPerInnerLoop>{}),
-                {0, KIdx * KPerInnerLoop},
-                a_lds_load_tile_distr);
+                a_block_window.get_bottom_tensor_view(), a_lds_shape, a_offset, a_lds_load_distr);
             auto b_lds_gemm_window = make_tile_window(
-                b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::NPerBlock>{}, number<KPerInnerLoop>{}),
-                {0, KIdx * KPerInnerLoop},
-                b_lds_load_tile_distr);
+                b_block_window.get_bottom_tensor_view(), b_lds_shape, b_offset, b_lds_load_distr);
 
             if constexpr(std::is_same_v<ADataType, pk_int4_t>)
             {
                 load_interleaved_pk_type(a_warp_tile_, a_block_window);
             }
+            else if constexpr(ALoadTranspose)
+            {
+                a_warp_tile_ = load_tile_transpose(a_lds_gemm_window);
+            }
             else
             {
                 load_tile(a_warp_tile_, a_lds_gemm_window);
@@ -424,6 +482,10 @@ struct BlockUniversalGemmAsBsCr
             {
                 load_interleaved_pk_type(b_warp_tile_, b_block_window);
             }
+            else if constexpr(BLoadTranspose)
+            {
+                b_warp_tile_ = load_tile_transpose(b_lds_gemm_window);
+            }
             else
             {
                 load_tile(b_warp_tile_, b_lds_gemm_window);
@@ -431,10 +493,16 @@ struct BlockUniversalGemmAsBsCr
         }
 
         // C += A * B
-        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+        template <typename CBlockTensor,
+                  typename ASmemBlockWindow,
+                  typename BSmemBlockWindow,
+                  bool ALoadTranspose = false,
+                  bool BLoadTranspose = false>
         CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
                                        const ASmemBlockWindow& a_block_window,
-                                       const BSmemBlockWindow& b_block_window)
+                                       const BSmemBlockWindow& b_block_window,
+                                       bool_constant<ALoadTranspose> a_load_tr = {},
+                                       bool_constant<BLoadTranspose> b_load_tr = {})
         {
             static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                           "The CDataType as defined in traits should be the same as correspoinding "
@@ -442,7 +510,7 @@ struct BlockUniversalGemmAsBsCr
 
             // hot loop:
             static_for<0, KRepeat, 1>{}([&](auto kIter) {
-                LocalPrefetch<kIter.value>(a_block_window, b_block_window);
+                LocalPrefetch<kIter.value>(a_block_window, b_block_window, a_load_tr, b_load_tr);
                 __builtin_amdgcn_sched_barrier(0);
                 // NOTE: Synchronize threads in a workgroup at the start of each MAC
                 // cluster, but except the first, as we can shorten non-MAC cluster a bit
@@ -543,29 +611,45 @@ struct BlockUniversalGemmAsBsCr
         return c_block_tensor;
     }
 
-    template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+    template <typename ASmemBlockWindow,
+              typename BSmemBlockWindow,
+              bool ALoadTranspose = false,
+              bool BLoadTranspose = false>
     CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
-                                      const BSmemBlockWindow& b_block_window)
+                                      const BSmemBlockWindow& b_block_window,
+                                      bool_constant<ALoadTranspose> a_load_tr = {},
+                                      bool_constant<BLoadTranspose> b_load_tr = {})
     {
-        block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window);
+        block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window, a_load_tr, b_load_tr);
     }
 
     // C += A * B
-    template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
+    template <typename CBlockTensor,
+              typename ASmemBlockWindow,
+              typename BSmemBlockWindow,
+              bool ALoadTranspose = false,
+              bool BLoadTranspose = false>
     CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
                                    const ASmemBlockWindow& a_block_window,
-                                   const BSmemBlockWindow& b_block_window)
+                                   const BSmemBlockWindow& b_block_window,
+                                   bool_constant<ALoadTranspose> a_load_tr = {},
+                                   bool_constant<BLoadTranspose> b_load_tr = {})
     {
-        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window);
+        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window, a_load_tr, b_load_tr);
     }
 
     // C = A * B
-    template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+    template <typename ASmemBlockWindow,
+              typename BSmemBlockWindow,
+              bool ALoadTranspose = false,
+              bool BLoadTranspose = false>
     CK_TILE_DEVICE auto operator()(const ASmemBlockWindow& a_block_window,
-                                   const BSmemBlockWindow& b_block_window)
+                                   const BSmemBlockWindow& b_block_window,
+                                   bool_constant<ALoadTranspose> a_load_tr = {},
+                                   bool_constant<BLoadTranspose> b_load_tr = {})
     {
         auto c_block_tensor = MakeCBlockTile();
-        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window);
+        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window, a_load_tr, b_load_tr);
         return c_block_tensor;
     }
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
index 6861adb153..2bee550b3c 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -20,6 +20,13 @@ struct GemmPipelineAgBgCrImplBase
     static constexpr index_t MPerBlock = BlockGemmShape::kM;
     static constexpr index_t NPerBlock = BlockGemmShape::kN;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
+#if defined(__gfx950__)
+    static constexpr bool is_a_load_tr = std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+    static constexpr bool is_b_load_tr = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+#else
+    static constexpr bool is_a_load_tr = false;
+    static constexpr bool is_b_load_tr = false;
+#endif
 
     CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
 
@@ -50,11 +57,15 @@ struct GemmPipelineAgBgCrImplBase
         store_tile(lds_tile_window, block_tile_tmp);
     }
 
-    template <typename DstBlockTile, typename SrcTileWindow>
+    template <typename DstBlockTile, typename SrcTileWindow, bool LoadTranspose = false>
     CK_TILE_DEVICE void LocalPrefetch(DstBlockTile& dst_block_tile,
-                                      const SrcTileWindow& lds_tile_window) const
+                                      const SrcTileWindow& lds_tile_window,
+                                      bool_constant<LoadTranspose> = {}) const
     {
-        load_tile(dst_block_tile, lds_tile_window);
+        if constexpr(LoadTranspose)
+            dst_block_tile = load_tile_transpose(lds_tile_window);
+        else
+            load_tile(dst_block_tile, lds_tile_window);
     }
 
     CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const
@@ -96,14 +107,25 @@ struct GemmPipelineAgBgCrImplBase
                              Policy::template MakeADramTileDistribution<Problem>());
 
         // A LDS tile window for store
-        auto a_copy_lds_window = make_tile_window(
-            a_lds_block_view, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+        auto a_lds_shape = []() {
+            if constexpr(is_a_load_tr)
+                return make_tuple(number<KPerBlock>{}, number<MPerBlock>{});
+            else
+                return make_tuple(number<MPerBlock>{}, number<KPerBlock>{});
+        }();
+        auto a_copy_lds_window = make_tile_window(a_lds_block_view, a_lds_shape, {0, 0});
 
+        auto a_lds_load_tile_distr = []() {
+            if constexpr(is_a_load_tr)
+                return make_static_tile_distribution(
+                    typename InputTileDistributionTraits<
+                        typename ALdsLoadTileDistr::DstrEncode,
+                        typename Problem::ADataType>::TransposedDstrEncode{});
+            else
+                return ALdsLoadTileDistr{};
+        }();
         auto a_lds_gemm_window =
-            make_tile_window(a_lds_block_view,
-                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                             {0, 0},
-                             ALdsLoadTileDistr{});
+            make_tile_window(a_lds_block_view, a_lds_shape, {0, 0}, a_lds_load_tile_distr);
 
         return make_tuple(std::move(a_copy_dram_window),
                           std::move(a_copy_lds_window),
@@ -130,14 +152,25 @@ struct GemmPipelineAgBgCrImplBase
         // TODO: Do we really need those two tile windows???
         // They're exactly same...
         // B LDS tile window for store
-        auto b_copy_lds_window = make_tile_window(
-            b_lds_block_view, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+        auto b_lds_shape = []() {
+            if constexpr(is_b_load_tr)
+                return make_tuple(number<KPerBlock>{}, number<NPerBlock>{});
+            else
+                return make_tuple(number<NPerBlock>{}, number<KPerBlock>{});
+        }();
+        auto b_copy_lds_window = make_tile_window(b_lds_block_view, b_lds_shape, {0, 0});
 
+        auto b_lds_load_tile_distr = []() {
+            if constexpr(is_b_load_tr)
+                return make_static_tile_distribution(
+                    typename InputTileDistributionTraits<
+                        typename BLdsLoadTileDistr::DstrEncode,
+                        typename Problem::BDataType>::TransposedDstrEncode{});
+            else
+                return BLdsLoadTileDistr{};
+        }();
         auto b_lds_gemm_window =
-            make_tile_window(b_lds_block_view,
-                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                             {0, 0},
-                             BLdsLoadTileDistr{});
+            make_tile_window(b_lds_block_view, b_lds_shape, {0, 0}, b_lds_load_tile_distr);
 
         return make_tuple(std::move(b_copy_dram_window),
                           std::move(b_copy_lds_window),
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 6d0db060cd..8f54e4eda6 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -153,6 +153,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         Problem::TailNum; // Base::GetBlockLoopTailNum(Problem::num_loop);
     static constexpr auto Scheduler = Problem::Scheduler;
 
+    static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
+    static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
+
     using Base::PrefetchStages;
     using Base::UsePersistentKernel;
 
@@ -467,7 +470,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            if constexpr(is_a_col_major)
+            if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -478,7 +481,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             {
                 Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
             }
-            if constexpr(is_b_row_major)
+            if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -494,7 +497,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
 
             block_sync_lds();
-            block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+            block_gemm.LocalPrefetch(
+                a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
 
             __builtin_amdgcn_sched_barrier(0);
 
@@ -506,7 +510,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                 {
                     block_sync_lds();
 
-                    if constexpr(is_a_col_major)
+                    if constexpr(is_a_col_major && !is_a_load_tr_v())
                     {
                         auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                             Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -517,7 +521,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                     {
                         Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
                     }
-                    if constexpr(is_b_row_major)
+                    if constexpr(is_b_row_major && !is_b_load_tr_v())
                     {
                         auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                             Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -536,7 +540,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
 
                     block_sync_lds();
 
-                    block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                    block_gemm.LocalPrefetch(
+                        a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
                     HotLoopScheduler();
                     __builtin_amdgcn_sched_barrier(0);
 
@@ -578,7 +583,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
                     Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
                 }
                 block_sync_lds();
-                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                block_gemm.LocalPrefetch(
+                    a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
             }
             // __builtin_amdgcn_sched_barrier(0);
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 8e6bab21be..ac91c2f58f 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -141,6 +141,9 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     static constexpr auto TailNum    = Problem::TailNum;
     static constexpr auto Scheduler  = Problem::Scheduler;
 
+    static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
+    static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
@@ -305,17 +308,23 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             auto&& [a_lds_block0, b_lds_block0] = Base::GetABLdsTensorViews(p_smem_0);
             auto&& [a_lds_block1, b_lds_block1] = Base::GetABLdsTensorViews(p_smem_1);
 
-            auto a_copy_lds_window0 = make_tile_window(
-                a_lds_block0, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+            constexpr auto a_lds_shape = []() {
+                if constexpr(is_a_load_tr_v())
+                    return make_tuple(number<KPerBlock>{}, number<MPerBlock>{});
+                else
+                    return make_tuple(number<MPerBlock>{}, number<KPerBlock>{});
+            }();
+            auto a_copy_lds_window0 = make_tile_window(a_lds_block0, a_lds_shape, {0, 0});
+            auto a_copy_lds_window1 = make_tile_window(a_lds_block1, a_lds_shape, {0, 0});
 
-            auto a_copy_lds_window1 = make_tile_window(
-                a_lds_block1, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
-
-            auto b_copy_lds_window0 = make_tile_window(
-                b_lds_block0, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
-
-            auto b_copy_lds_window1 = make_tile_window(
-                b_lds_block1, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+            constexpr auto b_lds_shape = []() {
+                if constexpr(is_b_load_tr_v())
+                    return make_tuple(number<KPerBlock>{}, number<NPerBlock>{});
+                else
+                    return make_tuple(number<NPerBlock>{}, number<KPerBlock>{});
+            }();
+            auto b_copy_lds_window0 = make_tile_window(b_lds_block0, b_lds_shape, {0, 0});
+            auto b_copy_lds_window1 = make_tile_window(b_lds_block1, b_lds_shape, {0, 0});
 
             // Block GEMM
             auto block_gemm   = BlockGemm();
@@ -325,7 +334,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            if constexpr(is_a_col_major)
+            if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -336,7 +345,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             {
                 Base::LocalPrefill(a_copy_lds_window0, a_global_load_tile, a_element_func);
             }
-            if constexpr(is_b_row_major)
+            if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -354,51 +363,53 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
 
             block_sync_lds();
 
-            constexpr auto ALdsTileDistr = decltype(make_static_tile_distribution(
-                BlockGemm::MakeABlockDistributionEncode())){};
-            constexpr auto BLdsTileDistr = decltype(make_static_tile_distribution(
-                BlockGemm::MakeBBlockDistributionEncode())){};
+            constexpr auto ALdsTileDistr =
+                make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+            constexpr auto BLdsTileDistr =
+                make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
 
             using ALdsTile = decltype(make_static_distributed_tensor<ADataType>(ALdsTileDistr));
             using BLdsTile = decltype(make_static_distributed_tensor<BDataType>(BLdsTileDistr));
+            ALdsTile a_block_tile0, a_block_tile1;
+            BLdsTile b_block_tile0, b_block_tile1;
 
-            ALdsTile a_block_tile0;
-            ALdsTile a_block_tile1;
-
-            BLdsTile b_block_tile0;
-            BLdsTile b_block_tile1;
-
+            constexpr auto a_lds_input_tile_distr = [&]() {
+                if constexpr(is_a_load_tr_v())
+                    return make_static_tile_distribution(
+                        typename InputTileDistributionTraits<
+                            decltype(BlockGemm::MakeABlockDistributionEncode()),
+                            typename Problem::ADataType>::TransposedDstrEncode{});
+                else
+                    return ALdsTileDistr;
+            }();
+            constexpr auto b_lds_input_tile_distr = [&]() {
+                if constexpr(is_b_load_tr_v())
+                    return make_static_tile_distribution(
+                        typename InputTileDistributionTraits<
+                            decltype(BlockGemm::MakeBBlockDistributionEncode()),
+                            typename Problem::BDataType>::TransposedDstrEncode{});
+                else
+                    return BLdsTileDistr;
+            }();
             auto a_lds_ld_window0 =
-                make_tile_window(a_lds_block0,
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 ALdsTileDistr);
+                make_tile_window(a_lds_block0, a_lds_shape, {0, 0}, a_lds_input_tile_distr);
             auto a_lds_ld_window1 =
-                make_tile_window(a_lds_block1,
-                                 make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 ALdsTileDistr);
+                make_tile_window(a_lds_block1, a_lds_shape, {0, 0}, a_lds_input_tile_distr);
             auto b_lds_ld_window0 =
-                make_tile_window(b_lds_block0,
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 BLdsTileDistr);
+                make_tile_window(b_lds_block0, b_lds_shape, {0, 0}, b_lds_input_tile_distr);
             auto b_lds_ld_window1 =
-                make_tile_window(b_lds_block1,
-                                 make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
-                                 {0, 0},
-                                 BLdsTileDistr);
+                make_tile_window(b_lds_block1, b_lds_shape, {0, 0}, b_lds_input_tile_distr);
 
-            static_assert(
-                !(is_tile_window_linear_v<decltype(a_lds_ld_window0)>)&&!(is_tile_window_linear_v<decltype(a_lds_ld_window1)>)&&!(
-                    is_tile_window_linear_v<
-                        decltype(b_lds_ld_window0)>)&&!(is_tile_window_linear_v<decltype(b_lds_ld_window1)>),
-                "LDS windows must not be linear");
+            static_assert(!is_tile_window_linear_v<decltype(a_lds_ld_window0)> &&
+                              !is_tile_window_linear_v<decltype(a_lds_ld_window1)> &&
+                              !is_tile_window_linear_v<decltype(b_lds_ld_window0)> &&
+                              !is_tile_window_linear_v<decltype(b_lds_ld_window1)>,
+                          "LDS windows must not be linear");
 
-            Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
-            Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0);
+            Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0, is_a_load_tr_v);
+            Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0, is_b_load_tr_v);
 
-            if constexpr(is_a_col_major)
+            if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -409,7 +420,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             {
                 Base::LocalPrefill(a_copy_lds_window1, a_global_load_tile, a_element_func);
             }
-            if constexpr(is_b_row_major)
+            if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -433,10 +444,10 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                     // ping
                     {
                         block_sync_lds();
-                        Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1);
-                        Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1);
+                        Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
+                        Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
 
-                        if constexpr(is_a_col_major)
+                        if constexpr(is_a_col_major && !is_a_load_tr_v())
                         {
                             auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -448,7 +459,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                             Base::LocalPrefill(
                                 a_copy_lds_window0, a_global_load_tile, a_element_func);
                         }
-                        if constexpr(is_b_row_major)
+                        if constexpr(is_b_row_major && !is_b_load_tr_v())
                         {
                             auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -473,10 +484,10 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                     // pong
                     {
                         block_sync_lds();
-                        Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
-                        Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0);
+                        Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0, is_a_load_tr_v);
+                        Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0, is_b_load_tr_v);
 
-                        if constexpr(is_a_col_major)
+                        if constexpr(is_a_col_major && !is_a_load_tr_v())
                         {
                             auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -488,7 +499,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                             Base::LocalPrefill(
                                 a_copy_lds_window1, a_global_load_tile, a_element_func);
                         }
-                        if constexpr(is_b_row_major)
+                        if constexpr(is_b_row_major && !is_b_load_tr_v())
                         {
                             auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -521,9 +532,9 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                 // 3
                 {
                     block_sync_lds();
-                    Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1);
-                    Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1);
-                    if constexpr(is_a_col_major)
+                    Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
+                    Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
+                    if constexpr(is_a_col_major && !is_a_load_tr_v())
                     {
                         auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                             Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -534,7 +545,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                     {
                         Base::LocalPrefill(a_copy_lds_window0, a_global_load_tile, a_element_func);
                     }
-                    if constexpr(is_b_row_major)
+                    if constexpr(is_b_row_major && !is_b_load_tr_v())
                     {
                         auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                             Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -550,8 +561,8 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                 // 2
                 {
                     block_sync_lds();
-                    Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
-                    Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0);
+                    Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0, is_a_load_tr_v);
+                    Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0, is_b_load_tr_v);
                     block_gemm(c_block_tile, a_block_tile1, b_block_tile1);
                 }
                 // 1
@@ -565,8 +576,8 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                 // 2
                 {
                     block_sync_lds();
-                    Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1);
-                    Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1);
+                    Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
+                    Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
                     block_gemm(c_block_tile, a_block_tile0, b_block_tile0);
                     static_for<0, 8, 1>{}([&](auto i) {
                         ignore = i;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index a42ddd93a0..4e9a70140e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -21,15 +21,27 @@ struct GemmPipelineAgBgCrCompV4DefaultPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
         // using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+
+        constexpr bool single_load_tr_length =
+            (DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType)) ==
+            (WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size());
+        constexpr auto wg_attr_num_access =
+            ((is_a_load_tr<Problem> || is_b_load_tr<Problem>)&&!single_load_tr_length)
+                ? WGAttrNumAccessEnum::Double
+                : WGAttrNumAccessEnum::Single;
+
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
                                                 typename Problem::BDataType,
                                                 typename Problem::CDataType, // AccDataType
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
                                                 WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                Problem::TransposeC,
+                                                false,
+                                                false,
+                                                wg_attr_num_access>;
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index dfcc398dfc..d62add7ef3 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -196,6 +196,9 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     static constexpr auto TailNum    = Problem::TailNum;
     static constexpr auto Scheduler  = Problem::Scheduler;
 
+    static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
+    static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
@@ -272,10 +275,10 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto& b_lds_block  = ab_lds_blocks.at(I1{});
 
             // Tile distribution for load from lds
-            constexpr auto a_lds_load_tile_distr = decltype(make_static_tile_distribution(
-                BlockGemm::MakeABlockDistributionEncode())){};
-            constexpr auto b_lds_load_tile_distr = decltype(make_static_tile_distribution(
-                BlockGemm::MakeBBlockDistributionEncode())){};
+            constexpr auto a_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+            constexpr auto b_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
 
             // A DRAM tile window for load
             // A LDS tile window for store
@@ -332,7 +335,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            if constexpr(is_a_col_major)
+            if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -343,7 +346,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             {
                 Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
             }
-            if constexpr(is_b_row_major)
+            if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -373,12 +376,13 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 {
                     static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) {
                         block_sync_lds();
-                        block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                        block_gemm.LocalPrefetch(
+                            a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
                         block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
                         block_sync_lds();
 
-                        if constexpr(is_a_col_major)
+                        if constexpr(is_a_col_major && !is_a_load_tr_v())
                         {
                             auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -394,7 +398,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                 a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
                                 a_element_func);
                         }
-                        if constexpr(is_b_row_major)
+                        if constexpr(is_b_row_major && !is_b_load_tr_v())
                         {
                             auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -427,12 +431,13 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 static_for<1, tail_num, 1>{}([&](auto prefetch_idx) {
                     block_sync_lds();
 
-                    block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                    block_gemm.LocalPrefetch(
+                        a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
                     block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
 
                     block_sync_lds();
 
-                    if constexpr(is_a_col_major)
+                    if constexpr(is_a_col_major && !is_a_load_tr_v())
                     {
                         auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                             Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -445,7 +450,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                            a_block_tiles.get(number<prefetch_idx>{}),
                                            a_element_func);
                     }
-                    if constexpr(is_b_row_major)
+                    if constexpr(is_b_row_major && !is_b_load_tr_v())
                     {
                         auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                             Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -461,14 +466,16 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 });
 
                 block_sync_lds();
-                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                block_gemm.LocalPrefetch(
+                    a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
             };
 
             if constexpr(TailNum == TailNumber::One)
             {
                 block_sync_lds();
-                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                block_gemm.LocalPrefetch(
+                    a_lds_gemm_window, b_lds_gemm_window, is_a_load_tr_v, is_b_load_tr_v);
                 block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
             }
             else if constexpr(TailNum == TailNumber::Two)
@@ -558,10 +565,10 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto& b_lds_block  = ab_lds_blocks.at(I1{});
 
             // Tile distribution for load from lds
-            constexpr auto a_lds_load_tile_distr = decltype(make_static_tile_distribution(
-                BlockGemm::MakeABlockDistributionEncode())){};
-            constexpr auto b_lds_load_tile_distr = decltype(make_static_tile_distribution(
-                BlockGemm::MakeBBlockDistributionEncode())){};
+            constexpr auto a_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+            constexpr auto b_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
 
             // A DRAM tile window for load
             // A LDS tile window for store
@@ -617,7 +624,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
             // LDS write 0
-            if constexpr(is_a_col_major)
+            if constexpr(is_a_col_major && !is_a_load_tr_v())
             {
                 auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                     Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -628,7 +635,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             {
                 Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func);
             }
-            if constexpr(is_b_row_major)
+            if constexpr(is_b_row_major && !is_b_load_tr_v())
             {
                 auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                     Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -658,10 +665,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 {
                     static_for<0, PrefetchStages, 1>{}([&](auto prefetch_idx) {
                         block_sync_lds();
-                        block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                        block_gemm(c_block_tile,
+                                   a_lds_gemm_window,
+                                   b_lds_gemm_window,
+                                   is_a_load_tr_v,
+                                   is_b_load_tr_v);
                         // no second block_sync_lds because it's interwave
 
-                        if constexpr(is_a_col_major)
+                        if constexpr(is_a_col_major && !is_a_load_tr_v())
                         {
                             auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                                 Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -677,7 +688,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                 a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}),
                                 a_element_func);
                         }
-                        if constexpr(is_b_row_major)
+                        if constexpr(is_b_row_major && !is_b_load_tr_v())
                         {
                             auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                                 Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -709,10 +720,14 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             auto HotLoopTail = [&](auto tail_num) {
                 static_for<1, tail_num, 1>{}([&](auto prefetch_idx) {
                     block_sync_lds();
-                    block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                    block_gemm(c_block_tile,
+                               a_lds_gemm_window,
+                               b_lds_gemm_window,
+                               is_a_load_tr_v,
+                               is_b_load_tr_v);
                     // no second block_sync_lds because it's interwave
 
-                    if constexpr(is_a_col_major)
+                    if constexpr(is_a_col_major && !is_a_load_tr_v())
                     {
                         auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
                             Policy::template MakeShuffledARegTileDistribution<Problem>());
@@ -725,7 +740,7 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                                            a_block_tiles.get(number<prefetch_idx>{}),
                                            a_element_func);
                     }
-                    if constexpr(is_b_row_major)
+                    if constexpr(is_b_row_major && !is_b_load_tr_v())
                     {
                         auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
                             Policy::template MakeShuffledBRegTileDistribution<Problem>());
@@ -741,13 +756,21 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
                 });
 
                 block_sync_lds();
-                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                block_gemm(c_block_tile,
+                           a_lds_gemm_window,
+                           b_lds_gemm_window,
+                           is_a_load_tr_v,
+                           is_b_load_tr_v);
             };
 
             if constexpr(TailNum == TailNumber::One)
             {
                 block_sync_lds();
-                block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+                block_gemm(c_block_tile,
+                           a_lds_gemm_window,
+                           b_lds_gemm_window,
+                           is_a_load_tr_v,
+                           is_b_load_tr_v);
             }
             else if constexpr(TailNum == TailNumber::Two)
             {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 881467cb94..2335c4eced 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -47,6 +47,8 @@ struct GemmPipelineAGmemBGmemCRegV1
     static constexpr bool kPadN = Problem::kPadN;
     static constexpr bool kPadK = Problem::kPadK;
 
+    static constexpr bool Preshuffle = Problem::Preshuffle;
+
     static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
 
     static constexpr index_t kLdsAlignmentInBytes = 16;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index c19d42ce25..52bd07c9e2 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -49,6 +49,9 @@ struct GemmPipelineProblemBase
     static constexpr auto Scheduler         = GemmPipelineScheduler::Default;
     static constexpr index_t VectorLoadSize = Traits::_VectorSize;
 
+    // In the base situation, the Preshuffle setting should be false.
+    static constexpr bool Preshuffle = false;
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index d5f2eedf2d..6820e82d09 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -12,6 +12,20 @@ namespace ck_tile {
 template <typename Derived>
 struct UniversalGemmBasePolicy
 {
+#if defined(__gfx950__)
+    template <typename Problem>
+    static constexpr bool is_a_load_tr =
+        std::is_same_v<remove_cvref_t<typename Problem::ALayout>, tensor_layout::gemm::ColumnMajor>;
+    template <typename Problem>
+    static constexpr bool is_b_load_tr =
+        std::is_same_v<remove_cvref_t<typename Problem::BLayout>, tensor_layout::gemm::RowMajor>;
+#else
+    template <typename Problem>
+    static constexpr bool is_a_load_tr = false;
+    template <typename Problem>
+    static constexpr bool is_b_load_tr = false;
+#endif
+
     static constexpr auto I0 = number<0>{};
     static constexpr auto I1 = number<1>{};
     static constexpr auto I2 = number<2>{};
@@ -22,51 +36,65 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
     {
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
 
+        using ADataType             = remove_cvref_t<typename Problem::ADataType>;
         constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack     = GetSmemPackA<Problem>();
 
-        constexpr auto DataTypeSize = sizeof(ADataType);
-        constexpr auto MLdsLayer =
-            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+        if constexpr(is_a_load_tr<Problem>)
+        {
+            // TODO: better lds descriptor for performance
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( //
+                make_tuple(number<KPerBlock>{}, number<MPerBlock>{}),
+                make_tuple(number<MPerBlock>{}, number<1>{}),
+                number<MPerBlock>{},
+                number<1>{});
+            return a_lds_block_desc_0;
+        }
+        else
+        {
+            constexpr index_t KPack = GetSmemPackA<Problem>();
 
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
-                       number<MPerBlock / MLdsLayer>{},
-                       number<KPack>{}),
-            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
+            constexpr auto DataTypeSize = sizeof(ADataType);
+            constexpr auto MLdsLayer =
+                (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
 
-        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
-                                                     number<KPerBlock / KPack * MLdsLayer>{})),
-                       make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
+                           number<MPerBlock / MLdsLayer>{},
+                           number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});
 
-        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
-            a_lds_block_desc_permuted,
-            make_tuple(make_unmerge_transform(
-                           make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
-                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
-                       make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
+                                                         number<KPerBlock / KPack * MLdsLayer>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
 
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
-            make_tuple(make_merge_transform_v3_division_mod(
-                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
+            constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(
+                               make_tuple(number<MLdsLayer>{}, number<KPerBlock / KPack>{})),
+                           make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
 
-        return a_lds_block_desc;
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+                make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
     }
 
     /**
@@ -78,14 +106,24 @@ struct UniversalGemmBasePolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
     {
-        // using BLayout   = remove_cvref_t<typename Problem::BLayout>;
         using BDataType = remove_cvref_t<typename Problem::BDataType>;
 
         constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
         constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
 
 #if 1
-        // if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        if constexpr(is_b_load_tr<Problem>)
+        {
+            // TODO: better lds descriptor for performance
+            constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor( //
+                make_tuple(number<KPerBlock>{}, number<NPerBlock>{}),
+                make_tuple(number<NPerBlock>{}, number<1>{}),
+                number<NPerBlock>{},
+                number<1>{});
+            return b_lds_block_desc_0;
+        }
+        else
+        // else if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
         {
             constexpr index_t KPack     = GetSmemPackB<Problem>();
             constexpr auto BK0          = number<KPerBlock / KPack>{};
@@ -584,8 +622,18 @@ struct UniversalGemmPipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+
+        constexpr index_t vector_size = DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType);
+        constexpr index_t thread_elements = WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size();
+        constexpr auto wg_attr_num_access =
+            !(is_a_load_tr<Problem> || is_b_load_tr<Problem>) ? WGAttrNumAccessEnum::Single
+            : vector_size == thread_elements                  ? WGAttrNumAccessEnum::Single
+            : vector_size * 2 == thread_elements              ? WGAttrNumAccessEnum::Double
+            : vector_size * 4 == thread_elements              ? WGAttrNumAccessEnum::Quad
+                                                              : WGAttrNumAccessEnum::Invalid;
+
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
                                                 typename Problem::ComputeDataType,
                                                 typename Problem::CDataType,
@@ -594,7 +642,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                 WarpTile::at(I2),
                                                 Problem::TransposeC,
                                                 false,
-                                                Problem::UseStructuredSparsity>;
+                                                Problem::UseStructuredSparsity,
+                                                wg_attr_num_access>;
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
index 432245d8a1..cf42cd3e74 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -84,7 +84,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
     using WarpTile   = remove_cvref_t<typename BlockGemmShape::WarpTile>;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
-    static constexpr index_t Preshuffle    = Problem::Preshuffle;
+    static constexpr bool Preshuffle       = Problem::Preshuffle;
     using Base::UsePersistentKernel;
 
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 185abccd3f..ae25bf0711 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -21,22 +21,29 @@ using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
-
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
-    2>>;
+    2,
+    AttrNumAccess>>;
 #endif
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
-    2>>;
+    2,
+    AttrNumAccess>>;
 #endif
 
 using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
@@ -56,25 +63,33 @@ using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
-        WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
-        2>>;
+        2,
+        AttrNumAccess>>;
 #endif
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
-        WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
-        2>>;
+        2,
+        AttrNumAccess>>;
 #endif
 
 #if defined(__gfx950__)
@@ -123,22 +138,29 @@ using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
     WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
-
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
-    2>>;
+    2,
+    AttrNumAccess>>;
 #endif
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
-    2>>;
+    2,
+    AttrNumAccess>>;
 #endif
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
@@ -159,25 +181,33 @@ using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
-        2>>;
+        2,
+        AttrNumAccess>>;
 #endif
 
 #if defined(__gfx950__)
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
 #else
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
-        2>>;
+        2,
+        AttrNumAccess>>;
 #endif
 
 #if defined(__gfx950__)
@@ -247,17 +277,25 @@ using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
 using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
     WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index 93ccdb5f57..27a81ff090 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -8,10 +8,22 @@
 
 namespace ck_tile {
 
-template <typename WarpGemmAttributeMfmaImpl_>
+// Number of groups of consecutive elements to fill in a ABKLane
+enum class WGAttrNumAccessEnum
+{
+    Single  = 1,
+    Double  = 2,
+    Quad    = 4,
+    Invalid = -1
+};
+
+template <typename WarpGemmAttributeMfmaImpl_,
+          WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
 struct WarpGemmAtrributeMfma
 {
-    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    static constexpr auto AttrNumAccess  = AttrNumAccess_;
+    static constexpr auto AttrNumAccessV = static_cast<index_t>(AttrNumAccess);
 
     using ADataType = typename Impl::ADataType;
     using BDataType = typename Impl::BDataType;
@@ -31,21 +43,35 @@ struct WarpGemmAtrributeMfma
     static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
                   "Multi-block WarpGemmAttributeMfmaImpl is not supported");
 
-    using AWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
-
-    using BWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
+    template <index_t kMNLane>
+    static constexpr auto get_warp_dstr_encoding()
+    {
+        if constexpr(AttrNumAccessV == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<kMNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else
+        {
+            static_assert(kKPerThread % AttrNumAccessV == 0,
+                          "kKPerThread must be divisible by NumAccess");
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<kMNLane>,
+                      sequence<AttrNumAccessV, Impl::kABKLane, Impl::kABKPerLane / AttrNumAccessV>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 2>,
+                sequence<0, 2>>{};
+        }
+    }
+    using AWarpDstrEncoding = decltype(get_warp_dstr_encoding<Impl::kAMLane>());
+    using BWarpDstrEncoding = decltype(get_warp_dstr_encoding<Impl::kBNLane>());
 
     using CWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
@@ -73,12 +99,16 @@ struct WarpGemmAtrributeMfma
     }
 };
 
-template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter>
+template <typename WarpGemmAttributeMfmaImpl_,
+          index_t kKIter,
+          WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
 struct WarpGemmAtrributeMfmaIterateK
 {
     static_assert(kKIter > 0, "wrong!");
 
-    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    static constexpr auto AttrNumAccess  = AttrNumAccess_;
+    static constexpr auto AttrNumAccessV = static_cast<index_t>(AttrNumAccess);
 
     using ADataType = typename Impl::ADataType;
     using BDataType = typename Impl::BDataType;
@@ -104,17 +134,37 @@ struct WarpGemmAtrributeMfmaIterateK
     {
         if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
         {
-            return tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Impl::kAMLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<2, 1>>,
-                tuple<sequence<0, 0>>,
-                sequence<2>,
-                sequence<1>>{};
+            if constexpr(AttrNumAccessV == 1)
+            {
+                return tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<Impl::kAMLane>,
+                          sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                    tuple<sequence<2, 1>>,
+                    tuple<sequence<0, 0>>,
+                    sequence<2>,
+                    sequence<1>>{};
+            }
+            else
+            {
+                static_assert(kKPerThread % AttrNumAccessV == 0,
+                              "kKPerThread must be divisible by NumAccess");
+                return tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<Impl::kAMLane>,
+                          sequence<AttrNumAccessV,
+                                   Impl::kABKLane,
+                                   Impl::kABKPerLane * kKIter / AttrNumAccessV>>,
+                    tuple<sequence<2, 1>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<2, 2>,
+                    sequence<0, 2>>{};
+            }
         }
         else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
         {
+            static_assert(AttrNumAccessV == 1,
+                          "Multiple access is not supported when using multi-block");
             // each M blocks share the same data
             return tile_distribution_encoding<
                 sequence<Impl::kBNBlock>,
@@ -127,6 +177,8 @@ struct WarpGemmAtrributeMfmaIterateK
         }
         else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
         {
+            static_assert(AttrNumAccessV == 1,
+                          "Multiple access is not supported when using multi-block");
             // single block to multi-block thread mapping
             return tile_distribution_encoding<
                 sequence<>,
@@ -143,17 +195,38 @@ struct WarpGemmAtrributeMfmaIterateK
     {
         if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
         {
-            return tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Impl::kBNLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<2, 1>>,
-                tuple<sequence<0, 0>>,
-                sequence<2>,
-                sequence<1>>{};
+            if constexpr(AttrNumAccessV == 1)
+            {
+                return tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<Impl::kBNLane>,
+                          sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+                    tuple<sequence<2, 1>>,
+                    tuple<sequence<0, 0>>,
+                    sequence<2>,
+                    sequence<1>>{};
+            }
+            else
+            {
+
+                static_assert(kKPerThread % AttrNumAccessV == 0,
+                              "kKPerThread must be divisible by NumAccess");
+                return tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<Impl::kBNLane>,
+                          sequence<AttrNumAccessV,
+                                   Impl::kABKLane,
+                                   Impl::kABKPerLane * kKIter / AttrNumAccessV>>,
+                    tuple<sequence<2, 1>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<2, 2>,
+                    sequence<0, 2>>{};
+            }
         }
         else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
         {
+            static_assert(AttrNumAccessV == 1,
+                          "Multiple access is not supported when using multi-block");
             // single block to multi-block thread mapping
             return tile_distribution_encoding<
                 sequence<>,
@@ -166,6 +239,8 @@ struct WarpGemmAtrributeMfmaIterateK
         }
         else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
         {
+            static_assert(AttrNumAccessV == 1,
+                          "Multiple access is not supported when using multi-block");
             // each N blocks share the same data
             return tile_distribution_encoding<
                 sequence<Impl::kAMBlock>,
@@ -289,10 +364,13 @@ struct WarpGemmAtrributeMfmaIterateK
     }
 };
 
-template <typename WarpGemmAttributeMfmaImpl_>
+template <typename WarpGemmAttributeMfmaImpl_,
+          WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
 struct WarpGemmAtrributeMfmaTransposedCDistribution
 {
-    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    static constexpr auto AttrNumAccess  = AttrNumAccess_;
+    static constexpr auto AttrNumAccessV = static_cast<index_t>(AttrNumAccess);
 
     using ADataType = typename Impl::BDataType;
     using BDataType = typename Impl::ADataType;
@@ -312,21 +390,35 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
     static_assert(Impl::kAMBlock == 1 && Impl::kBNBlock == 1,
                   "Multi-block WarpGemmAttributeMfmaImpl is not supported");
 
-    using AWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
-
-    using BWarpDstrEncoding = tile_distribution_encoding<
-        sequence<>,
-        tuple<sequence<Impl::kAMLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
-        tuple<sequence<2, 1>>,
-        tuple<sequence<0, 0>>,
-        sequence<2>,
-        sequence<1>>;
+    template <index_t kMNLane>
+    static constexpr auto get_warp_dstr_encoding()
+    {
+        if constexpr(AttrNumAccessV == 1)
+        {
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<kMNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<0, 0>>,
+                sequence<2>,
+                sequence<1>>{};
+        }
+        else
+        {
+            static_assert(kKPerThread % AttrNumAccessV == 0,
+                          "kKPerThread must be divisible by NumAccess");
+            return tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<kMNLane>,
+                      sequence<AttrNumAccessV, Impl::kABKLane, Impl::kABKPerLane / AttrNumAccessV>>,
+                tuple<sequence<2, 1>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 2>,
+                sequence<0, 2>>{};
+        }
+    }
+    using AWarpDstrEncoding = decltype(get_warp_dstr_encoding<Impl::kBNLane>());
+    using BWarpDstrEncoding = decltype(get_warp_dstr_encoding<Impl::kAMLane>());
 
     using CWarpDstrEncoding = tile_distribution_encoding<
         sequence<>,
@@ -450,10 +542,13 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
     }
 };
 
-template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter>
+template <typename WarpGemmAttributeMfmaImpl_,
+          index_t kKIter,
+          WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
 struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 {
-    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    using Impl                          = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    static constexpr auto AttrNumAccess = AttrNumAccess_;
 
     // swap A and B
     using ADataType = typename Impl::BDataType;
@@ -478,80 +573,14 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 
     CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding()
     {
-        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
-        {
-            return tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Impl::kBNLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<2, 1>>,
-                tuple<sequence<0, 0>>,
-                sequence<2>,
-                sequence<1>>{};
-        }
-        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
-        {
-            // single block to multi-block thread mapping
-            return tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Impl::kBNBlock, Impl::kBNLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<1, 2, 1>>,
-                tuple<sequence<0, 0, 1>>,
-                sequence<2>,
-                sequence<1>>{};
-        }
-        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
-        {
-            // each N blocks share the same data
-            return tile_distribution_encoding<
-                sequence<Impl::kAMBlock>,
-                tuple<sequence<Impl::kBNLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<0, 2, 1>>,
-                tuple<sequence<0, 0, 0>>,
-                sequence<2>,
-                sequence<1>>{};
-        }
+        return WarpGemmAtrributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
+            get_bwarp_dstr_encoding();
     }
 
     CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding()
     {
-        if constexpr(Impl::kAMBlock == 1 && Impl::kBNBlock == 1)
-        {
-            return tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Impl::kAMLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<2, 1>>,
-                tuple<sequence<0, 0>>,
-                sequence<2>,
-                sequence<1>>{};
-        }
-        else if constexpr(Impl::kAMBlock == 1 && 1 < Impl::kBNBlock)
-        {
-            // each M blocks share the same data
-            return tile_distribution_encoding<
-                sequence<Impl::kBNBlock>,
-                tuple<sequence<Impl::kAMLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<0, 2, 1>>,
-                tuple<sequence<0, 0, 0>>,
-                sequence<2>,
-                sequence<1>>{};
-        }
-        else if constexpr(1 < Impl::kAMBlock && Impl::kBNBlock == 1)
-        {
-            // single block to multi-block thread mapping
-            return tile_distribution_encoding<
-                sequence<>,
-                tuple<sequence<Impl::kAMBlock, Impl::kAMLane>,
-                      sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
-                tuple<sequence<1, 2, 1>>,
-                tuple<sequence<0, 0, 1>>,
-                sequence<2>,
-                sequence<1>>{};
-        }
+        return WarpGemmAtrributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
+            get_awarp_dstr_encoding();
     }
 
     CK_TILE_DEVICE static constexpr auto get_cwarp_dstr_encoding()
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index b6ada83532..4e5d102e35 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -16,8 +16,9 @@ template <typename AType,
           index_t NPerWave,
           index_t KPerWave,
           bool TransposeC,
-          bool SwizzleA              = false,
-          bool UseStructuredSparsity = false>
+          bool SwizzleA                     = false,
+          bool UseStructuredSparsity        = false,
+          WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 struct WarpGemmMfmaDispatcher;
 
 // clang-format off
@@ -25,12 +26,20 @@ struct WarpGemmMfmaDispatcher;
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaF16F16F32M32N32K16<WGAttrNumAccessEnum::Double>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaF16F16F32M16N16K32<WGAttrNumAccessEnum::Double>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaF16F16F32M4N64K16; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaF16F16F32M64N4K16; };
 
@@ -46,12 +55,20 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<WGAttrNumAccessEnum::Double>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<WGAttrNumAccessEnum::Double>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
+    using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; };
 
@@ -80,10 +97,18 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<WGAttrNumAccessEnum::Quad>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<WGAttrNumAccessEnum::Quad>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
 
 // int8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
@@ -102,8 +127,9 @@ template <typename AType,
           index_t NPerWave,
           index_t KPerWave,
           bool TransposeC,
-          bool SwizzleA              = false,
-          bool UseStructuredSparsity = false>
+          bool SwizzleA                     = false,
+          bool UseStructuredSparsity        = false,
+          WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      BType,
                                                                      AccType,
@@ -112,6 +138,7 @@ using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                      KPerWave,
                                                                      TransposeC,
                                                                      SwizzleA,
-                                                                     UseStructuredSparsity>::Type;
+                                                                     UseStructuredSparsity,
+                                                                     AttrNumAccess>::Type;
 
 } // namespace ck_tile
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 450a3a538f..7b519760b9 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -333,8 +333,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
         ck_tile::HostTensor<CDataType> c_m_n_dev_result(
             f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
 
-        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5}(a_m_k);
-        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5}(b_k_n);
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, 11939}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, 11940}(b_k_n);
 
         ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
         ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());

From 28072adc3aa5ef5d5c878442f7a8e316d7b47d03 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Thu, 17 Jul 2025 15:24:12 +0800
Subject: [PATCH 318/443] fix mfma32x32 dispatch (#2490)

---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index ae25bf0711..47b91ccbf7 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -213,7 +213,7 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<

From 05b65d0c7c25374bb5180f62306a4703b9ab4a2d Mon Sep 17 00:00:00 2001
From: slippedJim <jim.guo@amd.com>
Date: Thu, 17 Jul 2025 15:24:19 +0800
Subject: [PATCH 319/443] update (#2519)

---
 example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 89fbcff40c..1c46df0ab8 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -536,7 +536,6 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond = dtype in ['fp16', 'bf16']
                     cond &= mode == "batch"
                     cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
                     if not cond:
                         continue
             # Aiter (mha_varlen_bwd) integration
@@ -544,13 +543,11 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond = dtype in ['fp16', 'bf16']
                     cond &= mode == "group"
                     cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
                     if not cond:
                         continue
             # aiter::mha_bwd C++ api integration
             elif receipt == 600:
                     cond = dtype in ['fp16', 'bf16']
-                    cond &= dpad == dvpad
                     if not cond:
                         continue
             api_pool.register_dq_dk_dv_traits(k.api_trait())

From 7fc000d7b3d9860d939a505f2655f0d0330e699f Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Thu, 17 Jul 2025 14:41:29 +0200
Subject: [PATCH 320/443] Fix CI clang-format (#2521)

---
 .../gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 6820e82d09..8976315b21 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -625,7 +625,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
 
-        constexpr index_t vector_size = DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType);
+        constexpr index_t vector_size =
+            DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType);
         constexpr index_t thread_elements = WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size();
         constexpr auto wg_attr_num_access =
             !(is_a_load_tr<Problem> || is_b_load_tr<Problem>) ? WGAttrNumAccessEnum::Single

From c08986b02620d94f979436dbd279fd2993bb09dc Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Thu, 17 Jul 2025 09:53:34 -0600
Subject: [PATCH 321/443] Tests for CK Tile Batched Transpose and Smoothquant
 (#2453)

* Create tests for ck tile batched transpose using example

* Create ck tile tests for smoothquant using examples

* fix precision input strings and convert batched transpose to regression tests

* Code cleanup and fix asserts

* add missing licenses

* update copyright and licensing in files

* Update smoothquant tests to use example's smoothquant.cpp

* Add custom target for batched transpose tests

* Add missing new lines at end of files for CMakelists

* fix typo in batched transpose CMakeList target_compile_options

---------

Co-authored-by: root <root@ctr-ubbsmc16.amd.com>
---
 test/CMakeLists.txt                           |   3 +
 test/ck_tile/CMakeLists.txt                   |   2 +
 test/ck_tile/batched_transpose/CMakeLists.txt |  33 ++
 .../batched_transpose/batched_transpose.hpp   |  25 ++
 .../batched_transpose/batched_transpose.inc   | 283 ++++++++++++++++++
 .../batched_transpose_api.cpp                 | 113 +++++++
 .../batched_transpose_bf16.cpp                |  10 +
 .../batched_transpose_fp16.cpp                |  10 +
 .../batched_transpose_fp8.cpp                 |  10 +
 test/ck_tile/smoothquant/CMakeLists.txt       |  28 ++
 .../smoothquant_bf16_n1024_instance.cpp       |  21 ++
 .../smoothquant_bf16_n1536_instance.cpp       |  12 +
 .../smoothquant_bf16_n2048_instance.cpp       |  13 +
 .../smoothquant_bf16_n256_instance.cpp        |  11 +
 .../smoothquant_bf16_n3072_instance.cpp       |  13 +
 .../smoothquant_bf16_n4096_instance.cpp       |  13 +
 .../smoothquant_bf16_n4096_tp_instance.cpp    |  13 +
 .../smoothquant_bf16_n512_instance.cpp        |  12 +
 .../smoothquant_bf16_n64_n128_instance.cpp    |  11 +
 .../smoothquant_bf16_n768_instance.cpp        |  11 +
 .../smoothquant_fp16_n1024_instance.cpp       |  21 ++
 .../smoothquant_fp16_n1536_instance.cpp       |  12 +
 .../smoothquant_fp16_n2048_instance.cpp       |  13 +
 .../smoothquant_fp16_n256_instance.cpp        |  11 +
 .../smoothquant_fp16_n3072_instance.cpp       |  13 +
 .../smoothquant_fp16_n4096_instance.cpp       |  13 +
 .../smoothquant_fp16_n4096_tp_instance.cpp    |  13 +
 .../smoothquant_fp16_n512_instance.cpp        |  12 +
 .../smoothquant_fp16_n64_n128_instance.cpp    |  11 +
 .../smoothquant_fp16_n768_instance.cpp        |  11 +
 .../instances/smoothquant_fwd_api.cpp         | 143 +++++++++
 .../instances/smoothquant_instance_common.hpp |  61 ++++
 test/ck_tile/smoothquant/smoothquant.hpp      | 114 +++++++
 test/ck_tile/smoothquant/smoothquant.inc      | 274 +++++++++++++++++
 test/ck_tile/smoothquant/smoothquant_bf16.cpp |  11 +
 test/ck_tile/smoothquant/smoothquant_fp16.cpp |  11 +
 36 files changed, 1391 insertions(+)
 create mode 100644 test/ck_tile/batched_transpose/CMakeLists.txt
 create mode 100644 test/ck_tile/batched_transpose/batched_transpose.hpp
 create mode 100644 test/ck_tile/batched_transpose/batched_transpose.inc
 create mode 100644 test/ck_tile/batched_transpose/batched_transpose_api.cpp
 create mode 100644 test/ck_tile/batched_transpose/batched_transpose_bf16.cpp
 create mode 100644 test/ck_tile/batched_transpose/batched_transpose_fp16.cpp
 create mode 100644 test/ck_tile/batched_transpose/batched_transpose_fp8.cpp
 create mode 100644 test/ck_tile/smoothquant/CMakeLists.txt
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n1536_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n2048_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n256_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n3072_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n512_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_bf16_n768_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n1536_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n2048_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n256_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n3072_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n512_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fp16_n768_instance.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
 create mode 100644 test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
 create mode 100644 test/ck_tile/smoothquant/smoothquant.hpp
 create mode 100644 test/ck_tile/smoothquant/smoothquant.inc
 create mode 100644 test/ck_tile/smoothquant/smoothquant_bf16.cpp
 create mode 100644 test/ck_tile/smoothquant/smoothquant_fp16.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1be7c88c2e..c738eab802 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,6 +37,9 @@ set(REGRESSION_TESTS
     test_grouped_convnd_bwd_data_xdl
     test_conv_tensor_rearrange
     test_gemm_mx
+    test_ck_tile_batched_transpose_fp8
+    test_ck_tile_batched_transpose_fp16
+    test_ck_tile_batched_transpose_bf16
 )
 
 function(add_test_executable TEST_NAME)
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index cc933012ac..5c0f3fb076 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -6,3 +6,5 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
 add_subdirectory(data_type)
 add_subdirectory(slice_tile)
+add_subdirectory(batched_transpose)
+add_subdirectory(smoothquant)
diff --git a/test/ck_tile/batched_transpose/CMakeLists.txt b/test/ck_tile/batched_transpose/CMakeLists.txt
new file mode 100644
index 0000000000..ac8e3dac49
--- /dev/null
+++ b/test/ck_tile/batched_transpose/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+
+    function (add_batched_transpose_test TARGET_NAME MAIN_SRC)
+        message(DEBUG "adding ${TARGET_NAME}")
+
+        add_test_executable(${TARGET_NAME} ${MAIN_SRC} batched_transpose_api.cpp)
+        target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+        # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+        list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+        # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+        target_compile_options(${TARGET_NAME} PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
+
+    endfunction(add_batched_transpose_test TARGET_NAME MAIN_SRC)
+
+    set(CUSTOM_TARGET_NAME test_ck_tile_batched_transpose)
+
+    add_custom_target(${CUSTOM_TARGET_NAME})
+
+    add_batched_transpose_test(test_ck_tile_batched_transpose_fp16 batched_transpose_fp16.cpp)
+    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_fp16)
+
+    add_batched_transpose_test(test_ck_tile_batched_transpose_fp8 batched_transpose_fp8.cpp)
+    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_fp8)
+
+    add_batched_transpose_test(test_ck_tile_batched_transpose_bf16 batched_transpose_bf16.cpp)
+    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_bf16)
+
+
+else()
+    message(DEBUG "Skipping ck_tile batched_transpose tests for current target")
+endif()
diff --git a/test/ck_tile/batched_transpose/batched_transpose.hpp b/test/ck_tile/batched_transpose/batched_transpose.hpp
new file mode 100644
index 0000000000..bd1abb1191
--- /dev/null
+++ b/test/ck_tile/batched_transpose/batched_transpose.hpp
@@ -0,0 +1,25 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/ops/batched_transpose.hpp"
+
+#include <vector>
+#include <string>
+
+#pragma once
+
+struct batched_transpose_trait
+{
+    std::string type;
+    std::string layout;
+};
+
+struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
+{
+};
+
+float batched_transpose(batched_transpose_trait t,
+                        batched_transpose_kargs a,
+                        ck_tile::stream_config s);
diff --git a/test/ck_tile/batched_transpose/batched_transpose.inc b/test/ck_tile/batched_transpose/batched_transpose.inc
new file mode 100644
index 0000000000..30084f5664
--- /dev/null
+++ b/test/ck_tile/batched_transpose/batched_transpose.inc
@@ -0,0 +1,283 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "batched_transpose.hpp"
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+auto create_args(int argc, char* argv[], int index = 0)
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "whether do CPU validation or not")
+        .insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("N", "1", "input batch size. ")
+        .insert("C", "64", "input channel size.")
+        .insert("H", "18", "input height size.")
+        .insert("W", "64", "input width size. ")
+        .insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
+        .insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "t to 1 will print kernel name");
+
+    bool result = arg_parser.parse(argc, argv, index);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename Type>
+bool run_batched_transpose(ck_tile::ArgParser args)
+{
+    int validate           = args.get_int("v");
+    std::string prec       = args.get_str("pr");
+    int N                  = args.get_int("N");
+    int C                  = args.get_int("C");
+    int H                  = args.get_int("H");
+    int W                  = args.get_int("W");
+    int n_warmup           = args.get_int("warmup");
+    int n_repeat           = args.get_int("repeat");
+    std::string layout_in  = args.get_str("layout_in");
+    std::string layout_out = args.get_str("layout_out");
+    int seed               = args.get_int("seed");
+
+    int dim_in[4], dim_out[4];
+    int stride_dim_in[4], stride_dim_out[4];
+    bool nchw2nhwc = layout_in == "NCHW" && layout_out == "NHWC";
+    bool nhwc2nchw = layout_in == "NHWC" && layout_out == "NCHW";
+    assert(nchw2nhwc != nhwc2nchw);
+    (void)nhwc2nchw;
+
+    dim_in[0]         = N;
+    dim_in[1]         = nchw2nhwc ? C : H;
+    dim_in[2]         = nchw2nhwc ? H : W;
+    dim_in[3]         = nchw2nhwc ? W : C;
+    dim_out[0]        = N;
+    dim_out[1]        = nchw2nhwc ? H : C;
+    dim_out[2]        = nchw2nhwc ? W : H;
+    dim_out[3]        = nchw2nhwc ? C : W;
+    stride_dim_in[0]  = C * H * W;
+    stride_dim_in[1]  = nchw2nhwc ? H * W : C * W;
+    stride_dim_in[2]  = nchw2nhwc ? W : C;
+    stride_dim_in[3]  = 1;
+    stride_dim_out[0] = C * H * W;
+    stride_dim_out[1] = nchw2nhwc ? C * W : H * W;
+    stride_dim_out[2] = nchw2nhwc ? C : W;
+    stride_dim_out[3] = 1;
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    ck_tile::HostTensor<Type> x_host(
+        {dim_in[0], dim_in[1], dim_in[2], dim_in[3]},
+        {stride_dim_in[0], stride_dim_in[1], stride_dim_in[2], stride_dim_in[3]});
+    ck_tile::HostTensor<Type> y_host(
+        {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
+        {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
+
+    ck_tile::FillUniformDistribution<Type>{-.5f, .5f}(x_host);
+
+    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
+
+    x_dev.ToDevice(x_host.data());
+
+    auto trait = batched_transpose_trait{prec, layout_in};
+
+    uint32_t height = nchw2nhwc ? C : H * W;
+    uint32_t width  = nchw2nhwc ? H * W : C;
+
+    batched_transpose_kargs karg = [&]() {
+        batched_transpose_kargs a_;
+        a_.p_input  = x_dev.GetDeviceBuffer();
+        a_.p_output = y_dev.GetDeviceBuffer();
+        a_.batch    = N;
+        a_.height   = height;
+        a_.width    = width;
+        return a_;
+    }();
+
+    ck_tile::stream_config sc{nullptr, true, n_warmup, n_repeat};
+
+    auto ms = batched_transpose(trait, karg, sc);
+
+    std::size_t num_operations = N * C * H * (W - 1);
+    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
+
+    float ave_time   = ms * 1E-3;
+    float gb_per_sec = num_bytes / ms * 1.E-6;
+    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;
+
+    std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
+              << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
+              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
+              << gb_per_sec << " GB/s, " << std::endl;
+
+    printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
+           prec.c_str(),
+           N,
+           C,
+           H,
+           W,
+           layout_in.c_str(),
+           ms);
+    if(ms < 0)
+        printf("------------------------------------not "
+               "supported-------------------------------------\n");
+    fflush(stdout);
+
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    y_dev.FromDevice(y_host.data());
+
+    bool rtn = true;
+    if(validate)
+    {
+        // this host buffer will not copy to GPU, so no need use stride
+        ck_tile::HostTensor<Type> y_ref(
+            {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
+            {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
+
+        ck_tile::reference_batched_transpose<Type>(x_host, y_ref, layout_in, layout_out);
+
+        auto [rtol, atol] = get_elimit<Type>("");
+
+        rtn &= ck_tile::check_err(
+            y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
+    }
+    printf("-----------------------------------------------------------------------valid:%s--------"
+           "--------------------------------------------------------------------\n",
+           rtn ? "y" : "n");
+    fflush(stdout);
+    return rtn;
+}
+
+template <typename PrecType>
+bool run_test_case(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    return run_batched_transpose<PrecType>(args);
+}
+
+template <typename PrecType>
+bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
+{
+    bool valid = true;
+    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
+    {
+        constexpr int num_args = 7;
+        char* argv[num_args];
+
+        assert(test_cases[test_idx].size() == num_args &&
+               "invalid number of arguments in test case");
+
+        for(std::size_t idx = 0; idx < test_cases[test_idx].size(); ++idx)
+        {
+            argv[idx] = test_cases[test_idx][idx].data();
+        }
+
+        valid = valid && run_test_case<PrecType>(num_args, argv);
+
+        if(!valid)
+            break;
+    }
+
+    return valid;
+}
+
+std::vector<std::vector<std::string>> generate_test_cases(const std::string prec)
+{
+    return {
+        {"-pr=" + prec, "-N=1", "-C=32", "-H=1", "-W=32", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=1", "-C=64", "-H=1", "-W=64", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=2", "-C=12", "-H=1", "-W=32", "-layout_in=NHWC", "-layout_out=NCHW"},
+        {"-pr=" + prec, "-N=3", "-C=1334", "-H=1", "-W=37", "-layout_in=NHWC", "-layout_out=NCHW"},
+        {"-pr=" + prec, "-N=4", "-C=27", "-H=1", "-W=32", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=5", "-C=1234", "-H=1", "-W=12", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=1", "-C=1", "-H=1", "-W=1", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=1", "-C=1", "-H=1", "-W=1", "-layout_in=NHWC", "-layout_out=NCHW"},
+        {"-pr=" + prec,
+         "-N=128",
+         "-C=1024",
+         "-H=64",
+         "-W=64",
+         "-layout_in=NCHW",
+         "-layout_out=NHWC"},
+        {"-pr=" + prec,
+         "-N=128",
+         "-C=1024",
+         "-H=64",
+         "-W=64",
+         "-layout_in=NHWC",
+         "-layout_out=NCHW"},
+        {"-pr=" + prec, "-N=16", "-C=64", "-H=32", "-W=128", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=16", "-C=64", "-H=128", "-W=32", "-layout_in=NHWC", "-layout_out=NCHW"},
+        {"-pr=" + prec, "-N=1", "-C=2048", "-H=1", "-W=1", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=1", "-C=2048", "-H=1", "-W=1", "-layout_in=NHWC", "-layout_out=NCHW"},
+        {"-pr=" + prec,
+         "-N=1",
+         "-C=1",
+         "-H=1024",
+         "-W=1024",
+         "-layout_in=NCHW",
+         "-layout_out=NHWC"},
+        {"-pr=" + prec,
+         "-N=1",
+         "-C=1",
+         "-H=1024",
+         "-W=1024",
+         "-layout_in=NHWC",
+         "-layout_out=NCHW"},
+        {"-pr=" + prec, "-N=8", "-C=16", "-H=8", "-W=16", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=8", "-C=16", "-H=8", "-W=16", "-layout_in=NHWC", "-layout_out=NCHW"},
+        {"-pr=" + prec, "-N=1", "-C=64", "-H=1", "-W=1024", "-layout_in=NCHW", "-layout_out=NHWC"},
+        {"-pr=" + prec, "-N=1", "-C=64", "-H=1024", "-W=1", "-layout_in=NHWC", "-layout_out=NCHW"}};
+}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_api.cpp b/test/ck_tile/batched_transpose/batched_transpose_api.cpp
new file mode 100644
index 0000000000..27c2269a06
--- /dev/null
+++ b/test/ck_tile/batched_transpose/batched_transpose_api.cpp
@@ -0,0 +1,113 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+#include "batched_transpose.hpp"
+
+template <typename ts_type,
+          ck_tile::index_t block_x,
+          ck_tile::index_t block_y,
+          ck_tile::index_t warp_x,
+          ck_tile::index_t warp_y,
+          ck_tile::index_t thread_x,
+          ck_tile::index_t thread_y,
+          bool kPadM,
+          bool kPadN>
+float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
+{
+    uint32_t dim_stride = a.height * a.width;
+
+    a.dim_stride  = dim_stride;
+    a.dim_block_h = block_y;
+    a.dim_block_w = block_x;
+
+    using block_tile  = ck_tile::sequence<block_x, block_y>;
+    using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
+    using thread_tile = ck_tile::sequence<thread_x, thread_y>;
+
+    using ts_problem =
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
+    using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
+
+    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
+
+    auto kargs = kernel::MakeKargs(a);
+
+    const dim3 grids      = kernel::GridSize(a);
+    constexpr dim3 blocks = kernel::BlockSize();
+
+    printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
+    printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
+    printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
+           kargs.batch,
+           kargs.height,
+           kargs.width,
+           kargs.dim_stride);
+
+    printf("Launching Kernel...\n");
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
+
+    printf("Kernel finished...\n");
+
+    return ave_time;
+}
+
+// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
+#define FOREACH_TRANSPOSE_PARAM(F)                               \
+    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
+
+// Macro that defines one static function per line
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN)             \
+    static float                                                                                \
+        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN(  \
+            batched_transpose_kargs& a, ck_tile::stream_config& s)                              \
+    {                                                                                           \
+        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
+    }
+
+FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
+
+float batched_transpose(batched_transpose_trait t,
+                        batched_transpose_kargs a,
+                        ck_tile::stream_config s)
+{
+    if(t.type == "fp8")
+    {
+        if(a.height % 64 == 0 && a.width % 64 == 0)
+        {
+            return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
+        }
+        else
+        {
+            return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
+        }
+    }
+    else if(t.type == "fp16")
+    {
+        if(a.height % 64 == 0 && a.width % 64 == 0)
+        {
+            return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
+        }
+        else
+        {
+            return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
+        }
+    }
+    else if(t.type == "bf16")
+    {
+        if(a.height % 64 == 0 && a.width % 64 == 0)
+        {
+            return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
+        }
+        else
+        {
+            return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
+        }
+    }
+    return -1;
+}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_bf16.cpp b/test/ck_tile/batched_transpose/batched_transpose_bf16.cpp
new file mode 100644
index 0000000000..42642335f6
--- /dev/null
+++ b/test/ck_tile/batched_transpose/batched_transpose_bf16.cpp
@@ -0,0 +1,10 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+#include "batched_transpose.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16");
+
+    return !run_test_cases<ck_tile::bf16_t>(test_cases);
+}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_fp16.cpp b/test/ck_tile/batched_transpose/batched_transpose_fp16.cpp
new file mode 100644
index 0000000000..5562dd54e8
--- /dev/null
+++ b/test/ck_tile/batched_transpose/batched_transpose_fp16.cpp
@@ -0,0 +1,10 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+#include "batched_transpose.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16");
+
+    return !run_test_cases<ck_tile::fp16_t>(test_cases);
+}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_fp8.cpp b/test/ck_tile/batched_transpose/batched_transpose_fp8.cpp
new file mode 100644
index 0000000000..45e79fb4c2
--- /dev/null
+++ b/test/ck_tile/batched_transpose/batched_transpose_fp8.cpp
@@ -0,0 +1,10 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+#include "batched_transpose.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp8");
+
+    return !run_test_cases<ck_tile::fp8_t>(test_cases);
+}
diff --git a/test/ck_tile/smoothquant/CMakeLists.txt b/test/ck_tile/smoothquant/CMakeLists.txt
new file mode 100644
index 0000000000..de4459051c
--- /dev/null
+++ b/test/ck_tile/smoothquant/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    function (add_smoothquant_test TARGET_NAME MAIN_SRC)
+        message(DEBUG "adding ${TARGET_NAME}")
+
+        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+        foreach(source IN LISTS ARGN)
+            list(APPEND INSTANCE_SRCS ${source})
+        endforeach()
+
+        target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS})
+
+        set(COMPILE_OPTIONS)
+        # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+        list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+        target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS})
+    endfunction(add_smoothquant_test TARGET_NAME MAIN_SRC)
+
+    file(GLOB INSTANCE_SRCS instances/*.cpp)
+    add_smoothquant_test(test_ck_tile_smoothquant_fp16 smoothquant_fp16.cpp ${INSTANCE_SRCS})
+    add_smoothquant_test(test_ck_tile_smoothquant_bf16 smoothquant_bf16.cpp ${INSTANCE_SRCS})
+
+else()
+    message(DEBUG "Skipping ck_tile smoothquant tests for current target")
+endif()
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
new file mode 100644
index 0000000000..8e64d933f5
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
@@ -0,0 +1,21 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm   tn  vn   pd   2p
+#if 0
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
+
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
+#endif
+
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1536_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1536_instance.cpp
new file mode 100644
index 0000000000..0b8c3738b1
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1536_instance.cpp
@@ -0,0 +1,12 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 4,  64, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 2, 128, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 3, 1, 256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 6, 1, 256, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n2048_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n2048_instance.cpp
new file mode 100644
index 0000000000..1c805c540a
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n2048_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn   vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 1, 256, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 256, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 8, 1, 256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n256_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n256_instance.cpp
new file mode 100644
index 0000000000..0d6707d02c
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n256_instance.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n3072_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n3072_instance.cpp
new file mode 100644
index 0000000000..abeba019fb
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n3072_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1,  128, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1,  256, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 6, 1,  256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 3, 1, 1024, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_instance.cpp
new file mode 100644
index 0000000000..be192b3122
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp
new file mode 100644
index 0000000000..5d7abd3635
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t,  1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n512_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n512_instance.cpp
new file mode 100644
index 0000000000..faccdd9718
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n512_instance.cpp
@@ -0,0 +1,12 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 1, 4, 64, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp
new file mode 100644
index 0000000000..8ec7432168
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n64_n128_instance.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 1,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  4, 64, 2,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n768_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n768_instance.cpp
new file mode 100644
index 0000000000..ae7b6055b0
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n768_instance.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  3, 4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1,  6, 4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::bf16_t, 1, 12, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
new file mode 100644
index 0000000000..dfe3e9cc9c
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
@@ -0,0 +1,21 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm    tn  vn   pd    2p
+#if 0
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
+
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
+#endif
+
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1536_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1536_instance.cpp
new file mode 100644
index 0000000000..a84c3ce0ef
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1536_instance.cpp
@@ -0,0 +1,12 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm   tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 4,   64, 8,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 2,  128, 4,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 3, 1,  256, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 6, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n2048_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n2048_instance.cpp
new file mode 100644
index 0000000000..c38fc38438
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n2048_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 1,  256, 8, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 2, 1,  256, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 4, 1,  256, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 8, 1,  256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n256_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n256_instance.cpp
new file mode 100644
index 0000000000..a2f8588511
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n256_instance.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n3072_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n3072_instance.cpp
new file mode 100644
index 0000000000..99257bc322
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n3072_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn  pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1,  128, 8,true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1,  256, 4,true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 6, 1,  256, 2,true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 3, 1, 1024, 1,true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_instance.cpp
new file mode 100644
index 0000000000..dec70cefb2
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn    pd     2p
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true,  false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true,  false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true,  false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true,  false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
new file mode 100644
index 0000000000..b85e864523
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,13 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t,  1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n512_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n512_instance.cpp
new file mode 100644
index 0000000000..8d64ae043f
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n512_instance.cpp
@@ -0,0 +1,12 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4, 64, 8,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4, 64, 4,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4, 64, 2,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
new file mode 100644
index 0000000000..4675a31c25
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd      2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 1,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  4,  64, 2,  true, false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n768_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n768_instance.cpp
new file mode 100644
index 0000000000..f0f71fa717
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n768_instance.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd       2p
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  3,  4,  64, 4,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1,  6,  4,  64, 2,  true , false>>(const S&, A);
+template float smoothquant_<trait_<ck_tile::fp16_t, 1, 12,  4,  64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
new file mode 100644
index 0000000000..4b7ef5a38d
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
@@ -0,0 +1,143 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <ck_tile/core.hpp>
+#include "smoothquant.hpp"
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = smoothquant_traits_<DataType_,
+                                   Repeat_M_,
+                                   Repeat_N_,
+                                   ThreadPerBlock_M_,
+                                   ThreadPerBlock_N_,
+                                   Vector_N_,
+                                   kPadN_,
+                                   kTwoPass_>;
+
+template <typename data_type>
+float smoothquant_dispatch(smoothquant_traits /*t*/,
+                           smoothquant_args a,
+                           const ck_tile::stream_config& s)
+{
+    float r = -1;
+    // clang-format off
+    //                                         rm  rn  tm  tn  vn   pd    2p
+    if(a.n <= 64) {
+            r = smoothquant_<trait_<data_type, 1,  1,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 128) {
+        if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type, 1,  1,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type, 1,  2,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 256) {
+        if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1,  4,  64, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 512) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1,  4,  64, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2,  4,  64, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 8,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 768) {
+        if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3,  4,  64, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 6,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1,12,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 1024) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1, 2,  128, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 2,  128, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 2,  128, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 1536) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 4,   64, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 2,  128, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 6, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 2048) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 1, 1,  256, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1,  256, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 8, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 3072) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 1,  128, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 3, 1,  256, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 6, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 3, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.n <= 4096) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1,  256, 8,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 4,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1, 1024, 2,  true, false>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.n > 4096) {
+        if (a.n % 8 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1,  256, 8,  true, true>>(s, a);
+        else if (a.n % 4 == 0)
+            r = smoothquant_<trait_<data_type,  1, 4, 1,  256, 4,  true, true>>(s, a);
+        else if (a.n % 2 == 0)
+            r = smoothquant_<trait_<data_type,  1, 2, 1, 1024, 2,  true, true>>(s, a);
+        else
+            r = smoothquant_<trait_<data_type,  1, 4, 1, 1024, 1,  true, true>>(s, a);
+    }
+    return r;
+    // clang-format on
+}
+
+float smoothquant(smoothquant_traits t, smoothquant_args a, const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        return smoothquant_dispatch<ck_tile::fp16_t>(t, a, s);
+    }
+    else if(t.data_type.compare("bf16") == 0)
+    {
+        return smoothquant_dispatch<ck_tile::bf16_t>(t, a, s);
+    }
+    else
+        throw std::runtime_error("Without supported instances!");
+}
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp b/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
new file mode 100644
index 0000000000..19310beb94
--- /dev/null
+++ b/test/ck_tile/smoothquant/instances/smoothquant_instance_common.hpp
@@ -0,0 +1,61 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <ck_tile/core.hpp>
+#include "smoothquant.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = smoothquant_args;
+
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = smoothquant_traits_<DataType_,
+                                   Repeat_M_,
+                                   Repeat_N_,
+                                   ThreadPerBlock_M_,
+                                   ThreadPerBlock_N_,
+                                   Vector_N_,
+                                   kPadN_,
+                                   kTwoPass_>;
+
+template <typename Traits_>
+float smoothquant_(const S& s, A a)
+{
+    using DataType = typename Traits_::DataType;
+
+    using PipelineProblem = ck_tile::SmoothquantPipelineProblem<
+        typename SmoothquantTypeConfig<DataType>::XDataType,
+        typename SmoothquantTypeConfig<DataType>::SmoothScaleDataType,
+        typename SmoothquantTypeConfig<DataType>::ComputeDataType,
+        typename SmoothquantTypeConfig<DataType>::YScaleDataType,
+        typename SmoothquantTypeConfig<DataType>::QYDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kTwoPass>;
+
+    using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::Smoothquant<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/test/ck_tile/smoothquant/smoothquant.hpp b/test/ck_tile/smoothquant/smoothquant.hpp
new file mode 100644
index 0000000000..ce9ab25448
--- /dev/null
+++ b/test/ck_tile/smoothquant/smoothquant.hpp
@@ -0,0 +1,114 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/smoothquant.hpp"
+#include <string>
+
+template <typename DataType>
+struct SmoothquantTypeConfig;
+
+template <>
+struct SmoothquantTypeConfig<ck_tile::half_t>
+{
+    using XDataType           = ck_tile::half_t;
+    using SmoothScaleDataType = float;
+    using YScaleDataType      = float;
+    using QYDataType          = ck_tile::int8_t;
+    using ComputeDataType     = float;
+};
+
+template <>
+struct SmoothquantTypeConfig<ck_tile::bf16_t>
+{
+    using XDataType           = ck_tile::bf16_t;
+    using SmoothScaleDataType = float;
+    using YScaleDataType      = float;
+    using QYDataType          = ck_tile::int8_t;
+    using ComputeDataType     = float;
+};
+
+// runtime args
+struct smoothquant_args : public ck_tile::SmoothquantHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename DataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+struct smoothquant_traits_
+{
+    using DataType = ck_tile::remove_cvref_t<DataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+
+template <typename Traits_>
+float smoothquant_(const ck_tile::stream_config& s, smoothquant_args a);
+
+// This is the public API, will be generated by script
+struct smoothquant_traits
+{
+    std::string data_type;
+};
+
+float smoothquant(smoothquant_traits, smoothquant_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/smoothquant/smoothquant.inc b/test/ck_tile/smoothquant/smoothquant.inc
new file mode 100644
index 0000000000..afda7de4eb
--- /dev/null
+++ b/test/ck_tile/smoothquant/smoothquant.inc
@@ -0,0 +1,274 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "ck_tile/host.hpp"
+#include "smoothquant.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[], int index = 0)
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("x_stride", "-1", "input stride per row, if -1 then equal to n")
+        .insert("y_stride", "-1", "output stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv, index);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    std::string data_type = arg_parser.get_str("prec");
+    int kname             = arg_parser.get_int("kname");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(x_stride >= n);
+
+    using TypeConfig = SmoothquantTypeConfig<DataType>;
+
+    using XDataType           = typename TypeConfig::XDataType;
+    using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
+    using YScaleDataType      = typename TypeConfig::YScaleDataType;
+    using QYDataType          = typename TypeConfig::QYDataType;
+    using ComputeDataType     = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+    ck_tile::HostTensor<SmoothScaleDataType> smscale_host({n});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    smscale_buf.ToDevice(smscale_host.data());
+
+    std::cout << "[" << data_type << "]"
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
+              << std::flush;
+
+    smoothquant_traits traits{data_type};
+
+    smoothquant_args args{x_buf.GetDeviceBuffer(),
+                          smscale_buf.GetDeviceBuffer(),
+                          yscale_buf.GetDeviceBuffer(),
+                          qy_buf.GetDeviceBuffer(),
+                          m,
+                          n,
+                          x_stride,
+                          y_stride};
+
+    float ave_time = smoothquant(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(SmoothScaleDataType) * n +
+                           sizeof(YScaleDataType) * m + sizeof(QYDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto n_) {
+                auto v_smscale = ck_tile::type_convert<ComputeDataType>(smscale_host(n_));
+
+                for(int m_ = 0; m_ < m; ++m_)
+                {
+                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
+                    y_host(m_, n_) = v_x * v_smscale;
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, smscale_host.get_element_space_size())(
+                std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(y_stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
+                                                            qy_host_dev.begin() + i_r * y_stride +
+                                                                n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
+                                                            qy_host_ref.begin() + i_r * y_stride +
+                                                                n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+std::vector<std::vector<std::string>> create_test_cases(const std::string prec)
+{
+    return {{"-prec=" + prec, "-m=99", "-n=13", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=17", "-n=16", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=1", "-n=100", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=4", "-n=128", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=80", "-n=127", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=22", "-n=255", "-x_stride=256"},
+            {"-prec=" + prec, "-m=7", "-n=599", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=19", "-n=512", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=33", "-n=313", "-x_stride=1000"},
+            {"-prec=" + prec, "-m=11", "-n=510", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=171", "-n=676", "-x_stride=818"},
+            {"-prec=" + prec, "-m=91", "-n=636", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=12", "-n=768", "-x_stride=800"},
+            {"-prec=" + prec, "-m=100", "-n=766", "-x_stride=812"},
+            {"-prec=" + prec, "-m=31", "-n=1024", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=64", "-n=1000", "-x_stride=1004"},
+            {"-prec=" + prec, "-m=8", "-n=1501", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=3", "-n=1826", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=5", "-n=2040", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=7", "-n=2734", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=1", "-n=3182", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=9", "-n=4096", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=3", "-n=8192", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=1", "-n=10547", "-x_stride=-1"},
+            {"-prec=" + prec, "-m=3", "-n=17134", "-x_stride=-1"}};
+}
+
+template <typename DataType>
+bool run_test_case(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    return run<DataType>(arg_parser);
+}
+
+template <typename DataType>
+bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
+{
+    bool valid             = true;
+    constexpr int num_args = 4;
+
+    char* argv[num_args];
+
+    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
+    {
+        assert(test_cases[test_idx].size() == num_args &&
+               "invalid number of arguments in test case");
+        for(std::size_t idx = 0; idx < num_args; ++idx)
+        {
+            argv[idx] = test_cases[test_idx][idx].data();
+        }
+        valid = valid && run_test_case<DataType>(num_args, argv);
+
+        if(!valid)
+            break;
+    }
+
+    return valid;
+}
diff --git a/test/ck_tile/smoothquant/smoothquant_bf16.cpp b/test/ck_tile/smoothquant/smoothquant_bf16.cpp
new file mode 100644
index 0000000000..4f5a8ac63e
--- /dev/null
+++ b/test/ck_tile/smoothquant/smoothquant_bf16.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = create_test_cases("bf16");
+
+    return !run_test_cases<ck_tile::bf16_t>(test_cases);
+}
diff --git a/test/ck_tile/smoothquant/smoothquant_fp16.cpp b/test/ck_tile/smoothquant/smoothquant_fp16.cpp
new file mode 100644
index 0000000000..7d822b4903
--- /dev/null
+++ b/test/ck_tile/smoothquant/smoothquant_fp16.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "smoothquant.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp16");
+
+    return !run_test_cases<ck_tile::half_t>(test_cases);
+}

From 0f3083ab5c3c8eb4c81f2280b3f6a565a555444a Mon Sep 17 00:00:00 2001
From: Thrupti Raj Lakshmana Gowda <thruptiraj.lakshmanagowda@amd.com>
Date: Thu, 17 Jul 2025 14:19:41 -0500
Subject: [PATCH 322/443] [CKTILE] Layout Support for CK Tile engine (#2482)

* Updating runtime log message for CK TILE ENGINE

* CKTile layout from config

* CKTile custom config for CI

* Documentation for Layout Changes

* CKTile Layout changes  to Jenkins

* Fixing Clang Format

* Changes to Jenkins file to fix error

* fix(cmake-ck-dev): no longer sets invalid values as gpu arch

* style(py files): ruff formatting

* fix(cmake-ck-release): no longer sets invalid values as gpu arch

* chore(cmake-tile_engine): add reminder to uncomment user config json

* Changes to jenkin file to address more cases

* Changes to Jenkins to fix Error

* Changes to Jenkins file for fixing an error

* Update Jenkinsfile (#2517)

* Update Jenkinsfile

---------

Co-authored-by: ThruptiRajLakshmanaGowda <tlakshma@amd.com>
Co-authored-by: AviralGoelAMD <aviral.goel@amd.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 Jenkinsfile                                   | 42 ++++++++--
 script/cmake-ck-dev.sh                        |  7 +-
 script/cmake-ck-release.sh                    |  7 +-
 tile_engine/ops/gemm/CMakeLists.txt           | 56 ++++++++-----
 tile_engine/ops/gemm/README.md                | 26 +++---
 tile_engine/ops/gemm/configs/benchmark.json   | 15 ----
 .../ops/gemm/configs/custom_ci_config.json    | 82 +++++++++++++++++++
 .../ops/gemm/configs/default_config.json      | 15 ----
 .../gemm/configs/user_provided_config.json    | 15 ----
 tile_engine/ops/gemm/gemm_instance_builder.py | 50 +++++++----
 tile_engine/ops/gemm/json_config.py           | 48 ++++++-----
 11 files changed, 239 insertions(+), 124 deletions(-)
 create mode 100644 tile_engine/ops/gemm/configs/custom_ci_config.json

diff --git a/Jenkinsfile b/Jenkinsfile
index a7dc8360ee..7cfd3c1c90 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1229,11 +1229,24 @@ pipeline {
                                             -D CMAKE_BUILD_TYPE=Release \
                                             -D GPU_TARGETS="gfx90a" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
+                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_fp8 && \
-                                           ./bin/benchmark_gemm_fp8 && \
-                                           ninja -j64 benchmark_gemm_fp16 && \
-                                           ./bin/benchmark_gemm_fp16 """
+                                           ninja -j64 benchmark_gemm_fp8_rcr && \
+                                           ./bin/benchmark_gemm_fp8_rcr && \
+                                           ninja -j64 benchmark_gemm_fp16_rcr && \
+                                           ./bin/benchmark_gemm_fp16_rcr && \
+                                           ninja -j64 benchmark_gemm_fp8_crr && \
+                                           ./bin/benchmark_gemm_fp8_crr && \
+                                           ninja -j64 benchmark_gemm_fp16_crr && \
+                                           ./bin/benchmark_gemm_fp16_crr && \
+                                           ninja -j64 benchmark_gemm_fp8_ccr && \
+                                           ./bin/benchmark_gemm_fp8_ccr && \
+                                           ninja -j64 benchmark_gemm_fp16_ccr && \
+                                           ./bin/benchmark_gemm_fp16_ccr && \
+                                           ninja -j64 benchmark_gemm_fp8_rrr && \
+                                           ./bin/benchmark_gemm_fp8_rrr && \
+                                           ninja -j64 benchmark_gemm_fp16_rrr && \
+                                           ./bin/benchmark_gemm_fp16_rrr """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1254,11 +1267,24 @@ pipeline {
                                             -D CMAKE_BUILD_TYPE=Release \
                                             -D GPU_TARGETS="gfx942" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
+                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j128 benchmark_gemm_fp8 && \
-                                           ./bin/benchmark_gemm_fp8 && \
-                                           ninja -j128 benchmark_gemm_fp16 && \
-                                           ./bin/benchmark_gemm_fp16 """
+                                           ninja -j64 benchmark_gemm_fp8_rcr && \
+                                           ./bin/benchmark_gemm_fp8_rcr && \
+                                           ninja -j64 benchmark_gemm_fp16_rcr && \
+                                           ./bin/benchmark_gemm_fp16_rcr && \
+                                           ninja -j64 benchmark_gemm_fp8_crr && \
+                                           ./bin/benchmark_gemm_fp8_crr && \
+                                           ninja -j64 benchmark_gemm_fp16_crr && \
+                                           ./bin/benchmark_gemm_fp16_crr && \
+                                           ninja -j64 benchmark_gemm_fp8_ccr && \
+                                           ./bin/benchmark_gemm_fp8_ccr && \
+                                           ninja -j64 benchmark_gemm_fp16_ccr && \
+                                           ./bin/benchmark_gemm_fp16_ccr && \
+                                           ninja -j64 benchmark_gemm_fp8_rrr && \
+                                           ./bin/benchmark_gemm_fp8_rrr && \
+                                           ninja -j64 benchmark_gemm_fp16_rrr && \
+                                           ./bin/benchmark_gemm_fp16_rrr """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 4d0836af39..839b6c4f08 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -5,13 +5,16 @@ rm -rf CMakeFiles
 
 MY_PROJECT_SOURCE=$1
 
-if [ $# -ge 2 ] ; then
+if [ $# -ge 2 ] && [[ "$2" =~ ^gfx ]]; then
     GPU_TARGETS=$2
     shift 2
+    echo "GPU targets provided: $GPU_TARGETS"
     REST_ARGS=$@
 else
+    echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
     GPU_TARGETS="gfx908;gfx90a;gfx942"
-    REST_ARGS=
+    shift 1
+    REST_ARGS=$@
 fi
 
 cmake                                                                                             \
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index acb04ac75f..311ea91822 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -5,13 +5,16 @@ rm -rf CMakeFiles
 
 MY_PROJECT_SOURCE=$1
 
-if [ $# -ge 2 ] ; then
+if [ $# -ge 2 ] && [[ "$2" =~ ^gfx ]]; then
     GPU_TARGETS=$2
     shift 2
+    echo "GPU targets provided: $GPU_TARGETS"
     REST_ARGS=$@
 else
+    echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
     GPU_TARGETS="gfx908;gfx90a;gfx942"
-    REST_ARGS=
+    shift 1
+    REST_ARGS=$@
 fi
 
 cmake                                                                                             \
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index 5db55f02d5..fe9b7802a7 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,21 +1,32 @@
 
 set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
+set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
 
-function(build_gemm_for_datatype datatype)
-    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/")
-    set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-    #set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+function(build_gemm_for_datatype datatype layout)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
+
+    # Comment this if-else block when using user_provided_config
+    if(layout STREQUAL "rcr")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
+    endif()
+
+    # uncomment this if you want to use user_provided_config.json
+    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    
     # Generate kernel list
     execute_process(
         COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                 --working_path ${working_path}
                 --datatype ${datatype}
+                --layout ${layout}
                 --config_json ${json_blob}
                 --list_blobs
         RESULT_VARIABLE ret
     )
     if(NOT ret EQUAL 0)
-        message(FATAL_ERROR "Failed to list kernels for ${datatype}: ${ret}")
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
     endif()
 
     file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs)
@@ -27,11 +38,12 @@ function(build_gemm_for_datatype datatype)
         COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                 --working_path "${working_path}"
                 --datatype ${datatype}
+                --layout ${layout}
                 --config_json "${json_blob}"
                 --gen_blobs
-        COMMENT "Generating GEMM instance sources for ${datatype}"
+        COMMENT "Generating GEMM instance sources for ${datatype} ${layout}"
     )
-    add_custom_target(gemm_gen_${datatype} DEPENDS ${codegen_blobs})
+    add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})
 
     set(intermediate_libs)
     list(LENGTH codegen_blobs codegen_blobs_len)
@@ -69,7 +81,7 @@ function(build_gemm_for_datatype datatype)
             #list(LENGTH chunk_files chunk_files_len)
             #if(chunk_files_len AND chunk_files_len GREATER 1)
             if(chunk_files)
-                set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}")
+                set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}")
                 add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
                 list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
             endif()
@@ -80,7 +92,7 @@ function(build_gemm_for_datatype datatype)
         #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
         #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
         if(sub_intermediate_libs)
-            set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}")
+            set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}")
             # Collect the $<TARGET_OBJECTS:...> expressions
             
             set(obj_exprs)
@@ -89,7 +101,7 @@ function(build_gemm_for_datatype datatype)
             endforeach()
             
             add_library(${intermediate_lib_name} STATIC ${obj_exprs})
-            add_dependencies(${intermediate_lib_name} gemm_gen_${datatype})
+            add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout})
             #foreach(objlib IN LISTS sub_intermediate_libs)
             #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
             #endforeach()
@@ -99,28 +111,28 @@ function(build_gemm_for_datatype datatype)
     endforeach()
     
     # Interface library for instances
-    add_library(gemm_template_instances_${datatype} INTERFACE)
-    add_dependencies(gemm_template_instances_${datatype} gemm_gen_${datatype})
-    target_link_libraries(gemm_template_instances_${datatype} INTERFACE ${intermediate_libs})
-    target_include_directories(gemm_template_instances_${datatype} INTERFACE
+    add_library(gemm_template_instances_${datatype}_${layout} INTERFACE)
+    add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout})
+    target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
+    target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE
         ${CMAKE_CURRENT_LIST_DIR}
         "${working_path}"
     )
-    set_target_properties(gemm_template_instances_${datatype} PROPERTIES LINKER_LANGUAGE CXX)
+    set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
     
     # Host API interface library
-    add_library(gemm_host_api_${datatype} INTERFACE)
-    target_link_libraries(gemm_host_api_${datatype} INTERFACE gemm_template_instances_${datatype})
-    target_include_directories(gemm_host_api_${datatype} INTERFACE
+    add_library(gemm_host_api_${datatype}_${layout} INTERFACE)
+    target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout})
+    target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE
         ${CMAKE_CURRENT_LIST_DIR}
         "${working_path}"
     )
     
 
     # Executable per datatype
-    set(exec_name "benchmark_gemm_${datatype}")
+    set(exec_name "benchmark_gemm_${datatype}_${layout}")
     add_executable(${exec_name} benchmark_gemm.cpp)
-    target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype})
+    target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout})
     target_compile_options(${exec_name} PRIVATE
         -Wno-undefined-func-template
         -Wno-float-equal
@@ -130,5 +142,7 @@ endfunction()
 
 # Process each datatype in isolation
 foreach(dt IN LISTS GEMM_DATATYPE)
-    build_gemm_for_datatype(${dt})
+    foreach(l IN LISTS GEMM_LAYOUT)
+        build_gemm_for_datatype(${dt} ${l})
+    endforeach()
 endforeach()
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index e74da4b958..a16b74d297 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -7,6 +7,7 @@ CK Tile Engine GEMM is used to generate and run GEMM kernels with different comb
 Users can specify custom kernel configurations such as tile size, warp size, padding, pipeline, scheduler, and epilogue in the config file. This allows building only for selected configurations, significantly reducing build time.
 For reference please see `./configs/user_provided_config.json`.
 
+
 The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark. For reference please see in `./configs/default_config.json`
 
 If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. 
@@ -18,25 +19,28 @@ mkdir build && cd build
 # build composable kernel
 # replace [Arch] with the appropriate architecture or leave blank and 
 # replace [Datatype1;Datatype2;...] in comma separated datatypes string (possible datatypes are [fp8, bf8, int8, fp16, bf16])
-sh ../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" 
+# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
+sh ../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
 # generate different executable for each passed datatype
-make benchmark_gemm_[Datatype1] -j
-make benchmark_gemm_[Datatype2] -j
+make benchmark_gemm_[Datatype1]_[Layout1] -j
+make benchmark_gemm_[Datatype1]_[Layout2] -j
+make benchmark_gemm_[Datatype2]_[Layout1] -j
+make benchmark_gemm_[Datatype2]_[Layout2] -j
 ```
-`benchmark_gemm_[Datatypes]` will be located in the `./bin/` directory.
+`benchmark_gemm_[Datatype]_[Layout]` will be located in the `./bin/` directory.
 
-`benchmark_gemm_[Datatypes]` must be rebuilt everytime if configuration file is modified.
+`benchmark_gemm_[Datatype]_[Layout]` must be rebuilt everytime if configuration file is modified.
 
 ``` bash
-rm -rf tile_engine/ && make benchmark_gemm_[Datatypes] -j  # rebuild
+rm -rf tile_engine/ && make benchmark_gemm_[Datatypes]_[Layout] -j  # rebuild
 ```
 
-## For eaxmple build for gfx942 for fp8 and fp16 datatypes
+## For eaxmple build for gfx942 for fp8 and fp16 datatypes with rcr layout
 ``` bash
 mkdir build && cd build
-sh ../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" 
-make benchmark_gemm_fp8 -j
-make benchmark_gemm_fp16 -j
+sh ../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr" 
+make benchmark_gemm_fp8_rcr -j
+make benchmark_gemm_fp16_rcr -j
 ```
 
 ## benchmark_gemm inputs
@@ -103,7 +107,7 @@ The following JSON file specifies parameters used to generate and build GEMM ker
 
 At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
 ``` bash
-./bin/benchmark_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default 
+./bin/benchmark_gemm_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default 
 ```
 The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and default epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
 
diff --git a/tile_engine/ops/gemm/configs/benchmark.json b/tile_engine/ops/gemm/configs/benchmark.json
index 601784049b..1560698b77 100644
--- a/tile_engine/ops/gemm/configs/benchmark.json
+++ b/tile_engine/ops/gemm/configs/benchmark.json
@@ -1,20 +1,5 @@
 {
     "problem": {
-        "layout_a": {
-            "values": [
-                "r"
-            ]
-        },
-        "layout_b": {
-            "values": [
-                "c"
-            ]
-        },
-        "layout_c": {
-            "values": [
-                "r"
-            ]
-        }
     },
     "tile_config": {
         "tile_m": {
diff --git a/tile_engine/ops/gemm/configs/custom_ci_config.json b/tile_engine/ops/gemm/configs/custom_ci_config.json
new file mode 100644
index 0000000000..9187fb01eb
--- /dev/null
+++ b/tile_engine/ops/gemm/configs/custom_ci_config.json
@@ -0,0 +1,82 @@
+{
+  "problem": {
+  },
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        128      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        128
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "default"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
index 069a3b080c..12a8ddd4b7 100644
--- a/tile_engine/ops/gemm/configs/default_config.json
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -1,20 +1,5 @@
 {
   "problem": {
-    "layout_a": {
-      "values": [
-        "r"
-      ]
-    },
-    "layout_b": {
-      "values": [
-        "c"
-      ]
-    },
-    "layout_c": {
-      "values": [
-        "r"
-      ]
-    }
   },
   "tile_config": {
     "tile_m": {
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
index 79bcced82a..5761b39ada 100644
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -1,20 +1,5 @@
 {
   "problem": {
-    "layout_a": {
-      "values": [
-        "r"
-      ]
-    },
-    "layout_b": {
-      "values": [
-        "c"
-      ]
-    },
-    "layout_c": {
-      "values": [
-        "r"
-      ]
-    }
   },
   "tile_config": {
     "tile_m": {
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index de1fd0bb62..0b38c44a1a 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -98,19 +98,19 @@ class GemmCodeGenerator:
                         _,
                     ) in tile:
                         instance_name = f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
-                        
+
                         if instance_name not in file_name:
                             file_name.add(instance_name)
                             f.write(str(w_p / instance_name) + "\n")
                             files_listed += 1
 
                 file_range_map[trait] = (start_idx, files_listed)
-        
-        file_path = w_p / 'gemm_instance_blobs_range.txt'
-        with  file_path.open('w') as f:
+
+        file_path = w_p / "gemm_instance_blobs_range.txt"
+        with file_path.open("w") as f:
             for name, ranges in file_range_map.items():
                 s, l = ranges
-                f.write(name + " " + f"{s}" + " " + f"{l}"+ "\n")
+                f.write(name + " " + f"{s}" + " " + f"{l}" + "\n")
 
     def _generate_all_traits(self):
         """Generate all possible kernel traits names."""
@@ -563,7 +563,7 @@ struct GemmKernel {{
             self.valid_trait_tile_combinations[trait].append(tile_valid_params)
 
     def _generate_instantiation_source_files(self):
-        """Generate kernel instance instantiation source files """
+        """Generate kernel instance instantiation source files"""
         tile_map = {}
         for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
             for tile in tile_valid_params:
@@ -583,11 +583,13 @@ struct GemmKernel {{
                     if key not in tile_map:
                         tile_map[key] = set()
                     tile_map[key].add(value)
-       
+
         files_listed = 0
         for trait, _ in self.valid_trait_tile_combinations.items():
             for block_tile, warp_tiles in tile_map.items():
-                tile_m, tile_n, tile_k, warp_m, warp_n, warp_k = map(int, block_tile.split('x'))
+                tile_m, tile_n, tile_k, warp_m, warp_n, warp_k = map(
+                    int, block_tile.split("x")
+                )
 
                 content = f"""
 // SPDX-License-Identifier: MIT
@@ -598,8 +600,10 @@ struct GemmKernel {{
 
 """
                 for warp_tile in warp_tiles:
-                    warp_tile_m, warp_tile_n, warp_tile_k = map(int, warp_tile.split("x"))
-                    
+                    warp_tile_m, warp_tile_n, warp_tile_k = map(
+                        int, warp_tile.split("x")
+                    )
+
                     sparse = (
                         self.config.problem.datatype_map["matrix_a"] == "fp16"
                         and self.config.problem.datatype_map["matrix_b"] == "fp16"
@@ -619,15 +623,23 @@ struct GemmKernel {{
                     )
                     if sparse:
                         files_listed = files_listed + 1
-                        content = content + f"""
+                        content = (
+                            content
+                            + f"""
 template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, true>;"""
+                        )
                     files_listed = files_listed + 1
-                    content = content + f"""
+                    content = (
+                        content
+                        + f"""
 template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, false>;"""
+                    )
                 content += f"""
 """
-                (self.output_dir /
-                    f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp").write_text(content)
+                (
+                    self.output_dir
+                    / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
+                ).write_text(content)
         print(f"Generated {files_listed} kernel instances in total.")
 
     def _generate_dispatcher_file(self):
@@ -785,7 +797,7 @@ def do_gen_blobs(
 
 def main(args):
     gemm_config = (
-        GemmConfig.from_json(args.config_json, args.datatype)
+        GemmConfig.from_json(args.config_json, args.datatype, args.layout)
         if args.config_json is not None
         else args.config_json
     )
@@ -823,7 +835,13 @@ if __name__ == "__main__":
         "-d",
         "--datatype",
         required=True,
-        help="Specify what datatype to use for the kernel generation, e.g. fp16, bf16, int8, fp8, bf8"
+        help="Specify what datatype to use for the kernel generation, e.g. fp16, bf16, int8, fp8, bf8",
+    )
+    parser.add_argument(
+        "-ly",
+        "--layout",
+        required=True,
+        help="Specify what layout to use for the kernel generation, e.g. rcr, rrr",
     )
     parser.add_argument(
         "-l",
diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
index 8b83977dd3..675a2052ef 100644
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -118,7 +118,9 @@ class GemmConfig:
     trait_config: TraitConfig
 
     @classmethod
-    def from_json(cls: Type["GemmConfig"], filepath: str, datatype: str) -> "GemmConfig":
+    def from_json(
+        cls: Type["GemmConfig"], filepath: str, datatype: str, layout: str
+    ) -> "GemmConfig":
         """JSON configuration loader with validation controls"""
         config_path = Path(filepath)
 
@@ -132,32 +134,40 @@ class GemmConfig:
             a_type = datatype
             b_type = datatype
             c_type = datatype
-            if b_type == 'int4':
+            if b_type == "int4":
                 a_type = "fp16"
-            if b_type in ['bf8', 'fp8', 'int4']:
+            if b_type in ["bf8", "fp8", "int4"]:
                 c_type = "fp16"
 
+            layout_parts = layout.lower()
+            assert len(layout_parts) == 3, (
+                f"Invalid layout string: {layout} (must be 3 characters like 'rcr' where r stands for row major and c stands for column major)"
+            )
+            assert layout_parts[0] in ("r", "c"), (
+                f"Invalid matrix_a layout: {layout_parts[0]} (must be 'r' for row major or or 'c' for column major)"
+            )
+            assert layout_parts[1] in ("r", "c"), (
+                f"Invalid matrix_a layout: {layout_parts[1]} (must be 'r' for row major or or 'c' for column major)"
+            )
+            assert layout_parts[2] == "r", (
+                f"Invalid matrix_c layout: {layout_parts[2]} (must be 'r' only as currently we are supporting only row major)"
+            )
+            a_layout = layout_parts[0]
+            b_layout = layout_parts[1]
+            c_layout = layout_parts[2]
+
             # Parse problem config
-            #TODO: Not reading datatype information from json file.
+            # TODO: Not reading datatype information from json file.
             problem = ProblemConfig(
                 datatypes=(
-                    EnumConfigParam(
-                        values=[a_type]),
-                    EnumConfigParam(
-                        values=[b_type]),
-                    EnumConfigParam(
-                        values=[c_type])
+                    EnumConfigParam(values=[a_type]),
+                    EnumConfigParam(values=[b_type]),
+                    EnumConfigParam(values=[c_type]),
                 ),
                 layouts=(
-                    EnumConfigParam(
-                        values=config_dict["problem"]["layout_a"]["values"]
-                    ),
-                    EnumConfigParam(
-                        values=config_dict["problem"]["layout_b"]["values"]
-                    ),
-                    EnumConfigParam(
-                        values=config_dict["problem"]["layout_c"]["values"]
-                    ),
+                    EnumConfigParam(values=[a_layout]),
+                    EnumConfigParam(values=[b_layout]),
+                    EnumConfigParam(values=[c_layout]),
                 ),
             )
 

From 095393276abeb84c0949467f77fbec164a081b01 Mon Sep 17 00:00:00 2001
From: Linjun-AMD <Jun.Lin@amd.com>
Date: Fri, 18 Jul 2025 09:59:38 +0800
Subject: [PATCH 323/443] h_dim256 fmha use async_qr pipeline (#2510)

---
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 31 ++++++-------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 06a012d277..78cec40aa8 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -532,31 +532,20 @@ class KernelComponentFactory:
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-                if hdim == 256 and hdim_v == 256:
-                # if True:
+                if bias == "bias":
+                    # TODO: rocm 6.2 compiler problem if using qr_async for bias case
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                 else:
-                    if bias == "bias":
-                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                if receipt == 1 and bias != "bias":
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):

From f0a8c180173e1cdac7fb194515c9c54599b4059a Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Fri, 18 Jul 2025 13:50:58 +0800
Subject: [PATCH 324/443] [CK_TILE] Fix tile_example_moe_sorting broke in #2436
 (#2525)

---
 .../ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp     | 4 ++--
 .../ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp  | 4 ++--
 .../block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
index 1dcd62011a..23c4ad583e 100644
--- a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
+++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -73,7 +73,7 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
             sequence<2, 1>, // !! note here is different
             sequence<0, 0>>{};
 
-        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
+        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>;
 
         constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
             c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
diff --git a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp
index 0b812875c4..037bb7688c 100644
--- a/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp
+++ b/include/ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -49,7 +49,7 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_Base
             sequence<2, 1>, // !! note here is different
             sequence<0, 0>>{};
 
-        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
+        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>;
 
         constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
             c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
index 862fa0bbe3..cfbd78967f 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
@@ -47,11 +47,11 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
             if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
                          kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
             {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K16{}, 2, 2);
+                return make_tuple(WarpGemmMfmaF16F16F32M32N32K16<>{}, 2, 2);
             }
             else
             {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K16{}, 2, 2);
+                return make_tuple(WarpGemmMfmaF16F16F32M32N32K16<>{}, 2, 2);
             }
 #else
             using WG = WarpGemmMfmaDispatcher<ck_tile::half_t,

From 0198257d794bff1d245414b984e9e156fa98f970 Mon Sep 17 00:00:00 2001
From: Mingtao Gu <145657261+mtgu0705@users.noreply.github.com>
Date: Fri, 18 Jul 2025 14:35:54 +0800
Subject: [PATCH 325/443] [CK] Fixed MPerBlock=32 build issue for MXFP4 GEMM
 decode (#2512)

* added MPerBlock=32 for MXFP4 GEMM decode

* added two instance for M>128 scenario.

* added 1 instance

* format

---------

Co-authored-by: mtgu0705 <mtgu@amd.com>
Co-authored-by: felix <felix.li@amd.com>
---
 ...gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp | 234 +++++++++++++-----
 ...evice_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp |  17 +-
 2 files changed, 186 insertions(+), 65 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp
index 7e11304e2f..629bbb316f 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_mx_bpreshuffle.hpp
@@ -226,85 +226,197 @@ struct BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle<BlockGemmPipelineScheduler:
         // constexpr auto num_dsread_a_mfma =
         //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
 
-        constexpr auto num_total_stages = MRepeat;
+        constexpr auto num_total_stages = std::max(2, MRepeat);
 
-        // Group num_mfma_perstage num_ds_read_a_perstage
-        // since we want to reuse a local register buffer
-        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
-        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+        if constexpr(num_total_stages > 2)
+        {
+            // Group num_mfma_perstage num_ds_read_a_perstage
+            // since we want to reuse a local register buffer
+            constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+            constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
 
-        constexpr auto num_ds_read_a_mfma_perstage =
-            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+            constexpr auto num_ds_read_a_mfma_perstage =
+                math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
 
-        constexpr auto num_ds_read_a_prefetch_stages = 2;
+            constexpr auto num_ds_read_a_prefetch_stages = 2;
 
-        constexpr auto buffer_load_perstage_more =
-            math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
-        constexpr auto buffer_load_perstage_less =
-            math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
-        constexpr auto buffer_load_perstage_stage2 =
-            math::integer_divide_floor((num_buffer_load_stage2), 2);
+            constexpr auto buffer_load_perstage_more =
+                math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
+            constexpr auto buffer_load_perstage_less =
+                math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
+            constexpr auto buffer_load_perstage_stage2 =
+                math::integer_divide_floor((num_buffer_load_stage2), 2);
 
-        constexpr auto buffer_load_stages_more =
-            num_buffer_load_stage1 -
-            math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
-                ((num_total_stages - 2));
+            constexpr auto buffer_load_stages_more =
+                num_buffer_load_stage1 -
+                math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
+                    ((num_total_stages - 2));
 
-        constexpr auto buffer_load_issue_point_interval_more =
-            num_mfma_perstage / buffer_load_perstage_more;
-        constexpr auto buffer_load_issue_point_interval_less =
-            num_mfma_perstage / buffer_load_perstage_less;
-        constexpr auto buffer_load_issue_point_interval_stage2 =
-            num_mfma_perstage / buffer_load_perstage_stage2;
+            constexpr auto buffer_load_issue_point_interval_more =
+                num_mfma_perstage / buffer_load_perstage_more;
+            constexpr auto buffer_load_issue_point_interval_less =
+                num_mfma_perstage / buffer_load_perstage_less;
+            constexpr auto buffer_load_issue_point_interval_stage2 =
+                num_mfma_perstage / buffer_load_perstage_stage2;
 
-        // Stage 1
-        // global read more
-        static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
-            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            // Stage 1
+            // global read more
+            static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
+                static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
 
-                if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                    if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+
+                    if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x100, ds_read_a_mfma_rate, 0); // DS read
+                    }
+                });
+            });
+
+            // global read less
+            static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
+                static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                    if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x100, ds_read_a_mfma_rate, 0); // DS read
+                    }
+                });
+            });
+
+            // Stage 2, Sync
+            // lds synchronization, prefetch next loop local A
+            static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
+                static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                    if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x100, ds_read_a_mfma_rate, 0); // DS read
+                    }
+                });
+            });
+        }
+        else
+        {
+            constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                                   num_buffer_load_a_scale +
+                                                   num_buffer_load_b_scale;
+            constexpr auto num_dsread_a_mfma = math::integer_divide_ceil(
+                num_ds_read_inst_a, ds_read_a_mfma_rate); // how many mfma per dsread_a
+
+            // stage 1
+            constexpr auto num_mfma_stage1 = num_mfma_inst - num_dsread_a_mfma;
+
+            constexpr auto mfma_perstage_more =
+                math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+            constexpr auto mfma_perstage_less =
+                math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+            constexpr auto mfma_stages_more =
+                num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+                if constexpr(i < mfma_stages_more)
                 {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
                     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
                 }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
 
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+            static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+                if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+                {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
+
+            static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+                if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) <
+                             mfma_stages_more)
+                {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
+
+            static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+                if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                              num_buffer_load_a_scale) < mfma_stages_more)
+                {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
+
+            // stage 2
+            static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                             ds_read_a_mfma_rate)
                 {
                     __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
                 }
-            });
-        });
-
-        // global read less
-        static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
-            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                else
                 {
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                }
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
-                {
-                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x100,
+                        num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate,
+                        0); // DS read
                 }
             });
-        });
-
-        // Stage 2, Sync
-        // lds synchronization, prefetch next loop local A
-        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
-            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
-                {
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                }
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
-                {
-                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
-                }
-            });
-        });
+        }
     }
 
     template <bool HasMainLoop,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
index 40bacb3ee9..97357f1ee4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn.hpp
@@ -46,10 +46,11 @@ using device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_instances = std::tuple<
     //#####################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
     //#####################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
     //#####################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   384,   128,  16,  16,  16,   16,    2,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
-      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   384,   128,  16,  16,  16,   16,    2,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   512,   128,  16,  16,  16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       
       DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
@@ -65,6 +66,14 @@ using device_gemm_mx_xdl_f4_f4_f16_mk_mfma_mn_instances = std::tuple<
       DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   384,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 16, 1, 16>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
       DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   512,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+      //new instances for testing
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    16,   4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           8,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+      DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   64,    128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           4,           2,                   S<1, 32, 1, 8>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+      // DeviceGemmMX_Xdl_CShuffleV3<Row,    MFMA,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   128,   128,  16,  16,  16,   16,    16,   4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           4,                   S<1, 8, 1, 32>,                8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
       std::nullptr_t
     // clang-format on
     >;

From ead17e626514b210e3fce37ead25cfa46f44c029 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 18 Jul 2025 12:25:24 -0700
Subject: [PATCH 326/443] disable building CI for gfx942 by default (#2529)

---
 Jenkinsfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 7cfd3c1c90..fb4afa992b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -809,7 +809,7 @@ def process_results(Map conf=[:]){
 
 //launch develop branch daily jobs
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
-                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
@@ -914,8 +914,8 @@ pipeline {
             description: "Build CK and run tests on gfx90a (default: ON)")
         booleanParam(
             name: "BUILD_GFX942",
-            defaultValue: true,
-            description: "Build CK and run tests on gfx942 (default: ON)")
+            defaultValue: false,
+            description: "Build CK and run tests on gfx942 (default: OFF)")
         booleanParam(
             name: "BUILD_GFX950",
             defaultValue: false,

From 20306db651858938e913533da7e4382d28912fa1 Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Sun, 20 Jul 2025 00:13:36 -0600
Subject: [PATCH 327/443] Tests for CK Tile Flatmm and MOE Smoothquant (#2458)

* CK tile tests for flatmm using example

* MOE smoothquant draft tests

* fix create_arg default index to zero for MOE smoothquant

* revert MOE smoothquant changes

* code clean up

* Add back MOE smoothquant changes

* Add MOE smoothquant cases for different precisions and update cmake

* clean up comments

* Update flamm cmake

* revert change made to moe_smoothquant smoke_test.sh EXE path

* remove unecessary comment in MOE smoothquant cmakelist

* comment out adding moe_smoothquant subdirectory for now due to bugs with GPU core dump issue on gfx942 and gfx90a

* Clean up run_test_case function in MOE smootquant tests

* update copyright and licensing on files

* Remove flatmm test dir since tests should be done as weighted preshuffle gemm

* Add flamm smoke test cases to weighted preshuffle gemm gtests

* remove blank line from CMakeLists

---------

Co-authored-by: root <root@ctr-ubbsmc16.amd.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 test/ck_tile/CMakeLists.txt                   |   3 +
 .../test_gemm_pipeline_kernel_types.hpp       |  20 +-
 .../test_gemm_pipeline_ut_cases.inc           | 119 ++++++-
 test/ck_tile/moe_smoothquant/CMakeLists.txt   |  32 ++
 .../moe_smoothquant_bf16_n1024_instance.cpp   |  27 ++
 .../moe_smoothquant_bf16_n1536_instance.cpp   |  18 +
 .../moe_smoothquant_bf16_n2048_instance.cpp   |  19 ++
 .../moe_smoothquant_bf16_n256_instance.cpp    |  16 +
 .../moe_smoothquant_bf16_n3072_instance.cpp   |  18 +
 .../moe_smoothquant_bf16_n4096_instance.cpp   |  18 +
 ...moe_smoothquant_bf16_n4096_tp_instance.cpp |  18 +
 .../moe_smoothquant_bf16_n512_instance.cpp    |  18 +
 ...moe_smoothquant_bf16_n64_n128_instance.cpp |  16 +
 .../moe_smoothquant_bf16_n768_instance.cpp    |  16 +
 .../moe_smoothquant_fp16_n1024_instance.cpp   |  27 ++
 .../moe_smoothquant_fp16_n1536_instance.cpp   |  18 +
 .../moe_smoothquant_fp16_n2048_instance.cpp   |  18 +
 .../moe_smoothquant_fp16_n256_instance.cpp    |  16 +
 .../moe_smoothquant_fp16_n3072_instance.cpp   |  18 +
 .../moe_smoothquant_fp16_n4096_instance.cpp   |  18 +
 ...moe_smoothquant_fp16_n4096_tp_instance.cpp |  18 +
 .../moe_smoothquant_fp16_n512_instance.cpp    |  18 +
 ...moe_smoothquant_fp16_n64_n128_instance.cpp |  16 +
 .../moe_smoothquant_fp16_n768_instance.cpp    |  16 +
 .../instances/moe_smoothquant_fwd_api.cpp     | 155 +++++++++
 .../moe_smoothquant_instance_common.hpp       |  65 ++++
 .../moe_smoothquant/moe_smoothquant.hpp       | 104 ++++++
 .../moe_smoothquant/moe_smoothquant.inc       | 317 ++++++++++++++++++
 .../moe_smoothquant_bf16_fp8.cpp              |  11 +
 .../moe_smoothquant_bf16_int8.cpp             |  11 +
 .../moe_smoothquant_fp16_fp8.cpp              |  11 +
 .../moe_smoothquant_fp16_int8.cpp             |  11 +
 32 files changed, 1217 insertions(+), 9 deletions(-)
 create mode 100644 test/ck_tile/moe_smoothquant/CMakeLists.txt
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
 create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
 create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant.inc
 create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp

diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 5c0f3fb076..0b6fd35988 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -5,6 +5,9 @@ add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
 add_subdirectory(data_type)
+# Not including these tests as there is a bug on gfx90a and gfx942
+# resulting in "GPU core dump"
+#add_subdirectory(moe_smoothquant)
 add_subdirectory(slice_tile)
 add_subdirectory(batched_transpose)
 add_subdirectory(smoothquant)
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
index 152017dbad..f66f3cb0aa 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp
@@ -1,5 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
 
 #include <tuple>
 #include <type_traits>
@@ -9,9 +9,10 @@
 #include "ck_tile/host.hpp"
 #include "test_gemm_pipeline_util.hpp"
 
-using F16 = ck_tile::half_t;
-using F32 = float;
-using F8  = ck_tile::fp8_t;
+using F16  = ck_tile::half_t;
+using F32  = float;
+using F8   = ck_tile::fp8_t;
+using BF16 = ck_tile::bf16_t;
 
 using Row = ck_tile::tensor_layout::gemm::RowMajor;
 using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
@@ -22,11 +23,16 @@ using Default = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
 using WeightPreshuffle =
     ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::WeightPreshuffle>;
 
+// Adding alias for the F8 parameters to facilitate skipping tests.
+// This alias can be removed once test failures are fixed.
+using F8Types = std::tuple<Row, Col, Row, F8, F8, F32, F16, Default, WeightPreshuffle>;
+
 // clang-format off
 
 using KernelTypesWeightPreshuffle = ::testing::Types<
      std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Default,        WeightPreshuffle>,
-     std::tuple<    Row,     Col,     Row,       F8,         F8,         F32,       F16,             Default,        WeightPreshuffle>
->;
+     std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,      BF16,             Default,        WeightPreshuffle>,
+     F8Types
+     >;
 
 // clang-format on
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
index b3a82e5fbc..389e0d53ea 100755
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
@@ -1,5 +1,5 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
 
 #pragma once
 
@@ -18,4 +18,119 @@ TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle)
     this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
 }
 
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x128x128)
+{
+    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    {
+        GTEST_SKIP() << "Skipping this test due to failures with F8";
+    }
+    constexpr int M           = 128;
+    constexpr int N           = 128;
+    constexpr int K           = 128;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x128x4096)
+{
+    constexpr int M           = 128;
+    constexpr int N           = 128;
+    constexpr int K           = 4096;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x2048x128)
+{
+    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    {
+        GTEST_SKIP() << "Skipping this test due to failures with F8";
+    }
+
+    constexpr int M           = 128;
+    constexpr int N           = 2048;
+    constexpr int K           = 128;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x2048x4096)
+{
+    constexpr int M           = 128;
+    constexpr int N           = 2048;
+    constexpr int K           = 4096;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x128x128)
+{
+    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    {
+        GTEST_SKIP() << "Skipping this test due to failures with F8";
+    }
+
+    constexpr int M           = 1024;
+    constexpr int N           = 128;
+    constexpr int K           = 128;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x128x4096)
+{
+    constexpr int M           = 1024;
+    constexpr int N           = 128;
+    constexpr int K           = 4096;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x2048x128)
+{
+    if constexpr(std::is_same_v<TypeParam, F8Types>)
+    {
+        GTEST_SKIP() << "Skipping this test due to failures with F8";
+    }
+
+    constexpr int M           = 1024;
+    constexpr int N           = 2048;
+    constexpr int K           = 128;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x2048x4096)
+{
+    constexpr int M           = 1024;
+    constexpr int N           = 2048;
+    constexpr int K           = 4096;
+    constexpr bool PadM       = false;
+    constexpr bool PadN       = false;
+    constexpr bool PadK       = false;
+    constexpr bool Preshuffle = true;
+    this->template Run<PadM, PadN, PadK, Preshuffle>(M, N, K);
+}
+
 #endif
diff --git a/test/ck_tile/moe_smoothquant/CMakeLists.txt b/test/ck_tile/moe_smoothquant/CMakeLists.txt
new file mode 100644
index 0000000000..70999fa06b
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    function (add_moe_smoothquant_test TARGET_NAME MAIN_SRC)
+        message(DEBUG "adding ${TARGET_NAME}")
+        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+        foreach(source IN LISTS ARGN)
+            list(APPEND INSTANCE_SRCS ${source})
+        endforeach()
+
+        target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS})
+
+        set(COMPILE_OPTIONS)
+        # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+        list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+        # list(APPEND COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+
+        target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS})
+    endfunction(add_moe_smoothquant_test TARGET_NAME MAIN_SRC)
+
+    file(GLOB INSTANCE_SRCS instances/*.cpp)
+
+    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_fp8 moe_smoothquant_fp16_fp8.cpp ${INSTANCE_SRCS})
+    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_int8 moe_smoothquant_fp16_int8.cpp ${INSTANCE_SRCS})
+
+    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_fp8 moe_smoothquant_bf16_fp8.cpp ${INSTANCE_SRCS})
+    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_int8 moe_smoothquant_bf16_int8.cpp ${INSTANCE_SRCS})
+
+else()
+    message(DEBUG "Skipping ck_tile MOE smoothquant tests for current target")
+endif()
\ No newline at end of file
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
new file mode 100644
index 0000000000..93a1b9fed4
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
@@ -0,0 +1,27 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm   tn  vn   pd   2p
+#if 0
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
+#endif
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1,  256, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp
new file mode 100644
index 0000000000..7e55a542d7
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 3, 4,  64, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 3, 2, 128, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 3, 1, 256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 6, 1, 256, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 3, 4,  64, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 3, 2, 128, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 3, 1, 256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 6, 1, 256, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp
new file mode 100644
index 0000000000..74bd206e02
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp
@@ -0,0 +1,19 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn   vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 1, 256, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 256, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 1, 1, 256, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 256, 1, true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp
new file mode 100644
index 0000000000..169f4cdc72
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp
@@ -0,0 +1,16 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp
new file mode 100644
index 0000000000..bfb34e64a1
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 3, 1,  128, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 3, 1,  256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 6, 1,  256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 3, 1, 1024, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 3, 1,  128, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 3, 1,  256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 6, 1,  256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 3, 1, 1024, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
new file mode 100644
index 0000000000..03bbc0e06f
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1,  256, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1,  256, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 1024, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1,  256, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1,  256, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 1024, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
new file mode 100644
index 0000000000..000845bc40
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
new file mode 100644
index 0000000000..798a02248c
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t,  1, 1, 4, 64, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t,  1, 2, 4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t,  1, 4, 4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t,  1, 8, 4, 64, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 1, 4, 64, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
new file mode 100644
index 0000000000..7864e3e3dd
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
@@ -0,0 +1,16 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4, 64, 1,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4, 64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  2,  4, 64, 1,  true , false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4, 64, 1,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4, 64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  2,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
new file mode 100644
index 0000000000..c3d25c8859
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
@@ -0,0 +1,16 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  3, 4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  6, 4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 12, 4, 64, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  3, 4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  6, 4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 12, 4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
new file mode 100644
index 0000000000..eaaed6c5bb
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
@@ -0,0 +1,27 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm    tn  vn   pd    2p
+#if 0
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
+#endif
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 4, 2,  128, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 4, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
new file mode 100644
index 0000000000..556ac25809
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm   tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 3, 4,   64, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 3, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 3, 1,  256, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 6, 1,  256, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 3, 4,   64, 8,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 3, 2,  128, 4,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 3, 1,  256, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 6, 1,  256, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
new file mode 100644
index 0000000000..589faef0b5
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn tm  tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 1, 1,  256, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1,  256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1,  256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1,  256, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 1, 1,  256, 8, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1,  256, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1,  256, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1,  256, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
new file mode 100644
index 0000000000..ca331b1793
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
@@ -0,0 +1,16 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4, 64, 4, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  2,  4, 64, 2, true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  4,  4, 64, 1, true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
new file mode 100644
index 0000000000..dc80dadec5
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn  pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 3, 1,  128, 8,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 3, 1,  256, 4,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 6, 1,  256, 2,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 3, 1, 1024, 1,true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 3, 1,  128, 8,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 3, 1,  256, 4,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 6, 1,  256, 2,true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t,  1, 3, 1, 1024, 1,true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
new file mode 100644
index 0000000000..2947c3b698
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn  vn    pd     2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1,  256, 8,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1,  256, 4,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1, 1024, 2,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 1,  true,  false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1,  256, 8,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1,  256, 4,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1, 1024, 2,  true,  false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 1,  true,  false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
new file mode 100644
index 0000000000..b194fd457b
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                   rm rn tm  tn   vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1,  256, 8,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1,  256, 4,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1, 1024, 2,  true, true>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 1,  true, true>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
new file mode 100644
index 0000000000..fee9a6a454
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
@@ -0,0 +1,18 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm rn  tm  tn  vn   pd    2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4, 64, 8,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  2,  4, 64, 4,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  4,  4, 64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  8,  4, 64, 1,  true , false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4, 64, 8,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  2,  4, 64, 4,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  4,  4, 64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  8,  4, 64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
new file mode 100644
index 0000000000..17986277f7
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
@@ -0,0 +1,16 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd      2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4,  64, 1,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4,  64, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  2,  4,  64, 1,  true, false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4,  64, 1,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4,  64, 2,  true, false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  2,  4,  64, 1,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
new file mode 100644
index 0000000000..a7fb2d0d6c
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
@@ -0,0 +1,16 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant_instance_common.hpp"
+
+// clang-format off
+//                                                  rm  rn  tm  tn  vn  pd       2p
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  3,  4,  64, 4,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  6,  4,  64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 12,  4,  64, 1,  true , false>>(const S&, A);
+
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  3,  4,  64, 4,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  6,  4,  64, 2,  true , false>>(const S&, A);
+template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 12,  4,  64, 1,  true , false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
new file mode 100644
index 0000000000..0b890ab3ac
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
@@ -0,0 +1,155 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <ck_tile/core.hpp>
+#include "moe_smoothquant.hpp"
+
+template <typename InType,
+          typename OutType,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = moe_smoothquant_traits_<InType,
+                                       OutType,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kTwoPass_>;
+
+template <typename in_type, typename out_type>
+float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/,
+                               moe_smoothquant_args a,
+                               const ck_tile::stream_config& s)
+{
+    float r = -1;
+    // clang-format off
+    //                                                    rm  rn  tm  tn  vn   pd    2p
+    if(a.hidden_size <= 64) {
+            r = moe_smoothquant_<trait_<in_type, out_type, 1,  1,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 128) {
+        if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type, 1,  1,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type, 1,  2,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 256) {
+        if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 1,  4,  64, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 512) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 1,  4,  64, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2,  4,  64, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 8,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 768) {
+        if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3,  4,  64, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 6,  4,  64, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1,12,  4,  64, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 1024) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 1, 2,  128, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2, 2,  128, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 2,  128, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 1536) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3, 4,   64, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3, 2,  128, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 6, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 2048) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 1, 1,  256, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2, 1,  256, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 8, 1,  256, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 3072) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3, 1,  128, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3, 1,  256, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 6, 1,  256, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 3, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size <= 4096) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2, 1,  256, 8,  true, false>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 1,  256, 4,  true, false>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2, 1, 1024, 2,  true, false>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 1, 1024, 1,  true, false>>(s, a);
+    }
+    else if(a.hidden_size > 4096) {
+        if (a.hidden_size % 8 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2, 1,  256, 8,  true, true>>(s, a);
+        else if (a.hidden_size % 4 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 1,  256, 4,  true, true>>(s, a);
+        else if (a.hidden_size % 2 == 0)
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 2, 1, 1024, 2,  true, true>>(s, a);
+        else
+            r = moe_smoothquant_<trait_<in_type, out_type,  1, 4, 1, 1024, 1,  true, true>>(s, a);
+    }
+    return r;
+    // clang-format on
+}
+
+float moe_smoothquant(moe_smoothquant_traits t,
+                      moe_smoothquant_args a,
+                      const ck_tile::stream_config& s)
+{
+    if(t.in_type.compare("fp16") == 0 && t.out_type == "int8")
+    {
+        return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::int8_t>(t, a, s);
+    }
+    else if(t.in_type.compare("fp16") == 0 && t.out_type == "fp8")
+    {
+        return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::fp8_t>(t, a, s);
+    }
+    else if(t.in_type.compare("bf16") == 0 && t.out_type == "int8")
+    {
+        return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::int8_t>(t, a, s);
+    }
+    else if(t.in_type.compare("bf16") == 0 && t.out_type == "fp8")
+    {
+        return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::fp8_t>(t, a, s);
+    }
+    else
+        throw std::runtime_error("Without supported instances!");
+}
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
new file mode 100644
index 0000000000..9d8c9caf00
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
@@ -0,0 +1,65 @@
+
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <ck_tile/core.hpp>
+#include "moe_smoothquant.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = moe_smoothquant_args;
+
+template <typename InputType_,
+          typename OutputType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+using trait_ = moe_smoothquant_traits_<InputType_,
+                                       OutputType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kTwoPass_>;
+
+template <typename Traits_>
+float moe_smoothquant_(const S& s, A a)
+{
+    using InputType  = typename Traits_::InputType;
+    using OutputType = typename Traits_::OutputType;
+
+    using PipelineProblem = ck_tile::SmoothquantPipelineProblem<
+        typename MoeSmoothquantTypeConfig<InputType, OutputType>::XDataType,
+        typename MoeSmoothquantTypeConfig<InputType, OutputType>::SmoothScaleDataType,
+        typename MoeSmoothquantTypeConfig<InputType, OutputType>::ComputeDataType,
+        typename MoeSmoothquantTypeConfig<InputType, OutputType>::YScaleDataType,
+        typename MoeSmoothquantTypeConfig<InputType, OutputType>::QYDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kTwoPass>;
+
+    using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::MoeSmoothquant<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
new file mode 100644
index 0000000000..d137e64cb4
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
@@ -0,0 +1,104 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/smoothquant.hpp"
+#include <string>
+
+template <typename InputType, typename OutputType>
+struct MoeSmoothquantTypeConfig
+{
+    using XDataType           = InputType;
+    using SmoothScaleDataType = float;
+    using YScaleDataType      = float;
+    using QYDataType          = OutputType;
+    using ComputeDataType     = float;
+};
+
+// runtime args
+struct moe_smoothquant_args : public ck_tile::MoeSmoothquantHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename InputType_,
+          typename OutputType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kTwoPass_>
+struct moe_smoothquant_traits_
+{
+    using InputType  = ck_tile::remove_cvref_t<InputType_>;
+    using OutputType = ck_tile::remove_cvref_t<OutputType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN    = kPadN_;
+    static constexpr bool kTwoPass = kTwoPass_;
+};
+
+template <typename Traits_>
+float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a);
+
+// This is the public API, will be generated by script
+struct moe_smoothquant_traits
+{
+    std::string in_type;  // input type
+    std::string out_type; // output type
+};
+
+float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
new file mode 100644
index 0000000000..ff23c99e74
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
@@ -0,0 +1,317 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "ck_tile/host.hpp"
+#include "moe_smoothquant.hpp"
+#include <cstring>
+#include <set>
+#include <hip/hip_runtime.h>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+auto create_args(int argc, char* argv[], int index = 0)
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("t", "3328", "tokens dimension")
+        .insert("h", "4096", "hidden_size dimension")
+        .insert("e", "32", "experts")
+        .insert("k", "5", "topk")
+        .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec_i", "fp16", "input precision, fp16/bf16")
+        .insert("prec_o", "int8", "precision, int8/fp8")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv, index);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InputType, typename OutputType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t tokens      = arg_parser.get_int("t");
+    ck_tile::index_t hidden_size = arg_parser.get_int("h");
+    ck_tile::index_t stride      = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = hidden_size;
+    ck_tile::index_t experts = arg_parser.get_int("e");
+    ck_tile::index_t topk    = arg_parser.get_int("k");
+    std::string prec_i       = arg_parser.get_str("prec_i");
+    std::string prec_o       = arg_parser.get_str("prec_o");
+    int kname                = arg_parser.get_int("kname");
+    int do_validation        = arg_parser.get_int("v");
+    int warmup               = arg_parser.get_int("warmup");
+    int repeat               = arg_parser.get_int("repeat");
+
+    assert(stride >= hidden_size);
+
+    using TypeConfig = MoeSmoothquantTypeConfig<InputType, OutputType>;
+
+    using XDataType           = typename TypeConfig::XDataType;
+    using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
+    using YScaleDataType      = typename TypeConfig::YScaleDataType;
+    using QYDataType          = typename TypeConfig::QYDataType;
+    using ComputeDataType     = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({tokens, hidden_size}, {stride, 1});
+    ck_tile::HostTensor<SmoothScaleDataType> smscale_host({experts * hidden_size});
+    ck_tile::HostTensor<ck_tile::index_t> topk_ids_host({tokens, topk});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({topk * tokens}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({topk * tokens}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({topk * tokens, hidden_size}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({topk * tokens, hidden_size}, {stride, 1});
+
+    topid_unique_gen<ck_tile::index_t>(topk_ids_host.mData, tokens, topk, experts, 11937);
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    smscale_buf.ToDevice(smscale_host.data());
+    topk_ids_buf.ToDevice(topk_ids_host.data());
+
+    std::cout << "[" << prec_i << "-" << prec_o << "]"
+              << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
+              << ", experts:" << experts << ", topk:" << topk << std::flush;
+
+    moe_smoothquant_traits traits{prec_i, prec_o};
+
+    moe_smoothquant_args args{x_buf.GetDeviceBuffer(),
+                              smscale_buf.GetDeviceBuffer(),
+                              topk_ids_buf.GetDeviceBuffer(),
+                              yscale_buf.GetDeviceBuffer(),
+                              qy_buf.GetDeviceBuffer(),
+                              tokens,
+                              hidden_size,
+                              experts,
+                              topk,
+                              stride,
+                              stride};
+
+    float ave_time = moe_smoothquant(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte = sizeof(XDataType) * tokens * hidden_size +
+                           sizeof(SmoothScaleDataType) * topk * hidden_size +
+                           sizeof(YScaleDataType) * topk * tokens +
+                           sizeof(QYDataType) * topk * tokens * hidden_size;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({topk * tokens, hidden_size}, {stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto i_token) {
+                for(int i_topk = 0; i_topk < topk; i_topk++)
+                {
+                    auto i_expert = topk_ids_host(i_token, i_topk);
+
+                    for(int i_h = 0; i_h < hidden_size; ++i_h)
+                    {
+                        auto v_smscale = ck_tile::type_convert<ComputeDataType>(
+                            smscale_host(i_expert * hidden_size + i_h));
+                        auto v_x = ck_tile::type_convert<ComputeDataType>(x_host(i_token, i_h));
+                        // y_host(i_token * topk + i_topk, i_h) = v_x * v_smscale;
+                        y_host(i_topk * tokens + i_token, i_h) = v_x * v_smscale;
+                    }
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({topk * tokens});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == hidden_size)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < topk * tokens; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride +
+                                                                hidden_size);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride +
+                                                                hidden_size);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+std::vector<std::vector<std::string>> generate_test_cases(const std::string prec_in,
+                                                          const std::string prec_out)
+{
+    return {{"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=99", "-h=13", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=17", "-h=16", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=100", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=4", "-h=128", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=80", "-h=127", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=22", "-h=255", "-stride=256"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=599", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=19", "-h=512", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=33", "-h=313", "-stride=1000"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=11", "-h=510", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=171", "-h=676", "-stride=818"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=12", "-h=768", "-stride=800"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=100", "-h=766", "-stride=812"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=31", "-h=1024", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=64", "-h=1000", "-stride=1004"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=8", "-h=1501", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=1826", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=5", "-h=2040", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=2734", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=3182", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=9", "-h=4096", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=8192", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=10547", "-stride=-1"},
+            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=17134", "-stride=-1"}};
+}
+
+template <typename InputType, typename OutputType>
+bool run_test_case(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    return run<InputType, OutputType>(arg_parser);
+}
+
+template <typename InputType, typename OutputType>
+bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
+{
+    bool valid             = true;
+    constexpr int num_args = 5;
+    char* argv[num_args];
+
+    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
+    {
+
+        assert(num_args == test_cases[test_idx].size() && "invalid number of arguments");
+
+        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
+        {
+            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
+        }
+
+        valid = valid && run_test_case<InputType, OutputType>(num_args, argv);
+
+        if(!valid)
+            break;
+    }
+
+    return valid;
+}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
new file mode 100644
index 0000000000..3b5350da4b
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16", "fp8");
+
+    return !run_test_cases<ck_tile::bf16_t, ck_tile::fp8_t>(test_cases);
+}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
new file mode 100644
index 0000000000..4751273f1d
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16", "int8");
+
+    return !run_test_cases<ck_tile::bf16_t, ck_tile::int8_t>(test_cases);
+}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
new file mode 100644
index 0000000000..b9932dee65
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16", "fp8");
+
+    return !run_test_cases<ck_tile::half_t, ck_tile::fp8_t>(test_cases);
+}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
new file mode 100644
index 0000000000..91c53b77bc
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_smoothquant.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16", "int8");
+
+    return !run_test_cases<ck_tile::half_t, ck_tile::int8_t>(test_cases);
+}

From 84a7600bdc5cc06123a82e48348820e2dd6c3285 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Sun, 20 Jul 2025 02:15:50 -0400
Subject: [PATCH 328/443] fix(cmake-dev): cmake dev script works with non bash
 shells (#2530)

---
 script/cmake-ck-dev.sh | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 839b6c4f08..151c2a22ff 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -5,11 +5,22 @@ rm -rf CMakeFiles
 
 MY_PROJECT_SOURCE=$1
 
-if [ $# -ge 2 ] && [[ "$2" =~ ^gfx ]]; then
-    GPU_TARGETS=$2
-    shift 2
-    echo "GPU targets provided: $GPU_TARGETS"
-    REST_ARGS=$@
+
+if [ $# -ge 2 ]; then
+    case "$2" in
+        gfx*) 
+            GPU_TARGETS=$2
+            shift 2
+            echo "GPU targets provided: $GPU_TARGETS"
+            REST_ARGS=$@
+            ;;
+        *)
+            echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
+            GPU_TARGETS="gfx908;gfx90a;gfx942"
+            shift 1
+            REST_ARGS=$@
+            ;;
+    esac
 else
     echo "No GPU targets provided, using default targets: gfx908;gfx90a;gfx942"
     GPU_TARGETS="gfx908;gfx90a;gfx942"

From 1fa1c34b7e70939ed1e131edef0e6d7ae6b29d0d Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Mon, 21 Jul 2025 12:20:28 -0600
Subject: [PATCH 329/443] Tests for CK tile Permute and MOE Sorting (#2417)

* Convert ck-tile 06_permute smoke test to unit tests for fp16, fp8, and fp32

* Apply clang format and update copy right year

* Convert ck tile moe sorting example smoke tests to unit tests

* fix CMakelists to ensure that permute and moe_sorting are built for gfx9 only.

* Remove number prefix from permute and moe_sorting directory names

* code cleanup

* add missing test cases for fp16 permute

* remove unecessary parentheses

* Cleanup

* Remove uneccessary final nullptr

* update copyright and licensing statement in files

* Add custom target for permute tests

* Add missing new line at end of file for moe sorting CMakelist.

* Update MOE sorting tests to account for MOE sorting example updates

The ck_tile/13_moe_sorting example was updated to include different
cases dependending on whether MOE_SORTING_FMOE_2D_BUF is set. So,
the ck_tile tests for MOE sorting were updated to account for these
changes.

---------

Co-authored-by: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
---
 test/ck_tile/CMakeLists.txt                   |   2 +
 test/ck_tile/moe_sorting/CMakeLists.txt       |  15 +
 test/ck_tile/moe_sorting/moe_sorting_api.cpp  | 444 +++++++++++++++
 test/ck_tile/moe_sorting/moe_sorting_api.hpp  |  33 ++
 test/ck_tile/moe_sorting/moe_sorting_fp32.cpp | 538 ++++++++++++++++++
 test/ck_tile/permute/CMakeLists.txt           |  33 ++
 .../alternative_impl/matrix_core_swizzle.cpp  | 101 ++++
 .../alternative_impl/matrix_core_swizzle.hpp  |  20 +
 .../matrix_core_swizzle_kernel.hpp            | 413 ++++++++++++++
 test/ck_tile/permute/permute.hpp              |  19 +
 test/ck_tile/permute/permute_fp16.cpp         |  29 +
 test/ck_tile/permute/permute_fp32.cpp         |  29 +
 test/ck_tile/permute/permute_fp8.cpp          |  29 +
 test/ck_tile/permute/permute_utils.inc        | 490 ++++++++++++++++
 14 files changed, 2195 insertions(+)
 create mode 100644 test/ck_tile/moe_sorting/CMakeLists.txt
 create mode 100644 test/ck_tile/moe_sorting/moe_sorting_api.cpp
 create mode 100644 test/ck_tile/moe_sorting/moe_sorting_api.hpp
 create mode 100644 test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
 create mode 100644 test/ck_tile/permute/CMakeLists.txt
 create mode 100644 test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
 create mode 100644 test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
 create mode 100644 test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
 create mode 100644 test/ck_tile/permute/permute.hpp
 create mode 100644 test/ck_tile/permute/permute_fp16.cpp
 create mode 100644 test/ck_tile/permute/permute_fp32.cpp
 create mode 100644 test/ck_tile/permute/permute_fp8.cpp
 create mode 100644 test/ck_tile/permute/permute_utils.inc

diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 0b6fd35988..648fdc7739 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -8,6 +8,8 @@ add_subdirectory(data_type)
 # Not including these tests as there is a bug on gfx90a and gfx942
 # resulting in "GPU core dump"
 #add_subdirectory(moe_smoothquant)
+add_subdirectory(permute)
+add_subdirectory(moe_sorting)
 add_subdirectory(slice_tile)
 add_subdirectory(batched_transpose)
 add_subdirectory(smoothquant)
diff --git a/test/ck_tile/moe_sorting/CMakeLists.txt b/test/ck_tile/moe_sorting/CMakeLists.txt
new file mode 100644
index 0000000000..e360293878
--- /dev/null
+++ b/test/ck_tile/moe_sorting/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+
+    add_test_executable(test_ck_tile_moe_sorting_fp32 moe_sorting_fp32.cpp moe_sorting_api.cpp)
+    target_include_directories(test_ck_tile_moe_sorting_fp32 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+    set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+    # list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+    target_compile_options(test_ck_tile_moe_sorting_fp32 PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
+
+else()
+    message(DEBUG "Skipping ck_tile_moe_sorting tests for current target")
+endif()
diff --git a/test/ck_tile/moe_sorting/moe_sorting_api.cpp b/test/ck_tile/moe_sorting/moe_sorting_api.cpp
new file mode 100644
index 0000000000..0e8998e254
--- /dev/null
+++ b/test/ck_tile/moe_sorting/moe_sorting_api.cpp
@@ -0,0 +1,444 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "moe_sorting_api.hpp"
+
+#ifndef MOE_SORTING_USE_EX_KERNEL
+#define MOE_SORTING_USE_EX_KERNEL 1
+#endif
+
+#ifndef MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_SUPPORT_LARGE_EXPERT 0
+#endif
+
+#ifndef MOE_SORTING_SUPPORT_LARGE_TOPK
+#define MOE_SORTING_SUPPORT_LARGE_TOPK 0
+#endif
+
+#if !MOE_SORTING_USE_EX_KERNEL
+
+#define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
+    constexpr ck_tile::index_t unroll_num  = unroll_num_;                             \
+    constexpr ck_tile::index_t expert_tile = expert_tile_;                            \
+    using ms_problem =                                                                \
+        ck_tile::MoeSortingProblem<index_t, ms_weight_type, unroll_num, expert_tile>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                     \
+    auto kargs           = kernel::MakeKargs(a);                                      \
+    const dim3 grids     = kernel::GridSize(a);                                       \
+    const dim3 blocks    = kernel::BlockSize(a);                                      \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                    \
+    float ave_time       = ck_tile::launch_kernel(                                    \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));    \
+    return ave_time;
+
+#else
+
+#define MOE_SORTING_DISPATCH_(                                                                          \
+    sub_token_tile_, sub_token_onshot_, local_expert_masking_, local_token_)                            \
+    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
+    constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
+    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
+    constexpr bool local_token                = local_token_;                                           \
+    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
+                                                    ms_weight_type,            \
+                                                    sub_token_tile,            \
+                                                    sub_token_onshot,          \
+                                                    local_expert_masking,      \
+                                                    local_token>;              \
+    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
+    auto kargs                                = kernel::MakeKargs(a);                                   \
+    const dim3 grids                          = kernel::GridSize(a);                                    \
+    const dim3 blocks                         = kernel::BlockSize(a);                                   \
+    const auto lds_bytes                      = kernel::GetSmemSize(a);                                 \
+    float ave_time                            = ck_tile::launch_kernel(                                 \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
+    return ave_time;
+
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(                                                  \
+    row_, sub_token_onshot_, local_expert_masking_, local_token_)                         \
+    if(row_ % 8 == 0)                                                                     \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 4 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 2 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else                                                                                  \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }
+
+#define MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, sub_token_onshot_, local_expert_masking_)    \
+    if(is_local_token)                                                                         \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, true)  \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, false) \
+    }
+
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)                \
+    if(is_sub_token_onshot)                                                     \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                           \
+    else                                                                        \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, false, local_expert_masking_) \
+    }
+
+#define MOE_SORTING_DISPATCH_EMASK_(row_)        \
+    if(is_local_expert_masking)                  \
+    {                                            \
+        MOE_SORTING_DISPATCH_SUBTO_(row_, true)  \
+    }                                            \
+    else                                         \
+    {                                            \
+        MOE_SORTING_DISPATCH_SUBTO_(row_, false) \
+    }
+
+#endif
+
+#if !MOE_SORTING_USE_EX_KERNEL
+#define MOE_SORTING_DISPATCH(unroll_num_)           \
+    if(a.num_experts <= 8)                          \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 8)  \
+    }                                               \
+    else if(a.num_experts <= 16)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 16) \
+    }                                               \
+    else if(a.num_experts <= 32)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 32) \
+    }                                               \
+    else if(a.num_experts <= 64)                    \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 64) \
+    }                                               \
+    else                                            \
+    {                                               \
+        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0)  \
+    }
+#endif
+
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+#if !MOE_SORTING_USE_EX_KERNEL
+        if(a.num_experts > 127)
+        {
+            printf("lds size exceed, only support experts <127 \n");
+            return -1;
+        }
+        if(a.moe_buf_bytes % 16)
+        {
+            printf("buf set size %d unaligned, must be multiple of 16\n", a.moe_buf_bytes);
+            return -1;
+        }
+        using index_t              = ck_tile::index_t;
+        using ms_weight_type       = float;
+        index_t smem_io_unroll_num = ck_tile::integer_divide_ceil(a.tokens * a.topk, 64);
+        switch(smem_io_unroll_num)
+        {
+        case(1): {
+            MOE_SORTING_DISPATCH(1);
+        }
+        case(2): {
+            MOE_SORTING_DISPATCH(2);
+        }
+        case(3): {
+            MOE_SORTING_DISPATCH(3);
+        }
+        case(5): {
+            MOE_SORTING_DISPATCH(5);
+        }
+        case(6): {
+            MOE_SORTING_DISPATCH(6);
+        }
+        case(8): {
+            MOE_SORTING_DISPATCH(8);
+        }
+        case(10): {
+            MOE_SORTING_DISPATCH(10);
+        }
+        default: {
+            MOE_SORTING_DISPATCH(4);
+        }
+        }
+#else
+        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts, a.topk, t.dispatch_policy) != 0)
+        {
+            return moe_sorting_mp(t, a, s);
+        }
+        using index_t                = ck_tile::index_t;
+        using ms_weight_type         = float;
+        auto sub_token_              = ck_tile::moe_sorting_get_sub_token(a.tokens, a.num_experts);
+        auto row_                    = sub_token_ / 8;
+        bool is_sub_token_onshot     = a.tokens <= sub_token_;
+        bool is_local_expert_masking = t.local_expert_masking;
+        bool is_local_token          = a.p_local_tokens != nullptr;
+
+        MOE_SORTING_DISPATCH_EMASK_(row_);
+        // MOE_SORTING_DISPATCH_ETILE(0, 0);
+#endif
+    }
+    return -1;
+}
+
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking,        \
+                                                        local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking,        \
+                                                        local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking,        \
+                                                        local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                        ms_weight_type,        \
+                                                        mesh_type_,            \
+                                                        unroll_num,            \
+                                                        expert_masking,        \
+                                                        local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+#endif
+
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                          \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
+        constexpr bool expert_masking         = expert_masking_;                                     \
+        constexpr bool local_token            = local_token_;                                        \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
+                                                        ms_weight_type,         \
+                                                        mesh_type_,             \
+                                                        unroll_num,             \
+                                                        expert_masking,         \
+                                                        local_token>;           \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                                \
+        const dim3 grids                      = kernel::GridSize(a);                                 \
+        const dim3 blocks                     = kernel::BlockSize(a);                                \
+        const auto lds_size                   = kernel::GetSmemSize(a);                              \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
+    }()
+
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
+    if(t.local_expert_masking)                                                                     \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, true),     \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, true),     \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, true));  \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, false),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, false),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, false)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, true),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, true),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, true)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time = ck_tile::launch_kernel(                                               \
+                s,                                                                                 \
+                maybe_clear_workspace,                                                             \
+                MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, false),                          \
+                MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, false),                          \
+                MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, false));                       \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }
+
+#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                 \
+    [&]() {                                                                                 \
+        using problem_ =                                                                    \
+            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>;  \
+        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;              \
+        auto kargs        = kernel::MakeKargs(a);                                           \
+        const dim3 grids  = kernel::GridSize(a);                                            \
+        const dim3 blocks = kernel::BlockSize(a);                                           \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs); \
+    }()
+
+float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
+{
+    bool is_local_token = a.p_local_tokens != nullptr;
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        using ms_index_t     = ck_tile::index_t;
+        using ms_weight_type = float;
+
+        auto maybe_clear_workspace = [=](const ck_tile::stream_config& s_) {
+            if(t.clear_workspace_inside_api)
+            {
+                if(is_local_token)
+                {
+                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(true, 1024, 1);
+                    k(s_);
+                }
+                else
+                {
+                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(false, 1024, 1);
+                    k(s_);
+                }
+            }
+        };
+
+        if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
+           ck_tile::get_smem_capacity())
+        {
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+            if(t.local_expert_masking)
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        maybe_clear_workspace,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, true));
+                return ave_time;
+            }
+            else
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        maybe_clear_workspace,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, false));
+                return ave_time;
+            }
+#else
+            printf("do not support large expert %d\n", a.num_experts);
+            return -1;
+#endif
+        }
+        else
+        {
+            ck_tile::index_t mesh_byte_size =
+                ck_tile::impl::moe_sorting_mesh_byte_size(a.tokens, a.num_experts, a.topk);
+            if(mesh_byte_size == 1)
+            {
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 4, 16, 16)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 1, 16, 16)
+                }
+            }
+            else if(mesh_byte_size == 2)
+            {
+#if MOE_SORTING_SUPPORT_LARGE_TOPK
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 4, 8, 8)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 1, 8, 8)
+                }
+#else
+                printf("do not support large topk %d\n", a.topk);
+                return -1;
+#endif
+            }
+            else
+            {
+                MOR_SORTING_MP_DISPATCH_(ck_tile::index_t, 1, 1, 1)
+            }
+        }
+    }
+    return -1;
+}
+
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk, int dispatch_policy)
+{
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
+}
diff --git a/test/ck_tile/moe_sorting/moe_sorting_api.hpp b/test/ck_tile/moe_sorting/moe_sorting_api.hpp
new file mode 100644
index 0000000000..5808d20f6d
--- /dev/null
+++ b/test/ck_tile/moe_sorting/moe_sorting_api.hpp
@@ -0,0 +1,33 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
+
+struct moe_sorting_trait
+{
+    std::string index_type;
+    std::string weight_type;         // currently always float
+    bool local_expert_masking;       // if mask experts as local expert
+    bool clear_workspace_inside_api; // if true, no need clear workspace outsize (will take care of
+                                     // it inside API)
+    int dispatch_policy; // 0 - let the API choose kernel for you. 1 - always use single kerenl. 2 -
+                         // always use mp kernel NOTE: moe_sorting_get_workspace_size() need use
+                         // same dispatch_policy value. it will be undefined behavior if ppl using
+                         // different value when get ws and call the kernel
+};
+
+struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
+{
+};
+
+// use below API before call moe_sorting() to indicate if need workspace or not
+// if return non zero, means need workspace, you need to allocate a GPU buffer
+// and set to moe_sorting_args.p_ws
+// NOTE: workspace size are required to clear zero before use the API
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk, int dispatch_policy);
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
+float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
diff --git a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp b/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
new file mode 100644
index 0000000000..cc511984fe
--- /dev/null
+++ b/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
@@ -0,0 +1,538 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "moe_sorting_api.hpp"
+
+auto create_args(int argc, char* argv[], int index = 0)
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "turn CPU validation on (1) or off (0).")
+        .insert("pr_i", "int32", "index data type.  Only int32 is currently supported.")
+        .insert("pr_w", "fp32", "output weight data type. Only fp32 is currently supported.")
+        .insert("t",
+                "128",
+                "number of input tokens.\n"
+                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
+        .insert(
+            "local_t",
+            "-1",
+            "Number of local input tokens for curent rank.\n"
+            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
+            "This feature is to simulate EP case where where each rank has different tokens.\n"
+            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
+        .insert("e", "8", "number of num_experts")
+        .insert("k", "4", "topk")
+        .insert("unit", "32", "unit_size")
+#if MOE_SORTING_FMOE_2D_BUF
+        .insert("moe_buf_interm_dim", "0", "interm_dim(col) of the following fmoe buf")
+        .insert(
+            "moe_buf_elem_bytes", "2", "fmoe buf element byte size, 1:8bit, 2:16bit, 4:32bit...")
+#else
+        .insert("moe_buf_size", "0", "moe_buf_size")
+#endif
+        .insert("ci",
+                "1",
+                "clear workspace inside API or not(if \"0\", require manually clear outside)")
+        .insert(
+            "dispatch",
+            "0",
+            "dispatch policy. 0:automatically pick up kernel, 1:use single kernel, 2:use mp kernel")
+        .insert("local_eid",
+                "-1",
+                "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
+                "please make sure eid is in ascending order!")
+        .insert("seed",
+                "-1",
+                "seed to be used. When set to -1, a random seed will be generated each time "
+                "invoking this example")
+        .insert("kname", "0", "prints the kernel name when set to 1")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv, index);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+template <typename WeightType, typename IndexType = ck_tile::index_t>
+bool test_moe_sorting(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string index_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+    int tokens              = args.get_int("t");
+    int local_tokens        = args.get_int("local_t");
+    int num_experts         = args.get_int("e");
+    int topk                = args.get_int("k");
+    int seed                = args.get_int("seed");
+    int unit_size           = args.get_int("unit");
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = args.get_int("moe_buf_interm_dim");
+    int moe_buf_elem_bytes = args.get_int("moe_buf_elem_bytes");
+#else
+    int64_t moe_buf_size = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
+#endif
+    int kname           = args.get_int("kname");
+    int warmup          = args.get_int("warmup");
+    int repeat          = args.get_int("repeat");
+    bool clear_inside   = args.get_int("ci") != 0;
+    int dispatch_policy = args.get_int("dispatch");
+
+    int max_output_ids =
+        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    if(topk > num_experts)
+    {
+        printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
+               topk,
+               num_experts);
+        return false;
+    }
+
+    // if local_tokens == tokens, not local_token, but better avoid this since no meaning for such
+    // case
+    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+    if(local_tokens > tokens)
+    {
+        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+        return false;
+    }
+
+    bool local_expert_masking      = args.get_str("local_eid") != "-1";
+    auto local_expert_masking_host = [&]() {
+        if(local_expert_masking)
+        {
+            auto local_eid = args.get_int_vec("local_eid");
+            ck_tile::HostTensor<IndexType> v_{{num_experts}};
+            v_.SetZero();
+            for(auto eid : local_eid)
+            {
+                if(eid >= num_experts)
+                {
+                    throw std::runtime_error(
+                        "local_eid larger than number of expert, please check");
+                }
+                v_.mData[eid] = 1;
+            }
+            return v_;
+        }
+        else
+            return ck_tile::HostTensor<IndexType>{{1}};
+    }();
+
+    // tokens already considered batch size
+    ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
+    ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
+    ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
+    ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
+    ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
+    // for simplicity, below buffer allocate 2 dword
+    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
+#if MOE_SORTING_FMOE_2D_BUF
+    ck_tile::HostTensor<int8_t> moe_buf_host(
+        {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
+         moe_buf_elem_bytes});
+    auto moe_buf_bytes = moe_buf_interm_dim == 0 ? static_cast<std::size_t>(0)
+                                                 : moe_buf_host.get_element_space_size_in_bytes();
+#else
+    ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+    auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
+                                           : moe_buf_host.get_element_space_size_in_bytes();
+#endif
+
+    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+#if MOE_SORTING_FMOE_2D_BUF
+    ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
+#else
+    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+#endif
+    topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
+
+    ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_weights_dev(sorted_weights_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_expert_ids_dev(
+        sorted_expert_ids_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem local_expert_masking_dev(
+        local_expert_masking_host.get_element_space_size_in_bytes());
+
+    // used for simulating dynamic_tokens for EP case
+    ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+    if(is_local_token)
+    {
+        local_tokens_dev.ToDevice(&local_tokens);
+    }
+
+    topk_ids_dev.ToDevice(topk_ids_host.data());
+    weights_dev.ToDevice(weights_host.data());
+    if(moe_buf_bytes > 0)
+    {
+        moe_buf_dev.ToDevice(moe_buf_host.data());
+    }
+    if(local_expert_masking)
+        local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
+
+    // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
+    ck_tile::index_t workspace_size =
+        moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
+    ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
+    if(workspace_size != 0 && clear_inside == false)
+        moe_sorting_ws.SetZero(); // note, clear here!!!!
+
+    moe_sorting_trait trait{
+        index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
+
+    moe_sorting_args karg
+    {
+        topk_ids_dev.GetDeviceBuffer(), weights_dev.GetDeviceBuffer(),
+            local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer() : nullptr,
+            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+            sorted_ids_dev.GetDeviceBuffer(), sorted_weights_dev.GetDeviceBuffer(),
+            sorted_expert_ids_dev.GetDeviceBuffer(), sorted_id_cnt_dev.GetDeviceBuffer(),
+            moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr, tokens, unit_size,
+            num_experts, topk,
+#if MOE_SORTING_FMOE_2D_BUF
+            moe_buf_interm_dim, moe_buf_elem_bytes
+#else
+            static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+#endif
+    };
+
+    ck_tile::stream_config sc{nullptr,
+                              true,
+                              /* log_level = */ (kname ? 1 : 0),
+                              warmup,
+                              repeat};
+
+    auto ms = moe_sorting(trait, karg, sc);
+
+    printf("[%s|%s|%s|%d]tokens:%d",
+           index_prec.c_str(),
+           weight_prec.c_str(),
+           workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
+           dispatch_policy,
+           tokens);
+    if(is_local_token)
+    {
+        printf("(%d)", local_tokens);
+    }
+    printf(", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);
+
+    if(local_expert_masking)
+    {
+        printf("local_eid:%s, ", args.get_str("local_eid").c_str());
+    }
+
+    if(moe_buf_bytes > 0)
+    {
+#if MOE_SORTING_FMOE_2D_BUF
+        printf("moe_buf:%lu(%d,%d), ",
+               static_cast<uint64_t>(moe_buf_bytes),
+               moe_buf_interm_dim,
+               moe_buf_elem_bytes);
+#else
+
+        printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
+#endif
+    }
+
+    if(ms < 0)
+        printf("not supported\n");
+    else
+        printf("ms:%f, ", ms);
+    fflush(stdout);
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    sorted_ids_dev.FromDevice(sorted_ids_host.data());
+    sorted_weights_dev.FromDevice(sorted_weights_host.data());
+    sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
+    sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
+    if(moe_buf_bytes > 0)
+    {
+        moe_buf_dev.FromDevice(moe_buf_host.data());
+    }
+
+    bool rtn = true;
+    if(validate)
+    {
+        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
+
+        int32_t ref_total_tokens_post_pad = 0;
+        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
+                                                              weights_host,
+                                                              local_expert_masking_host,
+                                                              sorted_ids_ref,
+                                                              sorted_weights_ref,
+                                                              sorted_expert_ids_ref,
+                                                              ref_total_tokens_post_pad,
+                                                              num_experts,
+                                                              unit_size,
+                                                              is_local_token ? local_tokens
+                                                                             : tokens,
+                                                              local_expert_masking);
+        printf("total_tokens_post_pad:%d(%d), ",
+               ref_total_tokens_post_pad,
+               sorted_id_cnt_host.mData[0]);
+        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
+        {
+            size_t slen = ref_total_tokens_post_pad;
+            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
+                                      sorted_ids_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect ids!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
+                                      sorted_weights_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect w!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
+                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
+                                      std::string("OUT Error: Incorrect eid!"),
+                                      1e-6,
+                                      1e-6);
+            // if(is_local_token)
+            {
+                auto t_ = is_local_token ? local_tokens : tokens;
+                bool _f = t_ == sorted_id_cnt_host.mData[1];
+                rtn &= _f;
+                if(!_f)
+                {
+                    printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
+                }
+            }
+        }
+        else
+        {
+            printf("(token size not equal!!)");
+            rtn = false;
+        }
+
+        if(moe_buf_bytes)
+        {
+#if MOE_SORTING_FMOE_2D_BUF
+            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
+#else
+            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+#endif
+            rtn &= ck_tile::check_err(
+                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
+        }
+        // rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
+    }
+
+    printf("valid:%s", rtn ? "y" : "n");
+    fflush(stdout);
+    if(!rtn)
+        printf(", (%d)", seed);
+    printf("\n");
+    fflush(stdout);
+    return rtn;
+}
+template <typename WeightType, typename IndexType = ck_tile::index_t>
+bool run_test_case(int argc, char* argv[])
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    return test_moe_sorting<WeightType, IndexType>(args);
+}
+
+template <typename WeightType, typename IndexType = ck_tile::index_t>
+bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
+{
+    bool valid = true;
+
+    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
+    {
+
+        constexpr int max_num_args = 7;
+        const int num_args         = test_cases[test_idx].size();
+
+        assert(max_num_args >= num_args && "Invalid number of arguments in test case");
+
+        char* argv[max_num_args];
+
+        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
+        {
+            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
+        }
+
+        try
+        {
+            valid = valid && run_test_case<WeightType, IndexType>(num_args, argv);
+
+            if(!valid)
+                break;
+        }
+        catch(const std::runtime_error& e)
+        {
+            std::cerr << "Runtime error: " << e.what() << '\n';
+            return false;
+        }
+    }
+
+    return valid;
+}
+
+std::vector<std::vector<std::string>> create_test_cases()
+{
+#if MOE_SORTING_FMOE_2D_BUF
+    return {{"-t=80", "-e=17", "-moe_buf_interm_dim=16", "-moe_buf_elem_bytes=4"},
+            {"-t=111", "-e=117", "-moe_buf_interm_dim=4", "-moe_buf_elem_bytes=4"},
+            {"-t=1000", "-e=55", "-moe_buf_interm_dim=1024", "-moe_buf_elem_bytes=1"},
+            {"-t=99", "-e=120", "-moe_buf_interm_dim=10244", "-moe_buf_elem_bytes=2"},
+            {"-t=175", "-e=64", "-k=8"},
+            {"-t=65", "-e=8", "-k=2"},
+            {"-t=1", "-e=25"},
+            {"-t=31", "-e=19", "-k=15"},
+            {"-t=81", "-e=37", "-k=7"},
+            {"-t=23", "-e=1", "-k=1"},
+            {"-t=127", "-e=99", "-k=19"},
+            {"-t=71", "-e=11", "-k=11"},
+            {"-t=1", "-e=1", "-k=1"},
+            {"-t=99", "-e=2", "-k=1"},
+            {"-t=333", "-e=99", "-k=13"},
+            {"-t=11", "-e=256", "-k=5"},
+            {"-t=64", "-e=455", "-k=8"},
+            {"-t=777", "-e=802", "-k=99"},
+            {"-t=4097", "-e=906", "-k=51"},
+            {"-t=128", "-e=32", "-k=5", "-local_t=6", "-moe_buf_interm_dim=262144"},
+            {"-t=13", "-e=64", "-k=3", "-local_eid=4,5,6,7,8,9,10,11"},
+            {"-t=99", "-e=33", "-k=9", "-local_eid=6,10,11,15,19"},
+            {"-t=80", "-e=99", "-k=10", "-local_eid=0,8,12,33"},
+            {"-t=11", "-e=256", "-k=5", "-local_eid=99,110,129"},
+            {"-t=128", "-e=128", "-k=6", "-moe_buf_interm_dim=163840", "-moe_buf_elem_bytes=1"},
+            {"-t=8192", "-e=32", "-k=5", "-local_t=11", "-moe_buf_interm_dim=163840"},
+            {"-t=8192",
+             "-e=32",
+             "-k=8",
+             "-local_t=12",
+             "-moe_buf_interm_dim=163840",
+             "-moe_buf_elem_bytes=1"},
+            {"-t=8192", "-e=256", "-k=5", "-local_t=13", "-moe_buf_interm_dim=163840"},
+            {"-t=8192", "-e=256", "-k=8", "-local_t=8", "-moe_buf_interm_dim=163840"},
+            {"-t=163840",
+             "-e=256",
+             "-k=8",
+             "-local_t=4",
+             "-moe_buf_interm_dim=163840",
+             "-moe_buf_elem_bytes=4"},
+            {"-t=12", "-local_t=3", "-e=256", "-k=5", "-local_eid=9,10,199,145"},
+            {"-t=67", "-local_t=9", "-e=555", "-k=5", "-local_eid=19,23,24,25,26,99"},
+            {"-t=99", "-local_t=93", "-e=121", "-local_t=4", "-moe_buf_interm_dim=10244"},
+            {"-t=536", "-local_t=345", "-e=802", "-k=99"},
+            {"-t=331", "-local_t=39", "-e=83", "-k=33"},
+            {"-t=765", "-local_t=654", "-e=783", "-k=8"},
+            {"-t=23", "-local_t=9", "-e=1", "-k=1"},
+            {"-t=7", "-local_t=0", "-e=89", "-k=1", "-local_eid=0,8,12,33"},
+            {"-t=61", "-local_t=0", "-e=333", "-k=99", "-local_eid=0,8,12,33"},
+            {"-t=133940",
+             "-local_t=111921",
+             "-e=256",
+             "-k=17",
+             "-local_t=2",
+             "-moe_buf_interm_dim=133940",
+             "-moe_buf_elem_bytes=1"}};
+
+#else
+    return {{"-t=80", "-e=17", "-moe_buf_size=16"},
+            {"-t=111", "-e=117", "-moe_buf_size=4"},
+            {"-t=1000", "-e=55", "-moe_buf_size=1024"},
+            {"-t=99", "-e=120", "-moe_buf_size=10244"},
+            {"-t=175", "-e=64", "-k=8"},
+            {"-t=65", "-e=8", "-k=2"},
+            {"-t=1", "-e=25"},
+            {"-t=31", "-e=19", "-k=15"},
+            {"-t=81", "-e=37", "-k=7"},
+            {"-t=23", "-e=1", "-k=1"},
+            {"-t=127", "-e=99", "-k=19"},
+            {"-t=71", "-e=11", "-k=11"},
+            {"-t=1", "-e=1", "-k=1"},
+            {"-t=99", "-e=2", "-k=1"},
+            {"-t=333", "-e=99", "-k=13"},
+            {"-t=11", "-e=256", "-k=5"},
+            {"-t=64", "-e=455", "-k=8"},
+            {"-t=777", "-e=802", "-k=99"},
+            {"-t=4097", "-e=906", "-k=51"},
+            {"-t=128", "-e=32", "-k=5", "-moe_buf_size=262144"},
+            {"-t=13", "-e=64", "-k=3", "-local_eid=4,5,6,7,8,9,10,11"},
+            {"-t=99", "-e=33", "-k=9", "-local_eid=6,10,11,15,19"},
+            {"-t=80", "-e=99", "-k=10", "-local_eid=0,8,12,33"},
+            {"-t=11", "-e=256", "-k=5", "-local_eid=99,110,129"},
+            {"-t=128", "-e=128", "-k=6", "-moe_buf_size=163840"},
+            {"-t=8192", "-e=32", "-k=5", "-moe_buf_size=163840"},
+            {"-t=8192", "-e=32", "-k=8", "-moe_buf_size=163840"},
+            {"-t=8192", "-e=256", "-k=5", "-moe_buf_size=163840"},
+            {"-t=8192", "-e=256", "-k=8", "-moe_buf_size=163840"},
+            {"-t=163840", "-e=256", "-k=8", "-moe_buf_size=163840"},
+            {"-t=12", "-local_t=3", "-e=256", "-k=5", "-local_eid=9,10,199,145"},
+            {"-t=67", "-local_t=9", "-e=555", "-k=5", "-local_eid=19,23,24,25,26,99"},
+            {"-t=99", "-local_t=93", "-e=121", "-moe_buf_size=10244"},
+            {"-t=536", "-local_t=345", "-e=802", "-k=99"},
+            {"-t=331", "-local_t=39", "-e=83", "-k=33"},
+            {"-t=765", "-local_t=654", "-e=783", "-k=8"},
+            {"-t=23", "-local_t=9", "-e=1", "-k=1"},
+            {"-t=7", "-local_t=0", "-e=89", "-k=1", "-local_eid=0,8,12,33"},
+            {"-t=61", "-local_t=0", "-e=333", "-k=99", "-local_eid=0,8,12,33"},
+            {"-t=133940", "-local_t=111921", "-e=256", "-k=17", "-moe_buf_size=133940"}};
+#endif
+}
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = create_test_cases();
+
+    return !run_test_cases<float, ck_tile::index_t>(test_cases);
+}
diff --git a/test/ck_tile/permute/CMakeLists.txt b/test/ck_tile/permute/CMakeLists.txt
new file mode 100644
index 0000000000..7ee55a984d
--- /dev/null
+++ b/test/ck_tile/permute/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+
+    function(add_permute_test TARGET_NAME MAIN_SRC)
+        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+
+        if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
+        set(PERMUTE_USE_ALTERNATIVE_IMPL true)
+        endif()
+
+        if(PERMUTE_USE_ALTERNATIVE_IMPL)
+        target_compile_options(${TARGET_NAME} PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
+        target_sources(${TARGET_NAME} PRIVATE alternative_impl/matrix_core_swizzle.cpp)
+        endif()
+
+    endfunction(add_permute_test TARGET_NAME MAIN_SRC)
+    
+    set(CUSTOM_TARGET_NAME test_ck_tile_permute)
+
+    add_custom_target(${CUSTOM_TARGET_NAME})
+
+    add_permute_test(test_ck_tile_permute_fp16 permute_fp16.cpp)
+    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp16)
+
+    add_permute_test(test_ck_tile_permute_fp8 permute_fp8.cpp)
+    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp8)
+
+    add_permute_test(test_ck_tile_permute_fp32 permute_fp32.cpp)
+    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp32)
+
+else()
+    message(DEBUG "Skipping ck_tile_permute tests for current target")
+endif()
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
new file mode 100644
index 0000000000..aedcfac138
--- /dev/null
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
@@ -0,0 +1,101 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "matrix_core_swizzle.hpp"
+#include "matrix_core_swizzle_kernel.hpp"
+
+float matrix_core_swizzle(matrix_core_swizzle_traits t,
+                          matrix_core_swizzle_args a,
+                          const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        if(t.inst.compare("32x32x8") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,3,4,2,5") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+        }
+        else if(t.inst.compare("16x16x16") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,3,4,2,5") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+        }
+    }
+    return -1;
+}
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
new file mode 100644
index 0000000000..89dfeda4af
--- /dev/null
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
@@ -0,0 +1,20 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+#include "matrix_core_swizzle_kernel.hpp"
+#include <string>
+
+struct matrix_core_swizzle_traits
+{
+    std::string data_type; // fp16 only
+    std::string inst;      // 32x32x8, 16x16x16
+    std::string permute;   //
+};
+
+using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
+
+// host API
+float matrix_core_swizzle(matrix_core_swizzle_traits,
+                          matrix_core_swizzle_args,
+                          const ck_tile::stream_config&);
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
new file mode 100644
index 0000000000..518a9a8889
--- /dev/null
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -0,0 +1,413 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+// if set to 1, slightly more instructions generated to calculate address
+#ifndef MERGE_2D_013425
+#define MERGE_2D_013425 0
+#endif
+
+enum class matrix_core_inst_enum
+{
+    MFMA_32x32x8_F16  = 0,
+    MFMA_16x16x16_F16 = 1,
+};
+
+namespace detail {
+template <matrix_core_inst_enum>
+struct to_warp_gemm;
+
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_32x32x8_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M32N32K8;
+};
+
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_16x16x16_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M16N16K16;
+};
+} // namespace detail
+template <matrix_core_inst_enum Inst>
+using to_warp_gemm_t = typename detail::to_warp_gemm<Inst>::type;
+
+// TODO: in below permute pattern, the last 3 dim is within wave
+enum class matrix_core_permute_style
+{
+    permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6
+    permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6
+    b_nr_kr_kw_nw_kv            = 2, // 0,1,3,4,2,5
+    b_nr_kr_waveflatten         = b_nr_kr_kw_nw_kv,
+};
+
+// assume this is B matrix, originally we have batch*n*k
+// now batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+// assume using 32x32x8-f16, 4 waves and extend the KPerLane to 8xfp16(dwordx4)
+//
+//                                      4(waves)  32(mfma_m lane)
+//                                          |      |
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 -> 8(thread loading)
+//                                    nr  kr    |
+//        nr  4  32 kr 2  8                     2(klane)
+//
+// permute: 0,1,4,2,5,3,6
+// or
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*n1*k0*k1*n2*k2 -> 8(thread loading)
+// permute: 0,1,2,4,5,3,6
+//
+// this kernel only deal with fp16/bf16 data(16bit), and use 2d block size to do the swizzling
+// for simplicity, only consider n/k is multiple of block-size
+
+// independend host arg with no template
+struct matrix_core_swizzle_host_args
+{
+    const void* p_src;
+    void* p_dst;
+    int32_t batch;
+    int32_t n;
+    int32_t k;
+};
+
+// NOTE: this kernel could follow the style of generic permute kernel
+// but here we pass in fixed layout as template arg and generate different kernel instance
+// purposely
+template <int BLOCK_SIZE_ = 256,
+          int NPerBlock_  = 256,
+          int KPerBlock_  = 128,
+          matrix_core_permute_style pstyle_ =
+              matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2,
+          matrix_core_inst_enum Inst_ = matrix_core_inst_enum::MFMA_32x32x8_F16>
+struct matrix_core_swizzle_kernel
+{
+    using karg = matrix_core_swizzle_host_args;
+    using harg = matrix_core_swizzle_host_args;
+
+    static constexpr int BLOCK_SIZE      = BLOCK_SIZE_;
+    static constexpr int WavesPerBlock_N = 4;
+    static constexpr int WavesPerBlock_K = 1;
+    static_assert(WavesPerBlock_N * WavesPerBlock_K * 64 == BLOCK_SIZE);
+    static constexpr int NPerBlock                    = NPerBlock_;
+    static constexpr int KPerBlock                    = KPerBlock_;
+    static constexpr matrix_core_permute_style pstyle = pstyle_;
+    static constexpr matrix_core_inst_enum Inst       = Inst_;
+
+    static constexpr ck_tile::index_t Alignment = 8;
+    karg a;
+    dim3 grids;
+
+    using WarpGemm = to_warp_gemm_t<Inst>;
+
+    __host__ matrix_core_swizzle_kernel(harg h)
+    {
+        a                   = h;
+        ck_tile::index_t ns = (h.n + NPerBlock - 1) / NPerBlock;
+        ck_tile::index_t ks = (h.k + KPerBlock - 1) / KPerBlock;
+        grids               = dim3(ks, ns, h.batch);
+    }
+
+    __host__ bool is_applicable(harg h) { return h.n % NPerBlock == 0 && h.k % KPerBlock == 0; }
+
+    __host__ void operator()(const ck_tile::stream_config& s) const
+    {
+        ck_tile::kentry<BLOCK_SIZE, 1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
+    }
+
+    struct kernel
+    {
+        __device__ static constexpr auto get_src_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+
+            // clang-format off
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,// 0
+                    //             1              2            3             4             5             6
+                    tuple<sequence<N0>, sequence<N1>, sequence<N2>, sequence<K0>, sequence<K1>, sequence<K2>>,
+
+                    //            N1           K1  N2
+                    tuple<sequence<2>, sequence<5, 3>>,
+                    tuple<sequence<0>, sequence<0, 0>>,
+
+                    //       N0 K0 K2
+                    sequence<1, 4, 6>,
+                    sequence<0, 0, 0>>{});
+            // clang-format on
+        }
+        __device__ static constexpr auto get_dst_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+
+            if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<K0>, sequence<N1>, sequence<K1>, sequence<N2>, sequence<K2>>,
+
+                        //            N1           K1  N2
+                        tuple<sequence<3>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+
+                        //       N0 K0 K2
+                        sequence<1, 2, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<N1>, sequence<K0>, sequence<K1>, sequence<N2>, sequence<K2>>,
+
+                        //            N1           K1  N2
+                        tuple<sequence<2>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+
+                        //       N0 K0 K2
+                        sequence<1, 3, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else
+            {
+                // clang-format off
+                // b_nr_kr_kw_nw_kv or b_nr_kr_waveflatten
+                constexpr index_t Kv = Alignment;
+                constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+
+                static_assert(KPerBlock % (K1 * K2) == 0);
+                constexpr index_t Nr = NPerBlock / Nw;
+                constexpr index_t Kr = KPerBlock / (Kv * Kw);
+
+                constexpr index_t Nr_p = WavesPerBlock_N;
+                constexpr index_t Kr_p = WavesPerBlock_K;
+                constexpr index_t Nr_y = Nr / Nr_p;
+                constexpr index_t Kr_y = Kr / Kr_p;
+
+                return make_static_tile_distribution(
+#if MERGE_2D_013425
+                    tile_distribution_encoding<
+                        sequence<1>,// 0    R
+                        // major       1                         2
+                        // minor       0     1     2             0     1     2   3
+                        tuple<sequence<Nr_y, Nr_p, Nw>, sequence<Kr_y, Kr_p, Kw, Kv>>,    // H
+
+                        //            Nr_p, Kr_p         Kw Nw
+                        tuple<sequence<1  , 2>, sequence<2, 1>>,    // p major
+                        tuple<sequence<1  , 1>, sequence<2, 2>>,    // p minor
+
+                        //       Nr_y Kr_y Kv
+                        sequence<1,   2,   2>,          // Y major
+                        sequence<0,   0,   3>>{});      // y minor
+#else
+                    tile_distribution_encoding<
+                        sequence<1>,// 0    R
+                        // major       1                     2                     3
+                        // minor       0     1               0     1               0   1   2
+                        tuple<sequence<Nr_y, Nr_p>, sequence<Kr_y, Kr_p>, sequence<Kw, Nw, Kv>>,    // H
+
+                        //            Nr_p, Kr_p         Kw Nw
+                        tuple<sequence<1  , 2>, sequence<3, 3>>,    // p major
+                        tuple<sequence<1  , 1>, sequence<0, 1>>,    // p minor
+
+                        //       Nr_y Kr_y Kv
+                        sequence<1,   2,   3>,          // Y major
+                        sequence<0,   0,   2>>{});      // y minor
+#endif
+                // clang-format on
+            }
+        }
+
+        __device__ void operator()(karg a_)
+        {
+            using namespace ck_tile;
+            index_t i_k = blockIdx.x;
+            index_t i_n = blockIdx.y;
+            index_t i_b = blockIdx.z;
+
+            constexpr index_t k2 = Alignment;
+            constexpr index_t n2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1 = BLOCK_SIZE / get_warp_size();
+            const index_t k0     = a_.k / (k1 * k2);
+            const index_t n0     = a_.n / (n1 * n2);
+
+            constexpr index_t k2_tile = Alignment;
+            constexpr index_t n2_tile = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1_tile = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1_tile = BLOCK_SIZE / get_warp_size();
+            constexpr index_t k0_tile = KPerBlock / (k1_tile * k2_tile);
+            constexpr index_t n0_tile = NPerBlock / (n1_tile * n2_tile);
+
+            const fp16_t* p_src = reinterpret_cast<const fp16_t*>(a_.p_src) + i_b * a_.k * a_.n;
+            fp16_t* p_dst       = reinterpret_cast<fp16_t*>(a_.p_dst) + i_b * a_.k * a_.n;
+
+            const auto src_view = [&]() {
+                const auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                    p_src,
+                    make_tuple(n0, n1, n2, k0, k1, k2),
+                    number<Alignment>{}); // control vector load
+                return tmp;
+            }();
+
+            const auto src_window = make_tile_window(src_view,
+                                                     make_tuple(number<n0_tile>{},
+                                                                number<n1_tile>{},
+                                                                number<n2_tile>{},
+                                                                number<k0_tile>{},
+                                                                number<k1_tile>{},
+                                                                number<k2_tile>{}),
+                                                     {i_n * n0_tile, 0, 0, i_k * k0_tile, 0, 0},
+                                                     get_src_dist());
+
+            auto dst_view = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, k0, n1, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, n1, k0, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else
+                {
+#if MERGE_2D_013425
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    // constexpr index_t waveflatten = kw*nw*kv;
+                    const index_t kr = a_.k / (k1 * k2);
+                    const index_t nr = a_.n / nw;
+                    auto tmp         = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(nr, kr, number<kw>{}, number<nw>{}, number<kv>{}),
+                        number<Alignment>{}); // control vector load
+                    auto tmp_1 = transform_tensor_view(
+                        tmp,
+                        make_tuple(
+                            make_merge_transform(make_tuple(nr, number<nw>{})),
+                            make_merge_transform(make_tuple(kr, number<kw>{}, number<kv>{}))),
+                        make_tuple(sequence<0, 3>{}, sequence<1, 2, 4>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                    return tmp_1;
+#else
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t waveflatten = kw * nw * kv;
+                    const index_t kr = a_.k / (k1 * k2);
+                    const index_t nr = a_.n / nw;
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(nr, kr, waveflatten),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+#endif
+                }
+            }();
+
+            auto dst_window = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, i_k * k0_tile, 0, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, 0, i_k * k0_tile, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else
+                {
+#if MERGE_2D_013425
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                            {i_n * NPerBlock, i_k * KPerBlock},
+                                            get_dst_dist());
+#else
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t waveflatten_tile = kw * nw * kv;
+                    constexpr index_t nr_tile = NPerBlock / nw;
+                    constexpr index_t kr_tile = KPerBlock / (kw * kv);
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<nr_tile>{},
+                                                       number<kr_tile>{},
+                                                       number<waveflatten_tile>{}),
+                                            {i_n * nr_tile, i_k * kr_tile, 0},
+                                            get_dst_dist());
+#endif
+                }
+            }();
+
+            // actual load store
+            auto src_tile = load_tile(src_window);
+
+            // now we only swap the distribution from src to dst, no extra movement occurs
+            auto dst_tile                = make_static_distributed_tensor<fp16_t>(get_dst_dist());
+            dst_tile.get_thread_buffer() = src_tile.get_thread_buffer();
+
+            // final store
+            store_tile(dst_window, dst_tile);
+        }
+    };
+};
diff --git a/test/ck_tile/permute/permute.hpp b/test/ck_tile/permute/permute.hpp
new file mode 100644
index 0000000000..5724b0f316
--- /dev/null
+++ b/test/ck_tile/permute/permute.hpp
@@ -0,0 +1,19 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/permute.hpp"
+#include <string>
+
+struct permute_traits
+{
+    std::string data_type;
+};
+
+using permute_args = ck_tile::GenericPermuteHostArgs;
+
+// host API
+float permute(permute_traits, permute_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/permute/permute_fp16.cpp b/test/ck_tile/permute/permute_fp16.cpp
new file mode 100644
index 0000000000..24781261ef
--- /dev/null
+++ b/test/ck_tile/permute/permute_fp16.cpp
@@ -0,0 +1,29 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+#include "permute_utils.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = create_test_cases_fp16();
+
+    return !run_test_cases<ck_tile::half_t>(test_cases);
+}
diff --git a/test/ck_tile/permute/permute_fp32.cpp b/test/ck_tile/permute/permute_fp32.cpp
new file mode 100644
index 0000000000..2ece7c20bb
--- /dev/null
+++ b/test/ck_tile/permute/permute_fp32.cpp
@@ -0,0 +1,29 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+#include "permute_utils.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp32");
+
+    return !run_test_cases<float>(test_cases);
+}
diff --git a/test/ck_tile/permute/permute_fp8.cpp b/test/ck_tile/permute/permute_fp8.cpp
new file mode 100644
index 0000000000..e8ae5d0410
--- /dev/null
+++ b/test/ck_tile/permute/permute_fp8.cpp
@@ -0,0 +1,29 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+#include "permute_utils.inc"
+
+int main()
+{
+    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp8");
+
+    return !run_test_cases<ck_tile::fp8_t>(test_cases);
+}
diff --git a/test/ck_tile/permute/permute_utils.inc b/test/ck_tile/permute/permute_utils.inc
new file mode 100644
index 0000000000..6b8cb86b53
--- /dev/null
+++ b/test/ck_tile/permute/permute_utils.inc
@@ -0,0 +1,490 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+namespace detail {
+template <int bytes>
+struct to_integer_type;
+
+template <>
+struct to_integer_type<4>
+{
+    using type = int32_t;
+};
+template <>
+struct to_integer_type<2>
+{
+    using type = int16_t;
+};
+template <>
+struct to_integer_type<1>
+{
+    using type = int8_t;
+};
+} // namespace detail
+
+template <int bytes>
+using to_integer_type = typename detail::to_integer_type<bytes>::type;
+
+// host API (shoule come from codegen)
+float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp8") == 0)
+    {
+        using DataType        = ck_tile::fp8_t;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids      = Kernel::GridSize(a);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+    else if(t.data_type.compare("fp16") == 0)
+    {
+        using DataType        = ck_tile::half_t;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids      = Kernel::GridSize(a);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+    else if(t.data_type.compare("fp32") == 0)
+    {
+        using DataType        = float;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids      = Kernel::GridSize(a);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+
+    return 0;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+auto create_args(int argc, char* argv[], int start_index = 0)
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("prec", "fp16", "data type. fp8/fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("shape", "2,3,4", "the shape of the input tensor")
+        .insert("perm", "2,1,0", "permute perm")
+        .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv, start_index);
+    return std::make_tuple(result, arg_parser);
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+// "1,2,3,4" -> vector{1,2,3,4}
+std::vector<ck_tile::index_t> decode_vec(std::string q_val)
+{
+#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
+    std::string::size_type pos = 0;
+    std::vector<ck_tile::index_t> v;
+    while(true)
+    {
+        auto found = q_val.find(',', pos);
+        ck_tile::index_t n =
+            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
+        v.push_back(n);
+        if(found == std::string::npos)
+        {
+            break;
+        }
+        pos = found + 1;
+    }
+    return v;
+#undef _S2I_
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+
+    auto shape        = decode_vec(arg_parser.get_str("shape"));
+    auto perm         = decode_vec(arg_parser.get_str("perm"));
+    int stream_warmup = arg_parser.get_int("warmup");
+    int stream_repeat = arg_parser.get_int("repeat");
+    bool kname        = arg_parser.get_bool("kname");
+    int seed          = arg_parser.get_int("seed");
+
+    assert(shape.size() == perm.size());
+    ck_tile::index_t rank = perm.size();
+    if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
+    {
+        printf("rank %d permute is not support yet\n", rank);
+        return false;
+    }
+
+    ck_tile::HostTensor<DataType> x(shape);
+    ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
+
+    std::vector<ck_tile::index_t> y_shape = [&]() {
+        std::vector<ck_tile::index_t> tmp(rank, 0);
+        // std::cout << "@@@@" << tmp << std::endl;
+        for(int i = 0; i < static_cast<int>(rank); i++)
+        {
+            // std::cout << "  i:" << i << ", perm:" << perm[i] << ", rak:" <<
+            // static_cast<int>(rank)
+            // << std::endl;
+            tmp[i] = shape[perm[i]];
+        }
+        // std::cout << "@@@" << tmp << std::endl;
+        return tmp;
+    }();
+
+    ck_tile::HostTensor<DataType> y(y_shape);
+
+    ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x.data());
+
+    std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm
+              << std::endl;
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (kname ? 1 : 0),
+                                         stream_warmup,
+                                         stream_repeat};
+    float ave_time   = 0.f;
+    auto run_permute = [&]() {
+        permute_traits t;
+        t.data_type = data_type;
+
+        permute_args a;
+        a.p_src = x_buf.GetDeviceBuffer();
+        a.p_dst = y_buf.GetDeviceBuffer();
+        a.rank  = rank;
+        std::copy(shape.begin(), shape.end(), a.shape);
+        std::copy(perm.begin(), perm.end(), a.perm);
+
+        return permute(t, a, stream_config);
+    };
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+    // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+    if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
+        arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6") ||
+        arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")))
+    {
+        if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))
+        {
+            // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
+            matrix_core_swizzle_traits t;
+            t.data_type = data_type;
+            t.permute   = arg_parser.get_str("perm");
+
+            matrix_core_swizzle_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.batch = shape[0];
+
+            auto nr = shape[1];
+            auto nw = shape[2];
+            auto kr = shape[3];
+            auto kw = shape[4];
+            auto kv = shape[5];
+            a.n     = nr * nw;
+            a.k     = kr * kw * kv;
+            if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
+            {
+                t.inst = "16x16x16";
+                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
+            {
+                t.inst = "32x32x8";
+                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else
+            {
+                ave_time = run_permute();
+            }
+        }
+        else
+        {
+            matrix_core_swizzle_traits t;
+            t.data_type = data_type;
+            t.permute   = arg_parser.get_str("perm");
+
+            matrix_core_swizzle_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.batch = shape[0];
+            a.n     = shape[1] * shape[2] * shape[3];
+            a.k     = shape[4] * shape[5] * shape[6];
+            if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
+               shape[4] % 8 == 0 && shape[1] % 2 == 0)
+            {
+                // 32x32x8 inst
+                // perm=0,1,4,2,5,3,6
+                // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
+                // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
+
+                t.inst = "32x32x8";
+                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
+                    shape[4] % 4 == 0 && shape[1] % 4 == 0)
+            {
+                // 16x16x16 inst
+                // perm=0,1,4,2,5,3,6
+                // y_shape=*,4x,4x,4,4,16,8
+                // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
+                t.inst = "16x16x16";
+                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else
+            {
+                ave_time = run_permute();
+            }
+        }
+    }
+    else
+#endif
+    {
+        ave_time = run_permute();
+    }
+    std::cout << ", time:" << ave_time << "ms" << std::flush;
+
+    bool pass = true;
+    if(do_validation)
+    {
+        reference_permute(x, y, perm);
+
+        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
+
+        y_buf.FromDevice(y_dev.data());
+
+        pass = std::equal(
+            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
+                using itype = to_integer_type<sizeof(DataType)>;
+                itype i_d   = ck_tile::bit_cast<itype>(d);
+                itype i_h   = ck_tile::bit_cast<itype>(h);
+                return i_d == i_h;
+            });
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+    }
+
+    std::cout << std::endl;
+
+    return pass;
+}
+
+template <typename DataType>
+bool run_test_case(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+
+    if(!result)
+        return false;
+
+    return run<DataType>(arg_parser);
+}
+
+template <typename DataType>
+bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
+{
+    bool valid             = true;
+    constexpr int num_args = 6;
+    char* argv[num_args];
+
+    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
+    {
+        assert(test_cases[test_idx].size() == num_args &&
+               "invalid number of arguments in test case");
+
+        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
+        {
+            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
+        }
+
+        valid = valid && run_test_case<DataType>(num_args, argv);
+
+        if(!valid)
+            break;
+    }
+
+    return valid;
+}
+
+std::vector<std::vector<std::string>> create_test_cases(const std::string prec)
+{
+    return {
+        {"-prec=" + prec, "-shape=3,8", "-perm=1,0", "-v=1", "-warmup=0", "-repeat=1"},
+        {"-prec=" + prec, "-shape=48,6,8", "-perm=2,1,0", "-v=1", "-warmup=0", "-repeat=1"},
+        {"-prec=" + prec, "-shape=24,128,3", "-perm=0,2,1", "-v=1", "-warmup=0", "-repeat=1"},
+        {"-prec=" + prec, "-shape=4,10,7,6", "-perm=0,2,3,1", "-v=1", "-warmup=0", "-repeat=1"},
+        {"-prec=" + prec, "-shape=8,24,36,10", "-perm=3,1,2,0", "-v=1", "-warmup=0", "-repeat=1"},
+        {"-prec=" + prec, "-shape=8,1,36,4", "-perm=2,1,0,3", "-v=1", "-warmup=0", "-repeat=1"},
+        {"-prec=" + prec,
+         "-shape=5,10,16,2,36,4",
+         "-perm=4,5,2,1,0,3",
+         "-v=1",
+         "-warmup=0",
+         "-repeat=1"},
+        {"-prec=" + prec,
+         "-shape=2,32,8,3,6,2,5,4",
+         "-perm=5,2,4,7,1,6,3,0",
+         "-v=1",
+         "-warmup=0",
+         "-repeat=1"}};
+}
+
+std::vector<std::vector<std::string>> create_test_cases_fp16()
+{
+    return {{"-prec=fp16",
+             "-shape=3,6,4,32,16,2,8",
+             "-perm=0,1,4,2,5,3,6",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=5,10,4,32,8,2,8",
+             "-perm=0,1,4,2,5,3,6",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=3,8,4,16,16,4,8",
+             "-perm=0,1,4,2,5,3,6",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=3,6,4,32,16,2,8",
+             "-perm=0,1,2,4,5,3,6",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=5,10,4,32,8,2,8",
+             "-perm=0,1,2,4,5,3,6",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=3,8,4,16,16,4,8",
+             "-perm=0,1,2,4,5,3,6",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=2,8,16,8,4,8",
+             "-perm=0,1,3,4,2,5",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=1,24,32,16,2,8",
+             "-perm=0,1,3,4,2,5",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16", "-shape=3,8", "-perm=1,0", "-v=1", "-warmup=0", "-repeat=1"},
+            {"-prec=fp16", "-shape=48,6,8", "-perm=2,1,0", "-v=1", "-warmup=0", "-repeat=1"},
+            {"-prec=fp16", "-shape=24,128,3", "-perm=0,2,1", "-v=1", "-warmup=0", "-repeat=1"},
+            {"-prec=fp16", "-shape=4,10,7,6", "-perm=0,2,3,1", "-v=1", "-warmup=0", "-repeat=1"},
+            {"-prec=fp16", "-shape=8,24,36,10", "-perm=3,1,2,0", "-v=1", "-warmup=0", "-repeat=1"},
+            {"-prec=fp16", "-shape=8,1,36,4", "-perm=2,1,0,3", "-v=1", "-warmup=0", "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=5,10,16,2,36,4",
+             "-perm=4,5,2,1,0,3",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"},
+            {"-prec=fp16",
+             "-shape=2,32,8,3,6,2,5,4",
+             "-perm=5,2,4,7,1,6,3,0",
+             "-v=1",
+             "-warmup=0",
+             "-repeat=1"}};
+}

From c9886109b43fdd73679c4443b6616a83eb40e066 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Mon, 21 Jul 2025 16:58:59 -0500
Subject: [PATCH 330/443] Update packed fp4 layout (#2523)

---
 include/ck/utility/data_type.hpp           |  4 +--
 include/ck/utility/scaled_type_convert.hpp | 12 +++-----
 include/ck/utility/type_convert.hpp        | 36 ++++++++--------------
 test/data_type/test_mx_fp4.cpp             |  4 +--
 4 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 8f5a45bdf0..5fbe30d21b 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -50,7 +50,7 @@ struct f4x2_pk_t
     __host__ __device__ inline type unpack(Number<I>) const
     {
         static_assert(I < 2, "Index is out of range.");
-        if constexpr(I == 0)
+        if constexpr(I == 1)
             return (data >> 4);
         else
             return data & 0b00001111;
@@ -58,7 +58,7 @@ struct f4x2_pk_t
 
     __host__ __device__ inline type pack(const type x0, const type x1)
     {
-        return (x0 << 4) | (x1 & 0b00001111);
+        return (x1 << 4) | (x0 & 0b00001111);
     }
 
     // Compare operator
diff --git a/include/ck/utility/scaled_type_convert.hpp b/include/ck/utility/scaled_type_convert.hpp
index 90a018fe3a..7de84d974c 100644
--- a/include/ck/utility/scaled_type_convert.hpp
+++ b/include/ck/utility/scaled_type_convert.hpp
@@ -377,10 +377,7 @@ inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_b
         f4x2_t f4x2_array[4];
     } value{};
     value.f4x2_array[0] = x;
-    float2_t tmp =
-        __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert<float>(scale), 0);
-    // permute high bits and low bits to match the order of the original vector
-    return float2_t{tmp[1], tmp[0]};
+    return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert<float>(scale), 0);
 #else
     float2_t ret{utils::to_float<f4_t>(
                      scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{})),
@@ -406,10 +403,9 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m
     float f_scale = type_convert<float>(scale);
 
     ck::static_for<0, 32 / 2, 1>{}([&](auto idx) {
-        op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[idx], f_scale, 0);
-        // permute high bits and low bits to match the order of the original vector
-        ret[2 * idx]     = op[1];
-        ret[2 * idx + 1] = op[0];
+        op               = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[idx], f_scale, 0);
+        ret[2 * idx]     = op[0];
+        ret[2 * idx + 1] = op[1];
     });
 
     return ret;
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 05e461fa63..c859cfba3d 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -1401,8 +1401,7 @@ inline __host__ __device__ f4x2_t f4_convert_rne(float2_t x, float scale = 1.0f)
         uint32_t bitwise;
         f4x2_t f4x2_array[4];
     } value{0};
-    // permute high bits and low bits to match the order of the original vector
-    value.bitwise = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(value.bitwise, x[1], x[0], scale, 0);
+    value.bitwise = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(value.bitwise, x[0], x[1], scale, 0);
     return value.f4x2_array[0];
 #else
     union
@@ -1410,8 +1409,8 @@ inline __host__ __device__ f4x2_t f4_convert_rne(float2_t x, float scale = 1.0f)
         uint32_t bitwise;
         f4x2_t f4x2_array[4];
     } value{0};
-    uint8_t l     = utils::sat_convert_to_type<f4_t>(x[1] / scale);
-    uint8_t h     = utils::sat_convert_to_type<f4_t>(x[0] / scale);
+    uint8_t l     = utils::sat_convert_to_type<f4_t>(x[0] / scale);
+    uint8_t h     = utils::sat_convert_to_type<f4_t>(x[1] / scale);
     value.bitwise = (h << 4) | l;
     return value.f4x2_array[0];
 #endif
@@ -1429,9 +1428,8 @@ inline __host__ __device__ f4x32_t f4_convert_rne(float32_t x, float scale = 1.0
     } f4_values{}, tmp_values{};
 
     ck::static_for<0, 32 / 2, 1>{}([&](auto idx) {
-        // permute high bits and low bits to match the order of the original vector
         tmp_values.bitwise = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(
-            tmp_values.bitwise, x[2 * idx + 1], x[2 * idx], scale, 0);
+            tmp_values.bitwise, x[2 * idx], x[2 * idx + 1], scale, 0);
         f4_values.f4x2_array[idx] = tmp_values.f4x2_array[0];
     });
 
@@ -1500,9 +1498,7 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
         uint32_t bitwise;
         f4x2_t f4x2_array[4];
     } value{0};
-    // permute high bits and low bits to match the order of the original vector
-    value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(
-        value.bitwise, float2_t{x[1], x[0]}, rng, scale, 0);
+    value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(value.bitwise, x, rng, scale, 0);
     return value.f4x2_array[0];
 #else
     constexpr int seed = 1254739;
@@ -1516,8 +1512,8 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
         uint32_t bitwise;
         f4x2_t f4x2_array[4];
     } value{0};
-    uint8_t l     = utils::sat_convert_to_type_sr<f4_t>(x[1] / scale, rng);
-    uint8_t h     = utils::sat_convert_to_type_sr<f4_t>(x[0] / scale, rng);
+    uint8_t l     = utils::sat_convert_to_type_sr<f4_t>(x[0] / scale, rng);
+    uint8_t h     = utils::sat_convert_to_type_sr<f4_t>(x[1] / scale, rng);
     value.bitwise = (h << 4) | l;
     return value.f4x2_array[0];
 #endif
@@ -1544,13 +1540,8 @@ inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f
     float_values.floatx32_array = x;
 
     ck::static_for<0, 32 / 2, 1>{}([&](auto idx) {
-        // permute high bits and low bits to match the order of the original vector
         f4_values.f4x2_array[idx] = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(
-            f4_values.bitwise,
-            float2_t{float_values.floatx2_array[idx][1], float_values.floatx2_array[idx][0]},
-            rng,
-            scale,
-            0);
+            f4_values.bitwise, float_values.floatx2_array[idx], rng, scale, 0);
     });
 
     return f4_values.f4x32_array;
@@ -1648,9 +1639,7 @@ inline __host__ __device__ float2_t type_convert<float2_t, f4x2_t>(f4x2_t x)
     } value{};
     value.f4x2_array[0] = x;
     float scale         = 1.0f;
-    float2_t tmp        = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, scale, 0);
-    // permute high bits and low bits to match the order of the original vector
-    return float2_t{tmp[1], tmp[0]};
+    return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, scale, 0);
 #else
     float2_t ret{
         utils::to_float<f4_t>(NumericLimits<e8m0_bexp_t>::Binary_1(),
@@ -1676,10 +1665,9 @@ inline __host__ __device__ float32_t type_convert<float32_t, f4x32_t>(f4x32_t x)
     float scale = 1.0f;
 
     ck::static_for<0, 32 / 2, 1>{}([&](auto idx) {
-        op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[idx], scale, 0);
-        // permute high bits and low bits to match the order of the original vector
-        ret[2 * idx]     = op[1];
-        ret[2 * idx + 1] = op[0];
+        op               = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[idx], scale, 0);
+        ret[2 * idx]     = op[0];
+        ret[2 * idx + 1] = op[1];
     });
 
     return ret;
diff --git a/test/data_type/test_mx_fp4.cpp b/test/data_type/test_mx_fp4.cpp
index 449f6fc777..c8059fa097 100644
--- a/test/data_type/test_mx_fp4.cpp
+++ b/test/data_type/test_mx_fp4.cpp
@@ -212,8 +212,8 @@ TEST(MXFP4, HostScaledConvert)
     auto i = 256 * 16;
 
     // f4x2 -> f32x2
-    EXPECT_EQ(out[i++], 1.0f);
     EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 1.0f);
 
     // f32x2 -> f4x2
     // RNE
@@ -296,8 +296,8 @@ TEST(MXFP4, DeviceScaledConvert)
     auto i = 256 * 16;
 
     // f4x2 -> f32x2
-    EXPECT_EQ(out[i++], 1.0f);
     EXPECT_EQ(out[i++], -4.0f);
+    EXPECT_EQ(out[i++], 1.0f);
 
     // f32x2 -> f4x2
     // RNE

From f102eedfb3a17079052b5a99885b7acddef0c5a0 Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Tue, 22 Jul 2025 08:15:18 -0600
Subject: [PATCH 331/443] [CK_TILE] Migrate CK Tile examples to Tests to
 autorun on CI (#2421)

[CK_TILE] Add new ck tile unit test

* Add new ck tile unit test smoke-gemm-universal
* Add new ck tile unit test smoke-gemm-basic
* Add new ck tile unit test topk_softmax
* Add new ck tile unit test add_rmsnorm2d_rdquant_fwd
---
 test/ck_tile/CMakeLists.txt                   |   4 +
 .../add_rmsnorm2d_rdquant/CMakeLists.txt      |  26 +
 .../add_rmsnorm2d_rdquant_fwd.hpp             | 151 ++++
 .../add_rmsnorm2d_rdquant_fwd.inc             | 370 +++++++++
 .../add_rmsnorm2d_rdquant_fwd_bf16.cpp        |   6 +
 .../add_rmsnorm2d_rdquant_fwd_fp16.cpp        |   6 +
 .../add_rmsnorm2d_rdquant_fwd_api.cpp         | 227 ++++++
 ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp |  26 +
 ...norm2d_rdquant_fwd_bf16_n1536_instance.cpp |  17 +
 ...norm2d_rdquant_fwd_bf16_n2048_instance.cpp |  18 +
 ...snorm2d_rdquant_fwd_bf16_n256_instance.cpp |  15 +
 ...norm2d_rdquant_fwd_bf16_n3072_instance.cpp |  17 +
 ...norm2d_rdquant_fwd_bf16_n4096_instance.cpp |  17 +
 ...snorm2d_rdquant_fwd_bf16_n512_instance.cpp |  17 +
 ...m2d_rdquant_fwd_bf16_n64_n128_instance.cpp |  15 +
 ...snorm2d_rdquant_fwd_bf16_n768_instance.cpp |  15 +
 ...norm2d_rdquant_fwd_bf16_n8192_instance.cpp |  42 +
 ...m2d_rdquant_fwd_bf16_n8192_tp_instance.cpp |  17 +
 ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp |  26 +
 ...norm2d_rdquant_fwd_fp16_n1536_instance.cpp |  17 +
 ...norm2d_rdquant_fwd_fp16_n2048_instance.cpp |  18 +
 ...snorm2d_rdquant_fwd_fp16_n256_instance.cpp |  15 +
 ...norm2d_rdquant_fwd_fp16_n3072_instance.cpp |  17 +
 ...norm2d_rdquant_fwd_fp16_n4096_instance.cpp |  17 +
 ...snorm2d_rdquant_fwd_fp16_n512_instance.cpp |  17 +
 ...m2d_rdquant_fwd_fp16_n64_n128_instance.cpp |  15 +
 ...snorm2d_rdquant_fwd_fp16_n768_instance.cpp |  15 +
 ...norm2d_rdquant_fwd_fp16_n8192_instance.cpp |  41 +
 ...m2d_rdquant_fwd_fp16_n8192_tp_instance.cpp |  17 +
 ..._rmsnorm2d_rdquant_fwd_instance_common.hpp |  70 ++
 test/ck_tile/gemm/CMakeLists.txt              |  19 +
 .../gemm/test_gemm_pipeline_basic_bf16.cpp    |   5 +
 .../gemm/test_gemm_pipeline_basic_bf8.cpp     |   5 +
 .../gemm/test_gemm_pipeline_basic_fp16.cpp    |   5 +
 .../gemm/test_gemm_pipeline_basic_fp8.cpp     |   5 +
 .../test_gemm_pipeline_basic_run_test.inc     | 313 ++++++++
 .../test_gemm_pipeline_smoke_run_test.inc     | 458 +++++++++++
 .../gemm/test_gemm_pipeline_smoke_util.hpp    | 414 ++++++++++
 .../test_gemm_pipeline_universal_bf16.cpp     |  16 +
 .../gemm/test_gemm_pipeline_universal_bf8.cpp |  16 +
 .../test_gemm_pipeline_universal_fp16.cpp     |  16 +
 .../gemm/test_gemm_pipeline_universal_fp8.cpp |  16 +
 .../test_gemm_pipeline_universal_run_test.inc | 393 ++++++++++
 test/ck_tile/layernorm2d/CMakeLists.txt       |  53 ++
 test/ck_tile/layernorm2d/generate.py          | 730 ++++++++++++++++++
 test/ck_tile/layernorm2d/layernorm2d_fwd.hpp  |  70 ++
 test/ck_tile/layernorm2d/layernorm2d_fwd.inc  | 566 ++++++++++++++
 .../layernorm2d/layernorm2d_fwd_bf16.cpp      |   6 +
 .../layernorm2d/layernorm2d_fwd_fp16.cpp      |   6 +
 test/ck_tile/rmsnorm2d/CMakeLists.txt         |  54 ++
 test/ck_tile/rmsnorm2d/generate.py            | 715 +++++++++++++++++
 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.hpp      |  69 ++
 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc      | 619 +++++++++++++++
 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_bf16.cpp |   5 +
 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_fp16.cpp |   5 +
 test/ck_tile/topk_softmax/CMakeLists.txt      |  19 +
 .../topk_softmax/test_topk_softmax.hpp        | 280 +++++++
 .../topk_softmax/test_topk_softmax_api.cpp    |  96 +++
 .../topk_softmax/test_topk_softmax_api.hpp    |  21 +
 .../topk_softmax/test_topk_softmax_bf16.cpp   |   6 +
 .../topk_softmax/test_topk_softmax_fp16.cpp   |   6 +
 61 files changed, 6298 insertions(+)
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_bf16.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_fp16.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_tp_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_tp_instance.cpp
 create mode 100644 test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
 create mode 100644 test/ck_tile/layernorm2d/CMakeLists.txt
 create mode 100644 test/ck_tile/layernorm2d/generate.py
 create mode 100644 test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
 create mode 100644 test/ck_tile/layernorm2d/layernorm2d_fwd.inc
 create mode 100644 test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
 create mode 100644 test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
 create mode 100644 test/ck_tile/rmsnorm2d/CMakeLists.txt
 create mode 100644 test/ck_tile/rmsnorm2d/generate.py
 create mode 100644 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.hpp
 create mode 100644 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc
 create mode 100644 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_bf16.cpp
 create mode 100644 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_fp16.cpp
 create mode 100644 test/ck_tile/topk_softmax/CMakeLists.txt
 create mode 100644 test/ck_tile/topk_softmax/test_topk_softmax.hpp
 create mode 100644 test/ck_tile/topk_softmax/test_topk_softmax_api.cpp
 create mode 100644 test/ck_tile/topk_softmax/test_topk_softmax_api.hpp
 create mode 100644 test/ck_tile/topk_softmax/test_topk_softmax_bf16.cpp
 create mode 100644 test/ck_tile/topk_softmax/test_topk_softmax_fp16.cpp

diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 648fdc7739..3e5a3034cd 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -13,3 +13,7 @@ add_subdirectory(moe_sorting)
 add_subdirectory(slice_tile)
 add_subdirectory(batched_transpose)
 add_subdirectory(smoothquant)
+add_subdirectory(topk_softmax)
+add_subdirectory(add_rmsnorm2d_rdquant)
+# add_subdirectory(layernorm2d)
+# add_subdirectory(rmsnorm2d)
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt b/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt
new file mode 100644
index 0000000000..37774f7643
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -0,0 +1,26 @@
+function(create_tile_add_rmsnorm2d_rdquant_fwd SUFFIX)
+    set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "test_ck_tile_add_rmsnorm2d_rdquant_fwd_${SUFFIX}")
+    message(DEBUG "adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
+    file(GLOB INSTANCE_SRCS instances/*.cpp)
+    add_test_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd_${SUFFIX}.cpp)
+    target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+    target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})
+
+    set(TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS)
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+    target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
+
+    # TODO: we have to turn off this global prop, otherwise the progress bar generated
+    # by cmake will print too many files, execvp: /bin/sh: Argument list too long
+    # however, this property may affect global
+    # TODO: consider codegen a makefile by us
+    set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
+endfunction()
+
+if(GPU_TARGETS MATCHES "gfx9")
+    create_tile_add_rmsnorm2d_rdquant_fwd("fp16")
+    create_tile_add_rmsnorm2d_rdquant_fwd("bf16")
+else()
+    message(DEBUG "Skipping ck tile add_rmsnorm2d_rdquant_fwd tests for current target")
+endif()
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
new file mode 100644
index 0000000000..faa134e5c4
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/add_rmsnorm2d_rdquant.hpp"
+#include <string>
+
+template <typename InputDataType, typename QuantizedDataType>
+struct AddRmsnormRdquantTypeConfig;
+
+template <>
+struct AddRmsnormRdquantTypeConfig<ck_tile::half_t, ck_tile::int8_t>
+{
+    using ADataType       = ck_tile::half_t;
+    using BDataType       = ck_tile::half_t;
+    using GammaDataType   = ck_tile::half_t;
+    using XDataType       = ck_tile::half_t;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct AddRmsnormRdquantTypeConfig<ck_tile::bf16_t, ck_tile::int8_t>
+{
+    using ADataType       = ck_tile::bf16_t;
+    using BDataType       = ck_tile::bf16_t;
+    using GammaDataType   = ck_tile::bf16_t;
+    using XDataType       = ck_tile::bf16_t;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::int8_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct AddRmsnormRdquantTypeConfig<ck_tile::half_t, ck_tile::fp8_t>
+{
+    using ADataType       = ck_tile::half_t;
+    using BDataType       = ck_tile::half_t;
+    using GammaDataType   = ck_tile::half_t;
+    using XDataType       = ck_tile::half_t;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::fp8_t;
+    using ComputeDataType = float;
+};
+
+template <>
+struct AddRmsnormRdquantTypeConfig<ck_tile::bf16_t, ck_tile::fp8_t>
+{
+    using ADataType       = ck_tile::bf16_t;
+    using BDataType       = ck_tile::bf16_t;
+    using GammaDataType   = ck_tile::bf16_t;
+    using XDataType       = ck_tile::bf16_t;
+    using YScaleDataType  = float;
+    using QYDataType      = ck_tile::fp8_t;
+    using ComputeDataType = float;
+};
+
+// runtime args
+struct add_rmsnorm2d_rdquant_fwd_args : public ck_tile::AddRmsnorm2dRdquantFwdHostArgs
+{
+};
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename InputDataType_,
+          typename QuantizedDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveX_,
+          bool kThreePass_>
+struct add_rmsnorm2d_rdquant_fwd_traits_
+{
+    using InputDataType     = ck_tile::remove_cvref_t<InputDataType_>;
+    using QuantizedDataType = ck_tile::remove_cvref_t<QuantizedDataType_>;
+
+    static constexpr auto WarpSize        = ck_tile::get_warp_size();
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN      = kPadN_;
+    static constexpr bool kSaveX     = kSaveX_;
+    static constexpr bool kThreePass = kThreePass_;
+};
+
+template <typename Traits_>
+float add_rmsnorm2d_rdquant_fwd_(const ck_tile::stream_config& s, add_rmsnorm2d_rdquant_fwd_args a);
+
+// This is the public API, will be generated by script
+struct add_rmsnorm2d_rdquant_fwd_traits
+{
+    std::string input_data_type;
+    std::string quantized_data_type;
+    bool save_x;
+};
+
+float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits,
+                                add_rmsnorm2d_rdquant_fwd_args,
+                                const ck_tile::stream_config&);
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc
new file mode 100644
index 0000000000..b7cf891862
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/host.hpp"
+#include "add_rmsnorm2d_rdquant_fwd.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename InputDataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_x", "1", "save rms(invrms) or not. set to 1 in training case")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec", "fp16", "precision")
+        .insert("quant", "int8", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InputDataType, typename QuantizedDataType, bool SaveX>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m      = arg_parser.get_int("m");
+    ck_tile::index_t n      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = n;
+    float epsilon                   = arg_parser.get_float("e");
+    std::string input_data_type     = arg_parser.get_str("prec");
+    std::string quantized_data_type = arg_parser.get_str("quant");
+    int kname                       = arg_parser.get_int("kname");
+    int do_validation               = arg_parser.get_int("v");
+    int warmup                      = arg_parser.get_int("warmup");
+    int repeat                      = arg_parser.get_int("repeat");
+
+    assert(stride >= n);
+
+    using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;
+
+    using ADataType        = typename TypeConfig::ADataType;
+    using BDataType        = typename TypeConfig::BDataType;
+    using GammaDataType    = typename TypeConfig::GammaDataType;
+    using XDataType        = typename TypeConfig::XDataType;
+    using YScaleDataType   = typename TypeConfig::YScaleDataType;
+    using QYDataType       = typename TypeConfig::QYDataType;
+    using ComputeDataType  = float;
+    using UnquantYDataType = ck_tile::null_type;
+
+    // host verify
+    ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<BDataType> b_host({m, n}, {stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+
+    ck_tile::HostTensor<XDataType> x_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host_dev({m, n}, {stride, 1});
+
+    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {stride, 1});
+    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {stride, 1});
+
+    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+
+    ck_tile::DeviceMem a_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_buf(b_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf(x_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+    a_buf.ToDevice(a_host.data());
+    b_buf.ToDevice(b_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+
+    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]"
+              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+
+    add_rmsnorm2d_rdquant_fwd_traits traits{input_data_type, quantized_data_type, SaveX};
+
+    add_rmsnorm2d_rdquant_fwd_args args{a_buf.GetDeviceBuffer(),
+                                        b_buf.GetDeviceBuffer(),
+                                        gamma_buf.GetDeviceBuffer(),
+                                        x_buf.GetDeviceBuffer(),
+                                        yscale_buf.GetDeviceBuffer(),
+                                        qy_buf.GetDeviceBuffer(),
+                                        epsilon,
+                                        m,
+                                        n,
+                                        stride};
+
+    float ave_time = add_rmsnorm2d_rdquant_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte = sizeof(ADataType) * m * n + sizeof(BDataType) * m * n +
+                           sizeof(GammaDataType) * n + sizeof(YScaleDataType) * m +
+                           sizeof(QYDataType) * m * n;
+
+    if constexpr(SaveX)
+        num_byte += sizeof(XDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        using YDataType      = ComputeDataType;
+        using InvRmsDataType = InputDataType;
+
+        // Add
+        {
+            auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+            ck_tile::reference_binary_elementwise<ADataType, BDataType, XDataType, ComputeDataType>(
+                a_host, b_host, x_host_ref, op);
+
+            if constexpr(SaveX)
+            {
+                x_buf.FromDevice(x_host_dev.data());
+
+                auto [rtol, atol] = get_elimit<XDataType>();
+                if(stride == n)
+                {
+                    pass = ck_tile::check_err(x_host_dev,
+                                              x_host_ref,
+                                              std::string("x Error: Incorrect results!"),
+                                              rtol,
+                                              atol);
+                }
+                else
+                {
+                    for(int i_r = 0; i_r < m; i_r++)
+                    {
+                        std::vector<QYDataType> x_host_dev_row(x_host_dev.begin() + i_r * stride,
+                                                               x_host_dev.begin() + i_r * stride +
+                                                                   n);
+                        std::vector<QYDataType> x_host_ref_row(x_host_ref.begin() + i_r * stride,
+                                                               x_host_ref.begin() + i_r * stride +
+                                                                   n);
+                        pass &= ck_tile::check_err(x_host_dev_row,
+                                                   x_host_ref_row,
+                                                   std::string("x[") + std::to_string(i_r) +
+                                                       std::string("] Error: Incorrect results!"),
+                                                   rtol,
+                                                   atol);
+                    }
+                }
+            }
+        }
+
+        ck_tile::HostTensor<YDataType> y_host({m, n});
+        // Rmsnorm2d
+        {
+            ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+            ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n});
+
+            // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
+            // simplicity
+            ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                             GammaDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             InvRmsDataType,
+                                             UnquantYDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<YDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride + n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride + n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+bool dispatch_by_type(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    const std::string input_data_type     = arg_parser.get_str("prec");
+    const std::string quantized_data_type = arg_parser.get_str("quant");
+    int save_x                            = arg_parser.get_int("save_x");
+    if(input_data_type == "fp16" && quantized_data_type == "int8" && save_x)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, true>(arg_parser);
+    }
+    else if(input_data_type == "fp16" && quantized_data_type == "int8" && !save_x)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, false>(arg_parser);
+    }
+    else if(input_data_type == "bf16" && quantized_data_type == "int8" && save_x)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, true>(arg_parser);
+    }
+    else if(input_data_type == "bf16" && quantized_data_type == "int8" && !save_x)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, true>(arg_parser);
+    }
+    else if(input_data_type == "fp16" && quantized_data_type == "fp8" && save_x)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, true>(arg_parser);
+    }
+    else if(input_data_type == "fp16" && quantized_data_type == "fp8" && !save_x)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, false>(arg_parser);
+    }
+    else if(input_data_type == "bf16" && quantized_data_type == "fp8" && save_x)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, true>(arg_parser);
+    }
+    else if(input_data_type == "bf16" && quantized_data_type == "fp8" && !save_x)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, true>(arg_parser);
+    }
+
+    return false;
+}
+
+int run_add_rmsnorm2d_rdquant_combinations(std::string const& data_type)
+{
+    constexpr size_t PARAM_COUNT = 11;
+    char bufs[PARAM_COUNT][64];
+    char* argv[PARAM_COUNT];
+
+    for(std::size_t i = 0; i < PARAM_COUNT; i++)
+    {
+        argv[i] = bufs[i];
+    }
+
+    std::vector<std::vector<std::string>> params = {
+        {"-m=99", "-n=13"},
+        {"-m=17", "-n=16"},
+        {"-m=1", "-n=100"},
+        {"-m=4", "-n=128"},
+        {"-m=80", "-n=127"},
+        {"-m=22", "-n=255", "-stride=256"},
+        {"-m=7", "-n=599"},
+        {"-m=19", "-n=512"},
+        {"-m=33", "-n=313", "-stride=1000"},
+        {"-m=11", "-n=510"},
+        {"-m=171", "-n=676", "-stride=818"},
+        {"-m=91", "-n=636"},
+        {"-m=12", "-n=768", "-stride=800"},
+        {"-m=100", "-n=766", "-stride=812"},
+        {"-m=31", "-n=1024"},
+        {"-m=64", "-n=1000", "-stride=1004"},
+        {"-m=8", "-n=1501"},
+        {"-m=3", "-n=1826"},
+        {"-m=5", "-n=2040"},
+        {"-m=7", "-n=2734"},
+        {"-m=1", "-n=3182"},
+        {"-m=9", "-n=4096"},
+        {"-m=3", "-n=8192"},
+        {"-m=1", "-n=10547"},
+        {"-m=3", "-n=17134"},
+    };
+
+    bool result      = true;
+    std::string pr_i = "-prec=" + data_type;
+    strncpy(bufs[0], "add_rmsnorm2d_rdquant_fwd", 64);
+    strncpy(bufs[1], pr_i.c_str(), 64);
+    for(size_t i = 0; i < params.size(); i++)
+    {
+        for(size_t j = 0; j < params[i].size(); j++)
+        {
+            strncpy(bufs[j + 2], params[i][j].c_str(), 64);
+        }
+        int argc = params[i].size() + 2;
+
+        result = dispatch_by_type(argc, argv) && result;
+    }
+    return result ? 0 : -1;
+}
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_bf16.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_bf16.cpp
new file mode 100644
index 0000000000..1e0863fa62
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_bf16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd.inc"
+
+int main() { return run_add_rmsnorm2d_rdquant_combinations("bf16"); }
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_fp16.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_fp16.cpp
new file mode 100644
index 0000000000..0a0a4c4f83
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd_fp16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd.inc"
+
+int main() { return run_add_rmsnorm2d_rdquant_combinations("fp16"); }
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
new file mode 100644
index 0000000000..f695ea30b2
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_api.cpp
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "add_rmsnorm2d_rdquant_fwd.hpp"
+
+template <typename InputDataType_,
+          typename QuantizedDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveX_,
+          bool kThreePass_>
+using trait_ = add_rmsnorm2d_rdquant_fwd_traits_<InputDataType_,
+                                                 QuantizedDataType_,
+                                                 Repeat_M_,
+                                                 Repeat_N_,
+                                                 ThreadPerBlock_M_,
+                                                 ThreadPerBlock_N_,
+                                                 Vector_N_,
+                                                 kPadN_,
+                                                 kSaveX_,
+                                                 kThreePass_>;
+
+template <typename input_data_type, typename quantized_data_type>
+float add_rmsnorm2d_rdquant_fwd_b16_(add_rmsnorm2d_rdquant_fwd_traits t,
+                                     add_rmsnorm2d_rdquant_fwd_args a,
+                                     const ck_tile::stream_config& s)
+{
+    float r = -1;
+    // clang-format off
+    //                                                      rm  rn  tm   tn  vn   pd     x      3p
+    if(a.n <= 64) {
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1,  1,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 128) {
+        if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1,  1,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type, 1,  2,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 256) {
+        if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 1,  4,  64, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 512) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 1,  4,  64, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2,  4,  64, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 768) {
+        if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3,  4,  64, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 6,  4,  64, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1,12,  4,  64, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 1024) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 1, 2,  128, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 2,  128, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 2,  128, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  256, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 1536) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3, 4,   64, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3, 2,  128, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3, 1,  256, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 6, 1,  256, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 2048) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 1, 1,  256, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  256, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  256, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8, 1,  256, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 3072) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3, 1,  128, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3, 1,  256, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 6, 1,  256, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 3, 1, 1024, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 4096) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  256, 8,  true,  true, false>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  256, 4,  true,  true, false>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1, 1024, 2,  true,  true, false>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1, 1024, 1,  true,  true, false>>(s, a);
+    }
+    else if(a.n <= 8192) {
+        if(a.n<8192){
+            if(t.save_x){
+                if (a.n % 8 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  512, 8,  true,  true, false>>(s, a);
+                else if (a.n % 4 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  512, 4,  true,  true, false>>(s, a);
+                else if (a.n % 2 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1, 1024, 2,  true,  true, false>>(s, a);
+                else
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8, 1, 1024, 1,  true,  true, false>>(s, a);
+            }
+            else{
+                if (a.n % 8 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  512, 8,  true,  false, false>>(s, a);
+                else if (a.n % 4 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  512, 4,  true,  false, false>>(s, a);
+                else if (a.n % 2 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1, 1024, 2,  true,  false, false>>(s, a);
+                else
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8, 1, 1024, 1,  true,  false, false>>(s, a);
+            }
+        }
+        else{
+            if(t.save_x){
+                if (a.n % 8 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  512, 8,  false,  true, false>>(s, a);
+                else if (a.n % 4 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  512, 4,  false,  true, false>>(s, a);
+                else if (a.n % 2 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1, 1024, 2,  false,  true, false>>(s, a);
+                else
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8, 1, 1024, 1,  false,  true, false>>(s, a);
+            }
+            else{
+                if (a.n % 8 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  512, 8,  false,  false, false>>(s, a);
+                else if (a.n % 4 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  512, 4,  false,  false, false>>(s, a);
+                else if (a.n % 2 == 0)
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1, 1024, 2,  false,  false, false>>(s, a);
+                else
+                    r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8, 1, 1024, 1,  false,  false, false>>(s, a);
+            }
+        }
+    }
+    else if(a.n > 8192) {
+        if (a.n % 8 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 2, 1,  512, 8,  true,  true, true>>(s, a);
+        else if (a.n % 4 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1,  512, 4,  true,  true, true>>(s, a);
+        else if (a.n % 2 == 0)
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 4, 1, 1024, 2,  true,  true, true>>(s, a);
+        else
+            r = add_rmsnorm2d_rdquant_fwd_<trait_<input_data_type, quantized_data_type,  1, 8, 1, 1024, 1,  true,  true, true>>(s, a);
+    }
+    return r;
+    // clang-format on
+}
+
+float add_rmsnorm2d_rdquant_fwd(add_rmsnorm2d_rdquant_fwd_traits t,
+                                add_rmsnorm2d_rdquant_fwd_args a,
+                                const ck_tile::stream_config& s)
+{
+    if(t.input_data_type.compare("fp16") == 0 && t.quantized_data_type.compare("int8") == 0 &&
+       t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::fp16_t, ck_tile::int8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("fp16") == 0 && t.quantized_data_type.compare("int8") == 0 &&
+            !t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::fp16_t, ck_tile::int8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("bf16") == 0 && t.quantized_data_type.compare("int8") == 0 &&
+            t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::bf16_t, ck_tile::int8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("bf16") == 0 && t.quantized_data_type.compare("int8") == 0 &&
+            !t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::bf16_t, ck_tile::int8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("fp16") == 0 && t.quantized_data_type.compare("fp8") == 0 &&
+            t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::fp16_t, ck_tile::fp8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("fp16") == 0 && t.quantized_data_type.compare("fp8") == 0 &&
+            !t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::fp16_t, ck_tile::fp8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("bf16") == 0 && t.quantized_data_type.compare("fp8") == 0 &&
+            t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::bf16_t, ck_tile::fp8_t>(t, a, s);
+    }
+    else if(t.input_data_type.compare("bf16") == 0 && t.quantized_data_type.compare("fp8") == 0 &&
+            !t.save_x)
+    {
+        return add_rmsnorm2d_rdquant_fwd_b16_<ck_tile::bf16_t, ck_tile::fp8_t>(t, a, s);
+    }
+    else
+        throw std::runtime_error("Without supported instances!");
+}
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
new file mode 100644
index 0000000000..00df2f5082
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
@@ -0,0 +1,26 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                                rm  rn  tm  tn  vn  pd      x     3p
+#if 0
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
+#endif
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 2,  128, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  256, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 2,  128, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp
new file mode 100644
index 0000000000..2adb54c078
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1536_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 3, 4,   64, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 3, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 3, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 6, 1,  256, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 3, 4,   64, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 3, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 3, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 6, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp
new file mode 100644
index 0000000000..39089843a2
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n2048_instance.cpp
@@ -0,0 +1,18 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 8, 1,  256, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 1, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 8, 1,  256, 1,  true,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp
new file mode 100644
index 0000000000..ddb8e1b354
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n256_instance.cpp
@@ -0,0 +1,15 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  2,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  4,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  2,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  4,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp
new file mode 100644
index 0000000000..2a87614403
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n3072_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 3, 1,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 3, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 6, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 3, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 3, 1,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 3, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 6, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 3, 1, 1024, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp
new file mode 100644
index 0000000000..045a3b8880
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n4096_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp
new file mode 100644
index 0000000000..1028973e74
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n512_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  2,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  4,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  8,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  2,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  4,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  8,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp
new file mode 100644
index 0000000000..b8439a0ce9
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n64_n128_instance.cpp
@@ -0,0 +1,15 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn    pd     x      3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  1,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  2,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  1,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  2,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp
new file mode 100644
index 0000000000..b24b245757
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n768_instance.cpp
@@ -0,0 +1,15 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  3,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1,  6,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 12,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  3,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1,  6,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 12,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_instance.cpp
new file mode 100644
index 0000000000..14f0ec8525
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_instance.cpp
@@ -0,0 +1,42 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  false,  false, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_tp_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_tp_instance.cpp
new file mode 100644
index 0000000000..3e3a6d75b9
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n8192_tp_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 1,  512, 8,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1,  512, 4,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 2, 1,  512, 8,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1,  512, 4,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1,  true,  true, true>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
new file mode 100644
index 0000000000..04d735c12c
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
@@ -0,0 +1,26 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                                rm  rn  tm  tn  vn  pd      x     3p
+#if 0
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
+#endif
+
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 2,  128, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  256, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 2,  128, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp
new file mode 100644
index 0000000000..5893d6c3ee
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1536_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 3, 4,   64, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 3, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 3, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 6, 1,  256, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 3, 4,   64, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 3, 2,  128, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 3, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 6, 1,  256, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp
new file mode 100644
index 0000000000..ec9c417bf3
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n2048_instance.cpp
@@ -0,0 +1,18 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 8, 1,  256, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 1, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 8, 1,  256, 1,  true,  true, false>>(const S&, A);
+
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp
new file mode 100644
index 0000000000..5bc8245106
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n256_instance.cpp
@@ -0,0 +1,15 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  2,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  4,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  2,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  4,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp
new file mode 100644
index 0000000000..c022c62de6
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n3072_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 3, 1,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 3, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 6, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 3, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 3, 1,  128, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 3, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 6, 1,  256, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 3, 1, 1024, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp
new file mode 100644
index 0000000000..19172b0793
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n4096_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1,  256, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  256, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 1,  true,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp
new file mode 100644
index 0000000000..f491d92787
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n512_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  2,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  4,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  8,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4,  64, 8,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  2,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  4,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  8,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp
new file mode 100644
index 0000000000..065f0ea4cc
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n64_n128_instance.cpp
@@ -0,0 +1,15 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn    pd     x      3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  1,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  2,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  1,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  2,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp
new file mode 100644
index 0000000000..be8c6c4de5
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n768_instance.cpp
@@ -0,0 +1,15 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  3,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1,  6,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 12,  4,  64, 1,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  3,  4,  64, 4,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1,  6,  4,  64, 2,  true , true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 12,  4,  64, 1,  true , true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_instance.cpp
new file mode 100644
index 0000000000..ad2dfd931e
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_instance.cpp
@@ -0,0 +1,41 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                                                  rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 1,  512, 8,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1,  512, 4,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 4, 1, 1024, 2,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 8, 1, 1024, 1,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  true,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  true,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  false,  false, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 2, 1,  512, 8,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1,  512, 4,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 4, 1, 1024, 2,  false,  true, false>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::fp8_t, 1, 8, 1, 1024, 1,  false,  true, false>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_tp_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_tp_instance.cpp
new file mode 100644
index 0000000000..e3afa07fa4
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n8192_tp_instance.cpp
@@ -0,0 +1,17 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "add_rmsnorm2d_rdquant_fwd_instance_common.hpp"
+
+// clang-format off
+//                                                               rm  rn  tm  tn  vn     pd    x     3p
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 1,  512, 8,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1,  512, 4,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 4, 1, 1024, 2,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 8, 1, 1024, 1,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 2, 1,  512, 8,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1,  512, 4,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 4, 1, 1024, 2,  true,  true, true>>(const S&, A);
+template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::fp8_t, 1, 8, 1, 1024, 1,  true,  true, true>>(const S&, A);
+// clang-format on
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
new file mode 100644
index 0000000000..25b10e1dc4
--- /dev/null
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp
@@ -0,0 +1,70 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "add_rmsnorm2d_rdquant_fwd.hpp"
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = add_rmsnorm2d_rdquant_fwd_args;
+
+template <typename InputDataType_,
+          typename QuantizedDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kTwoPass_>
+using trait_ = add_rmsnorm2d_rdquant_fwd_traits_<InputDataType_,
+                                                 QuantizedDataType_,
+                                                 Repeat_M_,
+                                                 Repeat_N_,
+                                                 ThreadPerBlock_M_,
+                                                 ThreadPerBlock_N_,
+                                                 Vector_N_,
+                                                 kPadN_,
+                                                 kSaveInvRms_,
+                                                 kTwoPass_>;
+
+template <typename Traits_>
+float add_rmsnorm2d_rdquant_fwd_(const S& s, A a)
+{
+    using InputDataType     = typename Traits_::InputDataType;
+    using QuantizedDataType = typename Traits_::QuantizedDataType;
+
+    using PipelineProblem = ck_tile::AddRmsnorm2dRdquantFwdPipelineProblem<
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::ADataType,
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::BDataType,
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::GammaDataType,
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::ComputeDataType,
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::XDataType,
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::YScaleDataType,
+        typename AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>::QYDataType,
+        typename Traits_::Shape,
+        Traits_::kPadN,
+        Traits_::kSaveX,
+        Traits_::kThreePass>;
+
+    using OnePassPipeline   = ck_tile::AddRmsnorm2dRdquantFwdPipelineOnePass<PipelineProblem>;
+    using ThreePassPipeline = ck_tile::AddRmsnorm2dRdquantFwdPipelineThreePass<PipelineProblem>;
+    using Pipeline = std::conditional_t<Traits_::kThreePass, ThreePassPipeline, OnePassPipeline>;
+
+    using Kernel = ck_tile::AddRmsnorm2dRdquantFwd<Pipeline>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+}
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 8f880b8fde..6cbdc1a24e 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -20,6 +20,16 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     target_compile_options(test_ck_tile_gemm_pipeline_mem PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     target_compile_options(test_ck_tile_gemm_pipeline_compv3 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     target_compile_options(test_ck_tile_gemm_pipeline_compv4 PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
+
+
+    add_test_executable(test_ck_tile_gemm_pipeline_universal_fp8 test_gemm_pipeline_universal_fp8.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_universal_bf8 test_gemm_pipeline_universal_bf8.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_basic_fp8 test_gemm_pipeline_basic_fp8.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile_gemm tests for current target")
 endif()
@@ -27,4 +37,13 @@ endif()
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MATCHES "gfx90a")
     add_gtest_executable(test_ck_tile_gemm_pipeline_persistent test_gemm_pipeline_persistent.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+    add_test_executable(test_ck_tile_gemm_pipeline_universal_fp16 test_gemm_pipeline_universal_fp16.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_universal_bf16 test_gemm_pipeline_universal_bf16.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_universal_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_basic_fp16 test_gemm_pipeline_basic_fp16.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    add_test_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
new file mode 100644
index 0000000000..af2cb398f5
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf16.cpp
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "test_gemm_pipeline_basic_run_test.inc"
+
+int main() { return run_gemm_combinations("bf16"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
new file mode 100644
index 0000000000..fd8c28ef17
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_bf8.cpp
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "test_gemm_pipeline_basic_run_test.inc"
+
+int main() { return run_gemm_combinations("bf8"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
new file mode 100644
index 0000000000..4a93d6046a
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp16.cpp
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "test_gemm_pipeline_basic_run_test.inc"
+
+int main() { return run_gemm_combinations("fp16"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
new file mode 100644
index 0000000000..fd8c28ef17
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_fp8.cpp
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include "test_gemm_pipeline_basic_run_test.inc"
+
+int main() { return run_gemm_combinations("bf8"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
new file mode 100644
index 0000000000..9e4c036655
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_smoke_util.hpp"
+#include "test_gemm_pipeline_smoke_run_test.inc"
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+
+{
+    if constexpr(Persistent)
+        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 1;
+
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+    using CodegenPipelineProblem = ck_tile::
+        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
+
+    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             ck_tile::tuple<>,
+                                             AccDataType,
+                                             CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation>>;
+
+        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw ArgumentsNotSupportedException(
+                "Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
+    }
+}
+
+template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
+bool run_gemm_test_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+}
+
+bool run_gemm_test(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_test_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_test_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_test_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_test_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfigBase::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_test_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported data type for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int run_gemm_combinations(std::string const& data_type)
+{
+    // Define possible values for each parameter
+    std::vector<std::string> m_values    = {"128", "1024"};
+    std::vector<std::string> n_values    = {"128", "2048"};
+    std::vector<std::string> k_values    = {"64", "128"};
+    std::vector<std::string> prec_values = {data_type};
+
+    // We'll store all our arguments as strings first
+    std::vector<std::string> arg_strings = {"./bin/tile_example_gemm_basic",
+                                            "", // m placeholder
+                                            "", // n placeholder
+                                            "", // k placeholder
+                                            "-stride_a=0",
+                                            "-stride_b=0",
+                                            "-stride_c=0",
+                                            "", // prec placeholder
+                                            "-v=2",
+                                            "-warmup=0",
+                                            "-repeat=1"};
+
+    // Create an array of const char pointers for argv
+    constexpr size_t ARG_COUNT   = 11;
+    constexpr size_t ARG_MAX_LEN = 64;
+    char args[ARG_COUNT][ARG_MAX_LEN];
+    char* argv[ARG_COUNT];
+
+    // Run all combinations
+    bool is_success = true;
+    for(const auto& m : m_values)
+    {
+        arg_strings[1] = "-m=" + m;
+
+        for(const auto& n : n_values)
+        {
+            arg_strings[2] = "-n=" + n;
+
+            for(const auto& k : k_values)
+            {
+                arg_strings[3] = "-k=" + k;
+
+                for(const auto& prec : prec_values)
+                {
+                    arg_strings[7] = "-prec=" + prec;
+
+                    // Set up the argv array with pointers to the string data
+                    for(size_t i = 0; i < ARG_COUNT; i++)
+                    {
+                        strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN);
+                        argv[i] = args[i];
+                    }
+
+                    std::cout << "Arguments received: ";
+                    for(size_t i = 1; i < ARG_COUNT; ++i)
+                    {
+                        std::cout << argv[i] << " ";
+                    }
+                    std::cout << std::endl;
+
+                    // Call the function with the current configuration
+                    try
+                    {
+                        is_success = run_gemm_test(ARG_COUNT, argv) && is_success;
+                    }
+                    catch(const ArgumentsNotSupportedException& e)
+                    {
+                        std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
+                        // ArgumentsNotSupportedException  is not an error. Do not change is_success
+                    }
+                    catch(const std::runtime_error& e)
+                    {
+                        std::cerr << "Caught runtime error: " << e.what() << '\n';
+                        is_success = false;
+                    }
+                }
+            }
+        }
+    }
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
new file mode 100644
index 0000000000..afa6912e0f
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
@@ -0,0 +1,458 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename GemmConfig,
+          typename Tensor,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+void permute_tensor_b(Tensor& tensor)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;
+
+    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                       BDataType,
+                                                                       AccDataType,
+                                                                       GemmShape,
+                                                                       GemmUniversalTraits,
+                                                                       GemmConfig::Scheduler,
+                                                                       true,
+                                                                       ck_tile::TailNumber::Full>;
+
+    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
+        UniversalGemmProblem>;
+
+    const ck_tile::index_t K  = tensor.get_length(0);
+    const ck_tile::index_t N  = tensor.get_length(1);
+    const ck_tile::index_t K1 = GemmPipeline::GetSmemPackB();
+    const ck_tile::index_t K0 = K / K1;
+
+    Tensor tensor_copy = tensor;
+
+    // int K0, N, K1
+    for(int j = 0; j < K0; j++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int jj = 0; jj < K1; jj++)
+            {
+                tensor(j * N * K1 + i * K1 + jj) = tensor_copy(i * K + (j * K1 + jj));
+            }
+        }
+    }
+}
+
+template <typename Tensor>
+void permute_vectors_i4x4_b(Tensor& tensor)
+{
+    const ck_tile::index_t K = tensor.get_length(0);
+    const ck_tile::index_t N = tensor.get_length(1);
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int8_t input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int8_t i4x2      = tensor(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int8_t hi   = input[2];
+                int8_t lo   = input[0];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 0, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[6];
+                int8_t lo   = input[4];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 2, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[3];
+                int8_t lo   = input[1];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 4, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[7];
+                int8_t lo   = input[5];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 6, i) = i4x2;
+            }
+        }
+    }
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float gemm(const ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& s);
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                  ck_tile::DeviceMem& b_k_n_dev_buf,
+                  ck_tile::DeviceMem& c_m_n_dev_buf,
+                  ck_tile::index_t M,
+                  ck_tile::index_t N,
+                  ck_tile::index_t K,
+                  ck_tile::index_t stride_A,
+                  ck_tile::index_t stride_B,
+                  ck_tile::index_t stride_C,
+                  ck_tile::index_t kbatch,
+                  int n_warmup,
+                  int n_repeat,
+                  bool persistent)
+{
+    ck_tile::GemmHostArgs</*NumDTensor = 0*/> args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                                      {},
+                                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                                      kbatch,
+                                                      M,
+                                                      N,
+                                                      K,
+                                                      stride_A,
+                                                      stride_B,
+                                                      {},
+                                                      stride_C};
+
+    float ave_time;
+    if(persistent)
+    {
+        ave_time = gemm<GemmConfig,
+                        ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        true,
+                        CDEElementWise>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    }
+    else
+    {
+        ave_time = gemm<GemmConfig,
+                        ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        false,
+                        CDEElementWise>(
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool run_gemm_test_with_layouts(int argc,
+                                char* argv[],
+                                const ALayout a_layout                  = ALayout{},
+                                const BLayout b_layout                  = BLayout{},
+                                [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    if(GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
+    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+    {
+        // Permute vector pk_i4x4 data for device implementation
+        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+        if constexpr(GemmConfig::PermuteB)
+        {
+            permute_tensor_b<GemmConfig,
+                             decltype(b_k_n_dev),
+                             ADataType,
+                             BDataType,
+                             AccDataType,
+                             CDataType,
+                             ALayout,
+                             BLayout,
+                             CLayout>(b_k_n_dev);
+        }
+        permute_vectors_i4x4_b(b_k_n_dev);
+        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+    }
+    else
+    {
+        if constexpr(GemmConfig::PermuteB)
+        {
+            std::cout << "Permute for this DataType is not implemented." << std::endl;
+            return false;
+        }
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    invoke_gemm<GemmConfig,
+                ADataType,
+                BDataType,
+                ck_tile::tuple<>,
+                AccDataType,
+                CDataType,
+                ALayout,
+                BLayout,
+                ck_tile::tuple<>,
+                CLayout>(a_m_k_dev_buf,
+                         b_k_n_dev_buf,
+                         c_m_n_dev_buf,
+                         M,
+                         N,
+                         K,
+                         stride_A,
+                         stride_B,
+                         stride_C,
+                         kbatch,
+                         n_warmup,
+                         n_repeat,
+                         persistent);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        // memory on host to store gpu reference result
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        // memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_gpu_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
new file mode 100644
index 0000000000..99a1e50a6f
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
+
+class ArgumentsNotSupportedException : public std::logic_error
+{
+    public:
+    explicit ArgumentsNotSupportedException(const std::string& message) : logic_error(message) {}
+};
+
+// temporary workaround to get k_warp_tile based on PrecType and gfx950 or not
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(__gfx950__)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryIntrawave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3 : public GemmConfigBase
+{
+    // Compute V3 only support Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV5 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer               = false;
+    static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
+};
+
+template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
+struct GemmTypeConfig;
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Column by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent = false,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s);
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
new file mode 100644
index 0000000000..0673272f5f
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf16.cpp
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_smoke_util.hpp"
+#include "test_gemm_pipeline_smoke_run_test.inc"
+#include "test_gemm_pipeline_universal_run_test.inc"
+
+int main() { return run_gemm_combinations("bf16"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
new file mode 100644
index 0000000000..70eae12e82
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_bf8.cpp
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_smoke_util.hpp"
+#include "test_gemm_pipeline_smoke_run_test.inc"
+#include "test_gemm_pipeline_universal_run_test.inc"
+
+int main() { return run_gemm_combinations("bf8"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
new file mode 100644
index 0000000000..8ea192c7f3
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp16.cpp
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_smoke_util.hpp"
+#include "test_gemm_pipeline_smoke_run_test.inc"
+#include "test_gemm_pipeline_universal_run_test.inc"
+
+int main() { return run_gemm_combinations("fp16"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
new file mode 100644
index 0000000000..20414b4fec
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_fp8.cpp
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "ck_tile/host.hpp"
+#include "test_gemm_pipeline_smoke_util.hpp"
+#include "test_gemm_pipeline_smoke_run_test.inc"
+#include "test_gemm_pipeline_universal_run_test.inc"
+
+int main() { return run_gemm_combinations("fp8"); }
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
new file mode 100644
index 0000000000..1980648391
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = typename PipelineTypeTraits<
+                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 ELayout,
+                                                 CDEElementWise,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 GemmConfig::M_Warp,
+                                                 GemmConfig::N_Warp,
+                                                 GemmConfig::M_Warp_Tile,
+                                                 GemmConfig::N_Warp_Tile,
+                                                 GemmConfig::K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation,
+                                                 GemmConfig::NumWaveGroups>>;
+            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            dim3 grids;
+            if constexpr(Persistent)
+            {
+                grids = Kernel::MaxOccupancyGridSize(s);
+            }
+            else
+            {
+                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+            }
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw ArgumentsNotSupportedException(
+                    "Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+            if(s.flush_cache_)
+            {
+                std::cout << "Flushing cache..." << std::endl;
+                static constexpr ck_tile::index_t APackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+                static constexpr ck_tile::index_t BPackedSize =
+                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+
+                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+                rotating_mem.Print();
+
+                auto run_flush_cache = [&]() {
+                    // flush icache
+                    ck_tile::flush_icache();
+                    // rotating mem
+                    rotating_mem.Next();
+                    // clear c mem
+                    if(args.k_batch > 1)
+                        hipGetErrorString(hipMemsetAsync(
+                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+                };
+                ave_time = ck_tile::launch_kernel_preprocess(
+                    s,
+                    run_flush_cache,
+                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                        Kernel{}, grids, blocks, 0, kargs));
+            }
+            else
+            {
+                ave_time =
+                    ck_tile::launch_kernel(s,
+                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                               Kernel{}, grids, blocks, 0, kargs));
+            }
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+bool run_gemm_test_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
+        else if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+}
+
+template <template <typename PreType> typename GemmConfig>
+bool run_gemm_test(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_test_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_test_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_test_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                       ck_tile::fp8_t,
+                                       ck_tile::fp8_t,
+                                       ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_test_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                       ck_tile::bf8_t,
+                                       ck_tile::bf8_t,
+                                       ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_test_prec_type<GemmConfig<ck_tile::half_t>,
+                                           ck_tile::half_t,
+                                           ck_tile::pk_int4_t,
+                                           ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int run_gemm_combinations(std::string const& data_type)
+{
+    // Define possible values for each parameter
+    std::vector<std::string> m_values    = {"512", "1024"};
+    std::vector<std::string> n_values    = {"512", "2048"};
+    std::vector<std::string> k_values    = {"512", "1024"};
+    std::vector<std::string> prec_values = {data_type};
+
+    // We'll store all our arguments as strings first
+    std::vector<std::string> arg_strings = {"./bin/tile_example_gemm_universal",
+                                            "", // m placeholder
+                                            "", // n placeholder
+                                            "", // k placeholder
+                                            "-stride_a=0",
+                                            "-stride_b=0",
+                                            "-stride_c=0",
+                                            "", // prec placeholder
+                                            "-v=2",
+                                            "-warmup=0",
+                                            "-repeat=1"};
+
+    // Create an array of const char pointers for argv
+    constexpr size_t ARG_COUNT   = 11;
+    constexpr size_t ARG_MAX_LEN = 64;
+    char args[ARG_COUNT][ARG_MAX_LEN];
+    char* argv[ARG_COUNT];
+
+    // Run all combinations
+    bool is_success = true;
+    for(const auto& m : m_values)
+    {
+        arg_strings[1] = "-m=" + m;
+
+        for(const auto& n : n_values)
+        {
+            arg_strings[2] = "-n=" + n;
+
+            for(const auto& k : k_values)
+            {
+                arg_strings[3] = "-k=" + k;
+
+                for(const auto& prec : prec_values)
+                {
+                    arg_strings[7] = "-prec=" + prec;
+
+                    // Set up the argv array with pointers to the string data
+                    for(size_t i = 0; i < ARG_COUNT; i++)
+                    {
+                        strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN);
+                        argv[i] = args[i];
+                    }
+
+                    std::cout << "Arguments received: ";
+                    for(size_t i = 1; i < ARG_COUNT; ++i)
+                    {
+                        std::cout << argv[i] << " ";
+                    }
+                    std::cout << std::endl;
+
+                    // Call the function with the current configuration
+                    try
+                    {
+                        is_success =
+                            run_gemm_test<GemmConfigComputeV3>(ARG_COUNT, argv) && is_success;
+                    }
+                    catch(const ArgumentsNotSupportedException& e)
+                    {
+                        std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
+                        // ArgumentsNotSupportedException  is not an error. Do not change is_success
+                    }
+                    catch(const std::runtime_error& e)
+                    {
+                        std::cerr << "Caught runtime error: " << e.what() << '\n';
+                        is_success = false;
+                    }
+                }
+            }
+        }
+    }
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/test/ck_tile/layernorm2d/CMakeLists.txt b/test/ck_tile/layernorm2d/CMakeLists.txt
new file mode 100644
index 0000000000..c909d6cf40
--- /dev/null
+++ b/test/ck_tile/layernorm2d/CMakeLists.txt
@@ -0,0 +1,53 @@
+function(create_tile_layernorm2d_fwd SUFFIX)
+    set(TEST_CK_TILE_LAYERNORM2D_FWD "test_ck_tile_layernorm2d_fwd_${SUFFIX}")
+
+    message(DEBUG "adding example ${TEST_CK_TILE_LAYERNORM2D_FWD}")
+    add_test_executable(${TEST_CK_TILE_LAYERNORM2D_FWD} layernorm2d_fwd_${SUFFIX}.cpp)
+    target_include_directories(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+    target_sources(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
+
+    set(TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS)
+
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal --offload-compress)
+
+    target_compile_options(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS})
+endfunction()
+
+if(GPU_TARGETS MATCHES "gfx9")
+    set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+    set(LAYERNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+        "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+    if(LAYERNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+        set(LAYERNORM2D_FWD_ENABLE_APIS  ${LAYERNORM2D_FWD_KNOWN_APIS})
+    endif()
+
+    # generate a list of kernels, but not actually emit files at config sta
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+        --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+        RESULT_VARIABLE ret
+        )
+    if(ret AND NOT ret EQUAL 0)
+        message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+    endif()
+
+    file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS)
+
+    add_custom_command(
+        OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+        --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+        )
+
+    create_tile_layernorm2d_fwd("fp16")
+    create_tile_layernorm2d_fwd("bf16")
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+    set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
+else()
+    message(DEBUG "Skipping ck tile add_rmsnorm2d_rdquant_fwd tests for current target")
+endif()
diff --git a/test/ck_tile/layernorm2d/generate.py b/test/ck_tile/layernorm2d/generate.py
new file mode 100644
index 0000000000..d77582630a
--- /dev/null
+++ b/test/ck_tile/layernorm2d/generate.py
@@ -0,0 +1,730 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Any
+import functools
+import itertools
+import copy
+from dataclasses import dataclass
+
+def get_if_str(idx, total, lase_else = True):
+    if idx == 0:
+        return 'if'
+    elif idx < total - 1:
+        return 'else if'
+    else:
+        if lase_else:
+            return 'else'
+        else:
+            return 'else if'
+
+XBIAS_ENUM_STR_MAP = [
+    'no',
+    'xbias']      # pre-norm add bias
+
+FUSED_ADD_ENUM_STR_MAP = [
+    'no',
+    'pras',      # pre-norm
+    'pra' ]      # post-norm
+
+FUSED_FUSED_SWEEP_STR_MAP = [
+    'no',
+    'dquant' ]
+
+DATA_TYPE_MAP = {'fp32' : 'float',
+                 'fp16' : 'ck_tile::fp16_t',
+                 'bf16' : 'ck_tile::bf16_t',
+                 'int8' : 'ck_tile::int8_t',
+                 'fp8'  : 'ck_tile::fp8_t'}
+
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+
+class layernorm_fwd_codegen:
+    API_TRAITS_DEFINE = """
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename XDataType_,
+          typename YDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
+          bool kWelford_,
+          bool kTwoPass_,
+          ck_tile::index_t kXbias_ = 0,
+          ck_tile::index_t kFusedAdd_ = 0,
+          ck_tile::index_t kFusedQuant_ = 0>
+struct layernorm2d_fwd_traits_
+{
+    using XDataType = ck_tile::remove_cvref_t<XDataType_>;
+    using YDataType = ck_tile::remove_cvref_t<YDataType_>;
+    using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
+    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN           = kPadN_;
+    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kFastFDiv       = kFastFDiv_;
+    static constexpr bool kWelford        = kWelford_;
+    static constexpr bool kTwoPass        = kTwoPass_;
+    static constexpr ck_tile::index_t kXbias = kXbias_;
+    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+};
+
+template <typename XDataType_,
+          typename YDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
+          bool kWelford_,
+          bool kTwoPass_,
+          int  kXbias_,
+          int  kFusedAdd_,
+          int  kFusedQuant_>
+using traits_ = layernorm2d_fwd_traits_<XDataType_,
+                                       YDataType_,
+                                       SmoothScaleDataType_,
+                                       YScaleDataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kSaveMeanInvStd_,
+                                       kFastFDiv_,
+                                       kWelford_,
+                                       kTwoPass_,
+                                       kXbias_,
+                                       kFusedAdd_,
+                                       kFusedQuant_>;
+"""
+    API_COMMON_HEADER = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+#include <ck_tile/ops/epilogue.hpp>
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = layernorm2d_fwd_args;
+
+{F_traits_define}
+
+template <typename Traits_>
+float layernorm2d_fwd_(const S& s, A a)
+{{
+    using XDataType = typename Traits_::XDataType;
+    using YDataType = typename Traits_::YDataType;
+    using SmoothScaleDataType = typename Traits_::SmoothScaleDataType;
+    using YScaleDataType = typename Traits_::YScaleDataType;
+    using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType;
+
+    using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
+        Traits_::kSaveMeanInvStd,
+        Traits_::kFastFDiv,
+        Traits_::kWelford,
+        Traits_::kTwoPass,
+        static_cast<ck_tile::Layernorm2dXBiasEnum>(Traits_::kXbias),
+        static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
+        static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XBiasDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::GammaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::BetaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::MeanDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::InvStdDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::SmoothScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YScaleDataType,
+        typename Traits_::Shape,
+        PipelineTraits>;
+
+    using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, true>;
+    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
+
+    static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
+    static constexpr bool UseRawStore = sizeof(YDataType) == 4;
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, UseRawStore,  true/*max3*/>>;
+
+    using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
+
+    using Epilogue = std::conditional_t<Traits_::kFusedQuant == 1, DynamicQuantEpilogue,  Default2DEpilogue>;
+
+    using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+}}
+
+"""
+
+    API_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+
+{F_traits_define}
+
+// Note: this internal API only declare, not define here, otherwise will block `make -j`
+template <typename Traits_>
+float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
+
+float layernorm2d_fwd(layernorm2d_fwd_traits t,
+                      layernorm2d_fwd_args a,
+                      const ck_tile::stream_config& s)
+{{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+
+"""
+
+    API_PER_DTYPE="""    {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{
+{F_per_n_case}
+    }}
+"""
+    API_PER_N_CASE="""        {F_if} {F_N_COND} {{
+{F_inner_dispatch}
+        }}
+"""
+    API_INNER_CASE="""            {F_if} {F_VEC_COND}
+                r={F_instance_func}(s, a);
+"""
+
+    INSTANCE_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_api_common.hpp"
+
+// clang-format off
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf   welford   2p   xbias   add  sweep
+{F_instance_def}
+// clang-format on
+
+"""
+
+    def __init__(self, working_path, kernel_filter):
+        self.working_path = working_path
+        self.kernel_filter = kernel_filter
+
+    class k_xbias_enum(IntEnum):
+        F_NO_XBIAS = 0
+        F_ADD_XBIAS = 1
+
+    class k_fuesd_add_enum(IntEnum):
+        F_NO_ADD = 0
+        F_PRE_ADD = 1
+        F_PRE_ADD_STORE_RESIDUAL = 2
+
+    class k_fused_sweep_enum(IntEnum):
+        F_NO_SWEEP = 0
+        F_RENORM = 1
+        F_DYNAMIC_QUANT = 2
+
+    @dataclass
+    class k_traits:
+        F_kPadN : bool
+        F_kSaveMeanInvStd : bool
+        F_kTwoPass : bool
+        F_kXbias : Any #: layernorm_fwd_codegen.k_bias_enum
+        F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum
+        F_kFusedQuant : Any  #: layernorm_fwd_codegen.k_fused_sweep_enum
+
+    @dataclass
+    class k_shape:
+        F_BlockTile    : List[int]
+        F_WarpPerBlock : List[int]
+        F_WarpTile     : List[int]
+        F_Vector_      : List[int]
+        @property
+        def F_BlockSize(self) -> int:
+            return functools.reduce(lambda a, b: a*b, self.F_WarpTile)
+
+    @dataclass
+    class k_problem:
+        F_XDataType       : str
+        F_XBiasDataType   : str
+        F_GammaDataType   : str
+        F_BetaDataType    : str
+        F_ComputeDataType : str
+        F_YDataType       : str
+        F_MeanDataType    : str
+        F_InvStdDataType  : str
+        F_BlockShape      : str
+        F_Traits          : Any #k_traits
+
+    @dataclass
+    class k_pipeline_one_pass:
+        F_Problem         : Any #k_problem
+    
+    @dataclass
+    class k_pipeline_two_pass:
+        F_Problem         : Any #k_problem
+
+    @dataclass
+    class default_2d_epilogue_problem:
+        F_AccDataType : str
+        F_ODataType : str
+        F_kPadM : bool
+        F_kPadN : bool
+
+    @dataclass
+    class default_2d_epilogue:
+        F_problem : Any
+
+    @dataclass
+    class k_kernel:
+        F_pipeline : Any
+        F_epilogue : Any
+
+    @dataclass
+    class h_traits:
+        F_XDataType : str
+        F_YDataType : str
+        F_SmoothScaleDataType : str
+        F_YScaleDataType : str
+        F_Repeat_M : int
+        F_Repeat_N : int
+        F_ThreadPerBlock_M : int
+        F_ThreadPerBlock_N : int
+        F_Vector_N : int
+        F_kPadN : bool
+        F_kSaveMeanInvStd_ : bool
+        F_kFastFDiv_ : bool
+        F_kWelford_ : bool
+        F_kTwoPass_ : bool
+        F_kXbias_ : int
+        F_kFusedAdd : int
+        F_kFusedQuant : int
+
+        @property
+        def trait_name(self) ->str:
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kXbias:4}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            return t_
+
+        # string when calling this kernel
+        @property
+        def call_name(self) -> str:
+            return f'layernorm2d_fwd_<traits_<{self.trait_name}>>'
+
+        # string when define this kernel
+        @property
+        def def_name(self) -> str:
+            return f'template float layernorm2d_fwd_<traits_<{self.trait_name}>>(const S&, A);'
+
+    # this class hold kernel under same source file
+    @dataclass
+    class h_instance:
+        F_DataTypePair : str
+        F_N : str
+        F_xbias : int
+        F_add : int
+        F_sweep : int
+        instance_list : List[Any] # List[h_traits]
+
+        @property
+        def name(self) -> str:
+            prec_i, prec_o = self.F_DataTypePair.split(',')
+            dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
+            nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}'
+            if self.F_xbias != 0:
+                nnn = nnn + '_' + XBIAS_ENUM_STR_MAP[self.F_xbias] 
+            if self.F_add != 0:
+                nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
+            if self.F_sweep != 0:
+                nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
+            return nnn
+
+        @property
+        def instance_name(self) ->str:
+            return self.name
+
+        @property
+        def content(self) ->str:
+            instance_defs = ''
+            for ins in self.instance_list:
+                instance_defs += ins.def_name + '\n'
+            return layernorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs)
+
+    @property
+    def name_api(self) -> str:
+        return 'layernorm2d_fwd_api'
+
+    @property
+    def name_common_header(self) -> str:
+        return 'layernorm2d_fwd_api_common'
+
+    def content_api(self, args) -> str:
+        # 1 sort based on dtype
+        t_dtype_dict = dict()
+        blobs = self.get_blobs(args)
+        for blob in blobs:
+            if blob.F_DataTypePair not in t_dtype_dict:
+                t_dtype_dict[blob.F_DataTypePair] = {}
+            if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]:
+                t_dtype_dict[blob.F_DataTypePair][blob.F_N] = []
+            t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob)
+
+        d_str = ''
+        for i_d, dtype_ in enumerate(t_dtype_dict):
+            blob_per_t = t_dtype_dict[dtype_]
+            n_str = ''
+            for i_n, n_ in enumerate(blob_per_t):
+                blob_per_n = blob_per_t[n_]
+                inner_str = ""
+                for i_b, b_ in enumerate(blob_per_n):
+                    # generate single kernel instance file
+                    #vec_str = ""
+                    for i_ins, ins in enumerate(b_.instance_list):
+                        idx_in_n = i_b * len(b_.instance_list) + i_ins
+                        len_in_n = len(blob_per_n) * len(b_.instance_list)
+                        # _if = 'if' if i_ins == 0 else 'else if'
+                        if ins.F_kFusedQuant == 0:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
+                        elif ins.F_kFusedQuant == 1:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType)
+                        elif ins.F_kFusedQuant == 2:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.xbias == {f_xbias}) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                                        f_vec_n = ins.F_Vector_N, f_xbias = ins.F_kXbias, f_fused_add = ins.F_kFusedAdd,
+                                        f_sweep_cond = _sweep_cond)
+                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
+                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
+                    #inner_str = inner_str + vec_str
+                n_cnd = f'(a.n <= {n_})' if isinstance(n_, int) else ''
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t), not isinstance(n_, int)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+            prec_i, prec_o = dtype_.split(',')
+            d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
+
+        api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str)
+        return api_base
+
+    @property
+    def content_common_header(self) -> str:
+        return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
+
+    def get_blobs(self, args):
+        h_traits = layernorm_fwd_codegen.h_traits
+        h_instance = layernorm_fwd_codegen.h_instance
+
+        dynamic_quant_out_dtype = ['int8', 'fp8']
+        # some predefined support range
+        # (prec_i,prec_o) for simplicity this string will be used as key for dict
+        scale_list = [('fp32,fp32')]
+        dtype_list = [('fp16,fp16'), ('bf16,bf16'),
+                        ('fp16,int8'), ('bf16,int8'),
+                        ('fp16,fp8'), ('bf16,fp8')] # NOTE: only fused-dynamic-quant use int8 or fp8 out
+        types_8bit = ('int8', 'fp8')
+        types_16bit = ('int16', 'fp16', 'bf16')
+        #fused_add_list = [0, 1, 2]
+        #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
+        xbias_list = [0, 1]
+        fused_add_list = [0, 1]
+        fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
+        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  welford   2p     xbias    add   sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1,1024, 8,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  1, 256, 2,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,    True,   0,    0,    0)]}
+        total_blob = list()
+        for hs_key in h_trait_dict:
+            hs = h_trait_dict[hs_key]
+            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
+            for dtype, scale_type, xbias, fused_add, fused_quant in itertools.product(dtype_list, scale_list, xbias_list, fused_add_list, fused_sweep_list):
+                prec_i, prec_o = dtype.split(',')
+                scale_sm, scale_y = scale_type.split(',')
+                if prec_o in dynamic_quant_out_dtype and fused_quant != 1:
+                    continue # skip non dynamic quant case
+                if fused_quant == 1 and hs_key == 'big':
+                    continue
+                current_hs = list()
+                for chs_ in hs:
+                    h_ = copy.copy(chs_) # copy the base instance out
+                    h_.F_XDataType = prec_i
+                    h_.F_YDataType = prec_o
+                    h_.F_SmoothScaleDataType = scale_sm
+                    h_.F_YScaleDataType = scale_y
+                    h_.F_kXbias = xbias
+                    h_.F_kFusedAdd = fused_add
+                    h_.F_kFusedQuant = fused_quant
+                    # disable welford update for 8bit and 16 bit smallN
+                    if not h_.F_kTwoPass_:
+                        #disable 16 bit when set args disable_16b_welford
+                        if args.disable_16b_welford and prec_i in types_16bit:
+                            h_.F_kWelford_ = False
+                        #disable 8bit by default
+                        elif prec_i in types_8bit or prec_o in types_8bit:
+                            h_.F_kWelford_ = False
+                        #disable 16bit small N
+                        elif prec_i in types_16bit and hs_key == '64':
+                            h_.F_kWelford_ = False
+                    current_hs.append(h_) # + "\n"
+                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                current_n_str = 'big' if hs_key == 'big' else current_n
+                total_blob.append(h_instance(dtype, current_n_str, xbias, fused_add, fused_quant, current_hs))
+        return total_blob
+
+    def list_blobs(self, args) -> None:
+        w_p = Path(self.working_path)
+        list_p = w_p / 'layernorm2d_fwd_blobs.txt'
+        blobs = self.get_blobs(args)
+        with list_p.open('w') as list_f:
+            # api related file
+            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
+            list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")
+            # kernel instance file
+            for b in blobs:
+                list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
+
+    def gen_blobs(self, args) -> None:
+        w_p = Path(self.working_path)
+        w_str = self.content_api(args)
+        (w_p / (self.name_api + ".cpp")).write_text(w_str)
+        (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
+        blobs = self.get_blobs(args)
+        for b in blobs:
+            (w_p / (b.name + ".cpp")).write_text(b.content)
+
+def list_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs(args)
+
+
+def gen_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs(args)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK layernorm kernel",
+    )
+    parser.add_argument(
+        "-a",
+        "--api",
+        default='fwd[all]',
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma."
+    )
+
+    # the directory for list_blobs/gen_blobs to write files into
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="the path where all the blobs are going to be generated"
+    )
+
+    # this script have 2 modes
+    # 1) list_blobs mode, will generate a txt file with all the files going to be generated.
+    #    this is useful in build system like cmake to construct source code dependency, by
+    #    reading the content out of this file
+    # 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework
+    #    like FA, only need to use this mode
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action='store_true',
+        help="list all the kernels to a file, "
+    )
+
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action='store_true',
+        help="generate all kernels into different tile"
+    )
+
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--traits",
+        default="all",
+        required=False,
+        help="enable/disable some feature. default generate all"
+    )
+
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt."
+    )
+
+    parser.add_argument(
+        "--disable_16b_welford",
+        default=False,
+        required=False,
+        help="enable/disable welford for 16bit datatype n > 64"
+    )
+
+    args = parser.parse_args()
+
+    # print(f'{args.list_blobs}-{args.gen_blobs}')
+    if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)):
+        print('gen_blobs/list_blobs must specify only one option')
+        sys.exit()
+
+    p = Path(args.working_path)
+    if not p.exists():
+        p.mkdir()
+
+    if args.list_blobs:
+        list_blobs(args)
+    else:
+        gen_blobs(args)
diff --git a/test/ck_tile/layernorm2d/layernorm2d_fwd.hpp b/test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
new file mode 100644
index 0000000000..0538953a58
--- /dev/null
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/layernorm2d.hpp"
+#include <string>
+
+template <typename InType,
+          typename OutType,
+          typename SmoothSScaleDataType_,
+          typename YScaleDataType_>
+struct LayerNormTypeConfig;
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::half_t;
+    using YDataType           = OutType;
+    using XBiasDataType       = ck_tile::half_t;
+    using GammaDataType       = ck_tile::half_t;
+    using BetaDataType        = ck_tile::half_t;
+    using MeanDataType        = ck_tile::half_t;
+    using InvStdDataType      = ck_tile::half_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::bf16_t;
+    using YDataType           = OutType;
+    using XBiasDataType       = ck_tile::bf16_t;
+    using GammaDataType       = ck_tile::bf16_t;
+    using BetaDataType        = ck_tile::bf16_t;
+    using MeanDataType        = ck_tile::bf16_t;
+    using InvStdDataType      = ck_tile::bf16_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+// runtime args
+struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs
+{
+};
+
+// This is the public API, will be generated by script
+struct layernorm2d_fwd_traits
+{
+    std::string prec_i; // input precision
+    std::string prec_o; // output precision
+
+    // if fused_quant == 1, need set prec_sm/prec_sy to proper string, otherwise can set
+    // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
+    // can set arbitrary(will skip check)
+    std::string prec_sm; // x-scale, used for [1*N] input smooth quant
+    std::string prec_sy; // y-scale, used for [M*1] output for next layer
+
+    bool save_mean_var; //
+    int xbias;          // 0:no-bias, 1:add bias
+    int fused_add;      // 0:no-add, 1:pre-add-store, 2:pre-add
+    int fused_quant;    // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+};
+
+float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/layernorm2d/layernorm2d_fwd.inc b/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
new file mode 100644
index 0000000000..8070815b7e
--- /dev/null
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "layernorm2d_fwd.hpp"
+#include <algorithm>
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1.0;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
+        .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
+        .insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
+        .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec_i", "fp16", "input precision")
+        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
+        .insert("prec_sm",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1")
+        .insert("prec_sy",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1 or 2")
+        .insert("xbias", "0", "add bias, 0:no add, 1:add bias before fadd")
+        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
+        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InDataType,
+          typename OutDataType,
+          typename SmoothScaleDataType,
+          typename YScaleDataType,
+          bool SaveMeanVar>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
+    if(xr_stride < 0)
+        xr_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
+    if(yr_stride < 0)
+        yr_stride = n;
+    float epsilon       = arg_parser.get_float("e");
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+
+    int kname         = arg_parser.get_int("kname");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
+    int xbias         = arg_parser.get_int("xbias");
+    int fused_add     = arg_parser.get_int("fadd");
+    int fused_quant   = arg_parser.get_int("fquant");
+    if(fused_quant == 1 && prec_o != "int8" && prec_o != "fp8")
+    {
+        std::cout
+            << "if fused_quant is 1 or 2, only support \"-prec_o=int8\" or \"-prec_o=fp8\" cases."
+            << std::endl;
+        return false;
+    }
+
+    assert(x_stride >= n);
+
+    using TypeConfig =
+        LayerNormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
+
+    using XDataType         = typename TypeConfig::XDataType;
+    using YDataType         = typename TypeConfig::YDataType;
+    using XBiasDataType     = typename TypeConfig::XBiasDataType;
+    using GammaDataType     = typename TypeConfig::GammaDataType;
+    using BetaDataType      = typename TypeConfig::BetaDataType;
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
+    using MeanDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::MeanDataType, ck_tile::null_type>;
+    using InvStdDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::InvStdDataType, ck_tile::null_type>;
+
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+    ck_tile::HostTensor<XBiasDataType> x_bias_host({n});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+    ck_tile::HostTensor<BetaDataType> beta_host({n});
+
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});
+
+    ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
+    ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
+
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host_dev({n});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
+    ck_tile::FillUniformDistribution<XBiasDataType>{-.5f, .5f}(x_bias_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+    ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_bias_buf(x_bias_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    x_bias_buf.ToDevice(x_bias_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+    beta_buf.ToDevice(beta_host.data());
+    x_residual_buf.ToDevice(x_residual_host.data());
+    sm_scale_buf.ToDevice(sm_scale_host.data());
+
+    auto prec_str = [&]() {
+        auto base_str = prec_i;
+        if(prec_i != prec_o)
+        {
+            base_str += "|" + prec_o;
+        }
+        if(fused_quant == 1)
+        {
+            base_str += std::string("(") + prec_sy + ")";
+        }
+        return base_str;
+    }();
+
+    std::cout << "[" << prec_str << "]"
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
+              << ", yr_stride:" << yr_stride << std::flush;
+
+    layernorm2d_fwd_traits traits{
+        prec_i, prec_o, prec_sm, prec_sy, SaveMeanVar, xbias, fused_add, fused_quant};
+
+    layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
+                              fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant == 1 ? sm_scale_buf.GetDeviceBuffer() : nullptr,
+                              x_bias_buf.GetDeviceBuffer(),
+                              gamma_buf.GetDeviceBuffer(),
+                              beta_buf.GetDeviceBuffer(),
+
+                              y_buf.GetDeviceBuffer(),
+                              fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
+                              nullptr, // p_mean, unsupported yet
+                              nullptr, // p_invStd, unsupported yet
+
+                              epsilon,
+                              m,
+                              n,
+                              x_stride,   // x row_stride
+                              xr_stride,  // x residule row stride
+                              y_stride,   // y row stride
+                              yr_stride}; // y residule row stride
+
+    float ave_time = layernorm2d_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    if(ave_time < 0)
+    {
+        std::cout << " not supported!" << std::endl << std::flush;
+        return false;
+    }
+
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XBiasDataType) * n +
+                           sizeof(GammaDataType) * n + sizeof(BetaDataType) * n +
+                           sizeof(YDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        if(xbias != 0)
+        {
+            // add bias before fadd
+            int M = x_host.mDesc.get_lengths()[0];
+            int N = x_host.mDesc.get_lengths()[1];
+            for(int idx_m = 0; idx_m < M; ++idx_m)
+            {
+                for(int idx_n = 0; idx_n < N; ++idx_n)
+                {
+                    x_host(idx_m, idx_n) = ck_tile::type_convert<XDataType>(
+                        ck_tile::type_convert<ComputeDataType>(x_host(idx_m, idx_n)) +
+                        ck_tile::type_convert<ComputeDataType>(x_bias_host(idx_n)));
+                }
+            }
+        }
+
+        if(fused_add != 0)
+        {
+            // fused pre_add/pre_add_store
+            // TODO we accumulate directly to x_host for simplcity here...
+
+            std::transform(x_host.mData.cbegin(),
+                           x_host.mData.cend(),
+                           x_residual_host.mData.cbegin(),
+                           x_host.mData.begin(),
+                           [](auto x_, auto r_) {
+                               auto o_ = ck_tile::type_convert<ComputeDataType>(x_) +
+                                         ck_tile::type_convert<ComputeDataType>(r_);
+                               return ck_tile::type_convert<XDataType>(o_);
+                           });
+        }
+        ck_tile::reference_layernorm2d_fwd<XDataType,
+                                           GammaDataType,
+                                           BetaDataType,
+                                           ComputeDataType,
+                                           YDataType,
+                                           MeanDataType,
+                                           InvStdDataType>(
+            x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+
+        if(fused_quant != 0)
+        {
+            auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
+                int N_ = acc_.mDesc.get_lengths()[1];
+                if(fused_quant == 1)
+                {
+                    for(int n_ = 0; n_ < N_; n_++)
+                    {
+                        // input smooth outlier
+                        acc_(m_, n_) = acc_(m_, n_) *
+                                       ck_tile::type_convert<ComputeDataType>(sm_scale_host(n_));
+                    }
+                }
+                ComputeDataType absmax = static_cast<ComputeDataType>(0);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    const auto a = ck_tile::abs(acc_(m_, n_));
+                    absmax       = a > absmax ? a : absmax;
+                }
+                // printf("cpu:absmax:%f\n", absmax);
+                constexpr ComputeDataType kMaxY =
+                    std::is_same<YDataType, ck_tile::fp8_t>::value    ? 240.0
+                    : std::is_same<YDataType, ck_tile::int8_t>::value ? 127.0
+                                                                      : 0.0;
+                ComputeDataType y_scale = absmax / kMaxY;
+                y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
+                }
+            };
+
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(x_host,
+                                                               gamma_host,
+                                                               beta_host,
+                                                               y_host_ref,
+                                                               mean_host_ref,
+                                                               invStd_host_ref,
+                                                               epsilon,
+                                                               dquant_functor);
+        }
+        else
+        {
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(
+                x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+        }
+
+        y_buf.FromDevice(y_host_dev.data());
+
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
+        if(fused_add == 1)
+        {
+            y_residual_buf.FromDevice(y_residual_host_dev.data());
+        }
+
+        auto [rtol, atol] = get_elimit<OutDataType>();
+
+        if(x_stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+            if(fused_add == 1)
+            {
+                pass &= ck_tile::check_err(y_residual_host_dev,
+                                           x_host,
+                                           std::string("ADD Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
+                                                      y_host_dev.begin() + i_r * y_stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
+                                                      y_host_ref.begin() + i_r * y_stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("OUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+                if(fused_add == 1)
+                {
+                    std::vector<YResidualDataType> y_residual_host_dev_row(
+                        y_residual_host_dev.begin() + i_r * yr_stride,
+                        y_residual_host_dev.begin() + i_r * yr_stride + n);
+                    std::vector<YResidualDataType> y_residual_host_ref_row(
+                        x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
+                    pass &= ck_tile::check_err(y_residual_host_dev_row,
+                                               y_residual_host_ref_row,
+                                               std::string("ADD[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+        if(fused_quant == 1)
+        {
+            y_scale_buf.FromDevice(y_scale_host_dev.data());
+            pass &= ck_tile::check_err(y_scale_host_dev,
+                                       y_scale_host_ref,
+                                       std::string("SCALE Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+bool dispatch_by_type(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+    int save_mv = arg_parser.get_int("save_mv");
+
+    // no dynamic quant case
+    if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+
+    // dynamic quant case, only in inference
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+
+    return false;
+}
+
+int run_layernorm2d_fwd_combinations(std::string const& data_type)
+{
+    constexpr size_t PARAM_COUNT = 20;
+    char bufs[PARAM_COUNT][64];
+    char* argv[PARAM_COUNT];
+
+    for(std::size_t i = 0; i < PARAM_COUNT; i++)
+    {
+        argv[i] = bufs[i];
+    }
+
+    std::vector<std::vector<std::string>> fquant = {
+        {}, {"-fquant=1", "-prec_o=int8"}, {"-fquant=1", "-prec_o=fp8"}};
+
+    std::vector<std::string> fadd = {"-fadd=0", "-fadd=1"};
+
+    std::vector<std::vector<std::string>> params = {
+        {"-m=99", "-n=13"},
+        {"-m=17", "-n=16"},
+        {"-m=1", "-n=100"},
+        {"-m=4", "-n=128"},
+        {"-m=80", "-n=127"},
+        {"-m=22", "-n=255 -stride=256"},
+        {"-m=7", "-n=599"},
+        {"-m=19", "-n=512"},
+        {"-m=33", "-n=313 -stride=1000"},
+        {"-m=11", "-n=510"},
+        {"-m=171", "-n=676 -stride=818"},
+        {"-m=91", "-n=636"},
+        {"-m=12", "-n=768 -stride=800"},
+        {"-m=100", "-n=766 -stride=812"},
+        {"-m=31", "-n=1024"},
+        {"-m=64", "-n=1000 -stride=1004"},
+        {"-m=8", "-n=1501"},
+        {"-m=3", "-n=1826"},
+        {"-m=5", "-n=2040"},
+        {"-m=7", "-n=2734"},
+        {"-m=1", "-n=3182"},
+        {"-m=9", "-n=4096"},
+        {"-m=3", "-n=8192"},
+        {"-m=3", "-n=9120"},
+        {"-m=1", "-n=10547"},
+    };
+
+    bool result = true;
+    int argc    = 0;
+    std::vector<int> argc_stack;
+    std::string pr_i = "-prec_i=" + data_type;
+    strncpy(bufs[argc++], "layernorm2d_fwd", 64);
+    strncpy(bufs[argc++], pr_i.c_str(), 64);
+    argc_stack.push_back(argc);
+    for(size_t fquant_idx = 0; fquant_idx < fquant.size(); fquant_idx++)
+    {
+        argc = argc_stack.back();
+        for(size_t j = 0; j < fquant[fquant_idx].size(); j++)
+        {
+            strncpy(bufs[argc++], fquant[fquant_idx][j].c_str(), 64);
+        }
+        argc_stack.push_back(argc);
+        for(size_t fadd_idx = 0; fadd_idx < fadd.size(); fadd_idx++)
+        {
+            argc = argc_stack.back();
+            strncpy(bufs[argc++], fadd[fadd_idx].c_str(), 64);
+            argc_stack.push_back(argc);
+            for(size_t param_idx = 0; param_idx < params.size(); param_idx++)
+            {
+                argc = argc_stack.back();
+                for(size_t j = 0; j < params[param_idx].size(); j++)
+                {
+                    strncpy(bufs[argc++], params[param_idx][j].c_str(), 64);
+                }
+
+                result = dispatch_by_type(argc, argv) && result;
+            }
+            argc_stack.pop_back();
+        }
+        argc_stack.pop_back();
+    }
+    argc_stack.pop_back();
+    return result ? 0 : -1;
+}
diff --git a/test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp b/test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
new file mode 100644
index 0000000000..c826af6a25
--- /dev/null
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd.inc"
+
+int main() { return run_layernorm2d_fwd_combinations("bf16"); }
diff --git a/test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp b/test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
new file mode 100644
index 0000000000..c18dff11d2
--- /dev/null
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd.inc"
+
+int main() { return run_layernorm2d_fwd_combinations("fp16"); }
diff --git a/test/ck_tile/rmsnorm2d/CMakeLists.txt b/test/ck_tile/rmsnorm2d/CMakeLists.txt
new file mode 100644
index 0000000000..5a73b0914c
--- /dev/null
+++ b/test/ck_tile/rmsnorm2d/CMakeLists.txt
@@ -0,0 +1,54 @@
+function(create_tile_rmsnorm2d_fwd SUFFIX)
+    set(TILE_RMSNORM2D_FWD "test_ck_tile_rmsnorm2d_fwd_${SUFFIX}")
+
+    message(DEBUG "adding ${TILE_RMSNORM2D_FWD}")
+    add_test_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd_${SUFFIX}.cpp)
+    target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+    target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
+
+    set(TILE_RMSNORM2D_FWD_COMPILE_OPTIONS)
+
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal --offload-compress)
+
+    target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
+endfunction()
+
+if(GPU_TARGETS MATCHES "gfx9")
+    set(RMSNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+    set(RMSNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+        "semicolon-separated list of APIs to generate (${RMSNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+    if(RMSNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+        set(RMSNORM2D_FWD_ENABLE_APIS  ${RMSNORM2D_FWD_KNOWN_APIS})
+    endif()
+
+    # generate a list of kernels, but not actually emit files at config sta
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+        --api ${RMSNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+        RESULT_VARIABLE ret
+        )
+    if(ret AND NOT ret EQUAL 0)
+        message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+    endif()
+
+    file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/rmsnorm2d_fwd_blobs.txt RMSNORM2D_FWD_GEN_BLOBS)
+
+    add_custom_command(
+        OUTPUT ${RMSNORM2D_FWD_GEN_BLOBS}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+        --api ${RMSNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+        )
+
+    create_tile_rmsnorm2d_fwd("fp16")
+    create_tile_rmsnorm2d_fwd("bf16")
+
+    # TODO: we have to turn off this global prop, otherwise the progress bar generated
+    # by cmake will print too many files, execvp: /bin/sh: Argument list too long
+    # however, this property may affect global
+    # TODO: consider codegen a makefile by us
+    set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
+else()
+    message(DEBUG "Skipping ck tile add_rmsnorm2d_rdquant_fwd tests for current target")
+endif()
+
diff --git a/test/ck_tile/rmsnorm2d/generate.py b/test/ck_tile/rmsnorm2d/generate.py
new file mode 100644
index 0000000000..4296b7373e
--- /dev/null
+++ b/test/ck_tile/rmsnorm2d/generate.py
@@ -0,0 +1,715 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Any
+import functools
+import itertools
+import copy
+from dataclasses import dataclass
+
+
+def get_if_str(idx, total, lase_else = True):
+    if idx == 0:
+        return 'if'
+    elif idx < total - 1:
+        return 'else if'
+    else:
+        if lase_else:
+            return 'else'
+        else:
+            return 'else if'
+
+FUSED_ADD_ENUM_STR_MAP = [
+    'no',
+    'pras',      # pre-norm
+    'pra' ]      # post-norm
+
+FUSED_FUSED_SWEEP_STR_MAP = [
+    'no',
+    'sdquant',  # smooth dynamic quant
+    'dquant' ]  # dynamic quant (without sm_scale)
+
+DATA_TYPE_MAP = {'fp32' : 'float',
+                 'fp16' : 'ck_tile::fp16_t',
+                 'bf16' : 'ck_tile::bf16_t',
+                 'int8' : 'ck_tile::int8_t',
+                 'fp8'  : 'ck_tile::fp8_t'}
+
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+
+
+class rmsnorm_fwd_codegen:
+    API_TRAITS_DEFINE = """
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename XDataType_,
+          typename YDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          typename UnquantYDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kSaveUnquant_,
+          bool kTwoPass_,
+          ck_tile::index_t kFusedAdd_ = 0,
+          ck_tile::index_t kFusedQuant_ = 0>
+struct rmsnorm2d_fwd_traits_
+{
+    using XDataType           = ck_tile::remove_cvref_t<XDataType_>;
+    using YDataType           = ck_tile::remove_cvref_t<YDataType_>;
+    using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
+    using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
+    using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN        = kPadN_;
+    static constexpr bool kSaveInvRms  = kSaveInvRms_;
+    static constexpr bool kSaveUnquant = kSaveUnquant_;
+    static constexpr bool kTwoPass     = kTwoPass_;
+    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+};
+
+template <typename XDataType_,
+          typename YDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          typename UnquantYDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveInvRms_,
+          bool kSaveUnquant_,
+          bool kTwoPass_,
+          int  kFusedAdd_,
+          int  kFusedQuant_>
+using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
+                                      YDataType_,
+                                      SmoothScaleDataType_,
+                                      YScaleDataType_,
+                                      UnquantYDataType_,
+                                      Repeat_M_,
+                                      Repeat_N_,
+                                      ThreadPerBlock_M_,
+                                      ThreadPerBlock_N_,
+                                      Vector_N_,
+                                      kPadN_,
+                                      kSaveInvRms_,
+                                      kSaveUnquant_,
+                                      kTwoPass_,
+                                      kFusedAdd_,
+                                      kFusedQuant_>;
+"""
+
+    API_COMMON_HEADER = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "rmsnorm2d_fwd.hpp"
+#include <ck_tile/ops/epilogue.hpp>
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = rmsnorm2d_fwd_args;
+
+{F_traits_define}
+
+template <typename Traits_>
+float rmsnorm2d_fwd_(const S& s, A a)
+{{
+    using XDataType           = typename Traits_::XDataType;
+    using YDataType           = typename Traits_::YDataType;
+    using SmoothScaleDataType = typename Traits_::SmoothScaleDataType;
+    using YScaleDataType      = typename Traits_::YScaleDataType;
+    using UnquantYDataType    = typename Traits_::UnquantYDataType;
+    using ComputeDataType     = typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType;
+
+    using PipelineTraits =
+        ck_tile::Rmsnorm2dFwdTraits<Traits_::kPadN,
+                                    Traits_::kSaveInvRms,
+                                    Traits_::kSaveUnquant,
+                                    Traits_::kTwoPass,
+                                    static_cast<ck_tile::Rmsnorm2dFusedAddEnum>(Traits_::kFusedAdd),
+                                    static_cast<ck_tile::Rmsnorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+
+    using PipelineProblem =
+        ck_tile::Rmsnorm2dFwdPipelineProblem<typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::GammaDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::InvRmsDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::UnquantYDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::SmoothScaleDataType,
+                                             typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YScaleDataType,
+                                             typename Traits_::Shape,
+                                             PipelineTraits>;
+
+    using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
+    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
+
+    static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
+
+    using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
+
+    using Default2DAndDynamicQuantEpilogueProblem = ck_tile::Default2DAndDynamicQuantEpilogueProblem<
+        ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, UnquantYDataType, typename Traits_::Shape,
+        ck_tile::Default2DAndDynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, false,  true/*max3*/>>;
+    using Default2DAndDynamicQuantEpilogue = ck_tile::Default2DAndDynamicQuantEpilogue<Default2DAndDynamicQuantEpilogueProblem>;
+
+    using Epilogue = std::conditional_t<Traits_::kFusedQuant != 0,
+                                        std::conditional_t<Traits_::kSaveUnquant,
+                                                           Default2DAndDynamicQuantEpilogue,
+                                                           DynamicQuantEpilogue>,
+                                        Default2DEpilogue>;
+
+    using Kernel = ck_tile::Rmsnorm2dFwd<Pipeline, Epilogue>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+}}
+
+"""
+
+    API_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "rmsnorm2d_fwd.hpp"
+
+{F_traits_define}
+
+// Note: this internal API only declare, not define here, otherwise will block `make -j`
+template <typename Traits_>
+float rmsnorm2d_fwd_(const ck_tile::stream_config& s, rmsnorm2d_fwd_args a);
+
+float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
+                    rmsnorm2d_fwd_args a,
+                    const ck_tile::stream_config& s)
+{{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+
+"""
+
+    INSTANCE_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd_api_common.hpp"
+
+// clang-format off
+//                                                                                    rm  rn  tm    tn  vn    pd     rms     2p
+{F_instance_def}
+// clang-format on
+
+"""
+
+    API_PER_DTYPE = """
+    {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{
+{F_per_n_case}
+    }}
+"""
+    API_PER_N_CASE = """
+        {F_if} {F_N_COND} {{
+{F_inner_dispatch}
+        }}
+"""
+    API_INNER_CASE = """
+            {F_if} {F_VEC_COND}
+                r={F_instance_func}(s, a);
+"""
+
+    def __init__(self, working_path, kernel_filter):
+        self.working_path = working_path
+        self.kernel_filter = kernel_filter
+
+    class k_fuesd_add_enum(IntEnum):
+        F_NO_ADD = 0
+        F_PRE_ADD = 1
+        F_PRE_ADD_STORE_RESIDUAL = 2
+
+    class k_fused_sweep_enum(IntEnum):
+        F_NO_SWEEP = 0
+        F_RENORM = 1
+        F_DYNAMIC_QUANT = 2
+
+    @dataclass
+    class k_traits:
+        F_kPadN : bool
+        F_kSaveMeanInvStd : bool
+        F_kTwoPass : bool
+        F_kFusedAdd : Any
+        F_kFusedQuant : Any
+
+    @dataclass
+    class k_shape:
+        F_BlockTile    : List[int]
+        F_WarpPerBlock : List[int]
+        F_WarpTile     : List[int]
+        F_Vector_      : List[int]
+        @property
+        def F_BlockSize(self) -> int:
+            return functools.reduce(lambda a, b: a*b, self.F_WarpTile)
+
+    @dataclass
+    class k_problem:
+        F_XDataType       : str
+        F_GammaDataType   : str
+        F_ComputeDataType : str
+        F_YDataType       : str
+        F_InvRmsDataType  : str
+        F_BlockShape      : str
+        F_Traits          : Any #k_traits
+
+    @dataclass
+    class k_pipeline_one_pass:
+        F_Problem         : Any #k_problem
+
+    @dataclass
+    class k_pipeline_two_pass:
+        F_Problem         : Any #k_problem
+
+    @dataclass
+    class default_2d_epilogue_problem:
+        F_AccDataType : str
+        F_ODataType : str
+        F_kPadM : bool
+        F_kPadN : bool
+
+    @dataclass
+    class default_2d_epilogue:
+        F_problem : Any
+
+    @dataclass
+    class k_kernel:
+        F_pipeline : Any
+        F_epilogue : Any
+
+    @dataclass
+    class h_traits:
+        F_XDataType : str
+        F_YDataType : str
+        F_SmoothScaleDataType : str
+        F_YScaleDataType : str
+        F_UnquantYDataType : str
+        F_Repeat_M : int
+        F_Repeat_N : int
+        F_ThreadPerBlock_M : int
+        F_ThreadPerBlock_N : int
+        F_Vector_N : int
+        F_kPadN : bool
+        F_kSaveInvRms : bool
+        F_kSaveUnquant: bool
+        F_kTwoPass : bool
+        F_kFusedAdd : int
+        F_kFusedQuant : int
+
+        @property
+        def trait_name(self) ->str:
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {DATA_TYPE_MAP[self.F_UnquantYDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveInvRms):5}, {BOOL_MAP(self.F_kSaveUnquant):5}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            return t_
+
+        # string when calling this kernel
+        @property
+        def call_name(self) -> str:
+            return f'rmsnorm2d_fwd_<traits_<{self.trait_name}>>'
+
+        # string when define this kernel
+        @property
+        def def_name(self) -> str:
+            return f'template float rmsnorm2d_fwd_<traits_<{self.trait_name}>>(const S&, A);'
+
+    # this class hold kernel under same source file
+    @dataclass
+    class h_instance:
+        F_DataTypePair : str
+        F_N : str
+        F_add : int
+        F_sweep : int
+        F_saveunquant : bool
+        instance_list : List[Any] # List[h_traits]
+
+        @property
+        def name(self) -> str:
+            prec_i, prec_o = self.F_DataTypePair.split(',')
+            dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
+            nnn = f'rmsnorm2d_fwd_{dtype_str}_n{self.F_N}'
+            if self.F_add != 0:
+                nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
+            if self.F_sweep != 0:
+                nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
+            if self.F_saveunquant:
+                nnn = nnn + '_saveunquant'
+            return nnn
+
+        @property
+        def instance_name(self) ->str:
+            return self.name
+
+        @property
+        def content(self) ->str:
+            instance_defs = ''
+            for ins in self.instance_list:
+                instance_defs += ins.def_name + '\n'
+            return rmsnorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs)
+
+    @property
+    def name_api(self) -> str:
+        return 'rmsnorm2d_fwd_api'
+
+    @property
+    def name_common_header(self) -> str:
+        return 'rmsnorm2d_fwd_api_common'
+
+    @property
+    def content_api(self) -> str:
+        # 1 sort based on dtype
+        t_dtype_dict = dict()
+        blobs = self.get_blobs()
+        for blob in blobs:
+            if blob.F_DataTypePair not in t_dtype_dict:
+                t_dtype_dict[blob.F_DataTypePair] = {}
+            if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]:
+                t_dtype_dict[blob.F_DataTypePair][blob.F_N] = []
+            t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob)
+
+        d_str = ''
+        for i_d, dtype_ in enumerate(t_dtype_dict):
+            blob_per_t = t_dtype_dict[dtype_]
+            n_str = ''
+            for i_n, n_ in enumerate(blob_per_t):
+                blob_per_n = blob_per_t[n_]
+                inner_str = ""
+                for i_b, b_ in enumerate(blob_per_n):
+                    # generate single kernel instance file
+                    #vec_str = ""
+                    for i_ins, ins in enumerate(b_.instance_list):
+                        idx_in_n = i_b * len(b_.instance_list) + i_ins
+                        len_in_n = len(blob_per_n) * len(b_.instance_list)
+                        # _if = 'if' if i_ins == 0 else 'else if'
+                        if ins.F_kFusedQuant == 0:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
+                        elif ins.F_kFusedQuant == 1:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\" && t.save_unquant == {f_suq})'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType, f_suq=BOOL_MAP(ins.F_kSaveUnquant))
+                        elif ins.F_kFusedQuant == 2:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\" && t.save_unquant == {f_suq})'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType, f_suq=BOOL_MAP(ins.F_kSaveUnquant))
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                                        f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
+                                        f_sweep_cond = _sweep_cond)
+                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
+                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
+                    #inner_str = inner_str + vec_str
+                n_cnd = f'(a.n <= {n_})' if (i_n < len(blob_per_t) - 1) else ''
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+            prec_i, prec_o = dtype_.split(',')
+            d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
+
+        api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str)
+        return api_base
+
+    @property
+    def content_common_header(self) -> str:
+        return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
+
+    def get_blobs(self):
+        h_traits = rmsnorm_fwd_codegen.h_traits
+        h_instance = rmsnorm_fwd_codegen.h_instance
+
+        dynamic_quant_out_dtype = ['int8', 'fp8']
+        # some predefined support range
+        # (prec_i,prec_o) for simplicity this string will be used as key for dict
+        scale_list = [('fp32,fp32')]
+        dtype_list = [('fp16,fp16'), ('bf16,bf16'),
+                        ('fp16,int8'), ('bf16,int8'),
+                        ('fp16,fp8'), ('bf16,fp8')] # NOTE: only fused-dynamic-quant use int8 out
+        #fused_add_list = [0, 1, 2]
+        #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused (smooth) dynamic quant
+        fused_add_list = [0, 1]
+        fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused (smooth) dynamic quant
+        bool_list = [False, True]
+
+        #                                                              rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '640' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  2,  64, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 128, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0)]}
+        total_blob = list()
+        for hs_key in h_trait_dict:
+            hs = h_trait_dict[hs_key]
+            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
+            for dtype, scale_type, fused_add, fused_quant, save_unquant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list, bool_list):
+                prec_i, prec_o = dtype.split(',')
+                scale_sm, scale_y = scale_type.split(',')
+                if prec_o in dynamic_quant_out_dtype and fused_quant != 1 and fused_quant != 2:
+                    continue # skip non dynamic quant case
+                if (fused_quant == 1 or fused_quant == 2) and hs_key == 'big':
+                    continue
+                if (fused_quant == 0 and save_unquant == True):
+                    continue # save_unquant should always be false when there is no quant enabled
+                current_hs = list()
+                for chs_ in hs:
+                    h_ = copy.copy(chs_) # copy the base instance out
+                    h_.F_XDataType = prec_i
+                    h_.F_YDataType = prec_o
+                    h_.F_SmoothScaleDataType = scale_sm
+                    h_.F_YScaleDataType = scale_y
+                    h_.F_UnquantYDataType = prec_i
+                    h_.F_kFusedAdd = fused_add
+                    h_.F_kFusedQuant = fused_quant
+                    h_.F_kSaveUnquant = save_unquant
+                    current_hs.append(h_) # + "\n"
+                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                current_n_str = 'big' if hs_key == 'big' else current_n
+                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, save_unquant, current_hs))
+        return total_blob
+
+    def list_blobs(self) -> None:
+        w_p = Path(self.working_path)
+        list_p = w_p / 'rmsnorm2d_fwd_blobs.txt'
+        blobs = self.get_blobs()
+        with list_p.open('w') as list_f:
+            # api related file
+            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
+            list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")
+            # kernel instance file
+            for b in blobs:
+                list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
+
+    def gen_blobs(self) -> None:
+        w_p = Path(self.working_path)
+        (w_p / (self.name_api + ".cpp")).write_text(self.content_api)
+        (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
+        blobs = self.get_blobs()
+        for b in blobs:
+            (w_p / (b.name + ".cpp")).write_text(b.content)
+
+
+def list_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            rmsnorm_fwd_codegen(args.working_path, args.filter).list_blobs()
+
+
+def gen_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            rmsnorm_fwd_codegen(args.working_path, args.filter).gen_blobs()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK rmsnorm kernel",
+    )
+    parser.add_argument(
+        "-a",
+        "--api",
+        default='fwd[all]',
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma."
+    )
+
+    # the directory for list_blobs/gen_blobs to write files into
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="the path where all the blobs are going to be generated"
+    )
+
+    # this script have 2 modes
+    # 1) list_blobs mode, will generate a txt file with all the files going to be generated.
+    #    this is useful in build system like cmake to construct source code dependency, by
+    #    reading the content out of this file
+    # 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework
+    #    like FA, only need to use this mode
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action='store_true',
+        help="list all the kernels to a file, "
+    )
+
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action='store_true',
+        help="generate all kernels into different tile"
+    )
+
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--traits",
+        default="all",
+        required=False,
+        help="enable/disable some feature. default generate all"
+    )
+
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt."
+    )
+
+    args = parser.parse_args()
+
+    # print(f'{args.list_blobs}-{args.gen_blobs}')
+    if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)):
+        print('gen_blobs/list_blobs must specify only one option')
+        sys.exit()
+
+    p = Path(args.working_path)
+    if not p.exists():
+        p.mkdir()
+
+    if args.list_blobs:
+        list_blobs(args)
+    else:
+        gen_blobs(args)
diff --git a/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.hpp b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.hpp
new file mode 100644
index 0000000000..bb4a2f5ef4
--- /dev/null
+++ b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.hpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/rmsnorm2d.hpp"
+#include <string>
+
+template <typename InType,
+          typename OutType,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_>
+struct RmsnormTypeConfig;
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct RmsnormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::half_t;
+    using YDataType           = OutType;
+    using GammaDataType       = ck_tile::half_t;
+    using InvRmsDataType      = ck_tile::half_t;
+    using UnquantYDataType    = ck_tile::half_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct RmsnormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::bf16_t;
+    using YDataType           = OutType;
+    using GammaDataType       = ck_tile::bf16_t;
+    using InvRmsDataType      = ck_tile::bf16_t;
+    using UnquantYDataType    = ck_tile::bf16_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+// runtime args
+struct rmsnorm2d_fwd_args : public ck_tile::Rmsnorm2dFwdHostArgs
+{
+};
+
+template <typename Traits_>
+float rmsnorm2d_fwd_(const ck_tile::stream_config& s, rmsnorm2d_fwd_args a);
+
+// This is the public API, will be generated by script
+struct rmsnorm2d_fwd_traits
+{
+    std::string prec_i; // input precision
+    std::string prec_o; // output precision
+
+    // if fused_quant == 1, need set prec_sm/prec_sy to proper string, otherwise can set
+    // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
+    // can set arbitrary(will skip check)
+    std::string prec_sm; // x-scale, used for [1*N] input smooth quant
+    std::string prec_sy; // y-scale, used for [M*1] output for next layer
+
+    bool save_rms;
+    bool save_unquant;
+    int fused_add;   // 0:no-add, 1:pre-add-store, 2:pre-add
+    int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+};
+
+float rmsnorm2d_fwd(rmsnorm2d_fwd_traits, rmsnorm2d_fwd_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc
new file mode 100644
index 0000000000..19abf10f3c
--- /dev/null
+++ b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "rmsnorm2d_fwd.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    double rtol = 1e-02;
+    double atol = 1.0;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
+        .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
+        .insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
+        .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_rms", "0", "save rms(invrms) or not. set to 1 in training case")
+        .insert("save_unquant", "0", "save result before quant")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec_i", "fp16", "input precision")
+        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
+        .insert("prec_sm",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1")
+        .insert("prec_sy",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1 or 2")
+        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
+        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InDataType,
+          typename OutDataType,
+          typename SmoothScaleDataType,
+          typename YScaleDataType,
+          bool SaveRms,
+          bool SaveUnquant>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m = arg_parser.get_int("m");
+    ck_tile::index_t n = arg_parser.get_int("n");
+    float epsilon      = arg_parser.get_float("e");
+    int kname          = arg_parser.get_int("kname");
+    int do_validation  = arg_parser.get_int("v");
+    int fused_add      = arg_parser.get_int("fadd");
+    int fused_quant    = arg_parser.get_int("fquant");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
+    if(xr_stride < 0)
+        xr_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
+    if(yr_stride < 0)
+        yr_stride = n;
+    assert(x_stride >= n);
+
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+
+    if((fused_quant == 1 || fused_quant == 2) && prec_o != "int8" && prec_o != "fp8")
+    {
+        std::cout
+            << "if fused_quant is 1 or 2, only support \"-prec_o=int8\" or \"-prec_o=fp8\" cases."
+            << std::endl;
+        return false;
+    }
+
+    if((fused_quant == 0) && SaveUnquant)
+    {
+        std::cout
+            << "save_unquant should be 0 if quant output is not enabled because it is meaningless. "
+            << "Output Y is what wanted." << std::endl;
+        return false;
+    }
+
+    using TypeConfig =
+        RmsnormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
+
+    using XDataType         = typename TypeConfig::XDataType;
+    using YDataType         = typename TypeConfig::YDataType;
+    using GammaDataType     = typename TypeConfig::GammaDataType;
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
+    using InvRmsDataType =
+        std::conditional_t<SaveRms, typename TypeConfig::InvRmsDataType, ck_tile::null_type>;
+    using UnquantYDataType =
+        std::conditional_t<SaveUnquant, typename TypeConfig::UnquantYDataType, ck_tile::null_type>;
+
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host_dev({n});
+
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
+
+    ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<UnquantYDataType> unquant_y_host_dev({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<ck_tile::null_type> unquant_y_null({1});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem unquant_y_buf(unquant_y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+    x_residual_buf.ToDevice(x_residual_host.data());
+    sm_scale_buf.ToDevice(sm_scale_host.data());
+
+    auto prec_str = [&]() {
+        auto base_str = prec_i;
+        if(prec_i != prec_o)
+        {
+            base_str += "|" + prec_o;
+        }
+        if(fused_quant == 1)
+        {
+            base_str += std::string("(") + prec_sy + ")";
+        }
+        return base_str;
+    }();
+
+    std::cout << "[" << prec_str << "]"
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
+              << ", yr_stride:" << yr_stride << std::flush;
+
+    rmsnorm2d_fwd_traits traits{
+        prec_i, prec_o, prec_sm, prec_sy, SaveRms, SaveUnquant, fused_add, fused_quant};
+
+    rmsnorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
+                            fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
+                            fused_quant == 1 ? sm_scale_buf.GetDeviceBuffer() : nullptr,
+                            gamma_buf.GetDeviceBuffer(),
+                            y_buf.GetDeviceBuffer(),
+                            fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
+                            fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
+                            nullptr, // p_invRms, unsupported yet
+                            SaveUnquant ? unquant_y_buf.GetDeviceBuffer() : nullptr,
+                            epsilon,
+                            m,
+                            n,
+                            x_stride,   // x row_stride
+                            xr_stride,  // x residule row stride
+                            y_stride,   // y row stride
+                            yr_stride}; // y residule row stride
+
+    float ave_time = rmsnorm2d_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    std::size_t num_byte =
+        sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + sizeof(YDataType) * m * n;
+    num_byte += SaveRms ? sizeof(InvRmsDataType) * m * n : 0;
+    num_byte += SaveUnquant ? sizeof(UnquantYDataType) * m * n : 0;
+    num_byte += fused_add ? sizeof(XResidualDataType) * m * n : 0;
+    num_byte += ((fused_quant == 1) || (fused_quant == 2)) ? sizeof(YScaleDataType) * m : 0;
+    num_byte += (fused_quant == 1) ? sizeof(SmoothScaleDataType) * n : 0;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        if(fused_add != 0)
+        {
+            // fused pre_add/pre_add_store
+            // TODO we accumulate directly to x_host for simplcity here...
+            std::transform(x_host.mData.cbegin(),
+                           x_host.mData.cend(),
+                           x_residual_host.mData.cbegin(),
+                           x_host.mData.begin(),
+                           [](auto x_, auto r_) {
+                               auto o_ = ck_tile::type_convert<ComputeDataType>(x_) +
+                                         ck_tile::type_convert<ComputeDataType>(r_);
+                               return ck_tile::type_convert<XDataType>(o_);
+                           });
+        }
+
+        if(fused_quant != 0)
+        {
+            auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
+                int N_ = acc_.mDesc.get_lengths()[1];
+                if(fused_quant == 1)
+                {
+                    for(int n_ = 0; n_ < N_; n_++)
+                    {
+                        // input smooth outlier
+                        acc_(m_, n_) = acc_(m_, n_) *
+                                       ck_tile::type_convert<ComputeDataType>(sm_scale_host(n_));
+                    }
+                }
+                ComputeDataType absmax = static_cast<ComputeDataType>(0);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    const auto a = ck_tile::abs(acc_(m_, n_));
+                    absmax       = a > absmax ? a : absmax;
+                }
+                // printf("cpu:absmax:%f\n", absmax);
+                constexpr ComputeDataType kMaxY =
+                    std::is_same<YDataType, ck_tile::fp8_t>::value    ? 240.0
+                    : std::is_same<YDataType, ck_tile::int8_t>::value ? 127.0
+                                                                      : 0.0;
+                ComputeDataType y_scale = absmax / kMaxY;
+                y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
+                }
+            };
+
+            auto default_and_dquant_functor = [&](int m_, auto& o_unquant_, auto& o_, auto& acc_) {
+                const int N = acc_.mDesc.get_lengths()[1];
+                for(int n_ = 0; n_ < N; ++n_)
+                {
+                    o_unquant_(m_, n_) = ck_tile::type_convert<OutDataType>(acc_(m_, n_));
+                }
+
+                dquant_functor(m_, o_, acc_);
+            };
+
+            if constexpr(SaveUnquant)
+            {
+                ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                                 GammaDataType,
+                                                 ComputeDataType,
+                                                 YDataType,
+                                                 InvRmsDataType,
+                                                 UnquantYDataType>(x_host,
+                                                                   gamma_host,
+                                                                   y_host_ref,
+                                                                   invRms_host_ref,
+                                                                   unquant_y_host_ref,
+                                                                   epsilon,
+                                                                   default_and_dquant_functor);
+            }
+            else
+            {
+                ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                                 GammaDataType,
+                                                 ComputeDataType,
+                                                 YDataType,
+                                                 InvRmsDataType,
+                                                 UnquantYDataType>(x_host,
+                                                                   gamma_host,
+                                                                   y_host_ref,
+                                                                   invRms_host_ref,
+                                                                   unquant_y_host_ref,
+                                                                   epsilon,
+                                                                   dquant_functor);
+            }
+        }
+        else
+        {
+            assert(SaveUnquant == false);
+            ck_tile::reference_rmsnorm2d_fwd<XDataType,
+                                             GammaDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             InvRmsDataType,
+                                             ck_tile::null_type>(
+                x_host, gamma_host, y_host_ref, invRms_host_ref, unquant_y_null, epsilon);
+        }
+
+        y_buf.FromDevice(y_host_dev.data());
+
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
+        if(fused_add == 1)
+        {
+            y_residual_buf.FromDevice(y_residual_host_dev.data());
+        }
+
+        auto [rtol, atol] = get_elimit<YDataType>();
+        if(x_stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("\nOUT Error: Incorrect results!"), rtol, atol);
+
+            if constexpr(SaveUnquant)
+            {
+                pass &= ck_tile::check_err(unquant_y_host_dev,
+                                           unquant_y_host_ref,
+                                           std::string("\n OUT ERROR: Incorrect unquant results!"),
+                                           rtol,
+                                           atol);
+            }
+
+            if(fused_add == 1)
+            {
+                pass &= ck_tile::check_err(y_residual_host_dev,
+                                           x_host,
+                                           std::string("\nADD Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
+                                                      y_host_dev.begin() + i_r * y_stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
+                                                      y_host_ref.begin() + i_r * y_stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("\nOUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+
+                if(fused_add == 1)
+                {
+                    std::vector<YResidualDataType> y_residual_host_dev_row(
+                        y_residual_host_dev.begin() + i_r * yr_stride,
+                        y_residual_host_dev.begin() + i_r * yr_stride + n);
+                    std::vector<YResidualDataType> y_residual_host_ref_row(
+                        x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
+                    pass &= ck_tile::check_err(y_residual_host_dev_row,
+                                               y_residual_host_ref_row,
+                                               std::string("\nADD[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+
+                if constexpr(SaveUnquant)
+                {
+                    std::vector<UnquantYDataType> unquant_y_host_dev_row(
+                        unquant_y_host_dev.begin() + i_r * y_stride,
+                        unquant_y_host_dev.begin() + i_r * y_stride + n);
+                    std::vector<UnquantYDataType> unquant_y_host_ref_row(
+                        unquant_y_host_ref.begin() + i_r * y_stride,
+                        unquant_y_host_ref.begin() + i_r * y_stride + n);
+                    pass &=
+                        ck_tile::check_err(unquant_y_host_dev_row,
+                                           unquant_y_host_ref_row,
+                                           std::string("\nOUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect unquant y results!"),
+                                           rtol,
+                                           atol);
+                }
+            }
+        }
+
+        if(fused_quant == 1)
+        {
+            y_scale_buf.FromDevice(y_scale_host_dev.data());
+            pass &= ck_tile::check_err(y_scale_host_dev,
+                                       y_scale_host_ref,
+                                       std::string("\nSCALE Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+bool is_quant_data_type(const std::string& prec) { return (prec == "int8") || (prec == "fp8"); }
+
+bool dispatch_by_type(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+
+    int save_rms    = arg_parser.get_int("save_rms");
+    int fused_quant = arg_parser.get_int("fquant");
+    int save_unquant =
+        arg_parser.get_int("save_unquant") && is_quant_data_type(prec_o) && (fused_quant != 0);
+
+    if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_rms)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, true, false>(arg_parser);
+    }
+    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, false, false>(arg_parser);
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            save_rms)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true, false>(arg_parser);
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, false, false>(arg_parser);
+    }
+
+    // dynamic quant case, only in inference
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && !save_unquant)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, true, false>(arg_parser);
+    }
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && !save_unquant)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, true, false>(arg_parser);
+    }
+    else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && !save_unquant)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false, false>(arg_parser);
+    }
+    else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && !save_unquant)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false, false>(arg_parser);
+    }
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, true, true>(arg_parser);
+    }
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, true, true>(arg_parser);
+    }
+    else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false, true>(arg_parser);
+    }
+    else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_rms && save_unquant)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false, true>(arg_parser);
+    }
+
+    return false;
+}
+
+int run_rmsnorm2d_fwd_combinations(std::string const& data_type)
+{
+    constexpr size_t PARAM_COUNT = 20;
+    char bufs[PARAM_COUNT][64];
+    char* argv[PARAM_COUNT];
+
+    for(std::size_t i = 0; i < PARAM_COUNT; i++)
+    {
+        argv[i] = bufs[i];
+    }
+
+    std::vector<std::vector<std::string>> fquant = {
+        {},
+        {"-fquant=1", "-prec_o=int8"},
+        {"-fquant=2", "-prec_o=int8"},
+        {"-fquant=1", "-prec_o=fp8"},
+        {"-fquant=2", "-prec_o=fp8"},
+        {"-fquant=1", "-prec_o=int8", "-save_unquant=1"},
+        {"-fquant=2", "-prec_o=int8", "-save_unquant=1"},
+        {"-fquant=1", "-prec_o=fp8", "-save_unquant=1"},
+        {"-fquant=2", "-prec_o=fp8", "-save_unquant=1"}};
+
+    std::vector<std::string> fadd = {"-fadd=0", "-fadd=1"};
+
+    std::vector<std::vector<std::string>> params = {
+        {"-m=99", "-n=13"},
+        {"-m=17", "-n=16"},
+        {"-m=1", "-n=100"},
+        {"-m=4", "-n=128"},
+        {"-m=80", "-n=127"},
+        {"-m=22", "-n=255", "-stride=256"},
+        {"-m=7", "-n=599"},
+        {"-m=19", "-n=512"},
+        {"-m=33", "-n=313", "-stride=1000"},
+        {"-m=11", "-n=510"},
+        {"-m=171", "-n=676", "-stride=818"},
+        {"-m=91", "-n=636"},
+        {"-m=12", "-n=768", "-stride=800"},
+        {"-m=100", "-n=766", "-stride=812"},
+        {"-m=31", "-n=1024"},
+        {"-m=64", "-n=1000", "-stride=1004"},
+        {"-m=8", "-n=1501"},
+        {"-m=3", "-n=1826"},
+        {"-m=5", "-n=2040"},
+        {"-m=7", "-n=2734"},
+        {"-m=1", "-n=3182"},
+        {"-m=9", "-n=4096"},
+        {"-m=3", "-n=8192"},
+    };
+
+    bool result = true;
+    int argc    = 0;
+    std::vector<int> argc_stack;
+    std::string pr_i = "-prec_i=" + data_type;
+    strncpy(bufs[argc++], "rmsnorm2d_fwd", 64);
+    strncpy(bufs[argc++], pr_i.c_str(), 64);
+    argc_stack.push_back(argc);
+    for(size_t fquant_idx = 0; fquant_idx < fquant.size(); fquant_idx++)
+    {
+        argc = argc_stack.back();
+        for(size_t j = 0; j < fquant[fquant_idx].size(); j++)
+        {
+            strncpy(bufs[argc++], fquant[fquant_idx][j].c_str(), 64);
+        }
+        argc_stack.push_back(argc);
+        for(size_t fadd_idx = 0; fadd_idx < fadd.size(); fadd_idx++)
+        {
+            argc = argc_stack.back();
+            strncpy(bufs[argc++], fadd[fadd_idx].c_str(), 64);
+            argc_stack.push_back(argc);
+            for(size_t param_idx = 0; param_idx < params.size(); param_idx++)
+            {
+                argc = argc_stack.back();
+                for(size_t j = 0; j < params[param_idx].size(); j++)
+                {
+                    strncpy(bufs[argc++], params[param_idx][j].c_str(), 64);
+                }
+
+                result = dispatch_by_type(argc, argv) && result;
+            }
+            argc_stack.pop_back();
+        }
+        argc_stack.pop_back();
+    }
+    argc_stack.pop_back();
+    return result ? 0 : -1;
+}
diff --git a/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_bf16.cpp b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_bf16.cpp
new file mode 100644
index 0000000000..b4f989bc0e
--- /dev/null
+++ b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_bf16.cpp
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd.inc"
+int main() { return run_rmsnorm2d_fwd_combinations("bf16"); }
diff --git a/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_fp16.cpp b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_fp16.cpp
new file mode 100644
index 0000000000..01534d7f56
--- /dev/null
+++ b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd_fp16.cpp
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "rmsnorm2d_fwd.inc"
+int main() { return run_rmsnorm2d_fwd_combinations("fp16"); }
diff --git a/test/ck_tile/topk_softmax/CMakeLists.txt b/test/ck_tile/topk_softmax/CMakeLists.txt
new file mode 100644
index 0000000000..046eaf6649
--- /dev/null
+++ b/test/ck_tile/topk_softmax/CMakeLists.txt
@@ -0,0 +1,19 @@
+function(add_tile_topk_softmax_test SUFFIX)
+    set(TEST_NAME "test_ck_tile_topk_softmax_${SUFFIX}")
+    add_test_executable(${TEST_NAME} test_topk_softmax_${SUFFIX}.cpp test_topk_softmax_api.cpp)
+    target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+    set(TEST_TOPK_SOFTMAX_COMPILE_OPTIONS)
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND TEST_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+    # list(APPEND TEST_TOPK_SOFTMAX_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+    target_compile_options(${TEST_NAME} PRIVATE ${TEST_TOPK_SOFTMAX_COMPILE_OPTIONS})
+endfunction()
+
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_tile_topk_softmax_test(fp16)
+    add_tile_topk_softmax_test(bf16)
+else()
+    message(DEBUG "Skipping tile topk_softmax tests for current target")
+endif()
diff --git a/test/ck_tile/topk_softmax/test_topk_softmax.hpp b/test/ck_tile/topk_softmax/test_topk_softmax.hpp
new file mode 100644
index 0000000000..1bb400ad07
--- /dev/null
+++ b/test/ck_tile/topk_softmax/test_topk_softmax.hpp
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cassert>
+#include <cstdlib>
+#include <time.h>
+
+#include "test_topk_softmax_api.hpp"
+
+// CPU reference
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_softmax(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    auto y = reference_softmax<InputType, WeightType, WeightType>(x, dim);
+
+    auto [y_values, y_indices] = reference_topk(y, k, dim, largest, sorted);
+
+    return ck_tile::make_tuple(y_values, y_indices);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_softmax(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::HostTensor<WeightType>& y_values,
+                            ck_tile::HostTensor<IndexType>& y_indices,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    auto y = reference_softmax<InputType, WeightType, WeightType>(x, dim);
+    reference_topk(y, y_values, y_indices, k, dim, largest, sorted);
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("pr_i", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
+        .insert("t", "32", "number of input tokens")
+        .insert("e", "8", "number of experts")
+        .insert("k", "2", "topk")
+        .insert("st_i", "-1", "row stride of input, -1 means same as experts")
+        .insert("st_o", "-1", "row stride of output/indices, -1 means same as topk")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+bool test_topk_softmax(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string input_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+    int tokens              = args.get_int("t");
+    int experts             = args.get_int("e");
+    int topk                = args.get_int("k");
+    int seed                = args.get_int("seed");
+    int stride_input        = args.get_int("st_i");
+    int stride_output       = args.get_int("st_o");
+    int kname               = args.get_int("kname");
+    int warmup              = args.get_int("warmup");
+    int repeat              = args.get_int("repeat");
+
+    if(stride_input < 0)
+    {
+        stride_input = experts;
+    }
+    if(stride_output < 0)
+    {
+        stride_output = topk;
+    }
+    assert(stride_input >= experts);
+    assert(stride_output >= topk);
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    if(topk > experts)
+    {
+        printf("topk:%d value should be smaller than, or equal to number of experts:%d\n",
+               topk,
+               experts);
+        return false;
+    }
+
+    // tokens already considered batch size
+    ck_tile::HostTensor<InputType> x_host({tokens, experts}, {stride_input, 1});
+    ck_tile::HostTensor<WeightType> value_host({tokens, topk}, {stride_output, 1});
+    ck_tile::HostTensor<IndexType> index_host({tokens, topk}, {stride_output, 1});
+
+    {
+        // random require per-row unique
+        auto rand_gen = ck_tile::FillUniformDistribution_Unique<InputType>{
+            -5.f, 5.f, static_cast<uint32_t>(seed)};
+
+        for(int i_t = 0; i_t < tokens; i_t++)
+        {
+            ck_tile::HostTensor<InputType> x_row({experts});
+            rand_gen(x_row);
+            std::copy(x_row.begin(), x_row.end(), x_host.begin() + i_t * stride_input);
+            rand_gen.clear();
+        }
+    }
+
+    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem value_dev(value_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem index_dev(index_host.get_element_space_size_in_bytes());
+
+    x_dev.ToDevice(x_host.data());
+
+    topk_softmax_trait trait{input_prec, weight_prec, experts};
+
+    topk_softmax_kargs karg{x_dev.GetDeviceBuffer(),
+                            value_dev.GetDeviceBuffer(),
+                            index_dev.GetDeviceBuffer(),
+                            tokens,
+                            experts,
+                            topk,
+                            stride_input,
+                            stride_output};
+
+    ck_tile::stream_config sc{nullptr,
+                              true,
+                              /* log_level = */ (kname ? 1 : 0),
+                              warmup,
+                              repeat};
+    auto ms = topk_softmax(trait, karg, sc);
+    printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, st_o:%d, ms:%f, ",
+           input_prec.c_str(),
+           weight_prec.c_str(),
+           tokens,
+           experts,
+           topk,
+           stride_input,
+           stride_output,
+           ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    value_dev.FromDevice(value_host.data());
+    index_dev.FromDevice(index_host.data());
+
+    bool rtn = true;
+    if(validate)
+    {
+        ck_tile::HostTensor<WeightType> value_ref({tokens, topk}, {stride_output, 1});
+        ck_tile::HostTensor<IndexType> index_ref({tokens, topk}, {stride_output, 1});
+
+        reference_topk_softmax<InputType, WeightType, IndexType>(
+            x_host, value_ref, index_ref, topk);
+
+        auto [rtol, atol] = get_elimit<InputType>("");
+        for(int i_t = 0; i_t < tokens; i_t++)
+        {
+            auto s_begin = std::vector<size_t>{static_cast<size_t>(i_t), static_cast<size_t>(0)};
+            auto s_end =
+                std::vector<size_t>{static_cast<size_t>(i_t + 1), static_cast<size_t>(topk)};
+            auto s_value_host = value_host.slice(s_begin, s_end);
+            auto s_value_ref  = value_ref.slice(s_begin, s_end);
+            rtn &= ck_tile::check_err(s_value_host,
+                                      s_value_ref,
+                                      std::string("[") + std::to_string(i_t) +
+                                          std::string("] Value Error:"),
+                                      rtol,
+                                      atol);
+            auto s_index_host = index_host.slice(s_begin, s_end);
+            auto s_index_ref  = index_ref.slice(s_begin, s_end);
+            rtn &= ck_tile::check_err(s_index_host,
+                                      s_index_ref,
+                                      std::string("[") + std::to_string(i_t) +
+                                          std::string("] Index Error:"),
+                                      rtol,
+                                      atol);
+        }
+    }
+
+    printf("valid:%s\n", rtn ? "y" : "n");
+    fflush(stdout);
+    return rtn;
+}
+
+template <typename T>
+int run_gemm_combinations(std::string const& data_type)
+{
+    char bufs[7][64];
+    char* argv[7] = {bufs[0], bufs[1], bufs[2], bufs[3], bufs[4], bufs[5], bufs[6]};
+    std::vector<std::vector<std::string>> params = {
+        {"-t=80", "-e=17"},
+        {"-t=111", "-e=117"},
+        {"-t=1000", "-e=55"},
+        {"-t=99", "-e=180"},
+        {"-t=175", "-e=64", "-k=8"},
+        {"-t=65", "-e=8", "-k=2"},
+        {"-t=1", "-e=25"},
+        {"-t=31", "-e=19", "-k=15"},
+        {"-t=81", "-e=37", "-k=7"},
+        {"-t=199", "-e=128", "-k=13"},
+        {"-t=23", "-e=1", "-k=1"},
+        {"-t=127", "-e=99", "-k=19", "-st_i=233", "-st_o=31"},
+        {"-t=71", "-e=11", "-k=11", "-st_i=30", "-st_o=12"},
+        {"-t=1", "-e=1", "-k=1"},
+        {"-t=99", "-e=2", "-k=1", "-st_i=11", "-st_o=5"},
+        {"-t=333", "-e=99", "-k=13", "-st_i=191", "-st_o=17"}};
+
+    bool result      = true;
+    std::string pr_i = "-pr_i=" + data_type;
+    strncpy(bufs[0], "test_topk_softmax_bf16", 64);
+    strncpy(bufs[1], pr_i.c_str(), 64);
+    for(size_t i = 0; i < params.size(); i++)
+    {
+        for(size_t j = 0; j < params[i].size(); j++)
+        {
+            strncpy(bufs[j + 2], params[i][j].c_str(), 64);
+        }
+        int argc = params[i].size() + 2;
+
+        auto [good_args, args] = create_args(argc, argv);
+        if(!good_args)
+        {
+            result = false;
+        }
+        result = test_topk_softmax<T, float, ck_tile::index_t>(args) && result;
+    }
+    return result ? 0 : -1;
+}
diff --git a/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp b/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp
new file mode 100644
index 0000000000..46c7abc697
--- /dev/null
+++ b/test/ck_tile/topk_softmax/test_topk_softmax_api.cpp
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_topk_softmax_api.hpp"
+
+#define TOPK_SOFTMAX_DISPATCH(experts_)                                                         \
+    constexpr ck_tile::index_t ts_experts = experts_;                                           \
+    using ts_problem                      = ck_tile::                                           \
+        TopkSoftmaxWarpPerRowProblem<ts_input_type, ts_weight_type, ts_index_type, ts_experts>; \
+    using ts_pipeline = ck_tile::TopkSoftmaxWarpPerRowPipeline<ts_problem>;                     \
+                                                                                                \
+    using kernel = ck_tile::TopkSoftmaxKernel<ts_pipeline>;                                     \
+                                                                                                \
+    auto kargs = kernel::MakeKargs(a);                                                          \
+                                                                                                \
+    const dim3 grids      = kernel::GridSize(a);                                                \
+    constexpr dim3 blocks = kernel::BlockSize();                                                \
+                                                                                                \
+    float ave_time = ck_tile::launch_kernel(                                                    \
+        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));               \
+                                                                                                \
+    return ave_time;
+
+float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s)
+{
+    if(t.input_type == "fp16" && t.weight_type == "fp32")
+    {
+        using ts_input_type  = ck_tile::fp16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+#if 1
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192)
+        }
+#else
+        if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128)
+        }
+#endif
+    }
+    else if(t.input_type == "bf16" && t.weight_type == "fp32")
+    {
+#if 1
+        using ts_input_type  = ck_tile::bf16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192)
+        }
+#endif
+    }
+    return -1;
+}
diff --git a/test/ck_tile/topk_softmax/test_topk_softmax_api.hpp b/test/ck_tile/topk_softmax/test_topk_softmax_api.hpp
new file mode 100644
index 0000000000..65651efa4d
--- /dev/null
+++ b/test/ck_tile/topk_softmax/test_topk_softmax_api.hpp
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/topk_softmax.hpp"
+#include <string>
+
+struct topk_softmax_trait
+{
+    std::string input_type;
+    std::string weight_type; // currently always float
+    int experts;
+};
+
+struct topk_softmax_kargs : public ck_tile::TopkSoftmaxHostArgs
+{
+};
+
+float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s);
diff --git a/test/ck_tile/topk_softmax/test_topk_softmax_bf16.cpp b/test/ck_tile/topk_softmax/test_topk_softmax_bf16.cpp
new file mode 100644
index 0000000000..c541f6d9a4
--- /dev/null
+++ b/test/ck_tile/topk_softmax/test_topk_softmax_bf16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_topk_softmax.hpp"
+
+int main() { return run_gemm_combinations<ck_tile::bf16_t>("bf16"); }
diff --git a/test/ck_tile/topk_softmax/test_topk_softmax_fp16.cpp b/test/ck_tile/topk_softmax/test_topk_softmax_fp16.cpp
new file mode 100644
index 0000000000..401b3c0013
--- /dev/null
+++ b/test/ck_tile/topk_softmax/test_topk_softmax_fp16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_topk_softmax.hpp"
+
+int main() { return run_gemm_combinations<ck_tile::fp16_t>("fp16"); }

From 67b282162345effdfea44b7cec2348416f990cf9 Mon Sep 17 00:00:00 2001
From: John Shumway <john.shumwayjr@gmail.com>
Date: Tue, 22 Jul 2025 11:52:10 -0600
Subject: [PATCH 332/443] Switch to C++20 standard for all CMake targets.
 (#2536)

All our platforms support C++20 now, so update to C++20 standard
for language features such as concepts, designated initializers,
range-based for initializers, and consteval. This PR only switches
the compiler flags to C++20, no other changes.
---
 client_example/CMakeLists.txt           | 2 +-
 codegen/CMakeLists.txt                  | 2 +-
 codegen/test/rtc/src/compile_kernel.cpp | 4 ++--
 script/cmake-ck-dev.sh                  | 2 +-
 test/block_swizzle_test/rebuild.sh      | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index 8fdd60f5d5..f27e557cc3 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.15)
 project(ck_app)
-add_compile_options(-std=c++17)
+add_compile_options(-std=c++20)
 
 if (DTYPES)
     add_definitions(-DDTYPES)
diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 35b5cf0367..2b2e6e2949 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -22,7 +22,7 @@ file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
 
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
 
-add_compile_options(-std=c++17)
+add_compile_options(-std=c++20)
 
 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 # TODO: Use object library
diff --git a/codegen/test/rtc/src/compile_kernel.cpp b/codegen/test/rtc/src/compile_kernel.cpp
index 262e6bae46..fac92ded7d 100644
--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -94,7 +94,7 @@ kernel clang_compile_kernel(const std::vector<src_file>& srcs, compile_options o
     assert(not srcs.empty());
     tmp_dir td{"compile"};
     options.flags += " -I. -O3";
-    options.flags += " -std=c++17";
+    options.flags += " -std=c++20";
     options.flags += " --offload-arch=" + get_device_name();
     std::string out;
 
@@ -278,7 +278,7 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(const std::vector<src
 static kernel hiprtc_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
 {
     options.flags += " -I. -O3";
-    options.flags += " -std=c++17";
+    options.flags += " -std=c++20";
     options.flags += " -DCK_CODE_GEN_RTC";
     options.flags += " --offload-arch=" + get_device_name();
     auto cos = compile_hip_src_with_hiprtc(srcs, options);
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 151c2a22ff..c45bb4330d 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -31,7 +31,7 @@ fi
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm/                                                                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++                                                  \
--D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
+-D CMAKE_CXX_FLAGS="-std=c++20 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
diff --git a/test/block_swizzle_test/rebuild.sh b/test/block_swizzle_test/rebuild.sh
index b07eb55048..553d1900d4 100644
--- a/test/block_swizzle_test/rebuild.sh
+++ b/test/block_swizzle_test/rebuild.sh
@@ -1,3 +1,3 @@
 CC=g++
 
-$CC -Wall -std=c++17 -Iinclude -O3 block_swizzle_test.cpp -o block_swizzle_test.exe
\ No newline at end of file
+$CC -Wall -std=c++20 -Iinclude -O3 block_swizzle_test.cpp -o block_swizzle_test.exe
\ No newline at end of file

From e62710e461d64f2740eaf46ba672d3173b7f17d1 Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Wed, 23 Jul 2025 01:10:16 -0600
Subject: [PATCH 333/443]  ck_tile kernel for gemm with groupwise quantized A 
 tensor (#2473)

* ck_tile kernel for gemm with groupwise quantized A or B tensor.

This change introduces new pipelines with Intrawave scheduler and block gemm primitives that loads the scale tensor to registers to perform dequantization post MFMA on C tensor in registers.

Scale tensor data, AQ/BQ is spliced across threads in registers and not stored in LDS.

Current support is for the following combinations, but it should be fairly straightforward to extend support to more formats.

1. fp8, fp8 -> f32
2. bf8, bf8 -> f32
3. i4, fp8 -> f32
4. i4, bf8 -> f32

Group size can go down to as low as K length of underlying WarpGemm primitive.

For Gemm problems with quantized B tensor, this change also introduces preliminary support for flatmm pipeline which loads B tensor directly into registers.

* [Block Scale Gemm] Only run gemm quant examples on __gfx94__

- Only run gemm quant examples on __gfx94__ for usage of
  `v_cvt_pk_fp8_f32`
- Format the code

* [Block Scale Gemm] Remove Bquant Gemm BlockScale

This cleanup is in preparation for future development of bquant. By
isolating Aquant-related code, we can streamline the codebase and make
it easier to add and maintain bquant functionality in subsequent
updates.

* [Block Scale Gemm] Format code with clang-format-12

The latest clang-format (v19) in ROCm 7.0 generate different result than
clang-format-12 which is used in CK CI.

Format code with clang-format-12 for consistency.

* [Block Scale Gemm] Split the k direction loop

- Split the k direction loop in block_universal_gemm_as_quant_bs_cr.hpp
   to make the logic clearer.
- Disable C transposition.

* [Block Scale Gemm] Move block scale gemm example to 38_block_scale_gemm

* [Block Scale Gemm] Update copyright

* test

* Add TailHandler

* Move TileDistributionEncodingPatternAQ

* Refactor

* refactor

* fix bug

* fix bug

* help solve the PR comment

* Format the code

* [Block Scale Gemm] Add unit tests

* [Block Scale Gemm] Add support to 16x16x32 MFMA

- Add support to 16x16x32 MFMA
- Fix a bug when exchange data crossing lanes

---------

Co-authored-by: Vijay Krishnamoorthy <vjkrish@meta.com>
Co-authored-by: Cong MA <congma13@ctr2-alola-ctrl-01.amd.com>
Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |  14 +-
 .../38_block_scale_gemm/CMakeLists.txt        |  13 +
 example/ck_tile/38_block_scale_gemm/README.md |  35 +
 .../38_block_scale_gemm/gemm_aquant_basic.cpp | 226 ++++++
 .../38_block_scale_gemm/gemm_utils.hpp        | 675 +++++++++++++++++
 .../run_gemm_aquant_example.inc               | 259 +++++++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/core/numeric/pk_int4.hpp      |  18 +
 include/ck_tile/host/fill.hpp                 |  55 ++
 .../ck_tile/host/reference/reference_gemm.hpp | 104 +++
 .../unary_element_wise_operation.hpp          |  90 +++
 .../ops/epilogue/cshuffle_epilogue.hpp        |   4 +-
 .../ops/flatmm/pipeline/tile_flatmm_shape.hpp |   3 +
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |   6 +-
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    |   1 +
 .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp  |   7 +-
 include/ck_tile/ops/gemm_group_quant.hpp      |  12 +
 .../block_universal_gemm_as_aquant_bs_cr.hpp  | 489 +++++++++++++
 .../kernel/gemm_aquant_kernel.hpp             | 679 +++++++++++++++++
 .../gemm_aquant_pipeline_ag_bg_cr_base.hpp    |  53 ++
 .../gemm_aquant_pipeline_ag_bg_cr_policy.hpp  |  93 +++
 .../gemm_aquant_pipeline_ag_bg_cr_v3.hpp      | 476 ++++++++++++
 .../pipeline/gemm_aquant_pipeline_problem.hpp | 121 ++++
 .../pipeline/gemm_group_quant_utils.hpp       |  95 +++
 .../pipeline/tile_gemm_aquant_traits.hpp      |  34 +
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/gemm_block_scale/CMakeLists.txt  |  19 +
 .../test_gemm_aquant_basic_bf8.cpp            |   6 +
 .../test_gemm_aquant_basic_fp8.cpp            |   6 +
 .../test_gemm_aquant_basic_i4bf8.cpp          |   6 +
 .../test_gemm_aquant_basic_i4f32bf8.cpp       |   6 +
 .../test_gemm_aquant_basic_i4f32fp8.cpp       |   6 +
 .../test_gemm_aquant_basic_i4fp8.cpp          |   6 +
 .../test_gemm_aquant_utils.hpp                | 681 ++++++++++++++++++
 .../test_run_gemm_aquant_example.inc          | 577 +++++++++++++++
 35 files changed, 4864 insertions(+), 13 deletions(-)
 create mode 100644 example/ck_tile/38_block_scale_gemm/CMakeLists.txt
 create mode 100644 example/ck_tile/38_block_scale_gemm/README.md
 create mode 100644 example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
 create mode 100644 example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
 create mode 100644 example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
 create mode 100644 include/ck_tile/ops/gemm_group_quant.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
 create mode 100644 include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
 create mode 100644 test/ck_tile/gemm_block_scale/CMakeLists.txt
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp
 create mode 100644 test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 7a9b5afaa2..24f64994cf 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -114,16 +114,16 @@ template <typename PrecType>
 struct GemmConfigComputeV3 : public GemmConfigBase
 {
     // Compute V3 only support Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
     static constexpr ck_tile::index_t K_Warp = 1;
 
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr bool DoubleSmemBuffer     = false;
diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..bdcb6f50bd
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+    add_executable(tile_example_gemm_aquant_basic EXCLUDE_FROM_ALL gemm_aquant_basic.cpp)
+    target_compile_options(tile_example_gemm_aquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+else()
+    message(DEBUG "Skipping ck_tile quant gemm tests for current target")
+endif()
diff --git a/example/ck_tile/38_block_scale_gemm/README.md b/example/ck_tile/38_block_scale_gemm/README.md
new file mode 100644
index 0000000000..742a88dee7
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/README.md
@@ -0,0 +1,35 @@
+# GEMM Matrix Multiplication
+
+This folder contains example for Block Scale GEMM using ck_tile tile-programming implementation. 
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The aquant pipeline method on the gemm calculation
+make tile_example_gemm_aquant_basic -j
+```
+This will result in an executable `build/bin/tile_example_gemm_aquant_basic`
+
+## example
+```
+args:
+          -b    batch size (default:1)
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
+          -k    k dimension (default:64)
+   -a_layout    Tensor A data layout (default: R)
+   -b_layout    Tensor B data layout (default: R)
+   -c_layout    Tensor C data layout (default: R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -e    Absolute error tolerance (default:1e-5)
+       -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+```
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
new file mode 100644
index 0000000000..a1ed3c4920
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize>
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 1;
+
+    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+
+    constexpr ck_tile::index_t M_Tile = 16;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 256;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 16;
+    constexpr ck_tile::index_t N_Warp_Tile = 16;
+    constexpr ck_tile::index_t K_Warp_Tile = 32;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 CodegenGemmShape,
+                                                                 CodegenGemmTraits,
+                                                                 ComputeDataType>;
+
+    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
+    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    constexpr bool transposed_warp_gemm = false;
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+
+        using CodegenPipelineProblem =
+            ck_tile::GemmAQuantPipelineProblem<ADataType,
+                                               AQDataType,
+                                               BDataType,
+                                               AccDataType,
+                                               CodegenGemmShape,
+                                               CodegenGemmTraits,
+                                               QuantGroupSize,
+                                               ComputeDataType,
+                                               ck_tile::GemmPipelineScheduler::Intrawave,
+                                               has_hot_loop_v,
+                                               tail_number_v>;
+        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
+        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             ck_tile::tuple<>,
+                                             AccDataType,
+                                             CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             transposed_warp_gemm,
+                                             ck_tile::memory_operation_enum::set>>;
+        using Kernel =
+            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(args.k_batch != 1)
+        {
+            throw std::runtime_error("split-k is not supported yet!");
+        }
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+}
+
+#include "run_gemm_aquant_example.inc"
+
+template <typename TypeConfig, uint32_t QuantGroupSize>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<TypeConfig, QuantGroupSize>(
+                argc, argv, Row{}, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for A.");
+    }
+
+    return 0;
+}
+
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
+        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
+        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4fp8")
+    {
+        using TypeConfig = decltype(
+            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::fp8_t>{});
+        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4bf8")
+    {
+        using TypeConfig = decltype(
+            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::bf8_t>{});
+        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
+        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32bf8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
+        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
new file mode 100644
index 0000000000..35e80ddb89
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm_group_quant.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
+#define CK_TILE_PIPELINE_PRESHUFFLE 5
+
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(__gfx950__)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(__gfx950__)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
+#endif
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryIntrawave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3 : public GemmConfigBase
+{
+    // Compute V3 only support Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 32;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 256;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV5 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer               = false;
+    static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufle_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufle_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+};
+
+template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
+struct GemmTypeConfig;
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
+template <typename ADataType_,
+          typename BDataType_ = ADataType_,
+          typename CDataType_ = ADataType_,
+          typename QDataType_ = float>
+struct GemmQuantTypeConfig
+{
+    using ADataType   = ADataType_;
+    using QDataType   = QDataType_;
+    using BDataType   = BDataType_;
+    using AccDataType = float;
+    using CDataType   = CDataType_;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float, ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float, ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, float>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, float>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("aq_layout", "R", "Aq tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Column by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_q", "0", "Tensor AQ stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("as_br_cr", "false", "Choose between as_br_cr and as_bs_cr");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
new file mode 100644
index 0000000000..9bdef9755b
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <bit>
+#include <random>
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize>
+float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                  ck_tile::DeviceMem& aq_m_aqk_dev_buf,
+                  ck_tile::DeviceMem& b_k_n_dev_buf,
+                  ck_tile::DeviceMem& c_m_n_dev_buf,
+                  ck_tile::index_t M,
+                  ck_tile::index_t N,
+                  ck_tile::index_t K,
+                  ck_tile::index_t AQK,
+                  ck_tile::index_t stride_A,
+                  ck_tile::index_t stride_AQ,
+                  ck_tile::index_t stride_B,
+                  ck_tile::index_t stride_C,
+                  ck_tile::index_t kbatch,
+                  int n_warmup,
+                  int n_repeat)
+{
+    ck_tile::AQuantGemmHostArgs args;
+    args.a_ptr     = a_m_k_dev_buf.GetDeviceBuffer();
+    args.aq_ptr    = aq_m_aqk_dev_buf.GetDeviceBuffer();
+    args.b_ptr     = b_k_n_dev_buf.GetDeviceBuffer();
+    args.c_ptr     = c_m_n_dev_buf.GetDeviceBuffer();
+    args.k_batch   = kbatch;
+    args.M         = M;
+    args.N         = N;
+    args.K         = K;
+    args.QK        = AQK;
+    args.stride_A  = stride_A;
+    args.stride_B  = stride_B;
+    args.stride_C  = stride_C;
+    args.stride_AQ = stride_AQ;
+
+    float ave_time = gemm_calc_aquant<ADataType,
+                                      AQDataType,
+                                      BDataType,
+                                      AccDataType,
+                                      CDataType,
+                                      BDataType,
+                                      ALayout,
+                                      BLayout,
+                                      CLayout,
+                                      QuantGroupSize>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = std::size_t(2) * M * N * K;
+    std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(AQDataType) * M * AQK +
+                           sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideAQ =" << stride_AQ << " StrideB =" << stride_B
+              << " StrideC =" << stride_C << " A_Layout =" << ALayout::name
+              << " B_Layout =" << BLayout::name << " C_Layout =" << CLayout::name
+              << " A_Type = " << DataTypeTraits<ADataType>::name
+              << " AQ_Type = " << DataTypeTraits<AQDataType>::name
+              << " B_Type = " << DataTypeTraits<BDataType>::name
+              << " Acc_Type = " << DataTypeTraits<AccDataType>::name
+              << " C_Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+    return ave_time;
+}
+
+template <typename TypeConfig,
+          uint32_t QuantGroupSize,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts(int argc,
+                                  char* argv[],
+                                  const ALayout a_layout                  = ALayout{},
+                                  const AQLayout aq_layout                = AQLayout{},
+                                  const BLayout b_layout                  = BLayout{},
+                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using ADataType   = typename TypeConfig::ADataType;
+    using AQDataType  = typename TypeConfig::QDataType;
+    using BDataType   = typename TypeConfig::BDataType;
+    using AccDataType = typename TypeConfig::AccDataType;
+    using CDataType   = typename TypeConfig::CDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    if(K % QuantGroupSize != 0)
+    {
+        throw std::runtime_error("K must be aligned with QuantGroupSize");
+    }
+
+    ck_tile::index_t AQK = K / QuantGroupSize;
+
+    ck_tile::index_t stride_A  = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_AQ = arg_parser.get_int("stride_q");
+    ck_tile::index_t stride_B  = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C  = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    stride_A  = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_AQ = ck_tile::get_default_stride(M, AQK, stride_AQ, is_row_major(aq_layout));
+    stride_B  = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C  = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<AQDataType> aq_m_aqk(
+        ck_tile::host_tensor_descriptor(M, AQK, stride_AQ, is_row_major(aq_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<std::uint32_t> fill_seed(0, 500);
+
+    if(init_method == 0)
+    {
+        if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
+        {
+            ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
+                a_m_k);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
+        }
+        ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(aq_m_aqk);
+        ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        std::cout << "Monotonic initialization is not supported." << std::endl;
+        return 0;
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
+        ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(aq_m_aqk);
+        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        aq_m_aqk.SetZero();
+        b_k_n.SetZero();
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem aq_m_aqk_dev_buf(aq_m_aqk.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
+    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    invoke_gemm<ADataType,
+                AQDataType,
+                BDataType,
+                AccDataType,
+                CDataType,
+                ALayout,
+                AQLayout,
+                BLayout,
+                CLayout,
+                QuantGroupSize>(a_m_k_dev_buf,
+                                aq_m_aqk_dev_buf,
+                                b_k_n_dev_buf,
+                                c_m_n_dev_buf,
+                                M,
+                                N,
+                                K,
+                                AQK,
+                                stride_A,
+                                stride_AQ,
+                                stride_B,
+                                stride_C,
+                                kbatch,
+                                n_warmup,
+                                n_repeat);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm_quant<ADataType,
+                                      AQDataType,
+                                      BDataType,
+                                      AccDataType,
+                                      CDataType,
+                                      QuantGroupSize,
+                                      true>(a_m_k, aq_m_aqk, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        if(!pass)
+        {
+            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+        std::cout << "CPU verification " << (pass ? "Passed!" : "Failed ...") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        std::cout << "GPU verification is not implemented yet. Re-run with -v=1" << std::endl;
+        return false;
+    }
+
+    return pass;
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 8989060842..db5cc71888 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -23,3 +23,4 @@ add_subdirectory(20_grouped_convolution)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_copy)
 add_subdirectory(37_transpose)
+add_subdirectory(38_block_scale_gemm)
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index 541093e337..ba8b87a9b8 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -116,6 +116,24 @@ CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
     return res;
 }
 
+CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t_signed_conversion(const pk_int4_t& x)
+{
+    uint8_t x_u8 = ck_tile::bit_cast<uint8_t>(x);
+
+    float x_l = ((x_u8 & 0x0f) >> 0);
+    float x_h = ((x_u8 & 0xf0) >> 4);
+
+    x_l = x_l > 7 ? x_l - 16 : x_l;
+    x_h = x_l > 7 ? x_l - 16 : x_l;
+
+#ifdef CK_TILE_USE_PK4_LAYOUT_SHUFFLE
+    fp32x2_t res = {x_h, x_l};
+#elif
+    fp32x2_t res = {x_l, x_h};
+#endif
+    return res;
+}
+
 CK_TILE_HOST_DEVICE fp16x2_t pk_int4_t_to_halfx2_t(const pk_int4_t& x)
 {
     uint8_t x_u8 = ck_tile::bit_cast<uint8_t>(x);
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 4a359e031f..9b31a7889d 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -8,6 +8,7 @@
 #include <iterator>
 #include <optional>
 #include <random>
+#include <stdexcept>
 #include <type_traits>
 #include <utility>
 #include <unordered_set>
@@ -92,6 +93,60 @@ struct FillUniformDistribution
     }
 };
 
+template <>
+struct FillUniformDistribution<ck_tile::pk_int4_t>
+{
+    float a_{-8.f}; // same type as primary template so that
+                    // `FillUniformDistribution<Type>{-5.0f, 5.0f}` works for all types
+    float b_{7.f};
+    std::optional<uint32_t> seed_{11939};
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        if(a_ < -8.0f || b_ > 7.0f)
+        {
+            throw std::runtime_error(
+                "a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
+        }
+
+        int min_value             = static_cast<int>(a_);
+        int max_value             = static_cast<int>(b_);
+        constexpr auto int4_array = std::array<uint8_t, 16>{0x88,
+                                                            0x99,
+                                                            0xaa,
+                                                            0xbb,
+                                                            0xcc,
+                                                            0xdd,
+                                                            0xee,
+                                                            0xff,
+                                                            0x00,
+                                                            0x11,
+                                                            0x22,
+                                                            0x33,
+                                                            0x44,
+                                                            0x55,
+                                                            0x66,
+                                                            0x77};
+        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+        std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
+        while(first != last)
+        {
+            int randomInt = dis(gen);
+            *first        = int4_array[randomInt + (min_value + 8)];
+            ++first;
+        }
+    }
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
 namespace impl {
 
 // clang-format off
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index c88deaec01..70ca44170e 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -11,6 +11,110 @@
 
 namespace ck_tile {
 
+template <typename ADataType,
+          typename QDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          uint32_t QuantGroupSize,
+          bool aquant,
+          typename AElementOp   = ck_tile::identity,
+          typename BElementOp   = ck_tile::identity,
+          typename ACCElementOp = ck_tile::identity>
+CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
+                                       const HostTensor<QDataType>& q,
+                                       const HostTensor<BDataType>& b_k_n,
+                                       HostTensor<CDataType>& c_m_n,
+                                       const AElementOp& a_element_op     = {},
+                                       const BElementOp& b_element_op     = {},
+                                       const ACCElementOp& acc_element_op = {})
+{
+    const std::size_t M = a_m_k.get_length(0);
+    const std::size_t N = b_k_n.get_length(1);
+    const std::size_t K = a_m_k.get_length(1);
+
+    auto f_mn = [&](auto m, auto n) {
+        AccDataType v_acc = 0, v_block_acc = 0;
+
+        static_assert(std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
+                      std::is_same_v<ADataType, bf8_t>);
+        static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t> ||
+                      std::is_same_v<BDataType, pk_int4_t>);
+        static_assert(std::is_same_v<AccDataType, float>);
+        static_assert(std::is_same_v<CDataType, float> ||
+                      std::is_same_v<CDataType, ck_tile::half_t>);
+        for(std::size_t k = 0; k < K; ++k)
+        {
+            AccDataType v_a;
+            AccDataType v_b;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = a_element_op(a_m_k(m, k));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = b_element_op(b_k_n(k, n));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else if constexpr(std::is_same_v<BDataType, fp8_t>)
+            {
+                v_b = fp8_to_float_raw(b_element_op(b_k_n(k, n)));
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
+            }
+            v_block_acc += v_a * v_b;
+
+            // Apply group dequant scale
+            if((k + 1) % QuantGroupSize == 0)
+            {
+                float scale       = 0.f;
+                index_t outer_dim = (aquant) ? m : k / QuantGroupSize;
+                index_t inner_dim = (aquant) ? k / QuantGroupSize : n;
+
+                if constexpr(std::is_same_v<QDataType, float>)
+                {
+                    scale = q(outer_dim, inner_dim);
+                }
+                else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
+                {
+                    scale = fp8_to_float_raw(q(outer_dim, inner_dim));
+                }
+                else if constexpr(std::is_same_v<QDataType, ck_tile::bf8_t>)
+                {
+                    scale = bf8_to_float_raw(q(outer_dim, inner_dim));
+                }
+                else
+                {
+                    static_assert(false, "Unexpected Q datatype.");
+                }
+                v_block_acc *= scale;
+                v_acc += v_block_acc;
+                v_block_acc = 0;
+            }
+        }
+
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
+    };
+
+    make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
+    std::cout << std::endl;
+}
+
 template <typename ADataType,
           typename BDataType,
           typename AccDataType,
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index a3fe5045cf..abe26dd9bd 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -110,6 +110,86 @@ CK_TILE_DEVICE bf16x4_t i4_to_bhalf4(int q)
     return res;
 }
 
+CK_TILE_DEVICE fp8x8_t amd_assembly_i4_to_fp8x8(int a)
+{
+    uint32_t src = static_cast<uint32_t>(a), src_hi;
+    uint32_t fp8x4_lo, fp8x4_hi;
+    float tmp_0, tmp_1;
+
+    asm volatile("v_lshrrev_b32 %[v_hi_src], 4, %[v_src]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_3\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_3\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_2\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_2\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0]\n"
+
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_1\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src]\n"
+                 "v_cvt_pk_fp8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0]\n"
+                 : [v_tmp_0] "+v"(tmp_0),
+                   [v_tmp_1] "+v"(tmp_1),
+                   [v_hi_src] "+v"(src_hi),
+                   [v_dst_lo] "+v"(fp8x4_lo),
+                   [v_dst_hi] "+v"(fp8x4_hi),
+                   [v_src] "+v"(src)
+                 :);
+
+    return bit_cast<fp8x8_t>(((static_cast<uint64_t>(fp8x4_hi) << 32) | fp8x4_lo));
+}
+
+CK_TILE_DEVICE float amd_assembly_fp8_to_fp32(uint32_t src)
+{
+    float res;
+    asm volatile("v_cvt_f32_fp8 %0, %1, src0_sel:BYTE_0" : "=v"(res) : "v"(src));
+    return res;
+}
+
+CK_TILE_DEVICE float amd_assembly_bf8_to_fp32(uint32_t src)
+{
+    float res;
+    asm volatile("v_cvt_f32_bf8 %0, %1, src0_sel:BYTE_0" : "=v"(res) : "v"(src));
+    return res;
+}
+
+CK_TILE_DEVICE bf8x8_t amd_assembly_i4_to_bf8x8(int a)
+{
+    uint32_t src = static_cast<uint32_t>(a), src_hi;
+    uint32_t bf8x4_lo, bf8x4_hi;
+    float tmp_0, tmp_1;
+
+    asm volatile("v_lshrrev_b32 %[v_hi_src], 4, %[v_src]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_3\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_3\n"
+                 "v_cvt_pk_bf8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_2\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_2\n"
+                 "v_cvt_pk_bf8_f32 %[v_dst_hi], %[v_tmp_1], %[v_tmp_0]\n"
+
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src], src0_sel:BYTE_1\n"
+                 "v_cvt_pk_bf8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0], op_sel:[0, 0, 1]\n"
+
+                 "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src]\n"
+                 "v_cvt_off_f32_i4 %[v_tmp_1], %[v_hi_src]\n"
+                 "v_cvt_pk_bf8_f32 %[v_dst_lo], %[v_tmp_1], %[v_tmp_0]\n"
+                 : [v_tmp_0] "+v"(tmp_0),
+                   [v_tmp_1] "+v"(tmp_1),
+                   [v_hi_src] "+v"(src_hi),
+                   [v_dst_lo] "+v"(bf8x4_lo),
+                   [v_dst_hi] "+v"(bf8x4_hi),
+                   [v_src] "+v"(src)
+                 :);
+
+    return bit_cast<bf8x8_t>(((static_cast<uint64_t>(bf8x4_hi) << 32) | bf8x4_lo));
+}
+
 struct PassThroughPack8
 {
     template <typename Y, typename X>
@@ -126,6 +206,16 @@ struct PassThroughPack8
         y.lo = i4_to_bhalf4(bit_cast<int>(x));
         y.hi = i4_to_bhalf4(bit_cast<int>(x) >> 16);
     }
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(fp8x8_t& y, const pk_int4x4_t& x) const
+    {
+        y = amd_assembly_i4_to_fp8x8(bit_cast<int>(x));
+    }
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(bf8x8_t& y, const pk_int4x4_t& x) const
+    {
+        y = amd_assembly_i4_to_bf8x8(bit_cast<int>(x));
+    }
     constexpr const static bool is_pack8_invocable = true;
 };
 
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index bf58544259..7ae63e17a7 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -69,6 +69,8 @@ struct CShuffleEpilogue
     using ODataType   = remove_cvref_t<typename Problem::ODataType>;
     using DsDataType  = remove_cvref_t<typename Problem::DsDataType>;
     using DsLayout    = remove_cvref_t<typename Problem::DsLayout>;
+    using ATypeToUse =
+        std::conditional_t<std::is_same_v<ADataType, pk_int4_t>, BDataType, ADataType>;
     // Used for weight-only quantization kernel, B would be dequantized to the same data type as A
     using BTypeToUse =
         std::conditional_t<std::is_same_v<BDataType, pk_int4_t>, ADataType, BDataType>;
@@ -201,7 +203,7 @@ struct CShuffleEpilogue
     static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
     static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);
 
-    using WG = WarpGemmMfmaDispatcher<ADataType,
+    using WG = WarpGemmMfmaDispatcher<ATypeToUse,
                                       BTypeToUse,
                                       AccDataType,
                                       MPerXdl,
diff --git a/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
index 551d390ec6..0e98078d53 100644
--- a/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
@@ -29,6 +29,9 @@ struct TileFlatmmShape
     static constexpr index_t flatKPerWarp  = WarpTile::at(idxK) * WarpTile::at(idxN);
     static constexpr index_t flatKPerBlock = flatKPerWarp * kK / WarpTile::at(idxK);
 
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
     CK_TILE_HOST static std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 8f54e4eda6..5b7903a9e7 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -162,9 +162,11 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
+        constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
+        constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
         return concat('_', "pipeline_AgBgCrCompV3", 
-                      concat('x', MPerBlock, NPerBlock, KPerBlock,  BlockSize),
-                      concat('x', GetVectorSizeA(), GetVectorSizeB(),  GetVectorSizeC()),
+                      concat('x', MPerBlock, NPerBlock, KPerBlock),  BlockSize,
+                      concat('x', WaveNumM, WaveNumN),
                       concat('x', kPadM, kPadN, kPadK));
         // clang-format on
     }
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index 27a81ff090..97fab489ab 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -37,6 +37,7 @@ struct WarpGemmAtrributeMfma
     static constexpr index_t kN          = Impl::kN;
     static constexpr index_t kK          = Impl::kK;
     static constexpr index_t kKPerThread = Impl::kABKPerLane;
+    static constexpr index_t kCMLane     = Impl::kCMLane;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
index f9d50ed35e..38fd0d408b 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
@@ -11,9 +11,10 @@ struct WarpGemmImpl
 {
     using WarpGemmAttribute = remove_cvref_t<WarpGemmAttribute_>;
 
-    static constexpr index_t kM = WarpGemmAttribute::kM;
-    static constexpr index_t kN = WarpGemmAttribute::kN;
-    static constexpr index_t kK = WarpGemmAttribute::kK;
+    static constexpr index_t kM      = WarpGemmAttribute::kM;
+    static constexpr index_t kN      = WarpGemmAttribute::kN;
+    static constexpr index_t kK      = WarpGemmAttribute::kK;
+    static constexpr index_t kCMLane = WarpGemmAttribute::kCMLane;
     /// @brief The number of elements in K dimension processed by single thread in wavefront.
     ///
     /// @note  Note that WarpGemm may run MFMA instruction multiple times (on different K).
diff --git a/include/ck_tile/ops/gemm_group_quant.hpp b/include/ck_tile/ops/gemm_group_quant.hpp
new file mode 100644
index 0000000000..0041c658b4
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp"
+#include "ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp"
diff --git a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
new file mode 100644
index 0000000000..c1ff6a356e
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, index_t UnaryOpSize_ = 8>
+struct BlockGemmQuantBase
+{
+    using AQDataType      = remove_cvref_t<typename Problem::AQDataType>;
+    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+
+    static constexpr index_t UnaryOpSize = UnaryOpSize_;
+    template <typename T>
+    CK_TILE_DEVICE static float cvt_scale_to_fp32(T scale)
+    {
+        float scale_reg_f = 0.f;
+        if constexpr(std::is_same_v<AQDataType, ck_tile::fp8_t>)
+        {
+            scale_reg_f =
+                ck_tile::element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+        }
+        else if constexpr(std::is_same_v<AQDataType, ck_tile::bf8_t>)
+        {
+            scale_reg_f =
+                ck_tile::element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+        }
+        else if constexpr(std::is_same_v<AQDataType, float>)
+        {
+            scale_reg_f = ck_tile::bit_cast<float>(scale);
+        }
+        else
+        {
+            static_assert(false, "AQDataType must be float, fp8_t or bf8_t.");
+        }
+        return scale_reg_f;
+    }
+
+    template <typename WarpWindow, typename WarpTile>
+    CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& warp_tile,
+                                                        const WarpWindow& warp_window)
+    {
+        const element_wise::PassThroughPack8 elementwise_op{};
+
+        static_assert(WarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
+        constexpr index_t thread_buffer_size = WarpTile::get_thread_buffer_size() / UnaryOpSize;
+        const auto in_dstr_tensors           = load_tile(warp_window);
+
+        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
+        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
+            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
+                           in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
+        });
+    }
+};
+
+// A is block window on shared memory
+// AQ (scale tensor) is block distributed tensor.
+// Consecutive kQuantGroupSize elements of A are quantized with a separate scale.
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename Policy_ = BlockGemmASmemBSmemCRegV1DefaultPolicy>
+struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
+{
+    private:
+    template <typename PipelineProblem_, typename GemmPolicy_>
+    struct GemmTraits_
+    {
+        using Problem         = remove_cvref_t<PipelineProblem_>;
+        using Policy          = remove_cvref_t<GemmPolicy_>;
+        using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+        using AQDataType      = remove_cvref_t<typename Problem::AQDataType>;
+        using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+        using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+        using CDataType       = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+        static constexpr index_t kQuantGroupSize = Problem::kQuantGroupSize;
+        static constexpr index_t kBlockSize      = Problem::kBlockSize;
+        static constexpr auto Scheduler          = Problem::Scheduler;
+
+        // Threadblock GEMM tile size
+        static constexpr index_t MPerBlock  = BlockGemmShape::kM;
+        static constexpr index_t NPerBlock  = BlockGemmShape::kN;
+        static constexpr index_t KPerBlock  = BlockGemmShape::kK;
+        static constexpr index_t AQPerBlock = KPerBlock / kQuantGroupSize;
+
+        static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm               = remove_cvref_t<decltype(config.template at<0>())>;
+
+        // number of warps along M and N for threadblock's GEMM problem size
+        static constexpr index_t MWarp = config.template at<1>();
+        static constexpr index_t NWarp = config.template at<2>();
+
+        using I0 = number<0>;
+        using I1 = number<1>;
+
+        static_assert(MWarp == BlockGemmShape::BlockWarps::at(I0{}),
+                      "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!");
+        static_assert(NWarp == BlockGemmShape::BlockWarps::at(I1{}),
+                      "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!");
+        static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(I0{}),
+                      "Error! WarpGemm's M is not consisten with BlockGemmShape!");
+        static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(I1{}),
+                      "Error! WarpGemm's N is not consisten with BlockGemmShape!");
+
+        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+
+        static constexpr index_t QScalesPerBlockRow =
+            (KPerBlock + kQuantGroupSize - 1) / kQuantGroupSize;
+        static constexpr index_t QScalesPerWarpGemmRow =
+            (WarpGemm::kK + kQuantGroupSize - 1) / kQuantGroupSize;
+
+        static constexpr index_t KIterPerQScale = KIterPerWarp / QScalesPerBlockRow;
+
+        static_assert(kQuantGroupSize % WarpGemm::kK == 0,
+                      "Error! WarpGemm::kK should be a multiple of kQuantGroupSize");
+        static_assert(QScalesPerWarpGemmRow == 1,
+                      "Error! kQuantGroupSize shouldn't be smaller than WarpGemm::kK");
+        static_assert(KIterPerWarp % QScalesPerBlockRow == 0,
+                      "Error! KItersPerWarp should be a multiple of QscalesPerBlockRow");
+
+        static_assert(KPerBlock / kQuantGroupSize > 0,
+                      "Error! Each row of blockgemm should have a separate scale");
+
+        static_assert(MIterPerWarp * MWarp * WarpGemm::kM == MPerBlock,
+                      "Error! Warps should cover all Block tile!");
+        static_assert(NIterPerWarp * NWarp * WarpGemm::kN == NPerBlock,
+                      "Error! Warps should cover all Block tile!");
+
+        // Currently tested combinations (A, AQ, B)
+        // 1. fp8, fp32, fp8 -> f32
+        // 2. bf8, fp32, bf8 -> f32
+        // 3. i4, (fp8/fp32) fp8 -> f32
+        // 4. i4, (fp8/fp32) bf8 -> f32
+        static_assert(
+            (std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
+             std::is_same_v<
+                 ADataType,
+                 bf8_t>)&&(std::is_same_v<BDataType, fp8_t> ||
+                           std::is_same_v<
+                               BDataType,
+                               bf8_t>)&&(std::is_same_v<AQDataType, float> ||
+                                         std::is_same_v<AQDataType, ck_tile::fp8_t> ||
+                                         std::is_same_v<
+                                             AQDataType,
+                                             ck_tile::bf8_t>)&&(std::is_same_v<ComputeDataType,
+                                                                               fp8_t> ||
+                                                                std::is_same_v<ComputeDataType,
+                                                                               bf8_t>)&&std::
+                is_same_v<CDataType, fp32_t>);
+
+        static constexpr index_t InterWaveSchedulingMacClusters = 1;
+
+        static constexpr index_t KPack      = WarpGemm::kKPerThread;
+        static constexpr index_t KPerThread = KIterPerWarp * WarpGemm::kKPerThread;
+    };
+
+    public:
+    using Traits = GemmTraits_<Problem_, Policy_>;
+
+    using ADataType       = remove_cvref_t<typename Traits::ADataType>;
+    using AQDataType      = remove_cvref_t<typename Traits::AQDataType>;
+    using BDataType       = remove_cvref_t<typename Traits::BDataType>;
+    using ComputeDataType = remove_cvref_t<typename Traits::ComputeDataType>;
+    using CDataType       = remove_cvref_t<typename Traits::CDataType>;
+
+    using Base = BlockGemmQuantBase<Problem_>;
+
+    using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
+
+    static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
+    static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
+    static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
+
+    static constexpr index_t MWarp = Traits::MWarp;
+    static constexpr index_t NWarp = Traits::NWarp;
+
+    static constexpr auto Scheduler       = Traits::Scheduler;
+    static constexpr uint8_t kA_cvt_scale = std::is_same_v<ADataType, pk_int4_t> ? 16 : 1;
+    static constexpr uint8_t kB_cvt_scale = std::is_same_v<BDataType, pk_int4_t> ? 16 : 1;
+
+    using AWarpDstr = typename WarpGemm::AWarpDstr;
+    using BWarpDstr = typename WarpGemm::BWarpDstr;
+    using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+    using AWarpTensor = typename WarpGemm::AWarpTensor;
+    using BWarpTensor = typename WarpGemm::BWarpTensor;
+    using CWarpTensor = typename WarpGemm::CWarpTensor;
+
+    static_assert(std::is_same_v<typename WarpGemm::CDataType, float>);
+
+    static constexpr auto a_warp_y_lengths =
+        to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+    static constexpr auto b_warp_y_lengths =
+        to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+    static constexpr auto c_warp_y_lengths =
+        to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+    static constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+    static constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+    static constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
+    using I0 = number<0>;
+    using I1 = number<1>;
+
+    CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
+    {
+        constexpr index_t KPerThread     = Traits::KPerThread;
+        constexpr index_t NumMacClusters = Traits::InterWaveSchedulingMacClusters;
+
+        constexpr index_t KPerInnerLoop =
+            ck_tile::max(KPerThread / NumMacClusters, WarpGemm::kKPerThread);
+
+        constexpr index_t KIterInterwave = KPerInnerLoop / WarpGemm::kKPerThread;
+
+        using KIterSeq = std::conditional_t<Scheduler == GemmPipelineScheduler::Interwave,
+                                            sequence<KIterInterwave>,
+                                            sequence<KIterPerWarp>>;
+
+        constexpr auto a_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, KIterSeq>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        return a_block_dstr_encode;
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeBBlockDistributionEncode()
+    {
+        constexpr index_t KPerThread     = Traits::KPerThread;
+        constexpr index_t NumMacClusters = Traits::InterWaveSchedulingMacClusters;
+        constexpr index_t KPerInnerLoop =
+            ck_tile::max(KPerThread / NumMacClusters, WarpGemm::kKPerThread);
+        constexpr index_t KIterInterwave = KPerInnerLoop / WarpGemm::kKPerThread;
+
+        using KIterSeq = std::conditional_t<Scheduler == GemmPipelineScheduler::Interwave,
+                                            sequence<KIterInterwave>,
+                                            sequence<KIterPerWarp>>;
+
+        constexpr auto b_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, KIterSeq>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        return b_block_dstr_encode;
+    }
+
+    private:
+    template <GemmPipelineScheduler Scheduler, typename GemmTraits>
+    struct BlockGemmImpl
+    {
+    };
+
+    template <typename GemmTraits>
+    struct BlockGemmImpl<GemmPipelineScheduler::Intrawave, GemmTraits>
+    {
+        static constexpr auto ALdsTileDistr =
+            decltype(make_static_tile_distribution(MakeABlockDistributionEncode())){};
+        static constexpr auto BLdsTileDistr =
+            decltype(make_static_tile_distribution(MakeBBlockDistributionEncode())){};
+
+        using ALdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(ALdsTileDistr));
+        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));
+
+        ALdsTile a_warp_tile_;
+        BLdsTile b_warp_tile_;
+
+        template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+        CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
+                                          const BSmemBlockWindow& b_block_window)
+        {
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
+                              std::is_same_v<ComputeDataType, bf8_t>);
+                Base::load_interleaved_pk_type(a_warp_tile_, a_block_window);
+            }
+            else
+            {
+                load_tile(a_warp_tile_, a_block_window);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                static_assert(std::is_same_v<ComputeDataType, fp8_t> ||
+                              std::is_same_v<ComputeDataType, bf8_t>);
+                Base::load_interleaved_pk_type(b_warp_tile_, b_block_window);
+            }
+            else
+            {
+                load_tile(b_warp_tile_, b_block_window);
+            }
+        }
+
+        // C += A * B
+        template <typename CBlockTensor,
+                  typename AQBlockTensor,
+                  typename ASmemBlockWindow,
+                  typename BSmemBlockWindow>
+        CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                       AQBlockTensor& aq_block_tensor,
+                                       [[maybe_unused]] ASmemBlockWindow& a_block_window,
+                                       [[maybe_unused]] BSmemBlockWindow& b_block_window)
+        {
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");
+
+            // hot loop:
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    CWarpTensor c_warp_tensor;
+
+                    static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
+                        static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                            constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
+
+                            AWarpTensor a_warp_tensor;
+                            a_warp_tensor.get_thread_buffer() =
+                                a_warp_tile_.get_y_sliced_thread_data(
+                                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                            BWarpTensor b_warp_tensor;
+                            b_warp_tensor.get_thread_buffer() =
+                                b_warp_tile_.get_y_sliced_thread_data(
+                                    merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                                    merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                            if constexpr(kIterInQScale == 0)
+                            {
+                                c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
+                            }
+                            else
+                            {
+                                WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                            }
+                        });
+
+                        // Need to multiply aquant with accumulated C
+                        //
+                        // The accumulated C tile has the standard distribution. For example
+                        // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
+                        // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
+                        // [26,0], [27,0].
+                        //
+                        // These elements are in different rows, need to get the scale value
+                        // for the corresponding row.
+                        // Based on aquant's tile distribution, it can be inferred which
+                        // lane holds the relevant scale. For example, the scales corresponding
+                        // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
+                        // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
+                        //
+                        // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
+
+                        // MIters per warp
+                        constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
+
+                        // Reg block offset based on mIter
+                        constexpr index_t reg_block_offset =
+                            ((mIter / mIters_per_warp) * Traits::AQPerBlock);
+
+                        constexpr index_t lane_base_offset =
+                            (mIter % mIters_per_warp) * WarpGemm::kM;
+
+                        // Scale tensor offset along K
+                        constexpr index_t src_reg_offset = reg_block_offset + kQScale;
+
+                        constexpr uint32_t kTileRows        = 4;
+                        constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
+
+                        constexpr auto tbuf_offset =
+                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                       merge_sequences(sequence<mIter, nIter>{},
+                                                       c_warp_y_index_zeros)) /
+                                   CBlockTensor::PackedSize>{};
+
+                        static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
+                            // Multiply by 4 because output is stored in tiles of 4
+                            // x CNLane
+                            constexpr uint32_t row_base =
+                                ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
+                                ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+
+                            constexpr uint32_t reg_offset_for_row_data = c_row / WarpGemm::kCMLane;
+
+                            // Lane index to source scale from
+                            uint32_t src_lane_idx = lane_base_offset + row_base +
+                                                    (__lane_id() / WarpGemm::kN * kTileRows);
+
+                            // Directly index into thread buffer corresponding to
+                            // desired row coefficient
+                            auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];
+                            uint32_t scale_reg_dword;
+
+                            if constexpr(std::is_same_v<AQDataType, float>)
+                            {
+                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                            }
+                            else
+                            {
+                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                            }
+
+                            // Pull scale data across lanes
+                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+
+                            float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+
+                            c_block_tensor
+                                .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
+                                (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
+                                 scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                        });
+                    });
+                });
+            });
+        }
+    };
+
+    public:
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+
+        return c_block_tensor;
+    }
+
+    template <typename ASmemBlockWindow, typename BSmemBlockWindow>
+    CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
+                                      const BSmemBlockWindow& b_block_window)
+    {
+        block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window);
+    }
+
+    // C += A * B
+    template <typename CBlockTensor,
+              typename AQBlockTensor,
+              typename ASmemBlockWindow,
+              typename BSmemBlockWindow>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   AQBlockTensor& aq_block_tensor,
+                                   const ASmemBlockWindow& a_block_window,
+                                   const BSmemBlockWindow& b_block_window)
+    {
+        block_gemm_impl_(c_block_tensor, aq_block_tensor, a_block_window, b_block_window);
+    }
+
+    private:
+    BlockGemmImpl<Scheduler, Traits> block_gemm_impl_{};
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
new file mode 100644
index 0000000000..b1f89fe2e2
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+struct AQuantGemmProblem
+{
+    CK_TILE_HOST AQuantGemmProblem() = default;
+    CK_TILE_HOST AQuantGemmProblem(index_t M_,
+                                   index_t N_,
+                                   index_t K_,
+                                   index_t QK_,
+                                   index_t stride_A_,
+                                   index_t stride_B_,
+                                   index_t stride_C_,
+                                   index_t stride_AQ_)
+        : M(M_),
+          N(N_),
+          K(K_),
+          QK(QK_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_C(stride_C_),
+          stride_AQ(stride_AQ_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t QK;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+    index_t stride_AQ;
+};
+
+struct AQuantGemmHostArgs : public AQuantGemmProblem
+{
+    CK_TILE_HOST AQuantGemmHostArgs() = default;
+    CK_TILE_HOST AQuantGemmHostArgs(const void* a_ptr_,
+                                    const void* b_ptr_,
+                                    void* c_ptr_,
+                                    const void* aq_ptr_,
+                                    index_t k_batch_,
+                                    index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t QK_,
+                                    index_t stride_A_,
+                                    index_t stride_B_,
+                                    index_t stride_C_,
+                                    index_t stride_AQ_)
+        : AQuantGemmProblem(M_, N_, K_, QK_, stride_A_, stride_B_, stride_C_, stride_AQ_),
+          a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          aq_ptr(aq_ptr_),
+          c_ptr(c_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_ptr;
+    const void* aq_ptr;
+    void* c_ptr;
+    index_t k_batch;
+};
+
+struct AQuantGemmKernelArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    const void* aq_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t QK;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+    index_t stride_AQ;
+    index_t k_batch;
+};
+
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct AQuantGemmKernel
+{
+    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline                       = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using AQLayout                           = remove_cvref_t<typename GemmPipeline::AQLayout>;
+    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using AQDataType = remove_cvref_t<typename GemmPipeline::AQDataType>;
+    using BDataType  = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType  = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>();
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>, GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr AQuantGemmKernelArgs
+    MakeKernelArgs(const AQuantGemmHostArgs& hostArgs)
+    {
+        return AQuantGemmKernelArgs{hostArgs.a_ptr,
+                                    hostArgs.b_ptr,
+                                    hostArgs.aq_ptr,
+                                    hostArgs.c_ptr,
+                                    hostArgs.M,
+                                    hostArgs.N,
+                                    hostArgs.K,
+                                    hostArgs.QK,
+                                    hostArgs.stride_A,
+                                    hostArgs.stride_B,
+                                    hostArgs.stride_C,
+                                    hostArgs.stride_AQ,
+                                    hostArgs.k_batch};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const AQuantGemmKernelArgs& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
+            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
+            }
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+            }
+
+            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            {
+                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+            }
+            else
+            {
+                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static bool IsSupportedArgument(const AQuantGemmKernelArgs& kargs)
+    {
+        if(kargs.k_batch != 1)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+            }
+            return false;
+        }
+
+        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+        if(kargs.QK % GemmPipeline::GetVectorSizeAQ() != 0)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
+            }
+            return false;
+        }
+
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+               GemmPipeline::kPadK == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
+                                  "without padding!");
+                }
+                return false;
+            }
+            if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
+                }
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+               GemmPipeline::kPadK == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
+                                  "without padding!");
+                }
+                return false;
+            }
+            if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
+                }
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
+                }
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                                   const BDataType* b_ptr,
+                                                   const AQDataType* aq_ptr,
+                                                   CDataType* c_ptr,
+                                                   const AQuantGemmKernelArgs& kargs,
+                                                   const SplitKBatchOffset& splitk_batch_offset)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+        const auto& a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                    make_tuple(kargs.stride_A, 1),
+                    number<GemmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+        }();
+
+        const auto& aq_tensor_view = [&]() {
+            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+            return make_naive_tensor_view<address_space_enum::global>(
+                aq_ptr,
+                make_tuple(kargs.M, kargs.QK),
+                make_tuple(kargs.stride_AQ, 1),
+                number<GemmPipeline::GetVectorSizeAQ()>{},
+                number<1>{});
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                {
+                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
+                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
+                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
+                    const auto b_k0_n_k1_desc =
+                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                     make_tuple(kargs.N * K1, K1, I1),
+                                                     number<VectorSizeB>{},
+                                                     number<1>{});
+                    const auto b_n_k_desc = transform_tensor_descriptor(
+                        b_k0_n_k1_desc,
+                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                   make_pass_through_transform(kargs.N)),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        b_ptr,
+                        make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                        make_tuple(kargs.stride_B, 1),
+                        number<GemmPipeline::GetVectorSizeB()>{},
+                        number<1>{});
+                }
+            }
+            else
+            {
+                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                {
+                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
+                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
+                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
+                    const auto b_k0_n_k1_desc =
+                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                     make_tuple(kargs.N * K1, K1, I1),
+                                                     number<VectorSizeB>{},
+                                                     number<1>{});
+                    const auto b_n_k_desc = transform_tensor_descriptor(
+                        b_k0_n_k1_desc,
+                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                   make_pass_through_transform(kargs.N)),
+                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                        make_tuple(sequence<1>{}, sequence<0>{}));
+                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        b_ptr,
+                        make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                        make_tuple(kargs.stride_B, 1),
+                        number<GemmPipeline::GetVectorSizeB()>{},
+                        number<1>{});
+                }
+            }
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(a_tensor_view, aq_tensor_view, b_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                  number<TilePartitioner::MPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadM>{});
+            }
+        }();
+
+        const auto& aq_pad_view = [&]() {
+            const auto& aq_tensor_view = views.at(I1);
+            static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
+            return pad_tensor_view(
+                aq_tensor_view,
+                make_tuple(number<TilePartitioner::MPerBlock>{},
+                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
+                // TODO: Add support for padding.
+                sequence<false, false>{});
+        }();
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I2);
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadN>{});
+            }
+        }();
+
+        // TODO vector write in for C in ColMajor
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I3);
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+
+        return make_tuple(a_pad_view, aq_pad_view, b_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view  = views.at(I0);
+        const auto& aq_pad_view = views.at(I1);
+        const auto& b_pad_view  = views.at(I2);
+        const auto& c_pad_view  = views.at(I3);
+
+        const auto& a_block_window = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::KPerBlock>{}),
+                                        {i_m, 0});
+            }
+            else
+            {
+                return make_tile_window(a_pad_view,
+                                        make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                   number<TilePartitioner::MPerBlock>{}),
+                                        {0, i_m});
+            }
+        }();
+
+        const auto& aq_block_window = [&]() {
+            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+            return make_tile_window(
+                aq_pad_view,
+                make_tuple(number<TilePartitioner::MPerBlock>{},
+                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
+                {i_m, 0});
+        }();
+
+        const auto& b_block_window = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return make_tile_window(b_pad_view,
+                                        make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                   number<TilePartitioner::KPerBlock>{}),
+                                        {i_n, 0});
+            }
+            else
+            {
+                return make_tile_window(b_pad_view,
+                                        make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {0, i_n});
+            }
+        }();
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, aq_block_window, b_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param aq_ptr input AQ pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
+     */
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
+                                       const BDataType* b_ptr,
+                                       const AQDataType* aq_ptr,
+                                       CDataType* c_ptr,
+                                       void* smem_ptr_0,
+                                       const AQuantGemmKernelArgs& kargs,
+                                       const SplitKBatchOffset& splitk_batch_offset,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
+            a_ptr, b_ptr, aq_ptr, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window  = gemm_tile_windows.at(I0);
+        const auto& aq_block_window = gemm_tile_windows.at(I1);
+        const auto& b_block_window  = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, aq_block_window, num_loop, smem_ptr_0);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template
+        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(c_block_window)>(
+            c_block_window, c_block_tile, c_block_window, smem_ptr_0);
+    }
+
+    CK_TILE_DEVICE void operator()(AQuantGemmKernelArgs kargs) const
+    {
+        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
+        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const SplitKBatchOffset splitk_batch_offset(kargs);
+        // options
+        const ADataType* a_ptr   = static_cast<const ADataType*>(kargs.a_ptr);
+        const BDataType* b_ptr   = static_cast<const BDataType*>(kargs.b_ptr);
+        const AQDataType* aq_ptr = static_cast<const AQDataType*>(kargs.aq_ptr);
+        CDataType* c_ptr         = static_cast<CDataType*>(kargs.c_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        assert(kargs.k_batch == 1);
+        RunGemm(a_ptr, b_ptr, aq_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
new file mode 100644
index 0000000000..1356d7e222
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy>
+struct GemmAQuantPipelineAgBgCrImplBase : public GemmPipelineAgBgCrImplBase<Problem, Policy>
+{
+    using Base           = GemmPipelineAgBgCrImplBase<Problem, Policy>;
+    using ADataType      = typename Base::ADataType;
+    using ALayout        = typename Base::ALayout;
+    using BDataType      = typename Base::BDataType;
+    using BLayout        = typename Base::BLayout;
+    using BlockGemmShape = typename Base::BlockGemmShape;
+
+    using AQLayout = remove_cvref_t<typename Problem::AQLayout>;
+
+    static constexpr index_t MPerBlock = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t QuantGroupSize = Problem::kQuantGroupSize;
+    static constexpr index_t KPerBlockAQ    = KPerBlock / QuantGroupSize;
+
+    static_assert(KPerBlock % QuantGroupSize == 0,
+                  "KPerBlock must be a multiple of QuantGroupSize");
+
+    // Create DRAM tile window for AQ
+    template <typename AQDramBlockWindowTmp>
+    CK_TILE_DEVICE constexpr auto
+    GetAQDramLoadWindow(const AQDramBlockWindowTmp& aq_dram_block_window_tmp) const
+    {
+        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+
+        using YPerTile = number<MPerBlock>;
+        using XPerTile = number<KPerBlockAQ>;
+
+        auto aq_copy_dram_window =
+            make_tile_window(aq_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(YPerTile(), XPerTile()),
+                             aq_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeAQDramTileDistribution<Problem>());
+        return aq_copy_dram_window;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
new file mode 100644
index 0000000000..83b61e23fc
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "gemm_group_quant_utils.hpp"
+
+namespace ck_tile {
+
+struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgCrPolicy
+{
+    using Base = UniversalGemmPipelineAgBgCrPolicy;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+
+    using Base::ATileAccessPattern;
+    using Base::BTileAccessPattern;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeAQ()
+    {
+        using AQLayout                = remove_cvref_t<typename Problem::AQLayout>;
+        using AQDataType              = remove_cvref_t<typename Problem::AQDataType>;
+        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
+        constexpr index_t KPerBlockAQ = KPerBlock / Problem::kQuantGroupSize;
+
+        static_assert(std::is_same_v<AQLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        return GetAQGlobalVectorLoadSize<Problem, AQDataType, MPerBlock, KPerBlockAQ>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeAQDramTileDistribution()
+    {
+        using AQLayout       = remove_cvref_t<typename Problem::AQLayout>;
+        using BlockGemmShape = typename Problem::BlockGemmShape;
+
+        constexpr index_t BlockSize   = Problem::kBlockSize;
+        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
+        constexpr index_t KPerBlockAQ = KPerBlock / Problem::kQuantGroupSize;
+        constexpr index_t VecLoadSize = GetVectorSizeAQ<Problem>();
+        using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm                = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
+                                                typename Problem::ComputeDataType,
+                                                typename Problem::CDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                false>;
+
+        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+        using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                                      WarpGemm,
+                                                                      BlockSize,
+                                                                      MPerBlock,
+                                                                      KPerBlockAQ,
+                                                                      VecLoadSize>;
+
+        return TileEncodingPattern::Make2DStaticTileDistribution();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    {
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+
+        static_assert(Problem::kQuantGroupSize % WarpTile::at(I2) == 0,
+                      "KPerWarpGemm must be a multiple of kQuantGroupSize!");
+
+        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
+                                                typename Problem::ComputeDataType,
+                                                typename Problem::CDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                false>;
+        static_assert(std::is_same_v<typename Problem::ComputeDataType, fp8_t> ||
+                      std::is_same_v<typename Problem::ComputeDataType, bf8_t>);
+        static_assert(std::is_same_v<typename Problem::CDataType, float>);
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                      typename Problem::BDataType,
+                                                                      typename Problem::CDataType,
+                                                                      BlockWarps,
+                                                                      WarpGemm>;
+        return AQuantBlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
new file mode 100644
index 0000000000..9fb26eb4e0
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <typename Problem>
+struct BaseAQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
+{
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto
+    TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number)
+    {
+        if(has_hot_loop)
+        {
+            if(tail_number == ck_tile::TailNumber::Full)
+            {
+                return run_func(
+                    ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_number == ck_tile::TailNumber::Odd)
+            {
+                return run_func(
+                    ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_number == ck_tile::TailNumber::Even)
+            {
+                return run_func(
+                    ck_tile::bool_constant<true>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("Unsupported tail number for this operation !!!");
+            }
+        }
+        else
+        {
+            if(tail_number == ck_tile::TailNumber::Full)
+            {
+                return run_func(
+                    ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            }
+            else if(tail_number == ck_tile::TailNumber::Odd)
+            {
+                return run_func(
+                    ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
+            }
+            else if(tail_number == ck_tile::TailNumber::Even)
+            {
+                return run_func(
+                    ck_tile::bool_constant<false>{},
+                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            }
+            else
+            {
+                throw std::runtime_error("Unsupported tail number for this operation !!!");
+            }
+        }
+    }
+};
+
+template <typename Problem, typename Policy = GemmAQuantPipelineAgBgCrDefaultPolicy>
+struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV3<Problem>
+{
+    using Base             = BaseGemmPipelineAgBgCrCompV3<Problem>;
+    using PipelineImplBase = GemmAQuantPipelineAgBgCrImplBase<Problem, Policy>;
+
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using AQDataType     = remove_cvref_t<typename Problem::AQDataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using I0 = number<0>;
+    using I1 = number<1>;
+    using I2 = number<2>;
+
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
+    static constexpr index_t AQPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<AQDataType>>::PackedSize;
+
+    using ALayout  = remove_cvref_t<typename Problem::ALayout>;
+    using AQLayout = remove_cvref_t<typename Problem::AQLayout>;
+    using BLayout  = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout  = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
+
+    static constexpr index_t BlockSize      = Problem::kBlockSize;
+    static constexpr index_t MPerBlock      = BlockGemmShape::kM;
+    static constexpr index_t NPerBlock      = BlockGemmShape::kN;
+    static constexpr index_t KPerBlock      = BlockGemmShape::kK;
+    static constexpr index_t QuantGroupSize = Problem::kQuantGroupSize;
+    static constexpr index_t KPerBlockAQ    = BlockGemmShape::kK / QuantGroupSize;
+
+    static constexpr index_t GetVectorSizeA() { return Policy::template GetVectorSizeA<Problem>(); }
+    static constexpr index_t GetVectorSizeB() { return Policy::template GetVectorSizeB<Problem>(); }
+    static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
+    static constexpr index_t GetVectorSizeAQ()
+    {
+        return Policy::template GetVectorSizeAQ<Problem>();
+    }
+
+    static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
+    static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+
+    static constexpr bool HasHotLoop = Problem::HasHotLoop;
+    static constexpr auto TailNum    = Problem::TailNum;
+    static constexpr auto Scheduler  = Problem::Scheduler;
+
+    using Base::PrefetchStages;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
+        constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
+        return concat('_', "aquant_pipeline_AgBgCrCompV3", 
+                      concat('x', MPerBlock, NPerBlock, KPerBlock),
+                      BlockSize,
+                      concat('x', WaveNumM, WaveNumN),
+                      concat('x', BlockGemm::WarpGemm::kM, BlockGemm::WarpGemm::kN, BlockGemm::WarpGemm::kK),
+                      concat('x', kPadM, kPadN, kPadK), "QuantGroupSize", QuantGroupSize);
+        // clang-format on
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST static std::string Print()
+    {
+        constexpr index_t MPerXDL = BlockGemm::WarpGemm::kM;
+        constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
+        constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;
+
+        constexpr index_t WaveSize = 64;
+        constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
+        constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
+
+        constexpr index_t A_LDS_Read_Width = GetSmemPackA();
+        constexpr index_t B_LDS_Read_Width = GetSmemPackB();
+
+        constexpr index_t A_LDS_Write_Width = GetSmemPackA();
+        constexpr index_t B_LDS_Write_Width = GetSmemPackB();
+
+        constexpr index_t A_Buffer_Load_Inst_Num =
+            MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
+        constexpr index_t B_Buffer_Load_Inst_Num =
+            NPerBlock * KPerBlock / (BlockSize * GetVectorSizeB());
+        constexpr index_t AQ_Buffer_Load_Inst_Num =
+            MPerBlock * KPerBlockAQ / (BlockSize * GetVectorSizeAQ());
+
+        constexpr index_t A_LDS_Write_Inst_Num =
+            MPerBlock * KPerBlock / (BlockSize * A_LDS_Write_Width);
+        constexpr index_t B_LDS_Write_Inst_Num =
+            NPerBlock * KPerBlock / (BlockSize * B_LDS_Write_Width);
+
+        constexpr index_t A_LDS_Read_Inst_Num =
+            WaveNumN * MPerBlock * KPerBlock / (BlockSize * A_LDS_Read_Width);
+        constexpr index_t B_LDS_Read_Inst_Num =
+            WaveNumM * NPerBlock * KPerBlock / (BlockSize * B_LDS_Read_Width);
+
+        constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
+                                            (BlockSize / WaveSize) / (MPerXDL * NPerXDL * KPerXDL);
+
+        auto str = std::stringstream{};
+
+        str << "A/B vector size: " << GetVectorSizeA() << ", " << GetVectorSizeB() << ", "
+            << "AQ vector size: " << GetVectorSizeAQ() << "\n"
+            << "A/B LDS read/write width: " << A_LDS_Read_Width << ", " << B_LDS_Read_Width << "\n"
+            << "A/B buffer load inst: " << A_Buffer_Load_Inst_Num << ", " << B_Buffer_Load_Inst_Num
+            << ", "
+            << "AQ buffer load inst: " << AQ_Buffer_Load_Inst_Num << "\n"
+            << "A/B LDS write inst: " << A_LDS_Write_Inst_Num << ", " << B_LDS_Write_Inst_Num
+            << "\n"
+            << "A/B LDS read inst: " << A_LDS_Read_Inst_Num << ", " << B_LDS_Read_Inst_Num << "\n"
+            << "C MFMA inst: " << C_MFMA_Inst_Num << "\n"
+            << "QuantGroupSize: " << QuantGroupSize << "\n"
+            << "KPack: " << BlockGemm::Traits::KPack << "\n"
+            << "PrefetchStages: " << PrefetchStages << "\n";
+        return str.str();
+    }
+
+    template <GemmPipelineScheduler Scheduler>
+    struct PipelineImpl : public PipelineImplBase
+    {
+    };
+
+    template <>
+    struct PipelineImpl<GemmPipelineScheduler::Intrawave> : public PipelineImplBase
+    {
+        using Base = PipelineImplBase;
+
+        template <bool HasHotLoop,
+                  TailNumber TailNum,
+                  typename ADramBlockWindowTmp,
+                  typename BDramBlockWindowTmp,
+                  typename AQDramBlockWindowTmp,
+                  typename AElementFunction,
+                  typename BElementFunction>
+        CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                       const AElementFunction& a_element_func,
+                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                       const BElementFunction& b_element_func,
+                                       const AQDramBlockWindowTmp& aq_dram_block_window_tmp,
+                                       index_t num_loop,
+                                       void* p_smem) const
+        {
+            static_assert(
+                std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                    std::is_same_v<BDataType,
+                                   remove_cvref_t<typename BDramBlockWindowTmp::DataType>> &&
+                    std::is_same_v<AQDataType,
+                                   remove_cvref_t<typename AQDramBlockWindowTmp::DataType>>,
+                "A/B/AQ Dram block window should have the same data type as appropriate "
+                "([A|B|AQ]DataType) defined in Problem definition!");
+
+            constexpr bool is_a_col_major =
+                std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
+            constexpr bool is_aq_col_major =
+                std::is_same_v<AQLayout, tensor_layout::gemm::ColumnMajor>;
+            constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
+
+            static_assert(!is_aq_col_major, "Aq must be row major (col major not supported yet)");
+            static_assert(MPerBlock == AQDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                              KPerBlockAQ == AQDramBlockWindowTmp{}.get_window_lengths()[I1{}],
+                          "Aq block window has incorrect lengths for defined AqLayout!");
+
+            static_assert(is_a_col_major
+                              ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}])
+                              : (MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]),
+                          "A block window has incorrect lengths for defined ALayout!");
+            static_assert(is_b_row_major
+                              ? (KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}])
+                              : (NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
+                                 KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]),
+                          "B block window has incorrect lengths for defined BLayout!");
+
+            using ADramTileWindowStep  = typename ADramBlockWindowTmp::BottomTensorIndex;
+            using BDramTileWindowStep  = typename BDramBlockWindowTmp::BottomTensorIndex;
+            using AQDramTileWindowStep = typename AQDramBlockWindowTmp::BottomTensorIndex;
+
+            auto&& [a_lds_block, b_lds_block] = Base::GetABLdsTensorViews(p_smem);
+
+            constexpr auto a_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
+            constexpr auto b_lds_load_tile_distr =
+                make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
+
+            auto&& [a_copy_dram_window, a_copy_lds_window, a_lds_gemm_window] =
+                Base::GetAWindows(a_dram_block_window_tmp, a_lds_block, a_lds_load_tile_distr);
+            auto&& [b_copy_dram_window, b_copy_lds_window, b_lds_gemm_window] =
+                Base::GetBWindows(b_dram_block_window_tmp, b_lds_block, b_lds_load_tile_distr);
+            auto aq_copy_dram_window = Base::GetAQDramLoadWindow(aq_dram_block_window_tmp);
+
+            using ABlockTileDistr  = decltype(a_copy_dram_window.get_tile_distribution());
+            using BBlockTileDistr  = decltype(b_copy_dram_window.get_tile_distribution());
+            using AQBlockTileDistr = decltype(aq_copy_dram_window.get_tile_distribution());
+
+            using ABlockTile =
+                decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+            using BBlockTile =
+                decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+            using AQBlockTile =
+                decltype(make_static_distributed_tensor<AQDataType>(AQBlockTileDistr{}));
+
+            auto block_gemm = BlockGemm();
+
+            ABlockTile a_block_tile;
+            BBlockTile b_block_tile;
+            AQBlockTile aq_block_tile[2];
+            int currIdx = 0;
+
+            auto c_block_tile = block_gemm.MakeCBlockTile();
+
+            constexpr ADramTileWindowStep a_dram_tile_window_step =
+                is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
+            constexpr BDramTileWindowStep b_dram_tile_window_step =
+                is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
+            constexpr AQDramTileWindowStep aq_dram_tile_window_step =
+                is_aq_col_major ? make_array(KPerBlockAQ, 0) : make_array(0, KPerBlockAQ);
+
+            // DRAM prefetch (global read 0)
+            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
+            Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
+            Base::GlobalPrefetch(
+                aq_block_tile[currIdx], aq_copy_dram_window, aq_dram_tile_window_step);
+
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            if constexpr(is_a_col_major)
+            {
+                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                    Policy::template MakeShuffled2DStaticTileDistribution<Problem>());
+                transpose_tile2d(a_shuffle_tmp, a_block_tile);
+                Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+            }
+            else
+            {
+                Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+            }
+
+            if constexpr(is_b_row_major)
+            {
+                auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                    Policy::template MakeShuffled2DStaticTileDistribution<Problem>());
+                transpose_tile2d(b_shuffle_tmp, b_block_tile);
+                Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+            }
+            else
+            {
+                Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+            }
+
+            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
+            Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
+
+            block_sync_lds();
+
+            block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            if constexpr(HasHotLoop)
+            {
+                index_t i = 0;
+                do
+                {
+                    block_sync_lds();
+
+                    if constexpr(is_a_col_major)
+                    {
+                        auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                            Policy::template MakeShuffledARegTileDistribution<Problem>());
+                        transpose_tile2d(a_shuffle_tmp, a_block_tile);
+                        Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                    }
+                    else
+                    {
+                        Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                    }
+                    if constexpr(is_b_row_major)
+                    {
+                        auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                            Policy::template MakeShuffledBRegTileDistribution<Problem>());
+                        transpose_tile2d(b_shuffle_tmp, b_block_tile);
+                        Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                    }
+                    else
+                    {
+                        Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+                    }
+
+                    Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
+                    Base::GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step);
+                    Base::GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2],
+                                         aq_copy_dram_window,
+                                         aq_dram_tile_window_step);
+
+                    block_gemm(
+                        c_block_tile, aq_block_tile[currIdx], a_lds_gemm_window, b_lds_gemm_window);
+
+                    currIdx = (currIdx + 1) % 2;
+
+                    block_sync_lds();
+
+                    block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                    __builtin_amdgcn_sched_barrier(0);
+
+                    i += 1;
+                } while(i < (num_loop - 1));
+            }
+            // tail
+            if constexpr((TailNum == TailNumber::Full) || (TailNum == TailNumber::Odd))
+            {
+                block_gemm(
+                    c_block_tile, aq_block_tile[currIdx], a_lds_gemm_window, b_lds_gemm_window);
+            }
+            else
+            {
+                Base::GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2],
+                                     aq_copy_dram_window,
+                                     aq_dram_tile_window_step);
+                block_gemm(
+                    c_block_tile, aq_block_tile[currIdx], a_lds_gemm_window, b_lds_gemm_window);
+                block_sync_lds();
+
+                currIdx = (currIdx + 1) % 2;
+
+                if constexpr(is_a_col_major)
+                {
+                    auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                        Policy::template MakeShuffledARegTileDistribution<Problem>());
+                    transpose_tile2d(a_shuffle_tmp, a_block_tile);
+                    Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func);
+                }
+                else
+                {
+                    Base::LocalPrefill(a_copy_lds_window, a_block_tile, a_element_func);
+                }
+                if constexpr(is_b_row_major)
+                {
+                    auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                        Policy::template MakeShuffledBRegTileDistribution<Problem>());
+                    transpose_tile2d(b_shuffle_tmp, b_block_tile);
+                    Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func);
+                }
+                else
+                {
+                    Base::LocalPrefill(b_copy_lds_window, b_block_tile, b_element_func);
+                }
+                block_sync_lds();
+                block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window);
+                block_gemm(
+                    c_block_tile, aq_block_tile[currIdx], a_lds_gemm_window, b_lds_gemm_window);
+            }
+            return c_block_tile;
+        }
+    };
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AQDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   const AQDramBlockWindowTmp& aq_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
+    {
+        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
+            a_dram_block_window_tmp,
+            [](const ADataType& a) { return a; },
+            b_dram_block_window_tmp,
+            [](const BDataType& b) { return b; },
+            aq_dram_block_window_tmp,
+            num_loop,
+            p_smem);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp
new file mode 100644
index 0000000000..4cca30fd3b
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
+
+#include <string>
+
+namespace ck_tile {
+
+template <typename ADataType_,
+          typename AQDataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename BlockGemmShape_,
+          typename Traits_,
+          uint32_t QuantGroupSize_,
+          typename ComputeDataType_        = BDataType_,
+          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
+          bool HasHotLoop_                 = true,
+          TailNumber TailNum_              = TailNumber::Full>
+struct GemmAQuantPipelineProblemBase : public GemmPipelineProblemBase<ADataType_,
+                                                                      BDataType_,
+                                                                      CDataType_,
+                                                                      BlockGemmShape_,
+                                                                      Traits_,
+                                                                      ComputeDataType_>
+{
+    using Base = GemmPipelineProblemBase<ADataType_,
+                                         BDataType_,
+                                         CDataType_,
+                                         BlockGemmShape_,
+                                         Traits_,
+                                         ComputeDataType_>;
+
+    using Traits = typename Base::Traits;
+
+    using typename Base::ADataType;
+    using typename Base::BDataType;
+    using typename Base::CDataType;
+    using typename Base::ComputeDataType;
+    using AQDataType = remove_cvref_t<AQDataType_>;
+
+    using BlockGemmShape = typename Base::BlockGemmShape;
+
+    using typename Base::ALayout;
+    using typename Base::BLayout;
+    using typename Base::CLayout;
+
+    static constexpr bool TransposeC = false;
+
+    using Base::kBlockSize;
+
+    using Base::kPadK;
+    using Base::kPadM;
+    using Base::kPadN;
+
+    using Base::DoubleSmemBuffer;
+    using Base::VectorLoadSize;
+
+    using AQLayout = remove_cvref_t<typename Traits::AQLayout>;
+
+    static constexpr uint32_t kQuantGroupSize = QuantGroupSize_;
+    static constexpr auto Scheduler           = Scheduler_;
+    static constexpr auto HasHotLoop          = HasHotLoop_;
+    static constexpr auto TailNum             = TailNum_;
+
+    static_assert(BlockGemmShape::kK % kQuantGroupSize == 0);
+    static_assert(Scheduler == GemmPipelineScheduler::Intrawave);
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm_aquant_problem",
+                      concat('x', VectorLoadSize, kBlockSize),
+                      concat('x', kPadM, kPadN, kPadK),
+                      Scheduler,
+                      "QuantGroupSize",
+                      kQuantGroupSize);
+        // clang-format on
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentAQ()
+    {
+        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
+        return VectorLoadSize / sizeof(AQDataType);
+    }
+
+    static constexpr index_t VectorSizeAQ = []() {
+        static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
+        return kPadK ? 1 : GetAlignmentAQ();
+    }();
+};
+
+template <typename ADataType_,
+          typename AQDataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename BlockGemmShape_,
+          typename Traits_,
+          uint32_t QuantGroupSize_,
+          typename ComputeDataType_        = BDataType_,
+          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
+          bool HasHotLoop_                 = true,
+          TailNumber TailNum_              = TailNumber::Full>
+using GemmAQuantPipelineProblem = GemmAQuantPipelineProblemBase<ADataType_,
+                                                                AQDataType_,
+                                                                BDataType_,
+                                                                CDataType_,
+                                                                BlockGemmShape_,
+                                                                Traits_,
+                                                                QuantGroupSize_,
+                                                                ComputeDataType_,
+                                                                Scheduler_,
+                                                                HasHotLoop_,
+                                                                TailNum_>;
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
new file mode 100644
index 0000000000..c018314ab7
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename DataType, index_t YPerTile, index_t XPerTile>
+CK_TILE_HOST_DEVICE static constexpr auto GetAQGlobalVectorLoadSize()
+{
+    using I1                 = number<1>;
+    constexpr index_t NWarps = Problem::BlockGemmShape::BlockWarps::at(I1{});
+
+    constexpr index_t BlockSize = Problem::kBlockSize;
+
+    // Data is replicated across warps along NWarps, so we divide BlockSize by NWarps
+    constexpr index_t elements_per_thread = (YPerTile * XPerTile) / (BlockSize / NWarps);
+    constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
+
+    // Define vector load candidates in descending order of priority
+    constexpr std::array<index_t, 5> candidates{
+        PackedSize * 32 / sizeof(DataType),
+        PackedSize * 16 / sizeof(DataType),
+        PackedSize * 8 / sizeof(DataType),
+        PackedSize * 4 / sizeof(DataType),
+        PackedSize * 2 / sizeof(DataType),
+    };
+
+    for(const auto vec_size : candidates)
+    {
+        if(vec_size <= 0 || XPerTile % vec_size != 0 || elements_per_thread % vec_size != 0)
+            continue;
+        bool is_valid = (vec_size > 0) && (XPerTile % vec_size == 0) &&
+                        (elements_per_thread % vec_size == 0) && vec_size != candidates[4];
+        if(is_valid)
+        {
+            return vec_size;
+        }
+    }
+    return PackedSize; // Absolute fallback
+}
+
+// AQ holds groupquant scale data for A. Data is loaded from DRAM and partitioned across
+// threads. Post mfma scales are shuffled across threads in the warp and applied to
+// accum registers.
+template <typename BlockGemmShape,
+          typename WarpGemm,
+          index_t BlockSize,
+          index_t YPerTile,
+          index_t XPerTile,
+          index_t VecSize>
+struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPattern
+{
+    // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
+    static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
+    static constexpr index_t warp_size = get_warp_size();
+    static constexpr index_t num_warps = BlockSize / get_warp_size();
+
+    static constexpr index_t MWarps = BlockGemmShape::BlockWarps::at(number<0>{});
+    static constexpr index_t NWarps = BlockGemmShape::BlockWarps::at(number<1>{});
+    static constexpr index_t KWarps = BlockGemmShape::BlockWarps::at(number<2>{});
+
+    static constexpr index_t MIterPerWarp = BlockGemmShape::kM / (MWarps * WarpGemm::kM);
+
+    static_assert(num_warps == MWarps * NWarps * KWarps);
+
+    // KWarps > 1 isn't supported
+    static_assert(KWarps == 1);
+
+    // # of elements per thread
+    static constexpr index_t X = XPerTile;
+
+    static constexpr index_t Y0 = 1;
+    static constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
+    static constexpr index_t Y2 = MWarps;
+    static constexpr index_t Y3 = WarpGemm::kM;
+    static_assert(Y3 >= WarpGemm::kM, "Scales for all rows must be available within the warp.");
+    static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
+                  "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
+
+    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<NWarps>,
+                                       tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
+                                       tuple<sequence<1, 0>, sequence<1, 1>>,
+                                       tuple<sequence<2, 0>, sequence<0, 3>>,
+                                       sequence<1, 2>,
+                                       sequence<1, 0>>{});
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
new file mode 100644
index 0000000000..4972badb3f
--- /dev/null
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <bool kPadM_,
+          bool kPadN_,
+          bool kPadK_,
+          typename ALayout_,
+          typename BLayout_,
+          typename CLayout_,
+          typename AQLayout_ = ALayout_>
+struct TileGemmAQuantTraits
+{
+    static constexpr bool kPadM = kPadM_;
+    static constexpr bool kPadN = kPadN_;
+    static constexpr bool kPadK = kPadK_;
+
+    static constexpr int _VectorSize = 16;
+
+    using ALayout  = ALayout_;
+    using BLayout  = BLayout_;
+    using CLayout  = CLayout_;
+    using AQLayout = AQLayout_;
+
+    static constexpr bool UseStructuredSparsity = false;
+    static constexpr index_t NumWaveGroups      = 1;
+};
+
+} // namespace ck_tile
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 3e5a3034cd..8f3fbd52c5 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -17,3 +17,4 @@ add_subdirectory(topk_softmax)
 add_subdirectory(add_rmsnorm2d_rdquant)
 # add_subdirectory(layernorm2d)
 # add_subdirectory(rmsnorm2d)
+add_subdirectory(gemm_block_scale)
diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt
new file mode 100644
index 0000000000..847ab88644
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(TEST_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+  list(APPEND TEST_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+list(APPEND TEST_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+
+if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+    set(TEST_GEMM_NAME test_tile_gemm_aquant_basic)
+    set(QUANT_TYPES fp8 bf8 i4fp8 i4bf8 i4f32fp8 i4f32bf8)
+
+    foreach(QUANT_TYPE ${QUANT_TYPES})
+        add_gtest_executable(${TEST_GEMM_NAME}_${QUANT_TYPE} test_gemm_aquant_basic_${QUANT_TYPE}.cpp)
+        target_compile_options(${TEST_GEMM_NAME}_${QUANT_TYPE} PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
+    endforeach()
+
+else()
+    message(DEBUG "Skipping ck_tile quant gemm tests for current target")
+endif()
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp
new file mode 100644
index 0000000000..9c4277d879
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_bf8.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_run_gemm_aquant_example.inc"
+
+int main() { return run_gemm_combinations("bf8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp
new file mode 100644
index 0000000000..b0cf55be6f
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_fp8.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_run_gemm_aquant_example.inc"
+
+int main() { return run_gemm_combinations("fp8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp
new file mode 100644
index 0000000000..fd80bf2b06
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4bf8.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_run_gemm_aquant_example.inc"
+
+int main() { return run_gemm_combinations("i4bf8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp
new file mode 100644
index 0000000000..fe8c9c5000
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32bf8.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_run_gemm_aquant_example.inc"
+
+int main() { return run_gemm_combinations("i4f32bf8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp
new file mode 100644
index 0000000000..a319d9c2ad
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4f32fp8.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_run_gemm_aquant_example.inc"
+
+int main() { return run_gemm_combinations("i4f32fp8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp
new file mode 100644
index 0000000000..ceb8760435
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_basic_i4fp8.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_run_gemm_aquant_example.inc"
+
+int main() { return run_gemm_combinations("i4fp8"); }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp b/test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp
new file mode 100644
index 0000000000..40f6712ef9
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_gemm_aquant_utils.hpp
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/gemm_group_quant.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
+#define CK_TILE_PIPELINE_PRESHUFFLE 5
+
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(__gfx950__)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(__gfx950__)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
+#endif
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+class ArgumentsNotSupportedException : public std::logic_error
+{
+    public:
+    explicit ArgumentsNotSupportedException(const std::string& message) : logic_error(message) {}
+};
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryIntrawave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3 : public GemmConfigBase
+{
+    // Compute V3 only support Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 32;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 256;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV5 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer               = false;
+    static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufle_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufle_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
+};
+
+template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
+struct GemmTypeConfig;
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
+template <typename ADataType_,
+          typename BDataType_ = ADataType_,
+          typename CDataType_ = ADataType_,
+          typename QDataType_ = float>
+struct GemmQuantTypeConfig
+{
+    using ADataType   = ADataType_;
+    using QDataType   = QDataType_;
+    using BDataType   = BDataType_;
+    using AccDataType = float;
+    using CDataType   = CDataType_;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float, ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float, ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>
+{
+    using ADataType   = ck_tile::pk_int4_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, float>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <>
+struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, float>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using QDataType   = float;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = float;
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("aq_layout", "R", "Aq tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Column by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_q", "0", "Tensor AQ stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("as_br_cr", "false", "Choose between as_br_cr and as_bs_cr");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
new file mode 100644
index 0000000000..f410b58053
--- /dev/null
+++ b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
@@ -0,0 +1,577 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <bit>
+#include <random>
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host.hpp"
+#include "test_gemm_aquant_utils.hpp"
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize>
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 1;
+
+    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+
+    constexpr ck_tile::index_t M_Tile = 16;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 256;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 16;
+    constexpr ck_tile::index_t N_Warp_Tile = 16;
+    constexpr ck_tile::index_t K_Warp_Tile = 32;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 CodegenGemmShape,
+                                                                 CodegenGemmTraits,
+                                                                 ComputeDataType>;
+
+    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
+    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    constexpr bool transposed_warp_gemm = false;
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+
+        using CodegenPipelineProblem =
+            ck_tile::GemmAQuantPipelineProblem<ADataType,
+                                               AQDataType,
+                                               BDataType,
+                                               AccDataType,
+                                               CodegenGemmShape,
+                                               CodegenGemmTraits,
+                                               QuantGroupSize,
+                                               ComputeDataType,
+                                               ck_tile::GemmPipelineScheduler::Intrawave,
+                                               has_hot_loop_v,
+                                               tail_number_v>;
+        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
+        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             ck_tile::tuple<>,
+                                             AccDataType,
+                                             CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             transposed_warp_gemm,
+                                             ck_tile::memory_operation_enum::set>>;
+        using Kernel =
+            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(args.k_batch != 1)
+        {
+            throw std::runtime_error("split-k is not supported yet!");
+        }
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+}
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize>
+float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                  ck_tile::DeviceMem& aq_m_aqk_dev_buf,
+                  ck_tile::DeviceMem& b_k_n_dev_buf,
+                  ck_tile::DeviceMem& c_m_n_dev_buf,
+                  ck_tile::index_t M,
+                  ck_tile::index_t N,
+                  ck_tile::index_t K,
+                  ck_tile::index_t AQK,
+                  ck_tile::index_t stride_A,
+                  ck_tile::index_t stride_AQ,
+                  ck_tile::index_t stride_B,
+                  ck_tile::index_t stride_C,
+                  ck_tile::index_t kbatch,
+                  int n_warmup,
+                  int n_repeat)
+{
+    ck_tile::AQuantGemmHostArgs args;
+    args.a_ptr     = a_m_k_dev_buf.GetDeviceBuffer();
+    args.aq_ptr    = aq_m_aqk_dev_buf.GetDeviceBuffer();
+    args.b_ptr     = b_k_n_dev_buf.GetDeviceBuffer();
+    args.c_ptr     = c_m_n_dev_buf.GetDeviceBuffer();
+    args.k_batch   = kbatch;
+    args.M         = M;
+    args.N         = N;
+    args.K         = K;
+    args.QK        = AQK;
+    args.stride_A  = stride_A;
+    args.stride_B  = stride_B;
+    args.stride_C  = stride_C;
+    args.stride_AQ = stride_AQ;
+
+    float ave_time = gemm_calc_aquant<ADataType,
+                                      AQDataType,
+                                      BDataType,
+                                      AccDataType,
+                                      CDataType,
+                                      BDataType,
+                                      ALayout,
+                                      BLayout,
+                                      CLayout,
+                                      QuantGroupSize>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = std::size_t(2) * M * N * K;
+    std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(AQDataType) * M * AQK +
+                           sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideAQ =" << stride_AQ << " StrideB =" << stride_B
+              << " StrideC =" << stride_C << " A_Layout =" << ALayout::name
+              << " B_Layout =" << BLayout::name << " C_Layout =" << CLayout::name
+              << " A_Type = " << DataTypeTraits<ADataType>::name
+              << " AQ_Type = " << DataTypeTraits<AQDataType>::name
+              << " B_Type = " << DataTypeTraits<BDataType>::name
+              << " Acc_Type = " << DataTypeTraits<AccDataType>::name
+              << " C_Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+    return ave_time;
+}
+
+template <typename TypeConfig,
+          uint32_t QuantGroupSize,
+          typename ALayout,
+          typename AQLayout,
+          typename BLayout,
+          typename CLayout>
+bool run_gemm_test_with_layouts(int argc,
+                                char* argv[],
+                                const ALayout a_layout                  = ALayout{},
+                                const AQLayout aq_layout                = AQLayout{},
+                                const BLayout b_layout                  = BLayout{},
+                                [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    using ADataType   = typename TypeConfig::ADataType;
+    using AQDataType  = typename TypeConfig::QDataType;
+    using BDataType   = typename TypeConfig::BDataType;
+    using AccDataType = typename TypeConfig::AccDataType;
+    using CDataType   = typename TypeConfig::CDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    if(K % QuantGroupSize != 0)
+    {
+        throw std::runtime_error("K must be aligned with QuantGroupSize");
+    }
+
+    ck_tile::index_t AQK = K / QuantGroupSize;
+
+    ck_tile::index_t stride_A  = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_AQ = arg_parser.get_int("stride_q");
+    ck_tile::index_t stride_B  = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C  = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    stride_A  = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_AQ = ck_tile::get_default_stride(M, AQK, stride_AQ, is_row_major(aq_layout));
+    stride_B  = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C  = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<AQDataType> aq_m_aqk(
+        ck_tile::host_tensor_descriptor(M, AQK, stride_AQ, is_row_major(aq_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<std::uint32_t> fill_seed(0, 500);
+
+    if(init_method == 0)
+    {
+        if constexpr(std::is_same_v<ADataType, ck_tile::pk_int4_t>)
+        {
+            ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
+                a_m_k);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-2.0f, 3.0f, fill_seed(gen)}(a_m_k);
+        }
+        ck_tile::FillUniformDistribution<AQDataType>{-2.0f, 2.0f, fill_seed(gen)}(aq_m_aqk);
+        ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        std::cout << "Monotonic initialization is not supported." << std::endl;
+        return true;
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(0x22)}(a_m_k);
+        ck_tile::FillConstant<AQDataType>{static_cast<AQDataType>(0.5f)}(aq_m_aqk);
+        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(0x38)}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        aq_m_aqk.SetZero();
+        b_k_n.SetZero();
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem aq_m_aqk_dev_buf(aq_m_aqk.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
+    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    invoke_gemm<ADataType,
+                AQDataType,
+                BDataType,
+                AccDataType,
+                CDataType,
+                ALayout,
+                AQLayout,
+                BLayout,
+                CLayout,
+                QuantGroupSize>(a_m_k_dev_buf,
+                                aq_m_aqk_dev_buf,
+                                b_k_n_dev_buf,
+                                c_m_n_dev_buf,
+                                M,
+                                N,
+                                K,
+                                AQK,
+                                stride_A,
+                                stride_AQ,
+                                stride_B,
+                                stride_C,
+                                kbatch,
+                                n_warmup,
+                                n_repeat);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm_quant<ADataType,
+                                      AQDataType,
+                                      BDataType,
+                                      AccDataType,
+                                      CDataType,
+                                      QuantGroupSize,
+                                      true>(a_m_k, aq_m_aqk, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        if(!pass)
+        {
+            std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                      << std::endl;
+        }
+        std::cout << "CPU verification " << (pass ? "Passed!" : "Failed ...") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        std::cout << "GPU verification is not implemented yet. Re-run with -v=1" << std::endl;
+        return false;
+    }
+
+    return pass;
+}
+
+template <typename TypeConfig, uint32_t QuantGroupSize>
+bool run_gemm_test_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_test_with_layouts<TypeConfig, QuantGroupSize>(
+                argc, argv, Row{}, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for A.");
+    }
+
+    return true;
+}
+
+bool run_gemm_test(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
+        return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
+        return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4fp8")
+    {
+        using TypeConfig = decltype(
+            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::fp8_t>{});
+        return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4bf8")
+    {
+        using TypeConfig = decltype(
+            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::bf8_t>{});
+        return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
+        return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32bf8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
+        return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int run_gemm_combinations(std::string const& data_type)
+{
+    // Define possible values for each parameter
+    std::vector<std::vector<std::string>> mnk_values = {{
+                                                            "1",
+                                                            "2048",
+                                                            "5120",
+                                                        },
+                                                        {
+                                                            "2",
+                                                            "2048",
+                                                            "5120",
+                                                        },
+                                                        {
+                                                            "16",
+                                                            "2048",
+                                                            "5120",
+                                                        },
+                                                        {
+                                                            "17",
+                                                            "2048",
+                                                            "5120",
+                                                        },
+                                                        {
+                                                            "2047",
+                                                            "5120",
+                                                            "1024",
+                                                        },
+                                                        {
+                                                            "2048",
+                                                            "5120",
+                                                            "1024",
+                                                        }};
+    std::vector<std::string> prec_values             = {data_type};
+
+    // We'll store all our arguments as strings first
+    std::vector<std::string> arg_strings = {"test_tile_gemm_aquant_basic",
+                                            "", // m placeholder
+                                            "", // n placeholder
+                                            "", // k placeholder
+                                            "", // prec placeholder
+                                            "-init=0",
+                                            "-v=1",
+                                            "-warmup=0",
+                                            "-repeat=1"};
+
+    // Create an array of const char pointers for argv
+    constexpr size_t ARG_COUNT   = 9;
+    constexpr size_t ARG_MAX_LEN = 64;
+    char args[ARG_COUNT][ARG_MAX_LEN];
+    char* argv[ARG_COUNT];
+
+    // Run all combinations
+    bool is_success = true;
+    for(const auto& mnk : mnk_values)
+    {
+        arg_strings[1] = "-m=" + mnk[0];
+        arg_strings[2] = "-n=" + mnk[1];
+        arg_strings[3] = "-k=" + mnk[2];
+
+        for(const auto& prec : prec_values)
+        {
+            arg_strings[4] = "-prec=" + prec;
+
+            // Set up the argv array with pointers to the string data
+            for(size_t i = 0; i < ARG_COUNT; i++)
+            {
+                strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN);
+                argv[i] = args[i];
+            }
+
+            std::cout << "Arguments received: ";
+            for(size_t i = 1; i < ARG_COUNT; ++i)
+            {
+                std::cout << argv[i] << " ";
+            }
+            std::cout << std::endl;
+
+            // Call the function with the current configuration
+            try
+            {
+                is_success = run_gemm_test(ARG_COUNT, argv) && is_success;
+            }
+            catch(const ArgumentsNotSupportedException& e)
+            {
+                std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n';
+                // ArgumentsNotSupportedException  is not an error. Do not change is_success
+            }
+            catch(const std::runtime_error& e)
+            {
+                std::cerr << "Caught runtime error: " << e.what() << '\n';
+                is_success = false;
+            }
+        }
+    }
+    return is_success ? EXIT_SUCCESS : EXIT_FAILURE;
+}

From a5fdc663c82cd78c5125852223031c1999de9413 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Wed, 23 Jul 2025 15:14:02 +0800
Subject: [PATCH 334/443] fix async copytest bug (#2509)

* fix async copytest bug

* Add block_sync_lds_direct_load utility

* fix the s_waitcnt_imm calculation

* Improve s_waitcnt_imm calculation

* fix vmcnt shift

* add input validation and bug fix

* remove unnecessary output

* move test_copy into test

* change bit width check

* refactor macros into constexpr functions

which still get inlined

* wrap s_waitcnt api

* parameterize test

* cleanup

* cleanup fp8 stub

* add fp8 test cases; todo which input parameters are valid?

* replace n for fp8 in test cases

* add large shapes; fp8 fails again

* change input init

* test sync/async

* time the test

* clang-format test

* use float instead of bfloat to cover a 4-byte type

* fix logic - arg sections should be 'or'd

* make block_sync_lds_direct_load interface similar to old ck

* fix a few comment typos

* name common shapes

* revert the example to original logic of not waiting lds

* clang-format

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 example/ck_tile/36_copy/CMakeLists.txt        |   4 -
 example/ck_tile/36_copy/test_copy.cpp         | 118 -----------
 example/ck_tile/CMakeLists.txt                |   1 -
 include/ck_tile/core/arch/arch.hpp            |  68 ++++++
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/memory_copy/CMakeLists.txt       |   3 +
 .../ck_tile/memory_copy}/README.md            |   0
 test/ck_tile/memory_copy/test_copy.cpp        | 193 ++++++++++++++++++
 .../ck_tile/memory_copy}/test_copy.hpp        | 116 +++++------
 9 files changed, 313 insertions(+), 191 deletions(-)
 delete mode 100644 example/ck_tile/36_copy/CMakeLists.txt
 delete mode 100644 example/ck_tile/36_copy/test_copy.cpp
 create mode 100644 test/ck_tile/memory_copy/CMakeLists.txt
 rename {example/ck_tile/36_copy => test/ck_tile/memory_copy}/README.md (100%)
 create mode 100644 test/ck_tile/memory_copy/test_copy.cpp
 rename {example/ck_tile/36_copy => test/ck_tile/memory_copy}/test_copy.hpp (56%)

diff --git a/example/ck_tile/36_copy/CMakeLists.txt b/example/ck_tile/36_copy/CMakeLists.txt
deleted file mode 100644
index d1b9ba923c..0000000000
--- a/example/ck_tile/36_copy/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_executable(test_copy_kernel EXCLUDE_FROM_ALL test_copy.cpp)
-target_compile_options(test_copy_kernel PRIVATE
-  -mllvm -enable-noalias-to-md-conversion=0
-)
\ No newline at end of file
diff --git a/example/ck_tile/36_copy/test_copy.cpp b/example/ck_tile/36_copy/test_copy.cpp
deleted file mode 100644
index 4123408453..0000000000
--- a/example/ck_tile/36_copy/test_copy.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck_tile/host.hpp"
-#include <cstring>
-#include "test_copy.hpp"
-
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "64", "m dimension")
-        .insert("n", "8", "n dimension")
-        .insert("id", "0", "warp to use")
-        .insert("v", "1", "cpu validation or not")
-        .insert("prec", "fp16", "precision")
-        .insert("warmup", "50", "cold iter")
-        .insert("repeat", "100", "hot iter");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename DataType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    using XDataType = DataType;
-    using YDataType = DataType;
-
-    ck_tile::index_t m       = arg_parser.get_int("m");
-    ck_tile::index_t n       = arg_parser.get_int("n");
-    ck_tile::index_t warp_id = arg_parser.get_int("id");
-    int do_validation        = arg_parser.get_int("v");
-    int warmup               = arg_parser.get_int("warmup");
-    int repeat               = arg_parser.get_int("repeat");
-
-    ck_tile::HostTensor<XDataType> x_host({m, n});
-    ck_tile::HostTensor<YDataType> y_host_ref({m, n});
-    ck_tile::HostTensor<YDataType> y_host_dev({m, n});
-
-    // ck_tile::FillConstant<XDataType>{1.f}(x_host);
-    ck_tile::half_t value = 1;
-    for(int i = 0; i < m; i++)
-    {
-        value = 1;
-        for(int j = 0; j < n; j++)
-        {
-            x_host(i, j) = value++;
-        }
-    }
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-
-    using BlockWaves         = ck_tile::sequence<2, 1>;
-    using BlockTile          = ck_tile::sequence<64, 8>;
-    using WaveTile           = ck_tile::sequence<64, 8>;
-    using Vector             = ck_tile::sequence<1, 2>;
-    constexpr bool AsyncCopy = true;
-
-    ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
-    std::cout << "grid size " << kGridSize << std::endl;
-
-    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
-    using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
-    using Kernel  = ck_tile::TileCopy<Problem>;
-
-    constexpr ck_tile::index_t kBlockSize  = 128;
-    constexpr ck_tile::index_t kBlockPerCu = 1;
-    std::cout << "block size " << kBlockSize << std::endl;
-    std::cout << "warp SIze " << ck_tile::get_warp_size() << std::endl;
-    std::cout << "warps per block _M " << Shape::WarpPerBlock_M << " " << Shape::WarpPerBlock_N
-              << std::endl;
-    std::cout << "Block waves: " << BlockWaves::at(ck_tile::number<0>{}) << " "
-              << BlockWaves::at(ck_tile::number<1>{}) << std::endl;
-    std::cout << " Wave Groups: " << Shape::WaveGroups << std::endl;
-
-    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
-                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                       Kernel{},
-                                       kGridSize,
-                                       kBlockSize,
-                                       0,
-                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
-                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       m,
-                                       n,
-                                       warp_id));
-
-    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
-
-    bool pass = true;
-
-    if(do_validation)
-    {
-        // reference
-        y_buf.FromDevice(y_host_dev.mData.data());
-        pass = ck_tile::check_err(y_host_dev, x_host);
-
-        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-    }
-
-    return pass;
-}
-
-int main(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
-    const std::string data_type = arg_parser.get_str("prec");
-    return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
-}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index db5cc71888..b317ed18aa 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -21,6 +21,5 @@ add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
 add_subdirectory(20_grouped_convolution)
 add_subdirectory(35_batched_transpose)
-add_subdirectory(36_copy)
 add_subdirectory(37_transpose)
 add_subdirectory(38_block_scale_gemm)
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 3dd9604b01..e2a73e6242 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -10,6 +10,15 @@
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 
+#define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
+#define CK_TILE_VMCNT(cnt)                                              \
+    ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
+     ((cnt)&0b1111) | (((cnt)&0b110000) << 10))
+#define CK_TILE_EXPCNT(cnt) \
+    ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
+#define CK_TILE_LGKMCNT(cnt) \
+    ([]() { static_assert(!((cnt) >> 4), "LGKM only has 4 bits"); }(), ((cnt) << 8))
+
 namespace ck_tile {
 
 template <typename, bool>
@@ -113,13 +122,72 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 #endif
 }
 
+// https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html
+struct waitcnt_arg
+{
+    // bit numbers (hex) -------------------------> FE'DC'BA98'7'654'3210
+    // [V]M [E]XP [L]GKM counters and [U]NUSED ---> VV'UU'LLLL'U'EEE'VVVV
+    CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111;
+
+    CK_TILE_DEVICE static constexpr index_t kMaxVmCnt   = 0b111111;
+    CK_TILE_DEVICE static constexpr index_t kMaxExpCnt  = 0b111;
+    CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b1111;
+
+    template <index_t cnt>
+    CK_TILE_DEVICE static constexpr index_t from_vmcnt()
+    {
+        static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
+        return MAX & ((cnt & 0b1111) | ((cnt & 0b110000) << 10));
+    }
+
+    template <index_t cnt>
+    CK_TILE_DEVICE static constexpr index_t from_expcnt()
+    {
+        static_assert(cnt >= 0 && !(cnt >> 3), "valid range is [0..7]");
+        return MAX & (cnt << 4);
+    }
+
+    template <index_t cnt>
+    CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
+    {
+        static_assert(cnt >= 0 && !(cnt >> 4), "valid range is [0..15]");
+        return MAX & (cnt << 8);
+    }
+};
+
+template <index_t vmcnt   = waitcnt_arg::kMaxVmCnt,
+          index_t expcnt  = waitcnt_arg::kMaxExpCnt,
+          index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
+CK_TILE_DEVICE void s_waitcnt()
+{
+    __builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
+                               waitcnt_arg::from_expcnt<expcnt>() |
+                               waitcnt_arg::from_lgkmcnt<lgkmcnt>());
+}
+
+template <index_t vmcnt   = waitcnt_arg::kMaxVmCnt,
+          index_t expcnt  = waitcnt_arg::kMaxExpCnt,
+          index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
+CK_TILE_DEVICE void s_waitcnt_barrier()
+{
+    s_waitcnt<vmcnt, expcnt, lgkmcnt>();
+    __builtin_amdgcn_s_barrier();
+}
+
 CK_TILE_DEVICE void block_sync_lds_direct_load()
 {
+#if 1
+    // invoke clang builtins which *should* produce the same result as the inline asm below
+    // difference: inline asm is being compiled to wait vmcnt(0) after the barrier
+    s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>();
+#else
+    // same content as in old CK (#999)
     asm volatile("\
     s_waitcnt vmcnt(0) \n \
     s_waitcnt lgkmcnt(0) \n \
     s_barrier \
     " ::);
+#endif
 }
 
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 8f3fbd52c5..fb566b2a00 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -11,6 +11,7 @@ add_subdirectory(data_type)
 add_subdirectory(permute)
 add_subdirectory(moe_sorting)
 add_subdirectory(slice_tile)
+add_subdirectory(memory_copy)
 add_subdirectory(batched_transpose)
 add_subdirectory(smoothquant)
 add_subdirectory(topk_softmax)
diff --git a/test/ck_tile/memory_copy/CMakeLists.txt b/test/ck_tile/memory_copy/CMakeLists.txt
new file mode 100644
index 0000000000..5311e5060a
--- /dev/null
+++ b/test/ck_tile/memory_copy/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(GPU_TARGETS MATCHES "gfx950")
+    add_gtest_executable(test_memory_copy test_copy.cpp)
+endif()
diff --git a/example/ck_tile/36_copy/README.md b/test/ck_tile/memory_copy/README.md
similarity index 100%
rename from example/ck_tile/36_copy/README.md
rename to test/ck_tile/memory_copy/README.md
diff --git a/test/ck_tile/memory_copy/test_copy.cpp b/test/ck_tile/memory_copy/test_copy.cpp
new file mode 100644
index 0000000000..e8962dce29
--- /dev/null
+++ b/test/ck_tile/memory_copy/test_copy.cpp
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "test_copy.hpp"
+
+struct MemoryCopyParam
+{
+    MemoryCopyParam(ck_tile::index_t m_, ck_tile::index_t n_, ck_tile::index_t warp_id_)
+        : m(m_), n(n_), warp_id(warp_id_)
+    {
+    }
+    ck_tile::index_t m;
+    ck_tile::index_t n;
+    ck_tile::index_t warp_id;
+};
+
+template <typename DataType, bool AsyncCopy = true>
+class TestCkTileMemoryCopy : public ::testing::TestWithParam<std::tuple<int, int, int>>
+{
+    protected:
+    void Run(const MemoryCopyParam& memcpy_params)
+    {
+        using XDataType = DataType;
+        using YDataType = DataType;
+
+        ck_tile::index_t m       = memcpy_params.m;
+        ck_tile::index_t n       = memcpy_params.n;
+        ck_tile::index_t warp_id = memcpy_params.warp_id;
+
+        constexpr auto dword_bytes = 4;
+
+        if(n % (dword_bytes / sizeof(DataType)) != 0)
+        {
+            std::cerr << "n size should be multiple of dword_bytes" << std::endl;
+        }
+
+        ck_tile::HostTensor<XDataType> x_host({m, n});
+        ck_tile::HostTensor<YDataType> y_host_dev({m, n});
+        std::cout << "input: " << x_host.mDesc << std::endl;
+        std::cout << "output: " << y_host_dev.mDesc << std::endl;
+
+        ck_tile::index_t value = 1;
+        for(int i = 0; i < m; i++)
+        {
+            value = 1;
+            for(int j = 0; j < n; j++)
+            {
+                value        = (value + 1) % 127;
+                x_host(i, j) = static_cast<DataType>(value);
+            }
+        }
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+
+        using BlockWaves = ck_tile::sequence<2, 1>;
+        using BlockTile  = ck_tile::sequence<64, 8>;
+        using WaveTile   = ck_tile::sequence<64, 8>;
+        using Vector     = ck_tile::sequence<1, dword_bytes / sizeof(DataType)>;
+
+        ck_tile::index_t kGridSize =
+            ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
+
+        using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+        using Problem = ck_tile::TileCopyProblem<XDataType, Shape, AsyncCopy>;
+        using Kernel  = ck_tile::TileCopy<Problem>;
+
+        constexpr ck_tile::index_t kBlockSize  = 128;
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        auto ms = launch_kernel(ck_tile::stream_config{nullptr, true},
+                                ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                    Kernel{},
+                                    kGridSize,
+                                    kBlockSize,
+                                    0,
+                                    static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                    static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                    m,
+                                    n,
+                                    warp_id));
+
+        auto bytes = 2 * m * n * sizeof(DataType);
+        std::cout << "elapsed: " << ms << " (ms)" << std::endl;
+        std::cout << (bytes * 1e-6 / ms) << " (GB/s)" << std::endl;
+
+        // reference
+        y_buf.FromDevice(y_host_dev.mData.data());
+        bool pass = ck_tile::check_err(y_host_dev, x_host);
+
+        EXPECT_TRUE(pass);
+    }
+};
+
+class TestCkTileMemoryCopyHalfAsync : public TestCkTileMemoryCopy<ck_tile::half_t>
+{
+};
+
+class TestCkTileMemoryCopyHalfSync : public TestCkTileMemoryCopy<ck_tile::half_t, false>
+{
+};
+
+class TestCkTileMemoryCopyFloatAsync : public TestCkTileMemoryCopy<float>
+{
+};
+
+class TestCkTileMemoryCopyFP8Async : public TestCkTileMemoryCopy<ck_tile::fp8_t>
+{
+};
+
+TEST_P(TestCkTileMemoryCopyHalfAsync, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+TEST_P(TestCkTileMemoryCopyHalfSync, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+TEST_P(TestCkTileMemoryCopyFloatAsync, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+TEST_P(TestCkTileMemoryCopyFP8Async, TestCorrectness)
+{
+    auto [M, N, warp_id] = GetParam();
+    this->Run({M, N, warp_id});
+}
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyHalfAsync,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 2, 0},
+                                           std::tuple{127, 30, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 2, 1},
+                                           std::tuple{127, 30, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyHalfSync,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 2, 0},
+                                           std::tuple{127, 30, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 2, 1},
+                                           std::tuple{127, 30, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyFloatAsync,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 2, 0},
+                                           std::tuple{127, 30, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 2, 1},
+                                           std::tuple{127, 30, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
+
+INSTANTIATE_TEST_SUITE_P(TestCkTileMemCopySuite,
+                         TestCkTileMemoryCopyFP8Async,
+                         ::testing::Values(std::tuple{64, 8, 0},
+                                           std::tuple{63, 8, 0},
+                                           std::tuple{63, 4, 0},
+                                           std::tuple{127, 20, 0},
+                                           std::tuple{64, 8, 1},
+                                           std::tuple{63, 8, 1},
+                                           std::tuple{63, 4, 1},
+                                           std::tuple{127, 20, 1},
+                                           std::tuple{16384, 16384, 0},
+                                           std::tuple{16384, 16384, 1}));
diff --git a/example/ck_tile/36_copy/test_copy.hpp b/test/ck_tile/memory_copy/test_copy.hpp
similarity index 56%
rename from example/ck_tile/36_copy/test_copy.hpp
rename to test/ck_tile/memory_copy/test_copy.hpp
index 0b3c87d472..a9840ba2c6 100644
--- a/example/ck_tile/36_copy/test_copy.hpp
+++ b/test/ck_tile/memory_copy/test_copy.hpp
@@ -14,14 +14,14 @@ namespace ck_tile {
 template <typename BlockWaves, // num warps along seq<M, N>
           typename BlockTile,  // block size, seq<M, N>
           typename WaveTile,   // warp size, seq<M, N>
-          typename Vector>     // contiguous elements(vector size) along seq<M, N>
+          typename Vector>     // contiguous elements (vector size) along seq<M, N>
 struct TileCopyShape
 {
     // We split Workgroup waves into two specialized groups.
-    // One for reading data from global -> LDS, the other is doing reduction
+    // One for reading data from global -> LDS, the other idling
     static constexpr index_t WaveGroups = 2;
     static constexpr index_t MWarps     = BlockWaves::at(number<0>{});
-    static constexpr index_t NWarps     = BlockWaves::at(number<0>{});
+    static constexpr index_t NWarps     = BlockWaves::at(number<1>{});
 
     static constexpr index_t Block_M = BlockTile::at(number<0>{});
     static constexpr index_t Block_N = BlockTile::at(number<1>{});
@@ -35,10 +35,9 @@ struct TileCopyShape
     static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
     static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
 
-    static constexpr index_t WarpPerBlock_M =
-        integer_divide_ceil(BlockWaves::at(number<0>{}), WaveGroups);
-    static constexpr index_t WarpPerBlock_N =
-        integer_divide_ceil(BlockWaves::at(number<1>{}), WaveGroups);
+    // We splitted the waves on M dimension
+    static constexpr index_t WarpPerBlock_M = integer_divide_ceil(MWarps, WaveGroups);
+    static constexpr index_t WarpPerBlock_N = NWarps;
 
     static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
     static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
@@ -47,7 +46,8 @@ struct TileCopyShape
 
     static constexpr index_t BlockSize     = get_warp_size() * WaveNum;
     static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;
-    static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N, "Inconsisten wave group size!");
+    static_assert(WaveGroupSize == WarpPerBlock_M * WarpPerBlock_N,
+                  "Inconsistent wave group size!");
 };
 
 template <typename XDataType_, typename BlockShape_, bool AsyncCopy_>
@@ -78,20 +78,21 @@ struct TileCopy
             S::Vector_N; // no. of elements along N dimensions to be read by each thread.
 
         constexpr index_t Y0 =
-            S::WaveNum / S::WaveGroups;        // no. of active warps working in this thread block.
-        constexpr index_t Y1 = warp_size / X0; // no. of threads in a warp needed along M dimension.
+            S::WaveNum / S::WaveGroups; // number of active warps working in this thread block.
         constexpr index_t Y2 =
+            warp_size / X0; // number of threads in a warp needed along M dimension.
+        constexpr index_t Y1 =
             S::Warp_M /
-            (Y1 *
-             Y0); // no. of iterations each warp needs to perform to cover the entire tile window.
+            Y2; // number of iterations each warp needs to perform to cover the entire tile window.
 
         constexpr auto outer_encoding =
-            tile_distribution_encoding<sequence<Y0>,
-                                       tuple<sequence<Y1, Y2>, sequence<X0, X1>>,
-                                       tuple<sequence<0>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<0, 0>>,
+            tile_distribution_encoding<sequence<S::WaveGroups>,
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<0, 1>, sequence<1, 2>>,
+                                       tuple<sequence<0, 0>, sequence<2, 0>>,
                                        sequence<1, 2>,
                                        sequence<1, 1>>{};
+
         return make_static_tile_distribution(outer_encoding);
     }
 
@@ -100,90 +101,69 @@ struct TileCopy
     {
         using S = typename Problem::BlockShape;
 
-        // LDS Data.
-        __shared__ XDataType x_lds[number<S::Block_M>{} * number<S::Block_N>{}];
-        XDataType* __restrict__ p_x_lds = static_cast<XDataType*>(x_lds);
+        // LDS buffer
+        __shared__ XDataType x_lds[S::Block_M * S::Block_N];
+
+        constexpr auto block_dims    = make_tuple(number<S::Block_M>{}, number<S::Block_N>{});
+        constexpr auto block_strides = make_tuple(number<S::Block_N>{}, number<1>{});
 
         const auto x_lds_desc = make_naive_tensor_descriptor(
-            make_tuple(number<S::Block_M>{}, number<S::Block_N>{}, number<S::Vector_N>{}),
-            make_tuple(number<S::Block_N>{}, number<S::Vector_N>{}, 1),
-            number<S::Vector_N>{},
-            number<1>{});
+            block_dims, block_strides, number<S::Vector_N>{}, number<1>{});
 
-        auto x_lds_block_desc = transform_tensor_descriptor(
-            x_lds_desc,
-            make_tuple(make_pass_through_transform(number<S::Block_M>{}),
-                       make_merge_transform(
-                           make_tuple(number<S::Block_N>{} / S::Vector_N, number<S::Vector_N>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
+        auto x_lds_view = make_tensor_view<address_space_enum::lds>(x_lds, x_lds_desc);
 
-        auto x_lds_view = make_tensor_view<address_space_enum::lds>(p_x_lds, x_lds_block_desc);
+        auto x_block_lds_write_window = make_tile_window(x_lds_view, block_dims, {0, 0});
 
-        auto x_block_lds_window =
-            make_tile_window(x_lds_view,
-                             make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                             {0, 0},
-                             MakeDRAMDistribution<Problem>());
-        auto x_block_lds_window_no_dist = make_tile_window(
-            x_lds_view, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {0, 0});
+        auto x_block_lds_read_window =
+            make_tile_window(x_lds_view, block_dims, {0, 0}, MakeDRAMDistribution<Problem>());
 
+        const index_t iM = __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_M);
         // Input tensor
-        const auto iM    = get_block_id() * S::Block_M;
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
             p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
         auto x_block_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                             {iM, 0},
-                             MakeDRAMDistribution<Problem>());
+            make_tile_window(x_m_n, block_dims, {iM, 0}, MakeDRAMDistribution<Problem>());
 
         // Output tensor
         const auto y_m = make_naive_tensor_view<address_space_enum::global>(
             p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+        auto y_block_window = make_tile_window(y_m, block_dims, {iM, 0});
 
-        auto y_block_window =
-            make_tile_window(y_m, make_tuple(number<S::Block_M>{}, number<S::Block_N>{}), {iM, 0});
-
-        // Programming logic
-        index_t num_n_tile_iteration =
+        const index_t num_n_tile_iteration =
             __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-        auto my_id = get_warp_id();
-
-        auto DramTileDist   = x_block_window.get_tile_distribution();
-        using dram_reg_tile = decltype(make_static_distributed_tensor<XDataType>(DramTileDist));
-
+        const index_t my_id                    = __builtin_amdgcn_readfirstlane(get_warp_id());
+        constexpr index_t async_copy_fence_cnt = 0;
         for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
         {
-            dram_reg_tile dram_tile;
-
             if(my_id == warp_id)
             {
                 if constexpr(AsyncCopy)
                 {
-                    async_load_tile(x_block_lds_window_no_dist, x_block_window);
-
-                    load_tile(dram_tile, x_block_lds_window);
-
+                    async_load_tile(x_block_lds_write_window, x_block_window);
+                    // We don't have prefetch here, wait the data back immediately.
+                    // Wait all asyncload insts complete.
+                    // Wait all waves synced
+                    s_waitcnt_barrier<async_copy_fence_cnt>();
+                    auto lds_tile = load_tile(x_block_lds_read_window);
                     // store from registers to DRAM
-                    store_tile(y_block_window, dram_tile);
+                    store_tile(y_block_window, lds_tile);
                 }
                 else
                 {
                     // load from DRAM to registers
-                    load_tile(dram_tile, x_block_window);
-
+                    auto dram_tile = load_tile(x_block_window);
                     // store in lds
-                    store_tile(x_block_lds_window_no_dist, dram_tile);
-
+                    store_tile(x_block_lds_write_window, dram_tile);
+                    // Wait all lds write insts complete
+                    // Wait all waves synced
+                    block_sync_lds();
                     // read from lds to registers
-                    load_tile(dram_tile, x_block_lds_window);
-
+                    auto lds_tile = load_tile(x_block_lds_read_window);
                     // store from registers to DRAM
-                    store_tile(y_block_window, dram_tile);
+                    store_tile(y_block_window, lds_tile);
                 }
             }
-            __syncthreads();
+
             move_tile_window(x_block_window, {0, S::Block_N});
             move_tile_window(y_block_window, {0, S::Block_N});
         }

From 1b6f024836abd6fb5b678a3fc6c148bb42337fce Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 23 Jul 2025 09:09:56 -0700
Subject: [PATCH 335/443] refactor fmha_bwd.py (#2546)

---
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   | 548 ++++++++----------
 1 file changed, 239 insertions(+), 309 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 1c46df0ab8..4dffdf836a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 import fnmatch
 import itertools
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Dict, Literal
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
@@ -204,107 +204,13 @@ FMHA_BWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode})
             }}
 """
 
-@dataclass
-class FmhaBwdDQDKDVApiTrait:
-    pipeline      : str
-    # sync with fmha_bwd_traits<>, to generate fallback calls
-    hdim          : str
-    dtype         : str  # data type
-    mode          : str  # value from MODE_MAP
-    bm0           : int  # tile size along q seqlen (block size)
-    bn0           : int  # tile size along k seqlen
-    bhdq          : int  # q head_dim
-    bhdv          : int  # v head_dim
-    mask          : str
-    bias          : str
-    dbias         : str
-    dropout       : str
-    spad          : str
-    skpad         : str
-    dpad          : str
-    dvpad         : str
-    deterministic : str
-
-    def scheck(self, spad1 : str) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.spad == 't' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} != 0'
-        elif self.spad == 'f' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0'
-        else: # self.skpad == 'f' and skpad1 == 'f'
-            return f'a.seqlen_q % 64 == 0'
-
-    @property
-    def skcheck(self) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.skpad == 't':
-            return f'a.seqlen_k % {self.bn0} != 0'
-        else:
-            return f'a.seqlen_k % {self.bn0} == 0'
-
-    @property
-    def dcheck(self) -> str:
-        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
-        else :               return f'a.hdim_q % {self.bhdq} == 0'
-
-    @property
-    def dvcheck(self) -> str:
-        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
-        else :                return f'a.hdim_v % {self.bhdv} == 0'
-
-class FmhaBwdApiPool:
-    def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = dict()
-        self.mask_impl = mask_impl
-
-    def register_dq_dk_dv_traits(self, trait : FmhaBwdDQDKDVApiTrait) -> None:
-        # TODO: do we need to check duplication?
-        if trait.dtype not in self.dq_dk_dv_pool.keys():
-            self.dq_dk_dv_pool[trait.dtype] = dict()
-        if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys():
-            self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list()
-
-        self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
-
-    @property
-    def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.dq_dk_dv_pool.keys()):
-            per_hdim_case=str()
-            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()):
-                traits=self.dq_dk_dv_pool[dtype][hdim]
-                hdim_int = int(hdim)
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    for spad1 in ["t", "f"]:
-                        if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")):
-                            continue
-                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
-                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
-                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
-                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                    F_deterministic=BOOL_MAP[trait.deterministic])
-
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
-            # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
-
 # GEMM0: Q@K=S^T
 # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v)
 # GEMM2: dO@V=dP^T(This was chosen as G2 because of the calculation order)
 # GEMM3: dS^T@Q^T=dK(Similar to G1, but N3 must be equal to headdim_qk)
 # GEMM4: dS@K^T=dQ(N4 must be equal to headdim_qk)
 # Is it necessary to distinguish between K0~K4?
-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdDQDKDVTileSize:
     F_bm0       : int  # tile size along q seqlen (block size)
     F_bn0       : int  # tile size along k seqlen
@@ -337,7 +243,7 @@ class FmhaBwdDQDKDVTileSize:
         f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}_r{self.F_rm2}x{self.F_rn2}x{self.F_rk2}" +\
         f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}_o{self.F_occupancy}"
 
-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdDQDKDVKernel:
     F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
     F_hdim          : int  # hdim
@@ -440,26 +346,6 @@ class FmhaBwdDQDKDVKernel:
     def filename(self) -> str:
         return self.name + ".cpp"
 
-    def api_trait(self) -> FmhaBwdDQDKDVApiTrait:
-        return FmhaBwdDQDKDVApiTrait(pipeline=self.F_pipeline,
-                hdim=str(self.F_hdim),
-                dtype=self.F_dtype,
-                mode=self.F_mode,
-                bm0=self.F_tile.F_bm0,
-                bn0=self.F_tile.F_bn0,
-                bhdq=self.F_tile.F_bhdq,
-                bhdv=self.F_tile.F_bhdv,
-                mask=self.F_mask,
-                bias=self.F_bias,
-                dbias=self.F_dbias,
-                dropout=self.F_dropout,
-                spad=self.F_spad,
-                skpad=self.F_skpad,
-                dpad=self.F_dpad,
-                dvpad=self.F_dvpad,
-                deterministic=self.F_deterministic
-                )
-
 # TODO: design a more practical way to do it
 # this is current supported tile size & pipeline.
 def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]:
@@ -477,84 +363,6 @@ def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict
     else:
         return None
 
-def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaBwdApiPool, List[FmhaBwdDQDKDVKernel]]:
-    # TODO: we don't support tuning yet, so pick up one value for pad
-    #       support this in future
-    gen = list()
-    api_pool = FmhaBwdApiPool(mask_impl)
-
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, mask, bias, dbias, dropout, spad, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"]):
-            tile = d[hdim_str][0]
-            ppl = d[hdim_str][1]
-            hdim = int(hdim_str)
-            if (mode == "group") and (spad == "f" or skpad == "f"):
-                continue
-            if ((bias == "no" or bias == "alibi") and dbias == "t"):
-                continue
-            if ("wg32" in dropout):
-                continue
-            if (dpad == "t" or dvpad == "t"):
-                ppl = d[hdim_str][2]
-            k = FmhaBwdDQDKDVKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_tile=tile,
-                                F_spad=spad, F_skpad=skpad, F_dpad=dpad, F_dvpad=dvpad,
-                                F_bias=bias, F_dbias=dbias, F_dropout=dropout, F_mask=mask, F_mode=mode,
-                                F_pipeline=ppl, mask_impl=mask_impl, F_deterministic=deterministic)
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
-                    continue
-            # Flash attention integration
-            if receipt == 2:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'alibi']
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    if not cond:
-                        continue
-            elif receipt == 3:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'alibi']
-                    cond &= dpad == dvpad
-                    cond &= deterministic == "f"
-                    if not cond:
-                        continue
-            # PyTorch integration
-            elif receipt == 4:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'bias']
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    cond &= mode == 'batch'
-                    cond &= deterministic == "f"
-                    if not cond:
-                        continue
-            # Aiter (mha_bwd) integration
-            elif receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    if not cond:
-                        continue
-            # Aiter (mha_varlen_bwd) integration
-            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    if not cond:
-                        continue
-            # aiter::mha_bwd C++ api integration
-            elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
-                    if not cond:
-                        continue
-            api_pool.register_dq_dk_dv_traits(k.api_trait())
-            gen.append(k)
-
-    return (api_pool, gen)
-
 FMHA_BWD_DOT_DO_O_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
 
@@ -613,7 +421,7 @@ std::string fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_{F_idx}>()
 }}
 """
 
-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdOGradDotOKernel:
     F_idx       : int  # this is not a tunable, but a counter to differentiate symbol
     F_hdim      : int  # hdim
@@ -653,49 +461,6 @@ class FmhaBwdOGradDotOKernel:
     def filename(self) -> str:
         return self.name + ".cpp"
 
-def get_bwd_dot_do_o_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdOGradDotOKernel]:
-    # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
-    #       support this in future
-    def get_occupancy(dtype, hdim):
-        return 2
-
-    gen = list()
-
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, spad, dvpad in itertools.product(d.keys(), MODE_MAP.keys(), ["t", "f"], ["t", "f"]):
-            hdim = int(hdim_str)
-            if (mode == "group" and spad == "f"):
-                continue
-            k = FmhaBwdOGradDotOKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype,
-                                F_spad=spad, F_dvpad=dvpad, F_mode=mode,
-                                F_occupancy=get_occupancy(dtype, hdim))
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
-                    continue
-            # Aiter (mha_bwd) integration
-            if receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    if not cond:
-                        continue
-            # Aiter (mha_varlen_bwd) integration
-            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    if not cond:
-                        continue
-            # aiter::mha_bwd C++ api integration
-            elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
-                    if not cond:
-                        continue
-            gen.append(k)
-
-    return gen
-
 FMHA_BWD_CONVERT_DQ_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
 
@@ -762,7 +527,7 @@ std::string fmha_bwd_convert_dq_get_name_<convert_dq_trait_{F_idx}>()
 }}
 """
 
-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdConvertQGradKernel:
     F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
     F_hdim          : int  # hdim
@@ -810,92 +575,257 @@ class FmhaBwdConvertQGradKernel:
     def filename(self) -> str:
         return self.name + ".cpp"
 
-def get_bwd_convert_dq_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdConvertQGradKernel]:
-    # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
-    #       support this in future
-    def get_occupancy(dtype, hdim):
-        return 2
+@dataclass(frozen=True)
+class FmhaBwdApiTrait:
+    idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    pipeline      : str
+    # sync with fmha_bwd_traits<>, to generate fallback calls
+    hdim          : int
+    dtype         : str  # data type
+    mode          : str  # value from MODE_MAP
+    tile          : FmhaBwdDQDKDVTileSize
+    mask          : str
+    bias          : str
+    dbias         : str
+    dropout       : str
+    spad          : str
+    spad1         : str # spad for dot/convert kernel
+    skpad         : str
+    dpad          : str
+    dvpad         : str
+    deterministic : str
+    mask_impl     : str
 
-    gen = list()
+    @property
+    def bm0(self) -> int:
+        return self.tile.F_bm0
+    @property
+    def bn0(self) -> int:
+        return self.tile.F_bn0
+    @property
+    def bhdq(self) -> int:
+        return self.tile.F_bhdq
+    @property
+    def bhdv(self) -> int:
+        return self.tile.F_bhdv
+
+    def scheck(self, spad1 : str) -> str:
+        if self.mode == 'group':
+            return 'true' # always support
+        elif self.spad == 't' and spad1 == 't':
+            return f'a.seqlen_q % {self.bm0} != 0'
+        elif self.spad == 'f' and spad1 == 't':
+            return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0'
+        else: # self.skpad == 'f' and skpad1 == 'f'
+            return 'a.seqlen_q % 64 == 0'
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == 'group':
+            return 'true' # always support
+        elif self.skpad == 't':
+            return f'a.seqlen_k % {self.bn0} != 0'
+        else:
+            return f'a.seqlen_k % {self.bn0} == 0'
+
+    @property
+    def dcheck(self) -> str:
+        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
+        else :               return f'a.hdim_q % {self.bhdq} == 0'
+
+    @property
+    def dvcheck(self) -> str:
+        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
+        else :                return f'a.hdim_v % {self.bhdv} == 0'
+
+    @property
+    def dot_do_o_kernel(self) -> FmhaBwdOGradDotOKernel:
+        # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
+        #       support this in future
+        def get_occupancy(dtype, hdim):
+            return 2
+
+        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1,
+            F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
+
+    @property
+    def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
+        return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
+            F_spad=self.spad, F_skpad=self.skpad, F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias,
+            F_dbias=self.dbias, F_dropout=self.dropout, F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, F_pipeline=self.pipeline, mask_impl=self.mask_impl)
+    
+    @property
+    def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
+        # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
+        #       support this in future
+        def get_occupancy(dtype, hdim):
+            return 2
+
+        return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
+            F_bm0=64, F_bn0=self.tile.F_bn0, F_spad=self.spad, F_dpad=self.dpad,
+            F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
+            F_deterministic=self.deterministic)
+
+class FmhaBwdApiPool:
+    def __init__(self, mask_impl):
+        self.dq_dk_dv_pool = dict()
+        self.mask_impl = mask_impl
+
+    def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.dq_dk_dv_pool.keys():
+            self.dq_dk_dv_pool[trait.dtype] = dict()
+        if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys():
+            self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list()
+
+        self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.dq_dk_dv_pool.keys()):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()):
+                traits=self.dq_dk_dv_pool[dtype][hdim]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    for spad1 in ["t", "f"]:
+                        if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")):
+                            continue
+                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
+                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
+                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
+                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                    F_deterministic=BOOL_MAP[trait.deterministic])
+
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
+
+def get_bwd_blobs(filter_list: str, receipt, mask_impl) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
+    if filter_list == '':
+        filter_list = '*@*@*'
+    filter_list = filter_list.split('@')
+    filter_list.extend(['*'] * (3 - len(filter_list)))
+    filter_dot_do_o = filter_list[0]
+    filter_convert_dq = filter_list[1]
+    filter_dq_dk_dv = filter_list[2]
+
+    # use dict as ordered set
+    gen_dot_do_o: Dict[FmhaBwdOGradDotOKernel, Literal[True]] = {}
+    gen_dq_dk_dv: Dict[FmhaBwdDQDKDVKernel, Literal[True]] = {}
+    gen_convert_dq: Dict[FmhaBwdConvertQGradKernel, Literal[True]] = {}
+    api_pool = FmhaBwdApiPool(mask_impl)
 
     for dtype in BWD_DTYPE_MAP.keys():
         d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
+        if d is None:
             continue
-        for hdim_str, mode, spad, dpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-            hdim = int(hdim_str)
+        for hdim_str, mode, mask, bias, dbias, dropout, spad, spad1, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 6)):
             tile = d[hdim_str][0]
-            if (mode == "group" and spad == "f"):
+            ppl = d[hdim_str][1]
+            hdim = int(hdim_str)
+            if (mode == "group") and (spad == "f" or skpad == "f"):
                 continue
-            k = FmhaBwdConvertQGradKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_bm0=64, F_bn0=tile.F_bn0,
-                                F_spad=spad, F_dpad=dpad, F_mode=mode, F_occupancy=get_occupancy(dtype, hdim), F_deterministic=deterministic)
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
+            if (spad1 == "f") and (spad == "t" or mode == "group"):
+                continue
+            if ((bias == "no" or bias == "alibi") and dbias == "t"):
+                continue
+            if ("wg32" in dropout):
+                continue
+            if (dpad == "t" or dvpad == "t"):
+                ppl = d[hdim_str][2]
+            t = FmhaBwdApiTrait(idx=0, pipeline=ppl, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad=spad, spad1=spad1, skpad=skpad, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
+            
+            if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
+                continue
+            if not fnmatch.fnmatch(t.dq_dk_dv_kernel.name, filter_dq_dk_dv):
+                continue
+            if not fnmatch.fnmatch(t.convert_dq_kernel.name, filter_convert_dq):
+                continue
+
+            # Flash attention integration
+            if receipt == 2:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'alibi']
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                cond &= dpad == dvpad
+                if not cond:
+                    continue
+            elif receipt == 3:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'alibi']
+                cond &= dpad == dvpad
+                cond &= deterministic == "f"
+                if not cond:
+                    continue
+            # PyTorch integration
+            elif receipt == 4:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'bias']
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                cond &= dpad == dvpad
+                cond &= mode == 'batch'
+                cond &= deterministic == "f"
+                if not cond:
                     continue
             # Aiter (mha_bwd) integration
-            if receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    if not cond:
-                        continue
+            elif receipt == 300:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= mode == "batch"
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                if not cond:
+                    continue
             # Aiter (mha_varlen_bwd) integration
             elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    if not cond:
-                        continue
+                cond = dtype in ['fp16', 'bf16']
+                cond &= mode == "group"
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                if not cond:
+                    continue
             # aiter::mha_bwd C++ api integration
             elif receipt == 600:
-                    cond = dtype in ['fp16', 'bf16']
-                    if not cond:
-                        continue
-            gen.append(k)
+                cond = dtype in ['fp16', 'bf16']
+                if not cond:
+                    continue
+            gen_dot_do_o[t.dot_do_o_kernel] = True
+            gen_dq_dk_dv[t.dq_dk_dv_kernel] = True
+            gen_convert_dq[t.convert_dq_kernel] = True
+            api_pool.register_dq_dk_dv_traits(t)
 
-    return gen
-
-def write_single_bwd_dq_dk_dv_kernel(kernel: FmhaBwdDQDKDVKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_single_bwd_dot_do_o_kernel(kernel: FmhaBwdOGradDotOKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_single_bwd_convert_dq_kernel(kernel: FmhaBwdConvertQGradKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_bwd_api(api_pool : FmhaBwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
+    return api_pool, list(gen_dot_do_o.keys()), list(gen_dq_dk_dv.keys()), list(gen_convert_dq.keys())
 
 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
-    filter_list = filter_list.split('@')
-    filter_list.extend([''] * (3 - len(filter_list)))
-    # TODO
-    assert optdim_list == [-1]
+    assert optdim_list == [-1]  # TODO
 
-    kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
-    for kernel in kernels:
-        write_single_bwd_dot_do_o_kernel(kernel, output_dir)
-    kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
-    for kernel in kernels:
-        write_single_bwd_convert_dq_kernel(kernel, output_dir)
-    api_pool, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
-    for kernel in kernels:
-        write_single_bwd_dq_dk_dv_kernel(kernel, output_dir)
-    write_bwd_api(api_pool, output_dir)
+    api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl)
+    (output_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
+    for k in kernels_dot_do_o:
+        (output_dir / k.filename).write_text(k.template)
+    for k in kernels_convert_dq:
+        (output_dir / k.filename).write_text(k.template)
+    for k in kernels_dq_dk_dv:
+        (output_dir / k.filename).write_text(k.template)
 
-def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
-    filter_list = filter_list.split('@')
-    filter_list.extend([''] * (3 - len(filter_list)))
-    # TODO
-    assert optdim_list == [-1]
 
-    with file_path.open('a') as f:
-        kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None:
+    assert optdim_list == [-1]  # TODO
+
+    _, kernels_dot_do_o, kernels_dq_dk_dv, kernels_convert_dq = get_bwd_blobs(
+        filter_list, receipt, mask_impl
+    )
+    with file_path.open("a") as f:
+        for k in kernels_dot_do_o:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
+        for k in kernels_dq_dk_dv:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
+        for k in kernels_convert_dq:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_BWD_API_FILENAME) + "\n")

From 1d8941554ea9e91d5ba90e5bf532917ce6f1ac48 Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Wed, 23 Jul 2025 16:47:57 -0600
Subject: [PATCH 336/443] [CK Tile] Fix building issue on RHEL8 (#2554)

`#include <bit>` led a building failure on RHEL8.

`<bit>` is C++20 header file. It is not supported on RHEL8.
---
 test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
index f410b58053..3d2c9a82e0 100644
--- a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
+++ b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
@@ -9,7 +9,6 @@
 #include <stdexcept>
 #include <string>
 #include <tuple>
-#include <bit>
 #include <random>
 
 #include "ck_tile/core/config.hpp"

From 668159386492563db24246f45e833f82e146fd5f Mon Sep 17 00:00:00 2001
From: jakpiase <jakub.piasecki@amd.com>
Date: Thu, 24 Jul 2025 10:41:35 +0200
Subject: [PATCH 337/443] [CK_TILE] Grouped Convolution Backward Weight Kernel
 (#2357)

* [CK TILE] Grouped Convolution Forward Kernel

* custom vector size

* fixes

* refactor

* resolved conflicts

* rebase fixes

* fixes

* tmp

* add working support for splitk

* minor fix

* fixes

* fixes

* minor fix

* small fix

* Split K and preprocessing fixes

---------

Co-authored-by: Bartlomiej Kocot <barkocot@amd.com>
---
 .../20_grouped_convolution/CMakeLists.txt     |   6 +-
 .../grouped_convolution_backward_weight.cpp   | 218 +++++
 .../grouped_convolution_forward.cpp           |   8 +-
 .../grouped_convolution_utils.hpp             |  27 +-
 ...grouped_convolution_bwd_weight_example.inc | 188 ++++
 ...> run_grouped_convolution_fwd_example.inc} |  38 +-
 include/ck_tile/host.hpp                      |   1 +
 .../reference_grouped_conv_bwd_weight.hpp     | 167 ++++
 include/ck_tile/ops/gemm_group_quant.hpp      |  10 +-
 include/ck_tile/ops/grouped_convolution.hpp   |   2 +
 ...ped_convolution_backward_weight_kernel.hpp | 861 ++++++++++++++++++
 .../grouped_convolution_forward_kernel.hpp    |  39 +-
 .../utils/grouped_convolution_utils.hpp       |  17 +-
 .../transform_conv_bwd_weight_to_gemm.hpp     | 659 ++++++++++++++
 14 files changed, 2176 insertions(+), 65 deletions(-)
 create mode 100644 example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
 create mode 100644 example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
 rename example/ck_tile/20_grouped_convolution/{run_grouped_convolution_example.inc => run_grouped_convolution_fwd_example.inc} (81%)
 create mode 100644 include/ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
 create mode 100644 include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp

diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
index 00cb0ab9e5..c05dcac09c 100644
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -1,4 +1,8 @@
-add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
 set(EXAMPLE_CONV_COMPILE_OPTIONS)
 list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+
+add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
 target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
+target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
new file mode 100644
index 0000000000..bdfaffecb6
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
+                              const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                                       TilePartitioner,
+                                                                       CodegenPipeline,
+                                                                       ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(kargs);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel_preprocess(
+            s,
+            Kernel::Preprocess(kargs, s),
+            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
+    }
+}
+
+#include "run_grouped_convolution_bwd_weight_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_weight_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<1>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<2>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<3>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_bwd_weight_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
index 685fdccde2..ce19c77bc1 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -23,7 +23,7 @@ template <ck_tile::index_t NDimSpatial,
           typename DsDataType     = ck_tile::tuple<>,
           typename DsLayout       = ck_tile::tuple<>,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s)
+float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s)
 {
     constexpr int kBlockPerCu = 1;
 
@@ -97,7 +97,7 @@ float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::
                                                                 ConvEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 
-        const dim3 grids      = Kernel::GridSize(args);
+        const dim3 grids      = Kernel::GridSize(kargs);
         constexpr dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))
@@ -129,7 +129,7 @@ float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::
                                           ck_tile::memory_operation_enum::set>{});
 }
 
-#include "run_grouped_convolution_example.inc"
+#include "run_grouped_convolution_fwd_example.inc"
 
 template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
 int run_grouped_conv_fwd_example_prec_type(
@@ -185,7 +185,7 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])
 
     std::string data_type  = arg_parser.get_str("prec");
     std::string in_layout  = arg_parser.get_str("in_layout");
-    std::string wei_layout = arg_parser.get_str("weight_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
     std::string out_layout = arg_parser.get_str("out_layout");
 
     if(data_type == "fp16")
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
index cc8d365b18..f3a7a60fd9 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -12,6 +12,28 @@
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/grouped_convolution.hpp"
 
+template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
+auto calculate_rtol_atol(const ck_tile::index_t GemmK,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(GemmK, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
+    const auto atol_split_k =
+        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
+            max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
 ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_spatial_lengths,
                                          std::vector<ck_tile::index_t>& image_spatial_lengths,
                                          std::vector<ck_tile::index_t>& strides,
@@ -90,7 +112,7 @@ auto create_args(int argc, char* argv[])
         .insert("rpad_w", "0", "right pad for w dimension")
 
         .insert("in_layout", "NHWGC", "Input image layout - NHWGC by default")
-        .insert("weight_layout", "GKYXC", "Weight layout - GKYXC by default")
+        .insert("wei_layout", "GKYXC", "Weight layout - GKYXC by default")
         .insert("out_layout", "NHWGK", "Output image layout - NHWGK by default")
         .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
@@ -105,4 +127,5 @@ auto create_args(int argc, char* argv[])
 }
 
 // host API
-float grouped_conv_fwd(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s);
+float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                       const ck_tile::stream_config& s);
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
new file mode 100644
index 0000000000..9c32e2a11e
--- /dev/null
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_bwd_weight(ck_tile::GroupedConvBwdWeightHostArgs& args,
+                                     int n_warmup,
+                                     int n_repeat)
+{
+    float ave_time = grouped_conv_bwd_weight<NDimSpatial,
+                                             InDataType,
+                                             WeiDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             InLayout,
+                                             WeiLayout,
+                                             OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_bwd_weight_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{-1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<OutDataType>{-1.f, 1.f}(output);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<InDataType>{}(input);
+        ck_tile::FillMonotonicSeq<OutDataType>{}(output);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<OutDataType>{1.f, 1.f}(output);
+    }
+    else
+    {
+        input.SetZero();
+        output.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.ToDevice(input.data());
+    weight_dev_buf.SetZero();
+    output_dev_buf.ToDevice(output.data());
+
+    ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
+                                               input_dev_buf.GetDeviceBuffer(),
+                                               weight_dev_buf.GetDeviceBuffer(),
+                                               {},
+                                               output_dev_buf.GetDeviceBuffer(),
+                                               kbatch);
+
+    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_bwd_weight<NDimSpatial,
+                                   InDataType,
+                                   WeiDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   InLayout,
+                                   WeiLayout,
+                                   OutLayout>(args, n_warmup, n_repeat);
+
+    weight_dev_buf.FromDevice(weight.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
+        weight_host_ref.SetZero();
+
+        ck_tile::
+            reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+                input,
+                weight_host_ref,
+                output,
+                conv_param.conv_filter_strides_,
+                conv_param.conv_filter_dilations_,
+                conv_param.input_left_pads_,
+                conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK =
+            weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(weight,
+                                  weight_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
similarity index 81%
rename from example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
rename to example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
index ed72eb354d..3532e343bb 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -2,28 +2,6 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
-template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
-auto calculate_rtol_atol(const ck_tile::index_t GemmK,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(GemmK, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
-    const auto atol_split_k =
-        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
-            max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
 template <ck_tile::index_t NDimSpatial,
           typename InDataType,
           typename WeiDataType,
@@ -32,7 +10,9 @@ template <ck_tile::index_t NDimSpatial,
           typename InLayout,
           typename WeiLayout,
           typename OutLayout>
-float invoke_grouped_conv_fwd(ck_tile::GroupedConvHostArgs& args, int n_warmup, int n_repeat)
+float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                              int n_warmup,
+                              int n_repeat)
 {
     float ave_time = grouped_conv_fwd<NDimSpatial,
                                       InDataType,
@@ -143,12 +123,12 @@ int run_grouped_conv_fwd_example_with_layouts(
     weight_dev_buf.ToDevice(weight.data());
     output_dev_buf.SetZero();
 
-    ck_tile::GroupedConvHostArgs args(conv_param,
-                                      input_dev_buf.GetDeviceBuffer(),
-                                      weight_dev_buf.GetDeviceBuffer(),
-                                      {},
-                                      output_dev_buf.GetDeviceBuffer(),
-                                      kbatch);
+    ck_tile::GroupedConvFwdHostArgs args(conv_param,
+                                         input_dev_buf.GetDeviceBuffer(),
+                                         weight_dev_buf.GetDeviceBuffer(),
+                                         {},
+                                         output_dev_buf.GetDeviceBuffer(),
+                                         kbatch);
 
     std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
     std::cout << "input: " << input.mDesc << std::endl;
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 4a9748fcbb..13db461807 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -27,6 +27,7 @@
 #include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_fused_moe.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
+#include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp"
 #include "ck_tile/host/reference/reference_grouped_conv_fwd.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
diff --git a/include/ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp b/include/ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp
new file mode 100644
index 0000000000..346a03d1e8
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+CK_TILE_HOST void
+reference_grouped_conv_bwd_weight(const HostTensor<InDataType>& input,
+                                  HostTensor<WeiDataType>& weight,
+                                  const HostTensor<OutDataType>& output,
+                                  std::vector<ck_tile::long_index_t> conv_strides,
+                                  std::vector<ck_tile::long_index_t> conv_dilations,
+                                  std::vector<ck_tile::long_index_t> in_left_pads,
+                                  std::vector<ck_tile::long_index_t>)
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto k, auto c, auto x) {
+            float v_acc = 0;
+
+            for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
+            {
+                for(std::size_t wo = 0; wo < output.get_lengths()[3]; ++wo)
+                {
+                    auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
+                    {
+                        InDataType v_in   = input(g, n, c, wi);
+                        OutDataType v_out = output(g, n, k, wo);
+                        v_acc += ck_tile::type_convert<float>(v_out) *
+                                 ck_tile::type_convert<float>(v_in);
+                    }
+                }
+            }
+            OutDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
+            weight(g, k, c, x)          = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   weight.get_lengths()[0],
+                                   weight.get_lengths()[1],
+                                   weight.get_lengths()[2],
+                                   weight.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto k, auto c, auto y, auto x) {
+            float v_acc = 0;
+
+            for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
+            {
+                for(std::size_t ho = 0; ho < output.get_lengths()[3]; ++ho)
+                {
+                    auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    for(std::size_t wo = 0; wo < output.get_lengths()[4]; ++wo)
+                    {
+                        auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+
+                        if(hi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
+                           wi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
+                        {
+                            InDataType v_in   = input(g, n, c, hi, wi);
+                            OutDataType v_out = output(g, n, k, ho, wo);
+
+                            v_acc += ck_tile::type_convert<float>(v_out) *
+                                     ck_tile::type_convert<float>(v_in);
+                        }
+                    }
+                }
+            }
+            WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
+            weight(g, k, c, y, x)       = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   weight.get_lengths()[0],
+                                   weight.get_lengths()[1],
+                                   weight.get_lengths()[2],
+                                   weight.get_lengths()[3],
+                                   weight.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto k, auto c, auto z, auto y, auto x) {
+            float v_acc = 0;
+
+            for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
+            {
+                for(std::size_t do_ = 0; do_ < output.get_lengths()[3]; ++do_)
+                {
+                    auto di = static_cast<ck_tile::long_index_t>(do_ * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+                    for(std::size_t ho = 0; ho < output.get_lengths()[4]; ++ho)
+                    {
+                        auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+                        for(std::size_t wo = 0; wo < output.get_lengths()[5]; ++wo)
+                        {
+                            auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
+                                      static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
+                                      static_cast<ck_tile::long_index_t>(in_left_pads[2]);
+                            if(di >= 0 &&
+                               ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
+                               hi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
+                               wi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
+                            {
+                                InDataType v_in   = input(g, n, c, di, hi, wi);
+                                OutDataType v_out = output(g, n, k, do_, ho, wo);
+
+                                v_acc += ck_tile::type_convert<float>(v_out) *
+                                         ck_tile::type_convert<float>(v_in);
+                            }
+                        }
+                    }
+                }
+            }
+            WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
+            weight(g, k, c, z, y, x)    = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   weight.get_lengths()[0],
+                                   weight.get_lengths()[1],
+                                   weight.get_lengths()[2],
+                                   weight.get_lengths()[3],
+                                   weight.get_lengths()[4],
+                                   weight.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Ref_conv_bwd_weight: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant.hpp b/include/ck_tile/ops/gemm_group_quant.hpp
index 0041c658b4..9f7565fefb 100644
--- a/include/ck_tile/ops/gemm_group_quant.hpp
+++ b/include/ck_tile/ops/gemm_group_quant.hpp
@@ -1,12 +1,16 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
-#include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp"
 #include "ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp"
-#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_problem.hpp"
+#include "ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp"
 #include "ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/grouped_convolution.hpp b/include/ck_tile/ops/grouped_convolution.hpp
index ae5720776c..29332f941a 100644
--- a/include/ck_tile/ops/grouped_convolution.hpp
+++ b/include/ck_tile/ops/grouped_convolution.hpp
@@ -3,9 +3,11 @@
 
 #pragma once
 
+#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
new file mode 100644
index 0000000000..5b7d78d51f
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -0,0 +1,861 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+
+namespace ck_tile {
+
+/// @brief The Grouped Convolution kernel device arguments.
+template <typename GroupedConvTraitsType>
+struct GroupedConvBwdWeightKernelArgs
+{
+
+    using ConvToGemmTransformer =
+        TransformConvBwdWeightToGemm<GroupedConvTraitsType::NDimSpatial,
+                                     GroupedConvTraitsType::ConvSpecialization>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvBwdWeightKernelArgs(const GroupedConvBwdWeightHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0])};
+
+        k_batch = args.k_batch;
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                       wei_g_k_c_xs_lengths,
+                                                       out_g_n_k_wos_lengths,
+                                                       conv_filter_strides,
+                                                       conv_filter_dilations,
+                                                       input_left_pads,
+                                                       input_right_pads};
+
+        // tuple
+        auto grid_descs =
+            conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
+                GroupedConvTraitsType::NDimSpatial>();
+
+        a_grid_desc_m_k = grid_descs.at(number<0>{});
+        b_grid_desc_n_k = grid_descs.at(number<1>{});
+        c_grid_desc_m_n = grid_descs.at(number<2>{});
+
+        group_stride_a = args.K_;            // A: Out NWGK
+        group_stride_b = args.C_;            // B: In  NWGC
+        group_stride_c = args.K_ * args.C_ * // C: Wei GKXC
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+
+        GemmM     = a_grid_desc_m_k.get_length(number<0>{});
+        GemmN     = b_grid_desc_n_k.get_length(number<0>{});
+        GemmK     = a_grid_desc_m_k.get_length(number<1>{});
+        GemmBatch = args.G_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvBwdWeightKernelArgs(const GroupedConvBwdWeightHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                static_cast<index_t>(args.input_spatial_lengths_[1])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[1])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                               static_cast<index_t>(args.conv_filter_strides_[1])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                           static_cast<index_t>(args.input_left_pads_[1])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                            static_cast<index_t>(args.input_right_pads_[1])};
+
+        k_batch = args.k_batch;
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                       wei_g_k_c_xs_lengths,
+                                                       out_g_n_k_wos_lengths,
+                                                       conv_filter_strides,
+                                                       conv_filter_dilations,
+                                                       input_left_pads,
+                                                       input_right_pads};
+
+        // tuple
+        auto grid_descs =
+            conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
+                GroupedConvTraitsType::NDimSpatial>();
+
+        a_grid_desc_m_k = grid_descs.at(number<0>{});
+        b_grid_desc_n_k = grid_descs.at(number<1>{});
+        c_grid_desc_m_n = grid_descs.at(number<2>{});
+
+        group_stride_a = args.K_;            // A: Out NHWGK
+        group_stride_b = args.C_;            // B: In  NHWGC
+        group_stride_c = args.K_ * args.C_ * // C: Wei GKYXC
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+
+        GemmM     = a_grid_desc_m_k.get_length(number<0>{});
+        GemmN     = b_grid_desc_n_k.get_length(number<0>{});
+        GemmK     = a_grid_desc_m_k.get_length(number<1>{});
+        GemmBatch = args.G_;
+    }
+
+    template <
+        typename InLay                      = typename GroupedConvTraitsType::InLayout,
+        typename WeiLay                     = typename GroupedConvTraitsType::WeiLayout,
+        typename OutLay                     = typename GroupedConvTraitsType::OutLayout,
+        typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
+                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
+                                    std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
+                                bool>::type = false>
+    CK_TILE_HOST GroupedConvBwdWeightKernelArgs(const GroupedConvBwdWeightHostArgs& args)
+    {
+        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.N_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                static_cast<index_t>(args.input_spatial_lengths_[1]),
+                                static_cast<index_t>(args.input_spatial_lengths_[2])};
+        wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
+                                static_cast<index_t>(args.K_),
+                                static_cast<index_t>(args.C_),
+                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[1]),
+                                static_cast<index_t>(args.filter_spatial_lengths_[2])};
+        out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.output_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.output_spatial_lengths_[2])};
+
+        conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
+                               static_cast<index_t>(args.conv_filter_strides_[1]),
+                               static_cast<index_t>(args.conv_filter_strides_[2])};
+        conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[1]),
+                                 static_cast<index_t>(args.conv_filter_dilations_[2])};
+        input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
+                           static_cast<index_t>(args.input_left_pads_[1]),
+                           static_cast<index_t>(args.input_left_pads_[2])};
+        input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
+                            static_cast<index_t>(args.input_right_pads_[1]),
+                            static_cast<index_t>(args.input_right_pads_[2])};
+
+        k_batch = args.k_batch;
+
+        in_ptr  = args.in_ptr;
+        wei_ptr = args.wei_ptr;
+        for(index_t d = 0; d < NumDTensor; d++)
+        {
+            ds_ptr[d] = args.ds_ptr[d];
+        }
+        out_ptr = args.out_ptr;
+
+        ConvToGemmTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
+                                                       wei_g_k_c_xs_lengths,
+                                                       out_g_n_k_wos_lengths,
+                                                       conv_filter_strides,
+                                                       conv_filter_dilations,
+                                                       input_left_pads,
+                                                       input_right_pads};
+
+        // tuple
+        auto grid_descs =
+            conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<
+                GroupedConvTraitsType::NDimSpatial>();
+
+        a_grid_desc_m_k = grid_descs.at(number<0>{});
+        b_grid_desc_n_k = grid_descs.at(number<1>{});
+        c_grid_desc_m_n = grid_descs.at(number<2>{});
+
+        group_stride_a = args.K_;            // A: Out NDHWGK
+        group_stride_b = args.C_;            // B: In  NDHWGC
+        group_stride_c = args.K_ * args.C_ * // C: wEI GKZYXC
+                         std::accumulate(args.filter_spatial_lengths_.begin(),
+                                         args.filter_spatial_lengths_.end(),
+                                         1,
+                                         std::multiplies<index_t>());
+
+        GemmM     = a_grid_desc_m_k.get_length(number<0>{});
+        GemmN     = b_grid_desc_n_k.get_length(number<0>{});
+        GemmK     = a_grid_desc_m_k.get_length(number<1>{});
+        GemmBatch = args.G_;
+    }
+
+    using ABCGridDescs = remove_cvref_t<decltype(
+        ConvToGemmTransformer{}.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N())>;
+
+    using AGridDescMK = remove_cvref_t<decltype(ABCGridDescs{}[number<0>{}])>;
+    using BGridDescNK = remove_cvref_t<decltype(ABCGridDescs{}[number<1>{}])>;
+    using CGridDescMN = remove_cvref_t<decltype(ABCGridDescs{}[number<2>{}])>;
+
+    static constexpr index_t NonSpatialDims = 3;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> in_g_n_c_wis_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> wei_g_k_c_xs_lengths;
+    array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> out_g_n_k_wos_lengths;
+
+    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_strides;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> conv_filter_dilations;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> input_left_pads;
+    array<index_t, GroupedConvTraitsType::NDimSpatial> input_right_pads;
+
+    index_t k_batch;
+    index_t GemmM;
+    index_t GemmN;
+    index_t GemmK;
+    index_t GemmBatch;
+
+    const void* out_ptr;
+    const void* in_ptr;
+    std::array<const void*, NumDTensor> ds_ptr;
+    void* wei_ptr;
+
+    AGridDescMK a_grid_desc_m_k;
+    BGridDescNK b_grid_desc_n_k;
+    CGridDescMN c_grid_desc_m_n;
+
+    long_index_t group_stride_a;
+    long_index_t group_stride_b;
+    long_index_t group_stride_c;
+};
+
+/// @brief The Grouped Convolution Forward kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the grouped convolution forward kernel template. By semantic
+///            division of Implicit GEMM algorithm into following parts we achieve flexible,
+///            versatile and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// tparam ConvSpecialization  Tensor descriptors specialization.
+/// @tparam TilePartitioner_            The type of class providing mapping of workgroup index into
+/// the
+///                                     output data tile to be calculated. It determines the
+///                                     workgroup to data relationship (or in other words - which
+///                                     data would be processed and calculated by which workgroup).
+/// @tparam GemmPipeline_               The type of class which provides the core part of matrix
+///                                     multiplication. This class should provide implementation of
+///                                     data loading from global memory and performing block-wise
+///                                     matrix multiplication. You can think of it as a work done by
+///                                     single workgroup point of view.
+/// @tparam EpiloguePipeline_           The type of class providing the final part of matrix
+///                                     multiplication implementation. It is responsible for storing
+///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                                     the output C tensor in global memory.
+template <typename GroupedConvTraitsType,
+          typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_>
+struct GroupedConvolutionBackwardWeightKernel
+{
+    static constexpr index_t NDimSpatial = GroupedConvTraitsType::NDimSpatial_;
+    static constexpr ConvolutionSpecialization ConvSpecialization =
+        GroupedConvTraitsType::ConvSpecialization;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+    using GemmALayout      = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using GemmBLayout      = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using GemmCLayout      = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    using InLayout  = remove_cvref_t<typename GroupedConvTraitsType::InLayout>;
+    using WeiLayout = remove_cvref_t<typename GroupedConvTraitsType::WeiLayout>;
+    using OutLayout = remove_cvref_t<typename GroupedConvTraitsType::OutLayout>;
+    using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType::DsLayout>;
+
+    using GemmDsLayout                  = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+    static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
+
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    using InDataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using DsDataType  = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+    // Below type is actually accumulation data type - the output of block GEMM.
+    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    using GroupedConvBwdWeightKernelArgsSpecialized =
+        GroupedConvBwdWeightKernelArgs<GroupedConvTraitsType>;
+
+    // TODO: Enable this
+    static constexpr bool IsSplitKSupported = true;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>();
+
+    static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
+                  "Not supported!");
+    static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
+    static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>, "Not supported!");
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "grouped_convolution_backward_weight", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto
+    GridSize(const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
+    {
+        return dim3(
+            TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr GroupedConvBwdWeightKernelArgsSpecialized
+    MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs)
+    {
+        return GroupedConvBwdWeightKernelArgsSpecialized(hostArgs);
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
+            const index_t KRead =
+                __builtin_amdgcn_readfirstlane((kargs.GemmK + K_t - 1) / K_t * K1);
+
+            a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+            b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+
+            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            {
+                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+            }
+            else
+            {
+                splitted_k =
+                    __builtin_amdgcn_readfirstlane(kargs.GemmK - KRead * (kargs.k_batch - 1));
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static auto Preprocess(const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
+                                        const stream_config& s)
+    {
+        return [&]() {
+            if(kargs.k_batch > 1)
+                hipGetErrorString(hipMemsetAsync(kargs.wei_ptr,
+                                                 0,
+                                                 kargs.GemmBatch * kargs.GemmM * kargs.GemmN *
+                                                     sizeof(WeiDataType),
+                                                 s.stream_id_));
+        };
+    }
+
+    CK_TILE_HOST static bool
+    IsSupportedArgument(const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
+    {
+        if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                      is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
+                     !IsSplitKSupported)
+        {
+            if(kargs.k_batch != 1)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                }
+                return false;
+            }
+        }
+
+        const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
+        const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
+
+        // check ConvSpecialization
+        if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t ConvStride = kargs.conv_filter_strides[i];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
+                const index_t LeftPad    = kargs.input_left_pads[i];
+                const index_t RightPad   = kargs.input_right_pads[i];
+
+                if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvSpecialization == ConvolutionSpecialization::Filter3x3)
+        {
+            if(ConvC != 1)
+            {
+                return false;
+            }
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t filter_spatial_dim = kargs.wei_g_k_c_xs_lengths[i + I3];
+
+                if(filter_spatial_dim != I3)
+                {
+                    return false;
+                }
+            }
+        }
+
+        namespace ctc = tensor_layout::convolution;
+
+        if constexpr(std::is_same_v<InLayout, ctc::NWGC> || std::is_same_v<InLayout, ctc::NHWGC> ||
+                     std::is_same_v<InLayout, ctc::NDHWGC>)
+        {
+            // Check access per C
+            if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported input layout!");
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKYXC> ||
+                     std::is_same_v<WeiLayout, ctc::GKZYXC>)
+        {
+            if(ConvC % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported weight layout!");
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
+                     std::is_same_v<OutLayout, ctc::NHWGK> ||
+                     std::is_same_v<OutLayout, ctc::NDHWGK>)
+        {
+            if(ConvK % GemmPipeline::GetVectorSizeA() != 0)
+            {
+                CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
+                return false;
+            }
+        }
+        else
+        {
+            CK_TILE_ERROR("Not supported output layout!");
+            return false;
+        }
+
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const OutDataType* a_ptr,
+                        const InDataType* b_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        WeiDataType* c_ptr,
+                        const GroupedConvBwdWeightKernelArgsSpecialized& kargs)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteB, "Not implemented!");
+        const auto& a_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(a_ptr,
+                                                                kargs.a_grid_desc_m_k); // A: out
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            return make_tensor_view<address_space_enum::global>(b_ptr,
+                                                                kargs.b_grid_desc_n_k); // B: in
+        }();
+
+        const auto& c_tensor_view = [&]() {
+            return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                c_ptr,
+                make_tuple(kargs.GemmM, kargs.GemmN),
+                make_tuple(kargs.GemmN, 1),
+                number<EpiloguePipeline::GetVectorSizeC()>{},
+                number<1>{});
+        }();
+
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsLayout>, OutLayout>,
+                              "Not supported!");
+                static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>,
+                              "Not supported!");
+                static_assert(std::is_same_v<std::tuple_element_t<i, DsDataType>, OutDataType>,
+                              "Not supported!");
+
+                return make_tensor_view<address_space_enum::global>(
+                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
+            },
+            number<NumDTensor>{});
+
+        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views, const index_t k_batch)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            return pad_tensor_view(a_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{} * k_batch),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I1);
+            return pad_tensor_view(b_tensor_view,
+                                   make_tuple(number<TilePartitioner::NPerBlock>{},
+                                              number<TilePartitioner::KPerBlock>{} * k_batch),
+                                   sequence<true, true>{});
+        }();
+
+        const auto& ds_tensor_view = views.at(I2);
+        const auto& ds_pad_view    = generate_tuple(
+            [&](auto i) {
+                return pad_tensor_view(ds_tensor_view[i],
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<true, true>{});
+            },
+            number<NumDTensor>{});
+
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I3);
+            return pad_tensor_view(c_tensor_view,
+                                   make_tuple(number<TilePartitioner::MPerBlock>{},
+                                              number<TilePartitioner::NPerBlock>{}),
+                                   sequence<true, true>{});
+        }();
+
+        return make_tuple(a_pad_view, b_pad_view, ds_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views,
+                                                   const index_t i_m,
+                                                   const index_t i_n,
+                                                   const index_t i_k)
+    {
+        const auto& a_pad_view  = views.at(I0);
+        const auto& b_pad_view  = views.at(I1);
+        const auto& ds_pad_view = views.at(I2);
+        const auto& c_pad_view  = views.at(I3);
+
+        const auto& a_block_window = [&]() {
+            return make_tile_window(a_pad_view,
+                                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_m, i_k});
+        }();
+
+        const auto& b_block_window = [&]() {
+            return make_tile_window(b_pad_view,
+                                    make_tuple(number<TilePartitioner::NPerBlock>{},
+                                               number<TilePartitioner::KPerBlock>{}),
+                                    {i_n, i_k});
+        }();
+
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                return make_tile_window(ds_pad_view[i],
+                                        make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                   number<TilePartitioner::NPerBlock>{}),
+                                        {i_m, i_n});
+            },
+            number<NumDTensor>{});
+
+        auto c_block_window = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_block_window, ds_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm(const OutDataType* a_ptr,
+                                       const InDataType* b_ptr,
+                                       const std::array<const void*, NumDTensor>& ds_ptr,
+                                       WeiDataType* c_ptr,
+                                       void* smem_ptr_0,
+                                       const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
+                                       const index_t num_loop,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n,
+                                       const index_t block_idx_k)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple, kargs.k_batch);
+        auto gemm_tile_windows =
+            MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n, block_idx_k);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs Grouped Convolution Forward kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm2LDS(const OutDataType* a_ptr,
+                                           const InDataType* b_ptr,
+                                           const std::array<const void*, NumDTensor>& ds_ptr,
+                                           WeiDataType* c_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
+                                           const index_t num_loop,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n,
+                                           const index_t block_idx_k)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                a_ptr, b_ptr, ds_ptr, c_ptr, kargs);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple, kargs.k_batch);
+        auto gemm_tile_windows =
+            MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n, block_idx_k);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& d_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
+    CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized kargs) const
+    {
+        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto [iM, iN] =
+            TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
+        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const auto blockIdZ    = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            ck_tile::integer_divide_ceil(kargs.GemmK, kargs.k_batch * TilePartitioner::KPerBlock));
+        const index_t i_k =
+            __builtin_amdgcn_readfirstlane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
+
+        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
+        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+
+        // options
+        // conv_bwd_weight = Out * In = Weight
+        const OutDataType* a_ptr = static_cast<const OutDataType*>(kargs.out_ptr) + group_offset_a;
+        const InDataType* b_ptr  = static_cast<const InDataType*>(kargs.in_ptr) + group_offset_b;
+        WeiDataType* c_ptr       = static_cast<WeiDataType*>(kargs.wei_ptr) + group_offset_c;
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+        {
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm2LDS(a_ptr,
+                            b_ptr,
+                            kargs.ds_ptr,
+                            c_ptr,
+                            smem_ptr_0,
+                            smem_ptr_1,
+                            kargs,
+                            num_loop,
+                            i_m,
+                            i_n,
+                            i_k);
+            }
+        }
+        else
+        {
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<OutDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm(
+                    a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr_0, kargs, num_loop, i_m, i_n, i_k);
+            }
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index 196c468c07..f979d96326 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -34,7 +34,7 @@ struct GroupedConvFwdKernelArgs
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
                                 bool>::type = false>
-    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
                                 static_cast<index_t>(args.N_),
@@ -56,9 +56,10 @@ struct GroupedConvFwdKernelArgs
 
         k_batch = args.k_batch;
 
-        GemmM = args.N_ * args.output_spatial_lengths_[0];
-        GemmN = args.K_;
-        GemmK = args.C_ * args.filter_spatial_lengths_[0];
+        GemmM     = args.N_ * args.output_spatial_lengths_[0];
+        GemmN     = args.K_;
+        GemmK     = args.C_ * args.filter_spatial_lengths_[0];
+        GemmBatch = args.G_;
 
         in_ptr  = args.in_ptr;
         wei_ptr = args.wei_ptr;
@@ -103,7 +104,7 @@ struct GroupedConvFwdKernelArgs
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
                                 bool>::type = false>
-    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
                                 static_cast<index_t>(args.N_),
@@ -132,9 +133,10 @@ struct GroupedConvFwdKernelArgs
 
         k_batch = args.k_batch;
 
-        GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
-        GemmN = args.K_;
-        GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1];
+        GemmM     = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
+        GemmN     = args.K_;
+        GemmK     = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1];
+        GemmBatch = args.G_;
 
         in_ptr  = args.in_ptr;
         wei_ptr = args.wei_ptr;
@@ -179,7 +181,7 @@ struct GroupedConvFwdKernelArgs
                                     std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
                                     std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
                                 bool>::type = false>
-    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
                                 static_cast<index_t>(args.N_),
@@ -220,6 +222,7 @@ struct GroupedConvFwdKernelArgs
         GemmN = args.K_;
         GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1] *
                 args.filter_spatial_lengths_[2];
+        GemmBatch = args.G_;
 
         in_ptr  = args.in_ptr;
         wei_ptr = args.wei_ptr;
@@ -280,6 +283,7 @@ struct GroupedConvFwdKernelArgs
     index_t GemmM;
     index_t GemmN;
     index_t GemmK;
+    index_t GemmBatch;
 
     const void* in_ptr;
     const void* wei_ptr;
@@ -354,8 +358,7 @@ struct GroupedConvolutionForwardKernel
     using OutLayout = remove_cvref_t<typename GroupedConvTraitsType::OutLayout>;
     using DsLayout  = remove_cvref_t<typename GroupedConvTraitsType::DsLayout>;
 
-    using GemmDsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
-
+    using GemmDsLayout                  = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
     static constexpr index_t NumDTensor = GroupedConvTraitsType::NumDTensor;
 
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
@@ -389,20 +392,16 @@ struct GroupedConvolutionForwardKernel
         // clang-format on
     }
 
-    CK_TILE_HOST static constexpr auto GridSize(const GroupedConvHostArgs& args)
+    CK_TILE_HOST static constexpr auto GridSize(const GroupedConvFwdKernelArgsSpecialized& kargs)
     {
-        const index_t GemmM = args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
-                                                        args.output_spatial_lengths_.end(),
-                                                        1,
-                                                        std::multiplies<index_t>());
-        const index_t GemmN = args.K_;
-        return dim3(TilePartitioner::GridSize(GemmM, GemmN), args.G_, args.k_batch);
+        return dim3(
+            TilePartitioner::GridSize(kargs.GemmM, kargs.GemmN), kargs.GemmBatch, kargs.k_batch);
     }
 
     CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
 
     CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
-    MakeKernelArgs(const GroupedConvHostArgs& hostArgs)
+    MakeKernelArgs(const GroupedConvFwdHostArgs& hostArgs)
     {
         return GroupedConvFwdKernelArgsSpecialized(hostArgs);
     }
@@ -750,7 +749,7 @@ struct GroupedConvolutionForwardKernel
         auto& c_block_window = gemm_tile_windows.at(I3);
 
         EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, d_block_window, smem_ptr_0, smem_ptr_1);
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
     }
 
     CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
index 4b7cb3c895..48aaed3aae 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -14,14 +14,15 @@ namespace ck_tile {
 ///      This structure is passed to Grouped Convolution Kernels when creating kernel
 ///      arguments object. It contain all necessary information required to
 ///      build proper kernel argument and launch kernel on GPU.
+template <typename InPtr, typename WeiPtr, typename OutPtr>
 struct GroupedConvHostArgs : public conv::ConvParam
 {
     CK_TILE_HOST GroupedConvHostArgs() = delete;
     CK_TILE_HOST GroupedConvHostArgs(ConvParam conv_param,
-                                     const void* in_ptr_,
-                                     const void* wei_ptr_,
+                                     InPtr in_ptr_,
+                                     WeiPtr wei_ptr_,
                                      const std::vector<const void*> ds_ptr_,
-                                     void* out_ptr_,
+                                     OutPtr out_ptr_,
                                      index_t k_batch_)
         : conv::ConvParam(conv_param),
           in_ptr(in_ptr_),
@@ -32,13 +33,16 @@ struct GroupedConvHostArgs : public conv::ConvParam
     {
     }
 
-    const void* in_ptr;
-    const void* wei_ptr;
+    InPtr in_ptr;
+    WeiPtr wei_ptr;
     const std::vector<const void*> ds_ptr;
-    void* out_ptr;
+    OutPtr out_ptr;
     index_t k_batch;
 };
 
+using GroupedConvFwdHostArgs       = GroupedConvHostArgs<const void*, const void*, void*>;
+using GroupedConvBwdWeightHostArgs = GroupedConvHostArgs<const void*, void*, const void*>;
+
 template <index_t NDimSpatial_,
           ConvolutionSpecialization ConvSpecialization_,
           typename InLayout_,
@@ -55,6 +59,7 @@ struct GroupedConvTraits
     }
 
     public:
+    static constexpr index_t NumGroupsToMerge                     = 1;
     static constexpr index_t NDimSpatial                          = NDimSpatial_;
     static constexpr ConvolutionSpecialization ConvSpecialization = ConvSpecialization_;
     using InLayout                                                = InLayout_;
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
new file mode 100644
index 0000000000..b2b7918810
--- /dev/null
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_bwd_weight_to_gemm.hpp
@@ -0,0 +1,659 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
+
+namespace ck_tile {
+
+template <index_t NDimSpatial,
+          ConvolutionSpecialization ConvolutionSpecialization,
+          bool SplitN              = false,
+          typename ADataType       = float,
+          typename CDataType       = float,
+          index_t NumGroupsToMerge = 1,
+          typename IndexType       = index_t>
+struct TransformConvBwdWeightToGemm
+{
+    private:
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+    static constexpr auto I3 = number<3>{};
+    static constexpr auto I4 = number<4>{};
+    static constexpr auto I5 = number<5>{};
+#if 0 // TODO: Enable these functionalities
+    template <typename ConvDimsType>
+    static long_index_t calculate_element_space_size_impl(const ConvDimsType& lengths,
+                                                          const ConvDimsType& strides,
+                                                          index_t i)
+    {
+        long_index_t acc = 1;
+        for(; i < (NDimSpatial + 3); i++)
+        {
+            acc +=
+                static_cast<long_index_t>(lengths[i] - I1) * static_cast<long_index_t>(strides[i]);
+        }
+
+        return acc;
+    }
+
+    template <typename ConvDimsType>
+    static IndexType GetSplitedNSize(const ConvDimsType& a_g_n_c_wis_lengths,
+                                     const ConvDimsType& a_g_n_c_wis_strides,
+                                     const ConvDimsType& c_g_n_k_wos_lengths,
+                                     const ConvDimsType& c_g_n_k_wos_strides)
+    {
+        const long_index_t a_element_space_size =
+            calculate_element_space_size_impl(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, I1);
+        const long_index_t c_element_space_size =
+            calculate_element_space_size_impl(c_g_n_k_wos_lengths, c_g_n_k_wos_strides, I1);
+        const long_index_t element_space_size = math::max(a_element_space_size * sizeof(ADataType),
+                                                          c_element_space_size * sizeof(CDataType));
+        constexpr long_index_t TwoGB          = (long_index_t{1} << 31);
+
+        const IndexType N = a_g_n_c_wis_lengths[I1];
+
+        if(element_space_size > TwoGB)
+        {
+            // Minimum divisor of N to not exceed 2GB
+            const auto divisor = math::integer_divide_ceil(element_space_size, TwoGB);
+
+            if(divisor <= static_cast<double>(N))
+            {
+                // Find least divisor of N larger than element_space_size / TwoGB
+                // Iterate up to sqrt(N). There are no divisors above this value.
+                for(IndexType least_divisor = divisor; least_divisor * least_divisor <= N;
+                    least_divisor++)
+                {
+                    if(N % least_divisor == 0)
+                    {
+                        return N / least_divisor;
+                    }
+                }
+                // Not found, process one Convolution N per block
+                return 1;
+            }
+            else
+            {
+                // Split Convolution's N dimension into N workgroups. However
+                // this still might not result in sufficiently small tensor,
+                // but at least later on we could divide the image as well.
+                return 1;
+            }
+        }
+        else
+        {
+            // Split N is not needed.
+            return N;
+        }
+    }
+#endif
+
+    public:
+    CK_TILE_HOST constexpr TransformConvBwdWeightToGemm() {}
+
+    template <typename TransformConvBwdWeightToGemmBase>
+    CK_TILE_HOST TransformConvBwdWeightToGemm(
+        const TransformConvBwdWeightToGemmBase& transform_conv_fwd_to_gemm_base)
+        : G_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.G_)},
+          N_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.N_)},
+          Di_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Di_)},
+          Hi_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Hi_)},
+          Wi_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Wi_)},
+          Do_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Do_)},
+          Ho_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Ho_)},
+          Wo_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Wo_)},
+          Z_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Z_)},
+          Y_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.Y_)},
+          X_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.X_)},
+          K_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.K_)},
+          C_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.C_)},
+          ConvStrideD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvStrideD_)},
+          ConvStrideH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvStrideH_)},
+          ConvStrideW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvStrideW_)},
+          ConvDilationD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvDilationD_)},
+          ConvDilationH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvDilationH_)},
+          ConvDilationW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ConvDilationW_)},
+          InLeftPadD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InLeftPadD_)},
+          InLeftPadH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InLeftPadH_)},
+          InLeftPadW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InLeftPadW_)},
+          InRightPadD_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InRightPadD_)},
+          InRightPadH_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InRightPadH_)},
+          InRightPadW_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.InRightPadW_)},
+          ZYX_{static_cast<IndexType>(transform_conv_fwd_to_gemm_base.ZYX_)}
+    {
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST TransformConvBwdWeightToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                              const ConvDimsType& b_g_k_c_xs_lengths,
+                                              const ConvDimsType& c_g_n_k_wos_lengths,
+                                              const ConvSpatialDimsType& conv_filter_strides,
+                                              const ConvSpatialDimsType& conv_filter_dilations,
+                                              const ConvSpatialDimsType& input_left_pads,
+                                              const ConvSpatialDimsType& input_right_pads)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          Di_{I1},
+          Hi_{I1},
+          Wi_{a_g_n_c_wis_lengths[I3]},
+          Do_{I1},
+          Ho_{I1},
+          Wo_{c_g_n_k_wos_lengths[I3]},
+          Z_{I1},
+          Y_{I1},
+          X_{b_g_k_c_xs_lengths[I3]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{I1},
+          ConvStrideH_{I1},
+          ConvStrideW_{conv_filter_strides[I0]},
+          ConvDilationD_{I1},
+          ConvDilationH_{I1},
+          ConvDilationW_{conv_filter_dilations[I0]},
+          InLeftPadD_{I0},
+          InLeftPadH_{I0},
+          InLeftPadW_{input_left_pads[I0]},
+          InRightPadD_{I0},
+          InRightPadH_{I0},
+          InRightPadW_{input_right_pads[I0]},
+          ZYX_{X_}
+    {
+        static_assert(std::is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
+        static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        N_ = c_g_n_k_wos_lengths[I1];
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST TransformConvBwdWeightToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                              const ConvDimsType& b_g_k_c_xs_lengths,
+                                              const ConvDimsType& c_g_n_k_wos_lengths,
+                                              const ConvSpatialDimsType& conv_filter_strides,
+                                              const ConvSpatialDimsType& conv_filter_dilations,
+                                              const ConvSpatialDimsType& input_left_pads,
+                                              const ConvSpatialDimsType& input_right_pads)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          Di_{I1},
+          Hi_{a_g_n_c_wis_lengths[I3]},
+          Wi_{a_g_n_c_wis_lengths[I4]},
+          Do_{I1},
+          Ho_{c_g_n_k_wos_lengths[I3]},
+          Wo_{c_g_n_k_wos_lengths[I4]},
+          Z_{I1},
+          Y_{b_g_k_c_xs_lengths[I3]},
+          X_{b_g_k_c_xs_lengths[I4]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{I1},
+          ConvStrideH_{conv_filter_strides[I0]},
+          ConvStrideW_{conv_filter_strides[I1]},
+          ConvDilationD_{I1},
+          ConvDilationH_{conv_filter_dilations[I0]},
+          ConvDilationW_{conv_filter_dilations[I1]},
+          InLeftPadD_{I0},
+          InLeftPadH_{input_left_pads[I0]},
+          InLeftPadW_{input_left_pads[I1]},
+          InRightPadD_{I0},
+          InRightPadH_{input_right_pads[I0]},
+          InRightPadW_{input_right_pads[I1]},
+          ZYX_{Y_ * X_}
+    {
+        static_assert(std::is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
+        static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        N_ = c_g_n_k_wos_lengths[I1];
+    }
+
+    template <typename ConvDimsType,
+              typename ConvSpatialDimsType,
+              index_t NDim                                   = NDimSpatial,
+              typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST TransformConvBwdWeightToGemm(const ConvDimsType& a_g_n_c_wis_lengths,
+                                              const ConvDimsType& b_g_k_c_xs_lengths,
+                                              const ConvDimsType& c_g_n_k_wos_lengths,
+                                              const ConvSpatialDimsType& conv_filter_strides,
+                                              const ConvSpatialDimsType& conv_filter_dilations,
+                                              const ConvSpatialDimsType& input_left_pads,
+                                              const ConvSpatialDimsType& input_right_pads)
+        : G_{a_g_n_c_wis_lengths[I0]},
+          Di_{a_g_n_c_wis_lengths[I3]},
+          Hi_{a_g_n_c_wis_lengths[I4]},
+          Wi_{a_g_n_c_wis_lengths[I5]},
+          Do_{c_g_n_k_wos_lengths[I3]},
+          Ho_{c_g_n_k_wos_lengths[I4]},
+          Wo_{c_g_n_k_wos_lengths[I5]},
+          Z_{b_g_k_c_xs_lengths[I3]},
+          Y_{b_g_k_c_xs_lengths[I4]},
+          X_{b_g_k_c_xs_lengths[I5]},
+          K_{c_g_n_k_wos_lengths[I2]},
+          C_{b_g_k_c_xs_lengths[I2]},
+          ConvStrideD_{conv_filter_strides[I0]},
+          ConvStrideH_{conv_filter_strides[I1]},
+          ConvStrideW_{conv_filter_strides[I2]},
+          ConvDilationD_{conv_filter_dilations[I0]},
+          ConvDilationH_{conv_filter_dilations[I1]},
+          ConvDilationW_{conv_filter_dilations[I2]},
+          InLeftPadD_{input_left_pads[I0]},
+          InLeftPadH_{input_left_pads[I1]},
+          InLeftPadW_{input_left_pads[I2]},
+          InRightPadD_{input_right_pads[I0]},
+          InRightPadH_{input_right_pads[I1]},
+          InRightPadW_{input_right_pads[I2]},
+          ZYX_{Z_ * Y_ * X_}
+    {
+        static_assert(std::is_same_v<ConvSpatialDimsType, std::array<IndexType, NDimSpatial>> ||
+                      std::is_same_v<ConvSpatialDimsType, ck_tile::array<IndexType, NDimSpatial>>);
+        static_assert(std::is_same_v<ConvDimsType, std::array<IndexType, NDimSpatial + I3>> ||
+                      std::is_same_v<ConvDimsType, ck_tile::array<IndexType, NDimSpatial + I3>>);
+#if 0 // TODO: Enable these functionalities
+        if constexpr(SplitN)
+        {
+            N_ = GetSplitedNSize(
+                a_g_n_c_wis_lengths, a_g_n_c_wis_strides, c_g_n_k_wos_lengths, c_g_n_k_wos_strides);
+        }
+        else
+        {
+            N_ = c_g_n_k_wos_lengths[I1];
+        }
+#endif
+        N_ = c_g_n_k_wos_lengths[I1];
+    }
+
+#if 0 // TODO: Enable these functionalities
+    __host__ bool AreDescriptorsSmallerThan2GB() const
+    {
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        const long_index_t in_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorA_ + (Di_ - I1) * DiStride_ + (Hi_ - I1) * HiStride_ +
+            (Wi_ - I1) * WiStride_ + (C_ - I1) * CStrideTensorA_;
+        const long_index_t out_desc_space_size =
+            I1 + (N_ - I1) * NStrideTensorC_ + (Do_ - I1) * DoStride_ + (Ho_ - I1) * HoStride_ +
+            (Wo_ - I1) * WoStride_ + (K_ - I1) * KStrideTensorC_;
+
+        bool is_a_descriptor_smaller_than_2GB = (in_desc_space_size * sizeof(ADataType)) <= TwoGB;
+        bool is_c_descriptor_smaller_than_2GB = (out_desc_space_size * sizeof(CDataType)) <= TwoGB;
+
+        return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
+    }
+
+    __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   CDataType* c_grid_ptr_base) const
+    {
+        // Create copies
+        auto conv_to_gemm_transformer_left  = *this;
+        auto conv_to_gemm_transformer_right = *this;
+        IndexType a_right_offset            = 0;
+        IndexType c_right_offset            = 0;
+        // Calculate real filter size
+        const IndexType z_eff = (Z_ - 1) * ConvDilationD_ + 1;
+        const IndexType y_eff = (Y_ - 1) * ConvDilationH_ + 1;
+        const IndexType x_eff = (X_ - 1) * ConvDilationW_ + 1;
+        // Calculate start position in input for right tensor
+        const IndexType di_right_transformer_start_idx = (Do_ / 2) * ConvStrideD_;
+        const IndexType hi_right_transformer_start_idx = (Ho_ / 2) * ConvStrideH_;
+        const IndexType wi_right_transformer_start_idx = (Wo_ / 2) * ConvStrideW_;
+        // Calculate last position in input for left tensor
+        const IndexType di_left_transformer_end_idx = (Do_ / 2 - 1) * ConvStrideD_ + z_eff;
+        const IndexType hi_left_transformer_end_idx = (Ho_ / 2 - 1) * ConvStrideH_ + y_eff;
+        const IndexType wi_left_transformer_end_idx = (Wo_ / 2 - 1) * ConvStrideW_ + x_eff;
+        // Allow to split if whole left padding will be in left tensor and right padding in right
+        // tensor
+        const bool is_possible_to_split_d = Do_ != 1 &&
+                                            di_right_transformer_start_idx > InLeftPadD_ &&
+                                            di_left_transformer_end_idx <= (InLeftPadD_ + Di_);
+        const bool is_possible_to_split_h = Ho_ != 1 &&
+                                            hi_right_transformer_start_idx > InLeftPadH_ &&
+                                            hi_left_transformer_end_idx <= (InLeftPadH_ + Hi_);
+        const bool is_possible_to_split_w = Wo_ != 1 &&
+                                            wi_right_transformer_start_idx > InLeftPadW_ &&
+                                            wi_left_transformer_end_idx <= (InLeftPadW_ + Wi_);
+
+        if(is_possible_to_split_d)
+        {
+            // Apply new sizes
+            // Split output on half
+            conv_to_gemm_transformer_left.Do_  = Do_ / 2;
+            conv_to_gemm_transformer_right.Do_ = Do_ - Do_ / 2;
+            // Assign left padding to left convolution
+            conv_to_gemm_transformer_left.InLeftPadD_  = InLeftPadD_;
+            conv_to_gemm_transformer_right.InLeftPadD_ = 0;
+            // Assign right padding to right convolution
+            conv_to_gemm_transformer_left.InRightPadD_  = 0;
+            conv_to_gemm_transformer_right.InRightPadD_ = InRightPadD_;
+            // Calculate new input size
+            conv_to_gemm_transformer_left.Di_ = di_left_transformer_end_idx - InLeftPadD_;
+            conv_to_gemm_transformer_right.Di_ =
+                math::min(Di_ - (di_right_transformer_start_idx - InLeftPadD_),
+                          (conv_to_gemm_transformer_right.Do_ - 1) * ConvStrideD_ + z_eff);
+            ;
+            // Calcualte offsets
+            a_right_offset = ((Do_ / 2) * ConvStrideD_ - InLeftPadD_) * DiStride_;
+            c_right_offset = (Do_ / 2) * DoStride_;
+        }
+        else if(is_possible_to_split_h)
+        {
+            conv_to_gemm_transformer_left.Ho_  = Ho_ / 2;
+            conv_to_gemm_transformer_right.Ho_ = Ho_ - Ho_ / 2;
+
+            conv_to_gemm_transformer_left.InLeftPadH_  = InLeftPadH_;
+            conv_to_gemm_transformer_right.InLeftPadH_ = 0;
+
+            conv_to_gemm_transformer_left.InRightPadH_  = 0;
+            conv_to_gemm_transformer_right.InRightPadH_ = InRightPadH_;
+
+            conv_to_gemm_transformer_left.Hi_ = hi_left_transformer_end_idx - InLeftPadH_;
+            conv_to_gemm_transformer_right.Hi_ =
+                math::min(Hi_ - (hi_right_transformer_start_idx - InLeftPadH_),
+                          (conv_to_gemm_transformer_right.Ho_ - 1) * ConvStrideH_ + y_eff);
+            a_right_offset = ((Ho_ / 2) * ConvStrideH_ - InLeftPadH_) * HiStride_;
+            c_right_offset = (Ho_ / 2) * HoStride_;
+        }
+        else if(is_possible_to_split_w)
+        {
+            conv_to_gemm_transformer_left.Wo_  = Wo_ / 2;
+            conv_to_gemm_transformer_right.Wo_ = Wo_ - Wo_ / 2;
+
+            conv_to_gemm_transformer_left.InLeftPadW_  = InLeftPadW_;
+            conv_to_gemm_transformer_right.InLeftPadW_ = 0;
+
+            conv_to_gemm_transformer_left.InRightPadW_  = 0;
+            conv_to_gemm_transformer_right.InRightPadW_ = InRightPadW_;
+
+            conv_to_gemm_transformer_left.Wi_ = wi_left_transformer_end_idx - InLeftPadW_;
+            conv_to_gemm_transformer_right.Wi_ =
+                math::min(Wi_ - (wi_right_transformer_start_idx - InLeftPadW_),
+                          (conv_to_gemm_transformer_right.Wo_ - 1) * ConvStrideW_ + x_eff);
+
+            a_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
+            c_right_offset = (Wo_ / 2) * WoStride_;
+        }
+        // Return left transform, right transformer, right offset to Input and right offset to
+        // Output
+        return ck_tile::make_tuple(conv_to_gemm_transformer_left,
+                              conv_to_gemm_transformer_right,
+                              a_grid_ptr_base + a_right_offset,
+                              c_grid_ptr_base + c_right_offset);
+    }
+#endif
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto make_out_grid_desc() const
+    {
+        // NWGK
+        const index_t NDoHoWoStride = G_ * K_;
+        constexpr auto KStride      = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+
+        return make_naive_tensor_descriptor(make_tuple(K_, N_ * Wo_),
+                                            make_tuple(KStride, NDoHoWoStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto make_in_grid_desc() const
+    {
+        // NWGC
+        const index_t NStride  = Wi_ * G_ * C_;
+        const index_t WiStride = G_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(N_, Wi_, C_),
+                                            make_tuple(NStride, WiStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto make_wei_grid_desc() const
+    {
+        // GKXC
+        const index_t KStride   = X_ * C_;
+        constexpr auto CXStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(K_, X_ * C_), make_tuple(KStride, CXStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto make_out_grid_desc() const
+    {
+        // NHWGK
+        const index_t NDoHoWoStride = G_ * K_;
+        constexpr auto KStride      = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+
+        return make_naive_tensor_descriptor(make_tuple(K_, N_ * Ho_ * Wo_),
+                                            make_tuple(KStride, NDoHoWoStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto make_in_grid_desc() const
+    {
+        // NHWGC
+        const index_t NStride  = Hi_ * Wi_ * G_ * C_;
+        const index_t HiStride = Wi_ * G_ * C_;
+        const index_t WiStride = G_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(N_, Hi_, Wi_, C_),
+                                            make_tuple(NStride, HiStride, WiStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto make_wei_grid_desc() const
+    {
+        // GKYXC
+        const index_t KStride  = Y_ * X_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(K_, Y_ * X_ * C_),
+                                            make_tuple(KStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto make_out_grid_desc() const
+    {
+        // NDHWGK
+        const index_t NDoHoWoStride = G_ * K_;
+        constexpr auto KStride      = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+
+        return make_naive_tensor_descriptor(make_tuple(K_, N_ * Do_ * Ho_ * Wo_),
+                                            make_tuple(KStride, NDoHoWoStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto make_in_grid_desc() const
+    {
+        const index_t NStride  = Di_ * Hi_ * Wi_ * G_ * C_;
+        const index_t DiStride = Hi_ * Wi_ * G_ * C_;
+        const index_t HiStride = Wi_ * G_ * C_;
+        const index_t WiStride = G_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(
+            make_tuple(N_, Di_, Hi_, Wi_, C_),
+            make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto make_wei_grid_desc() const
+    {
+        // KZYXC
+        const index_t KStride  = Z_ * Y_ * X_ * C_;
+        constexpr auto CStride = I1;
+
+        // TODO Add support for NumGroupsToMerge > 1
+        return make_naive_tensor_descriptor(make_tuple(K_, Z_ * Y_ * X_ * C_),
+                                            make_tuple(KStride, CStride));
+    }
+
+    // TODO: implement ck_tile::tensor_layout::convolution that describe packed/strided dimemsion as
+    // properties
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 1, bool>::type = false>
+    CK_TILE_HOST auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N() const
+    {
+        const auto out_grid_desc = make_out_grid_desc<NDimSpatial>();
+        const auto in_grid_desc  = make_in_grid_desc<NDimSpatial>();
+        const auto wei_grid_desc = make_wei_grid_desc<NDimSpatial>();
+
+        // B: input tensor comes in K_N
+        const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+            in_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+        const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            make_tuple(
+                make_pass_through_transform(N_),
+                make_embed_transform(make_tuple(X_, Wo_), make_tuple(ConvDilationW_, ConvStrideW_)),
+                make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+        const auto in_gemmn_gemmktotal_grid_desc =
+            transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                        make_tuple(make_merge_transform(make_tuple(X_, C_)),
+                                                   make_merge_transform(make_tuple(N_, Wo_))),
+                                        make_tuple(sequence<1, 3>{}, sequence<0, 2>{}),
+                                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tuple(out_grid_desc, in_gemmn_gemmktotal_grid_desc, wei_grid_desc);
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 2, bool>::type = false>
+    CK_TILE_HOST auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N() const
+    {
+        const auto out_grid_desc = make_out_grid_desc<NDimSpatial>();
+        const auto in_grid_desc  = make_in_grid_desc<NDimSpatial>();
+        const auto wei_grid_desc = make_wei_grid_desc<NDimSpatial>();
+
+        // B: input tensor comes in K_N
+        const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+            in_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                       make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}));
+
+        const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            make_tuple(
+                make_pass_through_transform(N_),
+                make_embed_transform(make_tuple(Y_, Ho_), make_tuple(ConvDilationH_, ConvStrideH_)),
+                make_embed_transform(make_tuple(X_, Wo_), make_tuple(ConvDilationW_, ConvStrideW_)),
+                make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{}));
+
+        const auto in_gemmn_gemmktotal_grid_desc =
+            transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                        make_tuple(make_merge_transform(make_tuple(Y_, X_, C_)),
+                                                   make_merge_transform(make_tuple(N_, Ho_, Wo_))),
+                                        make_tuple(sequence<1, 3, 5>{}, sequence<0, 2, 4>{}),
+                                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tuple(out_grid_desc, in_gemmn_gemmktotal_grid_desc, wei_grid_desc);
+    }
+
+    template <index_t NDim = NDimSpatial, typename std::enable_if<NDim == 3, bool>::type = false>
+    CK_TILE_HOST auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N() const
+    {
+        const auto out_grid_desc = make_out_grid_desc<NDimSpatial>();
+        const auto in_grid_desc  = make_in_grid_desc<NDimSpatial>();
+        const auto wei_grid_desc = make_wei_grid_desc<NDimSpatial>();
+
+        // B: input tensor comes in K_N
+        const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+            in_grid_desc,
+            make_tuple(make_pass_through_transform(N_),
+                       make_pad_transform(Di_, InLeftPadD_, InRightPadD_),
+                       make_pad_transform(Hi_, InLeftPadH_, InRightPadH_),
+                       make_pad_transform(Wi_, InLeftPadW_, InRightPadW_),
+                       make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}));
+
+        const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            make_tuple(
+                make_pass_through_transform(N_),
+                make_embed_transform(make_tuple(Z_, Do_), make_tuple(ConvDilationD_, ConvStrideD_)),
+                make_embed_transform(make_tuple(Y_, Ho_), make_tuple(ConvDilationH_, ConvStrideH_)),
+                make_embed_transform(make_tuple(X_, Wo_), make_tuple(ConvDilationW_, ConvStrideW_)),
+                make_pass_through_transform(C_)),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1, 2>{},
+                       sequence<3, 4>{},
+                       sequence<5, 6>{},
+                       sequence<7>{}));
+
+        const auto in_gemmn_gemmktotal_grid_desc = transform_tensor_descriptor(
+            in_n_y_ho_x_wo_c_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(Z_, Y_, X_, C_)),
+                       make_merge_transform(make_tuple(N_, Do_, Ho_, Wo_))),
+            make_tuple(sequence<1, 3, 5, 7>{}, sequence<0, 2, 4, 6>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tuple(out_grid_desc, in_gemmn_gemmktotal_grid_desc, wei_grid_desc);
+    }
+
+    IndexType G_, N_;
+    IndexType Di_, Hi_, Wi_;
+    IndexType Do_, Ho_, Wo_;
+    IndexType Z_, Y_, X_;
+    IndexType K_, C_;
+    IndexType ConvStrideD_, ConvStrideH_, ConvStrideW_;
+    IndexType ConvDilationD_, ConvDilationH_, ConvDilationW_;
+    IndexType InLeftPadD_, InLeftPadH_, InLeftPadW_;
+    IndexType InRightPadD_, InRightPadH_, InRightPadW_;
+    IndexType ZYX_;
+};
+
+} // namespace ck_tile

From 606b0cc947cbab4c3313e3ca8c41c7b7647b6d25 Mon Sep 17 00:00:00 2001
From: Yashvardhan Agarwal <yashagar@amd.com>
Date: Thu, 24 Jul 2025 12:21:45 +0300
Subject: [PATCH 338/443] [CK_TILE] Support for elementwise kernel  (#2246)

* Elementwise kernel implementation

Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: yashagar <yashagar@amd.com>

* Elementwise with generalized nDims

* Adding the n-ary input tensor feature

* Generalize dimensions on top of inputs

* Add TFLOPS + remove std usage for tuples

* 1D basecase optimization

* Cleanup code + refactoring to a common interface

* Generalize to unary and add an example

* Cleanup, refactoring and commenting

* Suggestions for LWPCK-3170: elementwise kernel improvements

* Clang-format: remod.py

* Replace InputTensorType with XDataType as the type of input_tensors

* Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs

* Move examples to folder 19_elementwise

* Add missing copyright headers and fix some existing ones

* Replace an assert with throw std::runtime_error in elementwise example

* Avoid reading the output by using make_static_distributed_tensor for y_tile

* Removed two unused includes

* No need to move windows to the next block when each workgroup processes a single tile

* Only copy input tensors to the device

* Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example

* Adding output strides to the kernel, transposition example and update the other examples

* Changes made by remod.py

* Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view

* Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp

* Reuse generic reference binary/unary operation in examples + refactoring the transpose reference

* Fix comments in elementwise_example.cpp

- Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses
- ElementWiseTraits was renamed to ElementWiseShape
- Adopt suggestions made by Copilot when prompted to check for factual or typographical errors

* Simplify CMakeLists.txt and remove the unused variables this uncovers

* Rename a file and fix some copyright statements

* Changes made by script/clang-format-overwrite.sh

* Add basic unit test for ElementWiseKernel

* Remove left-over uninformative comment in apply unit test

* Changes made by clang-format-overwrite.sh

* fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view

* Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp

* Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array

* Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally

* Move examples to folder 20_elementwise

* Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK

* Fix CLang formating

* Bump up the elementwise example folder number

* Elementwise: add padding + minor cleanup

* Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view

* Add isSupportedArg to Elementwise kernel + addapt example and unit tests

* Fix clang-format on the unit test file

---------

Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 CHANGELOG.md                                  |   1 +
 example/ck_tile/21_elementwise/CMakeLists.txt |  15 ++
 .../21_elementwise/elementwise_example.cpp    | 214 +++++++++++++++++
 .../elementwise_example_add_4d.cpp            | 159 +++++++++++++
 .../elementwise_example_transpose.cpp         | 156 ++++++++++++
 .../elementwise_example_unary.cpp             | 147 ++++++++++++
 example/ck_tile/CMakeLists.txt                |   1 +
 include/ck_tile/core/container/tuple.hpp      |  25 +-
 include/ck_tile/host.hpp                      |   1 +
 .../host/reference/reference_elementwise.hpp  |   2 +-
 .../host/reference/reference_transpose.hpp    |  33 +++
 include/ck_tile/ops/elementwise.hpp           |   5 +
 .../binary_elementwise_operation.hpp          |  94 ++++++++
 .../elementwise/kernel/elementwise_kernel.hpp | 123 ++++++++++
 .../elementwise_pipeline_default_policy.hpp   |  29 +++
 .../pipeline/elementwise_pipeline_problem.hpp |  26 ++
 .../pipeline/elementwise_shape.hpp            |  29 +++
 .../unary_element_wise_operation.hpp          |   2 +-
 test/ck_tile/CMakeLists.txt                   |   2 +
 test/ck_tile/container/CMakeLists.txt         |   6 +
 test/ck_tile/container/test_tuple_apply.cpp   | 223 ++++++++++++++++++
 test/ck_tile/elementwise/CMakeLists.txt       |   6 +
 .../elementwise/test_elementwise_1d.cpp       | 216 +++++++++++++++++
 23 files changed, 1509 insertions(+), 6 deletions(-)
 create mode 100644 example/ck_tile/21_elementwise/CMakeLists.txt
 create mode 100644 example/ck_tile/21_elementwise/elementwise_example.cpp
 create mode 100644 example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
 create mode 100644 example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
 create mode 100644 example/ck_tile/21_elementwise/elementwise_example_unary.cpp
 create mode 100644 include/ck_tile/host/reference/reference_transpose.hpp
 create mode 100644 include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
 create mode 100644 include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
 create mode 100644 include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp
 create mode 100644 include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp
 create mode 100644 include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
 create mode 100644 test/ck_tile/container/CMakeLists.txt
 create mode 100644 test/ck_tile/container/test_tuple_apply.cpp
 create mode 100644 test/ck_tile/elementwise/CMakeLists.txt
 create mode 100644 test/ck_tile/elementwise/test_elementwise_1d.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 17f9455feb..7653515210 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
 * Added int8 support for CK_TILE GEMM.
+* Added support for elementwise kernel.
 
 ### Optimized
 
diff --git a/example/ck_tile/21_elementwise/CMakeLists.txt b/example/ck_tile/21_elementwise/CMakeLists.txt
new file mode 100644
index 0000000000..dc5242f4a1
--- /dev/null
+++ b/example/ck_tile/21_elementwise/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Elementwise example targets 2D inputs
+set(TARGET_NAME_2D_INPUT tile_example_elementwise)
+add_executable(${TARGET_NAME_2D_INPUT} elementwise_example.cpp)
+
+# Elementwise unary example targets 2D inputs
+set(TARGET_NAME_2D_INPUT_UNARY tile_example_elementwise_unary)
+add_executable(${TARGET_NAME_2D_INPUT_UNARY} elementwise_example_unary.cpp)
+
+# Elementwise transpose example targets 2D inputs
+set(TARGET_NAME_2D_INPUT_TRANSPOSE tile_example_elementwise_transpose)
+add_executable(${TARGET_NAME_2D_INPUT_TRANSPOSE} elementwise_example_transpose.cpp)
+
+# Elementwise example targets 4D inputs
+set(TARGET_NAME_4D_INPUT tile_example_elementwise_add_4d)
+add_executable(${TARGET_NAME_4D_INPUT} elementwise_example_add_4d.cpp)
diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp
new file mode 100644
index 0000000000..4c501860fd
--- /dev/null
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension")
+        .insert("n", "1024", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+
+    // If stride is negative (default -1), set it to N, assuming a dense row-major layout.
+    if(stride < 0)
+        stride = N;
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    if(stride < N)
+    {
+        throw std::runtime_error("stride must be >= N");
+    }
+
+    // Define type aliases for clarity.
+    // XDataType: Data type of the input tensors.
+    // ComputeDataType: Data type used for intermediate computations (often float for precision).
+    // YDataType: Data type of the output tensor.
+    // XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
+    using XDataType = DataType;
+    using ComputeDataType =
+        float; // Using float for intermediate calculations can improve numerical stability.
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::Add;
+
+    // 1. Initialize the input data on the host (CPU).
+    // HostTensor is a utility to manage tensor data on the CPU.
+    // The first argument is the shape (dimensions) of the tensor {M, N}.
+    // The second argument is the strides {stride, 1} for row-major layout.
+    // 'x_host_a' and 'x_host_b' are the two input tensors for the elementwise operation.
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host_b({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
+
+    std::vector<ck_tile::index_t> shape = {M, N};
+
+    // Fill the host tensors with random data.
+    // FillUniformDistribution populates the tensor with values from a uniform distribution,
+    // within an interval.
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_b);
+
+    // 2. Create device memory buffers
+    // DeviceMem allocates memory on the GPU.
+    // The size is determined by the total number of elements and the size of DataType.
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+
+    // Copy data from host input tensors to device buffers.
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    // 3. Configure the kernel execution parameters.
+    // Dividing the problem into blocktile, blockwarp and warptile
+    // The blocktile is the size of the tile processed by a single work group (also called thread
+    // block). The warptile is the size of the tile processed by a single wavefront (also called
+    // warp). The vector is the size of the tile processed by a single work item (also called
+    // thread). The problem is divided into blocks of size BlockTile. Each block is further divided
+    // into wavefronts of size WarpTile. Each wavefront is composed of 64 work items (on AMD; 32
+    // threads on NVIDIA). Each work item in a wavefront processes one vector's worth of elements.
+    // Note that WarpTile/Vector should be 64 for CDNA (because there are 64 work items per
+    // wavefront). Vector size is set to be 16 / sizeof(ComputeDataType), to maximize vectorization.
+    using BlockTile = ck_tile::sequence<2048>; // How many elements are handled by a block tile (the
+                                               // tensor is divided into blocks of this size)
+    using BlockWarps = ck_tile::sequence<8>; // How many concurrent wavefronts are in a block (each
+                                             // wavefront will cover some part of the block tile)
+
+    // WarpTile: Defines the size of the data sub-tile processed by a single wavefront.
+    // This should be consistent with BlockTile and BlockWarps.
+    // If BlockTile is 2048 and BlockWarps is 8, then WarpTile could be 2048/8 = 256.
+    // However, this example uses 64, meaning each wavefront processes 64 elements, and multiple
+    // such wavefront operations might be needed to cover the BlockTile, or the BlockTile is
+    // distributed differently.
+    // The current configuration (BlockTile=2048, BlockWarps=8, WarpTile=64) implies that
+    // each wavefront processes 64 elements, and 8 wavefronts process 8*64 = 512 elements
+    // concurrently. Since 512 is not equal to 2048, it means that warptile(s) will need to iterate
+    // over multiple times over different set of elements to cover the entire BlockTile.
+    using WarpTile = ck_tile::sequence<64>;
+
+    // 4. Create the kernel
+
+    // ElementWiseShape bundles these tiling parameters.
+    // It calculates derived properties like threads per wavefront, repeats, vectorization and total
+    // block size.
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+
+    // ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
+    // - Data types (input, compute, output).
+    // - Shape traits (tiling configuration).
+    // - The specific elementwise operation (e.g., Add).
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    // ElementWiseKernel refers to the GPU kernel class
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    // Compute flattened size
+    ck_tile::index_t total_elements = 1;
+    for(auto d : shape)
+        total_elements *= d;
+
+    // kBlockSize: The number of work items in a GPU workgroup (thread block).
+    // This is often a multiple of the wavefront size, 64 on CDNA.
+    // Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
+    // Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+
+    // kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
+    // This can influence occupancy and performance.
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    // kGridSize: Calculates the total number of workgroups required to process all elements.
+    // Each workgroup is responsible for 'elements_per_block' elements.
+    // To ensure all elements are covered, especially when 'total_elements' is not perfectly
+    // divisible by 'elements_per_block', using ceiling division.
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
+
+    auto input_size = ck_tile::make_tuple(M, N);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(input_size))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       input_size,
+                                       ck_tile::make_tuple(N, 1), // Input Stride
+                                       ck_tile::make_tuple(N, 1), // Output Stride
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
+            x_host_a, x_host_b, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
new file mode 100644
index 0000000000..f18a910813
--- /dev/null
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("dim0", "4", "dimension 0")
+        .insert("dim1", "16", "dimension 1")
+        .insert("dim2", "32", "dimension 2")
+        .insert("dim3", "32", "dimension 3")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t D0 = arg_parser.get_int("dim0");
+    ck_tile::index_t D1 = arg_parser.get_int("dim1");
+    ck_tile::index_t D2 = arg_parser.get_int("dim2");
+    ck_tile::index_t D3 = arg_parser.get_int("dim3");
+
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    using XDataType = DataType;
+    using ComputeDataType =
+        float; // Using float for intermediate calculations can improve numerical stability.
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::Add;
+
+    // Initialize the input data on the host (CPU).
+    std::vector<ck_tile::index_t> problem_shape = {D0, D1, D2, D3};
+
+    std::vector<ck_tile::index_t> host_strides(4);
+    host_strides[3] = 1;
+    host_strides[2] = problem_shape[3];
+    host_strides[1] = problem_shape[2] * problem_shape[3];
+    host_strides[0] = problem_shape[1] * problem_shape[2] * problem_shape[3];
+
+    ck_tile::HostTensor<XDataType> x_host_a(problem_shape, host_strides);
+    ck_tile::HostTensor<XDataType> x_host_b(problem_shape, host_strides);
+    ck_tile::HostTensor<YDataType> y_host(problem_shape, host_strides);
+    ck_tile::HostTensor<YDataType> y_validation(problem_shape, host_strides);
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{2.f, 10.f}(x_host_b);
+
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    using BlockTile  = ck_tile::sequence<256>;
+    using BlockWarps = ck_tile::sequence<1>;
+    using WarpTile   = ck_tile::sequence<256>;
+
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    ck_tile::index_t total_elements = 1;
+    for(auto d : problem_shape)
+        total_elements *= d;
+
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+
+    constexpr ck_tile::index_t kBlockPerCu = 2;
+
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
+
+    auto problem_shape_tuple =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+
+    auto strides_tuple =
+        ck_tile::make_tuple(host_strides[0], host_strides[1], host_strides[2], host_strides[3]);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(problem_shape_tuple))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // Run the kernel
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+            Kernel{},
+            kGridSize,
+            kBlockSize,
+            0,
+            problem_shape_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t>
+            strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for input strides
+            strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for output strides
+            input_tensors,
+            static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
+            x_host_a, x_host_b, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
new file mode 100644
index 0000000000..affc337c38
--- /dev/null
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_transpose.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension of input")
+        .insert("n", "1024", "n dimension of input")
+        .insert("stride_in", "-1", "stride for input M dim, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M         = arg_parser.get_int("m");
+    ck_tile::index_t N         = arg_parser.get_int("n");
+    ck_tile::index_t stride_in = arg_parser.get_int("stride_in");
+
+    if(stride_in < 0)
+        stride_in = N; // Dense input: stride for M dim is N
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    if(stride_in < N)
+    {
+        throw std::runtime_error("stride_in must be >= N");
+    }
+
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
+    // Use PassThrough operation for transposition (data is moved, not changed)
+    using XElementwiseOperation = ck_tile::element_wise::PassThrough;
+
+    // 1. Initialize the input data on the host (CPU).
+    // Input x_host_a: M x N
+    // Output y_host: N x M (transposed)
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride_in, 1});
+    // Output tensor y_host will have dimensions N x M.
+    // Assuming dense output, its stride for the N dimension will be M.
+    ck_tile::index_t stride_out_dim0 = M;
+    ck_tile::HostTensor<YDataType> y_host({N, M}, {stride_out_dim0, 1});
+    ck_tile::HostTensor<YDataType> y_validation({N, M}, {stride_out_dim0, 1});
+
+    // The logical shape for the element-wise operation kernel is based on the input tensor's
+    // elements.
+    std::vector<ck_tile::index_t> op_shape_vec = {M, N};
+    auto op_lengths                            = ck_tile::make_tuple(M, N); // Lens for the kernel
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+
+    // 2. Create device memory buffers
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes()); // y_host is N x M
+
+    x_buf_a.ToDevice(x_host_a.data());
+
+    // 3. Configure the kernel execution parameters.
+    using BlockTile  = ck_tile::sequence<1024>;
+    using BlockWarps = ck_tile::sequence<8>;
+    using WarpTile   = ck_tile::sequence<64>;
+
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+
+    // Problem definition for a single input tensor
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    ck_tile::index_t total_elements = M * N;
+
+    constexpr ck_tile::index_t kBlockSize         = 64 * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockPerCu        = 1;
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "Input M=" << M << ", N=" << N << ", StrideIn=" << stride_in << std::endl;
+    std::cout << "Output N=" << N << ", M=" << M << ", StrideOut=" << stride_out_dim0 << std::endl;
+    std::cout << "Grid size = " << kGridSize << ", BlockSize = " << kBlockSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    // Input tensors tuple (single input)
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
+    // Input strides tuple (tuple of tuples, one for each input)
+    auto input_strides = ck_tile::make_tuple(stride_in, 1);
+    // Output strides (for N x M tensor, dense)
+    auto output_strides = ck_tile::make_tuple(1, stride_out_dim0);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(op_lengths))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,             // Shared memory
+                                       op_lengths,    // Logical dimensions for the operation (M, N)
+                                       input_strides, // Strides for input tensor(s)
+                                       output_strides, // Strides for output tensor (N, M)
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data()); // Copy result from device to y_validation
+        ck_tile::reference_transpose_elementwise<XDataType, YDataType>(
+            x_host_a, y_host); // Compute reference on host
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Transpose Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    std::cerr << "Unsupported data type: " << data_type << std::endl;
+    return -3;
+}
diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
new file mode 100644
index 0000000000..147dfd3424
--- /dev/null
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension")
+        .insert("n", "1024", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = N;
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= N);
+
+    using XDataType             = DataType;
+    using YDataType             = DataType;
+    using ComputeDataType       = float;
+    using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
+
+    // 1. Initialize the input data on the host
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
+
+    std::vector<ck_tile::index_t> shape = {M, N};
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+
+    // 2. Create device memory buffers and copy input data from host to device
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+    x_buf_a.ToDevice(x_host_a.data());
+
+    // 3. Create the kernel
+
+    // Dividing the problem into blocktile, warptile, and vector
+    using BlockTile = ck_tile::sequence<2048>; // Size of the block tile (Entire problem is divided
+                                               // into blocks of this size)
+    using BlockWarps = ck_tile::sequence<8>; // How many concurrent warps are in a block (Each warp
+                                             // will cover some part of blockTile)
+    using WarpTile = ck_tile::sequence<64>;  // How many elements are covered by a warp
+
+    using Shape   = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        XDataType, // ComputeDataType is same as
+                                                                   // XDataType in the unary case
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    // Compute flattened size
+    ck_tile::index_t total_elements = 1;
+    for(auto d : shape)
+        total_elements *= d;
+
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
+    auto input_size    = ck_tile::make_tuple(M, N);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(input_size))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       input_size,
+                                       ck_tile::make_tuple(N, 1), // Input Stride
+                                       ck_tile::make_tuple(N, 1), // Output Stride
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+
+        auto op = [](const auto& v0) { return v0 * v0; };
+
+        ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index b317ed18aa..f85346e9be 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -20,6 +20,7 @@ add_subdirectory(17_grouped_gemm)
 add_subdirectory(18_flatmm)
 add_subdirectory(19_gemm_multi_d)
 add_subdirectory(20_grouped_convolution)
+add_subdirectory(21_elementwise)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(37_transpose)
 add_subdirectory(38_block_scale_gemm)
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 3700d348e7..a3e937c936 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -264,10 +264,14 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
 
 #define TP_COM_() static_assert(I < size(), "wrong! out of range")
     // clang-format off
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const          { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const { TP_COM_(); return get<I>(); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get()                      { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>)             { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const &          { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const & { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() &                { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) &       { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() &&               { TP_COM_(); return impl::getv<I>(std::move(*this)); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) &&      { TP_COM_(); return std::move(*this).template get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const &&         { TP_COM_(); return impl::getv<I>(std::move(*this)); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const &&{ TP_COM_(); return std::move(*this).template get<I>(); }
 
     template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at() const          { TP_COM_(); return impl::getv<I>(*this); }
     template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at(number<I>) const { TP_COM_(); return get<I>(); }
@@ -470,6 +474,12 @@ transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, sequence<Is...>)
     return make_tuple(f(x.at(number<Is>{}), y.at(number<Is>{}), z.at(number<Is>{}))...);
 }
 
+template <typename F, typename Tuple, index_t... Is>
+constexpr decltype(auto) apply_impl(F&& f, Tuple&& t, sequence<Is...>)
+{
+    return std::forward<F>(f)(std::forward<Tuple>(t).get(number<Is>{})...);
+}
+
 } // namespace detail
 
 template <typename F, typename X>
@@ -493,6 +503,13 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y,
         f, x, y, z, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
 }
 
+template <typename F, typename Tuple>
+constexpr decltype(auto) apply(F&& f, Tuple&& t)
+{
+    constexpr index_t N = std::decay_t<Tuple>::size();
+    return detail::apply_impl(std::forward<F>(f), std::forward<Tuple>(t), make_index_sequence<N>{});
+}
+
 namespace detail {
 
 template <typename F, typename X, index_t... Is>
diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp
index 13db461807..aa5afd25e5 100644
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -38,6 +38,7 @@
 #include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/reference/reference_topk.hpp"
+#include "ck_tile/host/reference/reference_transpose.hpp"
 #include "ck_tile/host/rotating_buffers.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/stream_utils.hpp"
diff --git a/include/ck_tile/host/reference/reference_elementwise.hpp b/include/ck_tile/host/reference/reference_elementwise.hpp
index 65303279b8..3e174bf870 100644
--- a/include/ck_tile/host/reference/reference_elementwise.hpp
+++ b/include/ck_tile/host/reference/reference_elementwise.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck_tile/host/reference/reference_transpose.hpp b/include/ck_tile/host/reference/reference_transpose.hpp
new file mode 100644
index 0000000000..45d3dc9efa
--- /dev/null
+++ b/include/ck_tile/host/reference/reference_transpose.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename ADataType, typename BDataType>
+void reference_transpose_elementwise(const HostTensor<ADataType>& a, HostTensor<BDataType>& b)
+{
+    ck_tile::index_t M = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[0]);
+    ck_tile::index_t N = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[1]);
+
+    // Ensure the b tensor is sized correctly for N x M
+    if(static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[0]) != N ||
+       static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[1]) != M)
+    {
+        throw std::runtime_error("Output tensor b has incorrect dimensions for transpose.");
+    }
+
+    auto f = [&](auto i, auto j) {
+        auto v_a = a(i, j);
+        b(j, i)  = ck_tile::type_convert<BDataType>(v_a);
+    };
+
+    make_ParallelTensorFunctor(f, M, N)(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise.hpp b/include/ck_tile/ops/elementwise.hpp
index 53187771b9..4858245ec4 100644
--- a/include/ck_tile/ops/elementwise.hpp
+++ b/include/ck_tile/ops/elementwise.hpp
@@ -3,6 +3,11 @@
 
 #pragma once
 
+#include "ck_tile/ops/elementwise/binary_elementwise_operation.hpp"
+#include "ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp"
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp b/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
new file mode 100644
index 0000000000..f9b1cf3352
--- /dev/null
+++ b/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+namespace element_wise {
+
+struct Add
+{
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        y = x0 + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        y = x0 + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const half_t& x1) const
+    {
+        y = x0 + type_convert<half_t>(x1);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const float& x0, const float& x1) const
+    {
+        y = type_convert<half_t>(x0 + x1);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        y = type_convert<half_t>(x0) + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = x0 + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const bf16_t& x1) const
+    {
+        const float x1_tmp = type_convert<float>(x1);
+        y                  = x0 + x1_tmp;
+    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bf16_t>(bf16_t& y, const bf16_t& x0, const bf16_t& x1) const
+    {
+        const float x1_tmp = type_convert<float>(x0);
+        const float x2_tmp = type_convert<float>(x1);
+        const float y_tmp  = x1_tmp + x2_tmp;
+        y                  = type_convert<bf16_t>(y_tmp);
+    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bf16_t>(bf16_t& y, const float& x0, const bf16_t& x1) const
+    {
+        const float x2_tmp = type_convert<float>(x1);
+        const float y_tmp  = x0 + x2_tmp;
+        y                  = type_convert<bf16_t>(y_tmp);
+    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
+    {
+        y = x0 + x1;
+    };
+};
+
+} // namespace element_wise
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
new file mode 100644
index 0000000000..103468c5fa
--- /dev/null
+++ b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_>
+struct ElementWiseKernel
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType            = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType      = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType            = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using ElementWiseOperation = ck_tile::remove_cvref_t<typename Problem::ElementWiseOperation>;
+
+    template <typename... XDataType, typename Dims>
+    CK_TILE_DEVICE void operator()(Dims lens,
+                                   Dims input_strides,
+                                   Dims output_strides,
+                                   const tuple<XDataType...>& input_tensors,
+                                   YDataType* p_y) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // Setup block-level coordinates and transforms
+        const index_t iM           = get_block_id() * S::kBlockM;
+        const auto merge_transform = make_merge_transform(lens);
+
+        // Load all input tiles into registers.
+        // The lambda structure here is intended to minimize the lifetime
+        // of intermediate objects (views, windows) used for loading.
+        const auto x_tiles = ck_tile::generate_tuple(
+            [&](auto i) {
+                const auto tensor_view = make_naive_tensor_view<address_space_enum::global>(
+                    input_tensors.get(i), lens, input_strides, number<S::kVectorM>{}, number<1>{});
+
+                const auto transformed_tensor = pad_tensor_view(
+                    transform_tensor_view(tensor_view,
+                                          ck_tile::make_tuple(merge_transform),
+                                          ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
+                                          ck_tile::make_tuple(sequence<0>{})),
+                    ck_tile::make_tuple(number<S::kBlockM>{}),
+                    sequence<Problem::kPad>{});
+
+                const auto x_window =
+                    make_tile_window(transformed_tensor,
+                                     ck_tile::make_tuple(number<S::kBlockM>{}),
+                                     {iM},
+                                     Policy::template MakeXBlockTileDistribution<Problem>());
+
+                return load_tile(x_window);
+            },
+            number<sizeof...(XDataType)>{});
+
+        // Setup output tile in registers.
+        const auto& x_tile0 = x_tiles.get(number<0>{});
+        auto y_tile = make_static_distributed_tensor<YDataType>(x_tile0.get_tile_distribution());
+
+        // Perform element-wise computation.
+        const auto spans = x_tile0.get_distributed_spans();
+        sweep_tile_span(spans[number<0>{}], [&](auto idx) {
+            const auto tile_idx = make_tuple(idx);
+            apply(
+                [&](auto&&... tiles) {
+                    ElementWiseOperation{}(y_tile(tile_idx),
+                                           type_convert<ComputeDataType>(tiles[tile_idx])...);
+                },
+                x_tiles);
+        });
+
+        // Setup output window and store the result tile.
+        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_y, lens, output_strides, number<S::kVectorM>{});
+
+        const auto transformed_y_m_n = pad_tensor_view(
+            transform_tensor_view(y_m_n,
+                                  ck_tile::make_tuple(merge_transform),
+                                  ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
+                                  ck_tile::make_tuple(sequence<0>{})),
+            ck_tile::make_tuple(number<S::kBlockM>{}),
+            sequence<Problem::kPad>{});
+
+        auto y_window = make_tile_window(transformed_y_m_n,
+                                         make_tuple(number<S::kBlockM>{}),
+                                         {iM},
+                                         y_tile.get_tile_distribution());
+
+        store_tile(y_window, cast_tile<YDataType>(y_tile));
+    }
+
+    template <typename... Ints>
+    CK_TILE_HOST static bool IsSupportedArgument(const ck_tile::tuple<Ints...>& input_sizes)
+    {
+        int total_elements  = 1;
+        const auto kVectorM = Problem_::BlockShape::kVectorM;
+
+        apply([&](auto&&... args) { ((total_elements *= args), ...); }, input_sizes);
+
+        if((total_elements % kVectorM) != 0)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("Conditions not met: total number of input elements (",
+                              total_elements,
+                              ") should be multiple of the vectorization size (",
+                              kVectorM,
+                              ")");
+            }
+            return false;
+        }
+
+        return true;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp b/include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp
new file mode 100644
index 0000000000..9cba43d350
--- /dev/null
+++ b/include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+struct ElementWiseDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<>, // Replicate
+                                       tuple<sequence<S::kRepeatM,
+                                                      S::kWarpPerBlockM,
+                                                      S::kThreadPerWarpM,
+                                                      S::kVectorM>>,    // Hierarchical
+                                       tuple<sequence<1>, sequence<1>>, // Parallel
+                                       tuple<sequence<1>, sequence<2>>, // Parallel
+                                       sequence<1, 1>,                  // Yield
+                                       sequence<0, 3>>{}                // Yield
+        );
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp b/include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp
new file mode 100644
index 0000000000..a5d00ee1d0
--- /dev/null
+++ b/include/ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename BlockShape_,
+          typename ElementWiseOperation_,
+          bool kPad_ = true>
+struct ElementWisePipelineProblem
+{
+    using XDataType            = remove_cvref_t<XDataType_>;
+    using ComputeDataType      = remove_cvref_t<ComputeDataType_>;
+    using YDataType            = remove_cvref_t<YDataType_>;
+    using BlockShape           = remove_cvref_t<BlockShape_>;
+    using ElementWiseOperation = remove_cvref_t<ElementWiseOperation_>;
+    static constexpr bool kPad = kPad_;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp b/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
new file mode 100644
index 0000000000..0d25a8a202
--- /dev/null
+++ b/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWarps, typename BlockTile, typename WarpTile, typename ComputeDataType>
+struct ElementWiseShape
+{
+    static constexpr index_t kBlockM = BlockTile::at(number<0>{});
+
+    static constexpr index_t kWarpM = WarpTile::at(number<0>{});
+
+    static constexpr index_t kVectorM = 16 / sizeof(ComputeDataType);
+
+    static constexpr index_t kWarpPerBlockM = BlockWarps::at(number<0>{});
+
+    static constexpr index_t kThreadPerWarpM = kWarpM / kVectorM;
+
+    static constexpr index_t kRepeatM = kBlockM / (kWarpPerBlockM * kWarpM);
+
+    static constexpr index_t kBlockSize =
+        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index abe26dd9bd..0e385901ed 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index fb566b2a00..42605f2513 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -5,6 +5,8 @@ add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
 add_subdirectory(data_type)
+add_subdirectory(container)
+add_subdirectory(elementwise)
 # Not including these tests as there is a bug on gfx90a and gfx942
 # resulting in "GPU core dump"
 #add_subdirectory(moe_smoothquant)
diff --git a/test/ck_tile/container/CMakeLists.txt b/test/ck_tile/container/CMakeLists.txt
new file mode 100644
index 0000000000..50670c83e4
--- /dev/null
+++ b/test/ck_tile/container/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_tuple_apply test_tuple_apply.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_ck_tile_tuple_apply PRIVATE utility)
+    endif()
+endif()
\ No newline at end of file
diff --git a/test/ck_tile/container/test_tuple_apply.cpp b/test/ck_tile/container/test_tuple_apply.cpp
new file mode 100644
index 0000000000..91e0c22895
--- /dev/null
+++ b/test/ck_tile/container/test_tuple_apply.cpp
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include "ck_tile/core.hpp"
+
+using namespace ck_tile;
+
+class TestCkTileTupleApply : public ::testing::Test
+{
+    public:
+    // Test functors for different scenarios
+    struct AddFunction
+    {
+        template <typename... Args>
+        CK_TILE_HOST_DEVICE constexpr auto operator()(Args... args) const
+        {
+            return (args + ...);
+        }
+    };
+
+    struct MultiplyFunction
+    {
+        template <typename... Args>
+        CK_TILE_HOST_DEVICE constexpr auto operator()(Args... args) const
+        {
+            return (args * ...);
+        }
+    };
+
+    struct MaxFunction
+    {
+        template <typename T>
+        CK_TILE_HOST_DEVICE constexpr T operator()(T a) const
+        {
+            return a;
+        }
+
+        template <typename T, typename... Args>
+        CK_TILE_HOST_DEVICE constexpr T operator()(T a, Args... args) const
+        {
+            auto rest_max = operator()(args...);
+            return a > rest_max ? a : rest_max;
+        }
+    };
+
+    struct ReturnTupleFunction
+    {
+        template <typename... Args>
+        CK_TILE_HOST_DEVICE constexpr auto operator()(Args... args) const
+        {
+            return make_tuple(args..., sizeof...(args));
+        }
+    };
+};
+
+TEST_F(TestCkTileTupleApply, BasicArithmetic)
+{
+    // Test with simple arithmetic operations
+    auto t1      = make_tuple(1, 2, 3);
+    auto result1 = apply(AddFunction{}, t1);
+    EXPECT_EQ(result1, 6);
+
+    auto t2      = make_tuple(2, 3, 4, 5);
+    auto result2 = apply(MultiplyFunction{}, t2);
+    EXPECT_EQ(result2, 120);
+}
+
+TEST_F(TestCkTileTupleApply, SingleElement)
+{
+    // Test with single element tuple
+    auto t1      = make_tuple(42);
+    auto result1 = apply(AddFunction{}, t1);
+    EXPECT_EQ(result1, 42);
+
+    auto result2 = apply(MultiplyFunction{}, t1);
+    EXPECT_EQ(result2, 42);
+}
+
+TEST_F(TestCkTileTupleApply, EmptyTuple)
+{
+    // Test with empty tuple
+    auto t      = tuple<>{};
+    auto result = apply([]() { return 100; }, t);
+    EXPECT_EQ(result, 100);
+}
+
+TEST_F(TestCkTileTupleApply, DifferentTypes)
+{
+    // Test with different data types
+    auto t1      = make_tuple(1, 2.5f, 3.0);
+    auto result1 = apply(AddFunction{}, t1);
+    EXPECT_FLOAT_EQ(result1, 6.5f);
+
+    // Test with mixed integer and floating point
+    auto t2      = make_tuple(10, 0.5f);
+    auto result2 = apply(MultiplyFunction{}, t2);
+    EXPECT_FLOAT_EQ(result2, 5.0f);
+}
+
+TEST_F(TestCkTileTupleApply, ReturnTuple)
+{
+    // Test function that returns a tuple
+    auto t      = make_tuple(1, 2, 3);
+    auto result = apply(ReturnTupleFunction{}, t);
+
+    EXPECT_EQ(result.get<0>(), 1);
+    EXPECT_EQ(result.get<1>(), 2);
+    EXPECT_EQ(result.get<2>(), 3);
+    EXPECT_EQ(result.get<3>(), 3); // size
+}
+
+TEST_F(TestCkTileTupleApply, LambdaFunction)
+{
+    // Test with lambda functions
+    auto t1      = make_tuple(5, 10, 15);
+    auto result1 = apply([](auto a, auto b, auto c) { return a + b + c; }, t1);
+    EXPECT_EQ(result1, 30);
+
+    // Test lambda with capture
+    int multiplier = 2;
+    auto result2 =
+        apply([multiplier](auto a, auto b) { return (a + b) * multiplier; }, make_tuple(3, 7));
+    EXPECT_EQ(result2, 20);
+}
+
+TEST_F(TestCkTileTupleApply, ConstexprContext)
+{
+    // Test in constexpr context
+    constexpr auto t      = make_tuple(2, 3, 4);
+    constexpr auto result = apply(MultiplyFunction{}, t);
+    static_assert(result == 24, "Constexpr apply should work");
+    EXPECT_EQ(result, 24);
+}
+
+TEST_F(TestCkTileTupleApply, ReferenceTypes)
+{
+    // Test with reference types using tie
+    int a = 1, b = 2, c = 3;
+    auto ref_tuple = tie(a, b, c);
+
+    // Function that modifies references
+    apply(
+        [](auto& x, auto& y, auto& z) {
+            x += 10;
+            y += 20;
+            z += 30;
+        },
+        ref_tuple);
+
+    EXPECT_EQ(a, 11);
+    EXPECT_EQ(b, 22);
+    EXPECT_EQ(c, 33);
+}
+
+TEST_F(TestCkTileTupleApply, MoveSemantics)
+{
+    // Test with move semantics
+    auto t      = make_tuple(1, 2, 3);
+    auto result = apply(AddFunction{}, std::move(t));
+    EXPECT_EQ(result, 6);
+}
+
+TEST_F(TestCkTileTupleApply, NumberTypes)
+{
+    // Test with ck_tile::number types
+    auto t      = make_tuple(number<1>{}, number<2>{}, number<3>{});
+    auto result = apply([](auto a, auto b, auto c) { return a + b + c; }, t);
+    EXPECT_EQ(result, 6);
+}
+
+TEST_F(TestCkTileTupleApply, ElementwiseOperation)
+{
+    // Test simulating elementwise operations
+    auto input1 = make_tuple(1.0f, 2.0f, 3.0f);
+    auto input2 = make_tuple(4.0f, 5.0f, 6.0f);
+
+    auto add_elementwise = [](const auto& a, const auto& b) {
+        return apply(
+            [&b](auto... args_a) {
+                return apply(
+                    [args_a...](auto... args_b) { return make_tuple((args_a + args_b)...); }, b);
+            },
+            a);
+    };
+
+    auto result = add_elementwise(input1, input2);
+
+    EXPECT_FLOAT_EQ(result.get<0>(), 5.0f);
+    EXPECT_FLOAT_EQ(result.get<1>(), 7.0f);
+    EXPECT_FLOAT_EQ(result.get<2>(), 9.0f);
+}
+
+template <typename T>
+class TestCkTileTupleApplySize : public TestCkTileTupleApply
+{
+    protected:
+    static constexpr int Size = T::value;
+};
+
+using TupleSizes = ::testing::Types<std::integral_constant<int, 1>,
+                                    std::integral_constant<int, 2>,
+                                    std::integral_constant<int, 3>,
+                                    std::integral_constant<int, 4>,
+                                    std::integral_constant<int, 8>,
+                                    std::integral_constant<int, 16>>;
+
+TYPED_TEST_SUITE(TestCkTileTupleApplySize, TupleSizes);
+
+TYPED_TEST(TestCkTileTupleApplySize, GeneratedTupleSum)
+{
+    constexpr int N = TypeParam::value;
+
+    // Generate tuple with values 1, 2, 3, ..., N
+    constexpr auto t = generate_tuple([](auto i) { return i.value + 1; }, number<N>{});
+
+    // Sum all elements
+    constexpr auto result = apply(TestCkTileTupleApply::AddFunction{}, t);
+
+    // Expected sum: 1 + 2 + ... + N = N*(N+1)/2
+    constexpr int expected = N * (N + 1) / 2;
+    static_assert(result == expected);
+}
diff --git a/test/ck_tile/elementwise/CMakeLists.txt b/test/ck_tile/elementwise/CMakeLists.txt
new file mode 100644
index 0000000000..d22a30ff56
--- /dev/null
+++ b/test/ck_tile/elementwise/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_elementwise_1d test_elementwise_1d.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_ck_tile_elementwise_1d PRIVATE utility)
+    endif()
+endif()
\ No newline at end of file
diff --git a/test/ck_tile/elementwise/test_elementwise_1d.cpp b/test/ck_tile/elementwise/test_elementwise_1d.cpp
new file mode 100644
index 0000000000..5f327c7097
--- /dev/null
+++ b/test/ck_tile/elementwise/test_elementwise_1d.cpp
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <cmath> // For std::abs
+#include <tuple>
+#include <type_traits> // For std::is_same_v, std::is_floating_point_v
+#include <utility>     // For std::index_sequence, std::forward
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp"
+#include "ck_tile/ops/elementwise/binary_elementwise_operation.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+// Traits to get number of inputs for an elementwise operation
+template <typename Op>
+struct elementwise_op_traits;
+
+template <>
+struct elementwise_op_traits<ck_tile::element_wise::Add>
+{
+    static constexpr int num_inputs = 2;
+};
+template <>
+struct elementwise_op_traits<ck_tile::element_wise::Relu>
+{
+    static constexpr int num_inputs = 1;
+};
+
+template <std::size_t D, typename F>
+auto make_uniform_array_with_factory(F&& factory)
+{
+    return [&]<std::size_t... Is>(std::index_sequence<Is...>)
+    {
+        return std::array<std::invoke_result_t<F, std::size_t>, D>{factory(Is)...};
+    }
+    (std::make_index_sequence<D>{});
+}
+
+template <typename Tuple>
+class TestCkTileElementwise : public ::testing::Test
+{
+    protected:
+    using XDataType         = std::tuple_element_t<0, Tuple>;
+    using YDataType         = std::tuple_element_t<1, Tuple>;
+    using ComputeDataType   = std::tuple_element_t<2, Tuple>;
+    using ElementwiseOpType = std::tuple_element_t<3, Tuple>;
+    using BlockWarps_       = std::tuple_element_t<4, Tuple>;
+    using BlockTile_        = std::tuple_element_t<5, Tuple>;
+    using WarpTile_         = std::tuple_element_t<6, Tuple>;
+    using TestElementWiseShape =
+        ck_tile::ElementWiseShape<BlockWarps_, BlockTile_, WarpTile_, ComputeDataType>;
+    static constexpr int NumInputs = elementwise_op_traits<ElementwiseOpType>::num_inputs;
+
+    void RunTest(ck_tile::index_t total_m_elements)
+    {
+        // Dims and Strides (1D example)
+        auto lens    = ck_tile::make_tuple(total_m_elements);
+        auto strides = ck_tile::make_tuple(
+            static_cast<ck_tile::index_t>(1)); // Strides for the single dimension
+
+        // Host Tensors
+        auto h_xs = make_uniform_array_with_factory<NumInputs>([&](std::size_t) {
+            auto ret = ck_tile::HostTensor<XDataType>({total_m_elements});
+            ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(ret);
+            return ret;
+        });
+        ck_tile::HostTensor<YDataType> h_y({total_m_elements});
+        h_y.SetZero();
+        ck_tile::HostTensor<YDataType> h_y_ref({total_m_elements});
+        h_y_ref.SetZero();
+
+        // Device Buffers
+        auto d_xs_mems_owner = make_uniform_array_with_factory<NumInputs>(
+            [&](std::size_t i) { return ck_tile::DeviceMem(h_xs[i]); });
+        for(int i = 0; i < NumInputs; ++i)
+        {
+            d_xs_mems_owner[i].ToDevice(h_xs[i].data());
+        }
+
+        ck_tile::DeviceMem d_y_mem(h_y);
+        d_y_mem.SetZero();
+
+        auto d_x_ptrs_tuple = [&]<std::size_t... Is>(std::index_sequence<Is...>)
+        {
+            return ck_tile::make_tuple(
+                static_cast<const XDataType*>(d_xs_mems_owner[Is].GetDeviceBuffer())...);
+        }
+        (std::make_index_sequence<NumInputs>{});
+
+        YDataType* p_y_device = static_cast<YDataType*>(d_y_mem.GetDeviceBuffer());
+
+        // Problem and Policy
+        using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                            ComputeDataType,
+                                                            YDataType,
+                                                            TestElementWiseShape,
+                                                            ElementwiseOpType>;
+        using Policy  = ck_tile::ElementWiseDefaultPolicy;
+
+        ck_tile::ElementWiseKernel<Problem, Policy> ew_kernel;
+
+        // Launch configuration
+        ck_tile::index_t grid_size =
+            (total_m_elements + TestElementWiseShape::kBlockM - 1) / TestElementWiseShape::kBlockM;
+        dim3 grid(grid_size, 1, 1);
+        dim3 block(TestElementWiseShape::kBlockSize, 1, 1);
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        ck_tile::stream_config s{nullptr, false, 0}; // Default stream, no timing, no log
+
+        // Check if the kernel configuration is supported
+        if(!ew_kernel.IsSupportedArgument(lens))
+        {
+            throw std::runtime_error(
+                "The kernel configuration is not supported for the given input size.");
+        }
+
+        ck_tile::launch_kernel(
+            s,
+            ck_tile::make_kernel<TestElementWiseShape::kBlockSize, // MaxThreadPerBlock
+                                 kBlockPerCu>                      // MinBlockPerCu
+            (ew_kernel,
+             grid,
+             block,
+             0, // actual shared memory
+             lens,
+             strides, // input strides
+             strides, // output strides
+             d_x_ptrs_tuple,
+             p_y_device));
+
+        d_y_mem.FromDevice(h_y.data());
+
+        // Reference computation on host
+        ElementwiseOpType op_host;
+        for(ck_tile::index_t i = 0; i < total_m_elements; ++i)
+        {
+            auto get_host_op_args = [&]<std::size_t... Is>(std::index_sequence<Is...>)
+            {
+                return ck_tile::make_tuple(static_cast<ComputeDataType>(h_xs[Is](i))...);
+            }
+            (std::make_index_sequence<NumInputs>{});
+
+            YDataType temp_y_val;
+            ck_tile::apply(
+                [&](auto&&... host_input_args) {
+                    op_host(temp_y_val,
+                            std::forward<decltype(host_input_args)>(host_input_args)...);
+                },
+                get_host_op_args);
+            h_y_ref(i) = temp_y_val;
+        }
+
+        // Check results
+        check_err(h_y, h_y_ref, "Error: Incorrect results!", 1e-5, 1e-5);
+    }
+};
+
+// Shape parameters (can be shared or varied per test type)
+using Shape1_BlockWarps = ck_tile::sequence<1>;   // 1D warp arrangement in M
+using Shape1_BlockTile  = ck_tile::sequence<256>; // M-dimension of block tile
+using Shape1_WarpTile   = ck_tile::sequence<64>;  // M-dimension of warp tile
+
+// Test configurations
+using TestConfig_F32_Add = std::tuple<float,
+                                      float,
+                                      float,
+                                      ck_tile::element_wise::Add,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile>;
+
+using TestConfig_F32_Relu = std::tuple<float,
+                                       float,
+                                       float,
+                                       ck_tile::element_wise::Relu,
+                                       Shape1_BlockWarps,
+                                       Shape1_BlockTile,
+                                       Shape1_WarpTile>;
+
+using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
+                                      ck_tile::half_t,
+                                      float, // Compute in float for half
+                                      ck_tile::element_wise::Add,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile>;
+
+using TestTypes = ::testing::Types<TestConfig_F32_Add, TestConfig_F32_Relu, TestConfig_F16_Add>;
+
+TYPED_TEST_SUITE(TestCkTileElementwise, TestTypes);
+
+TYPED_TEST(TestCkTileElementwise, RunElementwise_1024) { this->RunTest(1024); }
+
+TYPED_TEST(TestCkTileElementwise, RunElementwise_513)
+{
+    EXPECT_THROW((this->RunTest(513)),
+                 std::runtime_error); // Test with an input size that's not a multiple of kVectorM
+}
+
+TYPED_TEST(TestCkTileElementwise, RunElementwise_516)
+{
+    this->RunTest(516); // Test with an input size that's not a multiple of blockM
+}
+
+TYPED_TEST(TestCkTileElementwise, RunElementwise_Small_32)
+{
+    this->RunTest(32); // Test with a very small size
+}

From 4338346b106c1fe03d4e00be375d4d87052bc46b Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Thu, 24 Jul 2025 17:38:14 +0800
Subject: [PATCH 339/443] Use filename but not path to filter compilation
 (#2556)

---
 example/CMakeLists.txt                        | 167 +++++++++---------
 .../gpu/CMakeLists.txt                        |  95 +++++-----
 2 files changed, 122 insertions(+), 140 deletions(-)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 3c67e9214f..7bd628edf2 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -24,26 +24,27 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
     set(result 1)
     if(DEFINED DTYPES)
         foreach(source IN LISTS FILE_NAME)
+            get_filename_component(source_name ${source} NAME)
             set(test 0)
-            if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp16" OR source_name MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
                 set(test 1)
             endif()
-            if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp32" OR source_name MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
                 set(test 1)
             endif()
-            if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp64" OR source_name MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
                 set(test 1)
             endif()
-            if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
+            if((source_name MATCHES "_fp8" OR source_name MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
                 set(test 1)
             endif()
-            if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
+            if((source_name MATCHES "_bf8" OR source_name MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
                 set(test 1)
             endif()
-            if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
+            if((source_name MATCHES "_bf16" OR source_name MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
                 set(test 1)
             endif()
-            if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
+            if((source_name MATCHES "_int8" OR source_name MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
                 set(test 1)
             endif()
             if(test EQUAL 1)
@@ -55,73 +56,65 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
 
     set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
 
-    #Do not build any DL examples if DL_KERNELS not set
     foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+        get_filename_component(source_name ${source} NAME)
+        #Do not build any DL examples if DL_KERNELS not set
+        if(NOT DEFINED DL_KERNELS AND source_name MATCHES "_dl")
             message(DEBUG "removing dl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any DPP examples if DPP_KERNELS not set
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
+        #Do not build any DPP examples if DPP_KERNELS not set
+        if(NOT DEFINED DPP_KERNELS AND source_name MATCHES "_dpp")
             message(DEBUG "removing dpp example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any XDL examples if gfx9 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
+        #Do not build any XDL examples if gfx9 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
             message(DEBUG "removing xdl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any WMMA examples if gfx11 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-	if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+        #Do not build any WMMA examples if gfx11 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source_name MATCHES "_wmma")
             message(DEBUG "removing wmma example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any microscaling examples if gfx950 target is not on the list
-    foreach(source IN LISTS FILE_NAME)
-	if(NOT EX_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
+        #Do not build any microscaling examples if gfx950 target is not on the list
+        if(NOT EX_TARGETS MATCHES "gfx950" AND source_name MATCHES "_mx")
             message(DEBUG "removing microscaling example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any FP8 examples if CK_ENABLE_FP8 not set
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED CK_ENABLE_FP8 AND source MATCHES "_fp8")
+        #Do not build any FP8 examples if CK_ENABLE_FP8 not set
+        if(NOT DEFINED CK_ENABLE_FP8 AND source_name MATCHES "_fp8")
             message(DEBUG "removing fp8 example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any BF8 examples if CK_ENABLE_BF8 not set
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED CK_ENABLE_BF8 AND source MATCHES "_bf8")
+        #Do not build any BF8 examples if CK_ENABLE_BF8 not set
+        if(NOT DEFINED CK_ENABLE_BF8 AND source_name MATCHES "_bf8")
             message(DEBUG "removing bf8 example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    # Build fp8 gemm_multiply_multiply and moe only on gfx94/95
-    foreach(source IN LISTS FILE_NAME)
-    if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95")
-        if (source MATCHES "fp8" AND source MATCHES "(gemm_multiply_multiply|moe)")
-            message(DEBUG "Skipping ${source} example for current target")
-            list(REMOVE_ITEM FILE_NAME "${source}")
+        # Build fp8 gemm_multiply_multiply and moe only on gfx94/95
+        if(NOT EX_TARGETS MATCHES "gfx94" AND NOT EX_TARGETS MATCHES "gfx95")
+            if(source_name MATCHES "fp8" AND source_name MATCHES "(gemm_multiply_multiply|moe)")
+                message(DEBUG "Skipping ${source} example for current target")
+                list(REMOVE_ITEM FILE_NAME "${source}")
+            endif()
         endif()
-    endif()
     endforeach()
     #only continue if there are some source files left on the list
+    set(source_name_list "")
+    foreach(source IN LISTS FILE_NAME)
+        get_filename_component(source_name ${source} NAME)
+        list(APPEND source_name_list ${source_name})
+    endforeach()
     if(FILE_NAME)
-        if(FILE_NAME MATCHES "_xdl" AND NOT FILE_NAME MATCHES "_pk_i4")
+        if(source_name_list MATCHES "_xdl" AND NOT source_name_list MATCHES "_pk_i4")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(FILE_NAME MATCHES "_wmma")
+        elseif(source_name_list MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
-        elseif(FILE_NAME MATCHES "_mx") #only build mx example for gfx950
+        elseif(source_name_list MATCHES "_mx") #only build mx example for gfx950
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(FILE_NAME MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
+        elseif(source_name_list MATCHES "_pk_i4") #only build these examples for gfx942 and gfx950
             message(DEBUG "trimming targets for ${FILE_NAME}")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
         endif()
@@ -130,7 +123,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
         target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
         target_link_libraries(${EXAMPLE_NAME} PRIVATE getopt::getopt)
         add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
-        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS} )
+        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS})
         add_dependencies(examples ${EXAMPLE_NAME})
         add_dependencies(check ${EXAMPLE_NAME})
         rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
@@ -157,71 +150,71 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
     message(DEBUG "adding example ${EXAMPLE_NAME}")
     set(result 1)
     if(DEFINED DTYPES)
-    foreach(source IN LISTS FILE_NAME)
-        set(test 0)
-        if((source MATCHES "_fp16" OR source MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_fp32" OR source MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_fp64" OR source MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_fp8" OR source MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_bf8" OR source MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_bf16" OR source MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if((source MATCHES "_int8" OR source MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
-            set(test 1)
-        endif()
-        if(test EQUAL 1)
-            message(DEBUG "removing example ${source} ")
-            list(REMOVE_ITEM FILE_NAME "${source}")
-        endif()
-    endforeach()
+        foreach(source IN LISTS FILE_NAME)
+            get_filename_component(source_name ${source} NAME)
+            set(test 0)
+            if((source_name MATCHES "_fp16" OR source_name MATCHES "_f16") AND NOT "fp16" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_fp32" OR source_name MATCHES "_f32") AND NOT "fp32" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_fp64" OR source_name MATCHES "_f64") AND NOT "fp64" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_fp8" OR source_name MATCHES "_f8") AND NOT "fp8" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_bf8" OR source_name MATCHES "_bf8") AND NOT "bf8" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_bf16" OR source_name MATCHES "_b16") AND NOT "bf16" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if((source_name MATCHES "_int8" OR source_name MATCHES "_i8") AND NOT "int8" IN_LIST DTYPES)
+                set(test 1)
+            endif()
+            if(test EQUAL 1)
+                message(DEBUG "removing example ${source} ")
+                list(REMOVE_ITEM FILE_NAME "${source}")
+            endif()
+        endforeach()
     endif()
 
     set(EX_TARGETS ${SUPPORTED_GPU_TARGETS})
 
-    #Do not build any DL examples if DL_KERNELS not set
+    set(source_name_list "")
     foreach(source IN LISTS FILE_NAME)
-        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+        get_filename_component(source_name ${source} NAME)
+        #Do not build any DL examples if DL_KERNELS not set
+        if(NOT DEFINED DL_KERNELS AND source_name MATCHES "_dl")
             message(DEBUG "removing dl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any XDL examples if gfx9 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-        if(NOT EX_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
+        #Do not build any XDL examples if gfx9 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
             message(DEBUG "removing xdl example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
-    endforeach()
-    #Do not build any WMMA examples if gfx11 targets are not on the list
-    foreach(source IN LISTS FILE_NAME)
-	if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+        #Do not build any WMMA examples if gfx11 targets are not on the list
+        if(NOT EX_TARGETS MATCHES "gfx11" AND NOT EX_TARGETS MATCHES "gfx12" AND source_name MATCHES "_wmma")
             message(DEBUG "removing wmma example ${source} ")
             list(REMOVE_ITEM FILE_NAME "${source}")
         endif()
+        list(APPEND source_name_list ${source_name})
     endforeach()
     #only continue if there are some source files left on the list
     if(FILE_NAME)
-        if(FILE_NAME MATCHES "_xdl")
+        if(source_name_list MATCHES "_xdl")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-        elseif(FILE_NAME MATCHES "_wmma")
+        elseif(source_name_list MATCHES "_wmma")
             list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
         endif()
         set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
         add_executable(${EXAMPLE_NAME} ${FILE_NAME})
         target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
         add_dependencies(examples ${EXAMPLE_NAME})
-        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS} )
+        set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS})
         rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
         set(result 0)
     endif()
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index d1466206f0..90e8dc0221 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -3,6 +3,7 @@ function(add_instance_library INSTANCE_NAME)
     set(result 1)
     if(DEFINED DTYPES)
         foreach(source IN LISTS ARGN)
+            get_filename_component(source_name ${source} NAME)
             set(test 0)
             foreach(type IN LISTS DTYPES)
                 if(type MATCHES "fp16")
@@ -19,13 +20,13 @@ function(add_instance_library INSTANCE_NAME)
                     set(type1 "_i8")
                 endif()
                 #make an exception for reduction kernels
-                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}" OR "${source}" MATCHES "device_reduce_instance" OR ${source} MATCHES "device_image_to_column")
+                if("${source_name}" MATCHES "${type}" OR "${source_name}" MATCHES "${type1}" OR "${source_name}" MATCHES "device_reduce_instance" OR ${source_name} MATCHES "device_image_to_column")
                     #if filename matches any selected type, exit type loop and do no exclude the file from the list
                     set(test 0)
                     break()
-                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
-                    source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
-                    NOT(source MATCHES type OR source MATCHES type1))
+                elseif((source_name MATCHES "fp8" OR source_name MATCHES "fp32" OR source_name MATCHES "fp64" OR source_name MATCHES "bf16" OR source_name MATCHES "int8" OR source_name MATCHES "fp16" OR
+                         source_name MATCHES "_f8" OR source_name MATCHES "_f32" OR source_name MATCHES "_f64" OR source_name MATCHES "_i8" OR source_name MATCHES "_f16" OR source_name MATCHES "_b16") AND
+                    NOT (source_name MATCHES type OR source_name MATCHES type1))
                     #if filename contains a type which doesn't match any selected type, mark it for removal
                     set(test 1)
                 endif()
@@ -39,66 +40,52 @@ function(add_instance_library INSTANCE_NAME)
 
     set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
 
-    # Do not build DPP instances if DPP_KERNELS macro is not set
     foreach(source IN LISTS ARGN)
-        if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
+        get_filename_component(source_name ${source} NAME)
+
+        # Do not build DPP instances if DPP_KERNELS macro is not set
+        if(NOT DEFINED DPP_KERNELS AND source_name MATCHES "_dpp")
             message(DEBUG "removing dpp instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-    endforeach()
-    # Do not build DL instances if DL_KERNELS macro is not set
-    foreach(source IN LISTS ARGN)
-        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+        # Do not build DL instances if DL_KERNELS macro is not set
+        if(NOT DEFINED DL_KERNELS AND source_name MATCHES "_dl")
             message(DEBUG "removing dl instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-    endforeach()
-    # Do not build XDL instances if gfx9 targets are not on the target list
-    foreach(source IN LISTS ARGN)
-        if(NOT INST_TARGETS MATCHES "gfx9" AND source MATCHES "_xdl")
+        # Do not build XDL instances if gfx9 targets are not on the target list
+        if(NOT INST_TARGETS MATCHES "gfx9" AND source_name MATCHES "_xdl")
             message(DEBUG "removing xdl instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-    endforeach()
-    # Do not build MX instances if gfx950 targets are not on the target list
-    foreach(source IN LISTS ARGN)
-        if(NOT INST_TARGETS MATCHES "gfx950" AND source MATCHES "_mx")
+        # Do not build MX instances if gfx950 targets are not on the target list
+        if(NOT INST_TARGETS MATCHES "gfx950" AND source_name MATCHES "_mx")
             message(DEBUG "removing MX instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-    endforeach()
-    # Do not build WMMA instances if gfx11 targets are not on the target list
-    foreach(source IN LISTS ARGN)
-	if(NOT INST_TARGETS MATCHES "gfx11" AND NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+        # Do not build WMMA instances if gfx11 targets are not on the target list
+        if(NOT INST_TARGETS MATCHES "gfx11" AND NOT INST_TARGETS MATCHES "gfx12" AND source_name MATCHES "_wmma")
             message(DEBUG "removing wmma instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
-    endforeach()
-    # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
-    foreach(source IN LISTS ARGN)
-	    if((NOT BUILD_MHA_LIB OR (NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND NOT INST_TARGETS MATCHES "gfx95")) AND source MATCHES "mha")
-         message(DEBUG "removing mha instance ${source} ")
-         list(REMOVE_ITEM ARGN "${source}")
-    endif()
-    endforeach()
-    # Do not build XDL gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
-    if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        foreach(source IN LISTS ARGN)
-            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply" AND source MATCHES "_f8_")
+        # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
+        if((NOT BUILD_MHA_LIB OR (NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND NOT INST_TARGETS MATCHES "gfx95")) AND source_name MATCHES "mha")
+            message(DEBUG "removing mha instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+        # Do not build XDL gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
+        if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "_f8_")
                 message(DEBUG "removing gemm_multiply_multiply_f8 instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
-        endforeach()
-        foreach(source IN LISTS ARGN)
-            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
+            if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "_f8_")
                 message(DEBUG "removing gemm_universal_f8 instance ${source} ")
                 list(REMOVE_ITEM ARGN "${source}")
             endif()
-        endforeach()
-    endif()
-    # Do not build WMMA gemm_universal_f8 for any targets except gfx12+
-    foreach(source IN LISTS ARGN)
-        if(NOT INST_TARGETS MATCHES "gfx12" AND source MATCHES "gemm_wmma_universal" AND source MATCHES "_f8_")
+        endif()
+        # Do not build WMMA gemm_universal_f8 for any targets except gfx12+
+        if(NOT INST_TARGETS MATCHES "gfx12" AND source_name MATCHES "gemm_wmma_universal" AND source_name MATCHES "_f8_")
             message(DEBUG "removing gemm_universal_f8 instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
@@ -109,41 +96,43 @@ function(add_instance_library INSTANCE_NAME)
     if(ARGN)
         set(INST_OBJ)
         foreach(source IN LISTS ARGN)
+            get_filename_component(source_name ${source} NAME)
+
             set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
-            if(source MATCHES "_xdl")
+            if(source_name MATCHES "_xdl")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
-            elseif(source MATCHES "_wmma")
+            elseif(source_name MATCHES "_wmma")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx950)
-            elseif(source MATCHES "mha")
+            elseif(source_name MATCHES "mha")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
             endif()
 
-            if(source MATCHES "_mx")
+            if(source_name MATCHES "_mx")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
             endif()
 
             #only build the fp8 gemm instances for gfx90a if the build argument is set, otherwise only build for gfx942/gfx950
             if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-                if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
+                if(source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
-                if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
+                if(source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
             else()
-                if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
+                if(source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
-                if(source MATCHES "gemm_multiply_multiply" AND source MATCHES "f8")
+                if(source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
             endif()
-            if(source MATCHES "gemm_wmma_universal" AND source MATCHES "f8")
+            if(source_name MATCHES "gemm_wmma_universal" AND source_name MATCHES "f8")
                 list(FILTER INST_TARGETS INCLUDE REGEX "gfx12")
             endif()
             set(offload_targets)
             foreach(target IN LISTS INST_TARGETS)
-                    string(APPEND offload_targets "--offload-arch=${target} ")
+                string(APPEND offload_targets "--offload-arch=${target} ")
             endforeach()
             set_source_files_properties(${source} PROPERTIES COMPILE_FLAGS ${offload_targets})
             list(APPEND INST_OBJ ${source})
@@ -165,7 +154,7 @@ function(add_instance_library INSTANCE_NAME)
             list(APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1)
             target_compile_options(device_mha_instance PRIVATE ${FMHA_COMPILE_OPTIONS})
         endif()
-        
+
         target_compile_features(${INSTANCE_NAME} PUBLIC)
 
         # flags to compress the library

From adeaf61ee5ed87c1af3208a7516cab73f6888628 Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:32:06 -0600
Subject: [PATCH 340/443] [CK_TILE] Disable moe_sorting unit test on gfx908
 (#2555)

* [CK_TILE] Disable moe_sorting unit test on gfx908

- gfx908 does not support instruction used in moe_sorting

* Update CMakeLists.txt

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 test/ck_tile/moe_sorting/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/ck_tile/moe_sorting/CMakeLists.txt b/test/ck_tile/moe_sorting/CMakeLists.txt
index e360293878..9a7490f0c9 100644
--- a/test/ck_tile/moe_sorting/CMakeLists.txt
+++ b/test/ck_tile/moe_sorting/CMakeLists.txt
@@ -1,5 +1,5 @@
-# Currently ck_tile is only built on gfx9
-if(GPU_TARGETS MATCHES "gfx9")
+# Currently ck_tile is only built on gfx90a, gfx942 and gfx950
+if(GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950" OR GPU_TARGETS MATCHES "gfx90a")
 
     add_test_executable(test_ck_tile_moe_sorting_fp32 moe_sorting_fp32.cpp moe_sorting_api.cpp)
     target_include_directories(test_ck_tile_moe_sorting_fp32 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)

From 3421272f90a64610081af948220ee54fc1c599b7 Mon Sep 17 00:00:00 2001
From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com>
Date: Fri, 11 Jul 2025 13:07:05 -0600
Subject: [PATCH 341/443] MX GEMM - FP6 Support in GEMM MX v3 Pipeline (#2481)

* Add GEMM MX BF6 example

* Fix BF6 type_convert

* Add type_convert for bf16x6

* Add compare operator to f4x2_pk_t

* Update README for 67_gemm_microscaling

* Fix host tensor initialization with integer values for FP8
---
 include/ck/utility/data_type.hpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index 5fbe30d21b..ff611d2136 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -71,6 +71,17 @@ struct f4x2_pk_t
     {
         return !(lhs == rhs);
     }
+
+    // Compare operator
+    __host__ __device__ friend bool operator==(const f4x2_pk_t& lhs, const f4x2_pk_t& rhs)
+    {
+        return lhs.data == rhs.data;
+    }
+
+    __host__ __device__ friend bool operator!=(const f4x2_pk_t& lhs, const f4x2_pk_t& rhs)
+    {
+        return !(lhs == rhs);
+    }
 };
 
 template <typename BitType, index_t pk_size>

From 1e84fdaca7805614a4c005a315f2c7424895d5f0 Mon Sep 17 00:00:00 2001
From: AviralGoelAMD <aviral.goel@amd.com>
Date: Thu, 24 Jul 2025 17:30:05 +0000
Subject: [PATCH 342/443] docs(CHANGELOG): update changelog for rocm 7.0

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7653515210..fa3ba71143 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
 
-## Composable Kernel 1.1.0 for ROCm 6.5.0
+## Composable Kernel 1.1.0 for ROCm 7.0.0
 
 ### Added
 

From b507d889c11b099004f94b1402d0693c3942234c Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Thu, 24 Jul 2025 20:39:56 +0200
Subject: [PATCH 343/443] [CK_TILE] Introduces a new GEMM API that splits the
 existing basic GEMM class into multiple specialized classes. (#2520)

* Init commit new API

* apply clang-format

* PreShuffle preapring

* Apply Preshuffle condition to universal_gemm

* Fix: convert size_t to index_t

* Review changes

* Mode 100755 -> 100644

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 example/ck_tile/03_gemm/gemm_basic.cpp        |    2 +-
 example/ck_tile/03_gemm/gemm_utils.hpp        |    2 +-
 .../03_gemm/gemm_weight_preshuffle.cpp        |  209 +--
 example/ck_tile/03_gemm/run_gemm_example.inc  |   24 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  209 +--
 .../run_batched_gemm_example.inc              |   29 +-
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  |    2 +-
 .../run_grouped_gemm_example.inc              |   26 +-
 .../19_gemm_multi_d/gemm_multi_d_fp16.cpp     |    2 +-
 .../19_gemm_multi_d/gemm_multi_d_fp16.hpp     |    2 +-
 include/ck_tile/core/container/tuple.hpp      |    2 +
 include/ck_tile/ops/gemm.hpp                  |    2 +
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   |  166 ++-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   | 1015 +-------------
 .../ops/gemm/kernel/gemm_multi_d_kernel.hpp   |  185 +++
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |  165 ++-
 .../ops/gemm/kernel/universal_gemm_kernel.hpp | 1169 +++++++++++++++++
 .../batched_gemm/test_batched_gemm_util.hpp   |   29 +-
 .../test_gemm_pipeline_basic_run_test.inc     |    2 +-
 .../test_gemm_pipeline_smoke_run_test.inc     |   24 +-
 .../gemm/test_gemm_pipeline_smoke_util.hpp    |    2 +-
 .../test_gemm_pipeline_universal_run_test.inc |  211 +--
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |   30 +-
 .../gemm_multi_d/test_gemm_multi_d_util.hpp   |   30 +-
 .../test_gemm_pipeline_util.hpp               |   30 +-
 .../grouped_gemm/test_grouped_gemm_util.hpp   |   28 +-
 tile_engine/ops/gemm/gemm_instance_builder.py |   10 +-
 tile_engine/ops/gemm/gemm_profiler.hpp        |    6 +-
 28 files changed, 2094 insertions(+), 1519 deletions(-)
 create mode 100644 include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
 create mode 100644 include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp

diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 80c18cdb87..0d9c2d9957 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -24,7 +24,7 @@ template <typename GemmConfig,
           typename CLayout,
           bool Persistent,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
 {
     if constexpr(Persistent)
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 24f64994cf..1e867afd1a 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -475,4 +475,4 @@ template <typename ADataType,
           typename CLayout,
           bool Persistent = false,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s);
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index b7b0701080..34333d5474 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -25,7 +25,7 @@ template <typename GemmConfig,
           typename ELayout,
           bool Persistent,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -74,119 +74,120 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
     float ave_time{0};
 
-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
 
-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 DsDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 DsLayout,
-                                                 ELayout,
-                                                 CDEElementWise,
-                                                 UniversalGemmProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation,
-                                                 GemmConfig::NumWaveGroups>>;
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
 
-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
-            constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();
 
-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
 
-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
 
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
 
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
 
-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_preprocess(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 83836117e9..7f87c2bc06 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -158,7 +158,7 @@ template <typename GemmConfig,
           typename CLayout,
           bool Persistent,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float gemm(const ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& s);
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
 
 template <typename GemmConfig,
           typename ADataType,
@@ -185,18 +185,16 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   int n_repeat,
                   bool persistent)
 {
-    ck_tile::GemmHostArgs</*NumDTensor = 0*/> args = {a_m_k_dev_buf.GetDeviceBuffer(),
-                                                      b_k_n_dev_buf.GetDeviceBuffer(),
-                                                      {},
-                                                      c_m_n_dev_buf.GetDeviceBuffer(),
-                                                      kbatch,
-                                                      M,
-                                                      N,
-                                                      K,
-                                                      stride_A,
-                                                      stride_B,
-                                                      {},
-                                                      stride_C};
+    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                  b_k_n_dev_buf.GetDeviceBuffer(),
+                                  c_m_n_dev_buf.GetDeviceBuffer(),
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_A,
+                                  stride_B,
+                                  stride_C};
 
     float ave_time;
     if(persistent)
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index c96a470910..6c60f98fa4 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -25,7 +25,7 @@ template <typename GemmConfig,
           typename ELayout,
           bool Persistent,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -74,120 +74,121 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
     float ave_time{0};
 
-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
 
-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 DsDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 DsLayout,
-                                                 ELayout,
-                                                 CDEElementWise,
-                                                 UniversalGemmProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation,
-                                                 GemmConfig::NumWaveGroups>>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
 
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
 
-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
-            constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();
 
-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
 
-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
 
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
 
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
 
-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_preprocess(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
diff --git a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
index 7d5e1910dd..6d26cfe675 100644
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -50,21 +50,20 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                           int n_warmup,
                           int n_repeat)
 {
-    ck_tile::BatchedGemmHostArgs args;
-    args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
-    args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-    args.e_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch        = kbatch;
-    args.M              = M;
-    args.N              = N;
-    args.K              = K;
-    args.stride_A       = stride_A;
-    args.stride_B       = stride_B;
-    args.stride_E       = stride_C;
-    args.batch_stride_A = batch_stride_A;
-    args.batch_stride_B = batch_stride_B;
-    args.batch_stride_E = batch_stride_C;
-    args.batch_count    = batch_count;
+    ck_tile::BatchedGemmHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      batch_stride_C,
+                                      batch_count};
 
     float ave_time = batched_gemm<ADataType,
                                   BDataType,
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index c4e83617d3..74efb1bdeb 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -54,7 +54,7 @@ using BDataType   = Types::BDataType;
 using AccDataType = Types::AccDataType;
 using CDataType   = Types::CDataType;
 
-using grouped_gemm_kargs = ck_tile::GemmHostArgs</*NumDTensor = 0*/>;
+using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
 
 auto create_args(int argc, char* argv[])
 {
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index 5ed1219731..7532923f9a 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -83,18 +83,18 @@ float invoke_gemm(int n_warmup,
         const bool splitk = args[0].k_batch > 1;
         for(const auto& arg : args)
         {
-            kargs.emplace_back(ck_tile::GemmKernelArgs<>{arg.a_ptr,
-                                                         arg.b_ptr,
-                                                         {},
-                                                         arg.e_ptr,
-                                                         arg.M,
-                                                         arg.N,
-                                                         arg.K,
-                                                         arg.stride_A,
-                                                         arg.stride_B,
-                                                         {},
-                                                         arg.stride_E,
-                                                         arg.k_batch});
+            kargs.emplace_back(ck_tile::UniversalGemmKernelArgs<>{{arg.a_ptr},
+                                                                  {arg.b_ptr},
+                                                                  {/*arg.ds_ptr*/},
+                                                                  arg.e_ptr,
+                                                                  arg.M,
+                                                                  arg.N,
+                                                                  arg.K,
+                                                                  {arg.stride_A},
+                                                                  {arg.stride_B},
+                                                                  {/*arg.stride_Ds*/},
+                                                                  arg.stride_E,
+                                                                  arg.k_batch});
         }
         const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
         HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
@@ -240,7 +240,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
         void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
         gemm_descs.push_back(
-            {p_a, p_b, {}, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], {}, stride_Cs[i]});
+            {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
     }
 
     invoke_gemm<ADataType,
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
index 6c5ca08426..3debfa7f42 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -157,7 +157,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
                                                  UniversalGemmProblem::TransposeC,
                                                  memory_operation>>;
 
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
             const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
index 3ce3965e56..87b9592553 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
@@ -64,7 +64,7 @@ auto create_args(int argc, char* argv[])
     return std::make_tuple(result, arg_parser);
 }
 
-using gemm_multi_d_kargs = ck_tile::GemmHostArgs<DsDataType::size()>;
+using gemm_multi_d_kargs = ck_tile::GemmMultiDHostArgs<DsDataType::size()>;
 
 template <typename ADataType,
           typename BDataType,
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index a3e937c936..63d145d8b9 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -262,6 +262,8 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
         return flag;
     }
 
+    CK_TILE_HOST_DEVICE static constexpr bool IsTuple() { return true; }
+
 #define TP_COM_() static_assert(I < size(), "wrong! out of range")
     // clang-format off
     template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const &          { TP_COM_(); return impl::getv<I>(*this); }
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index b396f03244..9d00de5f73 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -28,6 +28,8 @@
 #include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index fc72138abf..9c1ce73eac 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -9,35 +9,41 @@
 
 namespace ck_tile {
 
-struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs</*NumDTensor = 0*/>
+/// @brief The Batched GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref BatchedGemmKernel "BatchedGemmKernel" when creating kernel
+///      arguments object. It contain all necessary information required to build proper kernel
+///      argument and launch kernel on GPU. This structure defines the GEMM problem configuration by
+///      stating all required information like M,N,K sizes and respective strides.
+struct BatchedGemmHostArgs : public ck_tile::UniversalGemmHostArgs<>
 {
-    CK_TILE_HOST BatchedGemmHostArgs() = default;
-    CK_TILE_HOST BatchedGemmHostArgs(const void* a_ptr_,
-                                     const void* b_ptr_,
-                                     void* c_ptr_,
-                                     ck_tile::index_t k_batch_,
-                                     ck_tile::index_t M_,
-                                     ck_tile::index_t N_,
-                                     ck_tile::index_t K_,
-                                     ck_tile::index_t stride_A_,
-                                     ck_tile::index_t stride_B_,
-                                     ck_tile::index_t stride_C_,
-                                     ck_tile::index_t batch_stride_A_,
-                                     ck_tile::index_t batch_stride_B_,
-                                     ck_tile::index_t batch_stride_C_,
-                                     ck_tile::index_t batch_count_)
-        : GemmHostArgs(a_ptr_,
-                       b_ptr_,
-                       {},
-                       c_ptr_,
-                       k_batch_,
-                       M_,
-                       N_,
-                       K_,
-                       stride_A_,
-                       stride_B_,
-                       {},
-                       stride_C_),
+    CK_TILE_HOST explicit BatchedGemmHostArgs(const void* a_ptr_,
+                                              const void* b_ptr_,
+                                              void* c_ptr_,
+                                              ck_tile::index_t k_batch_,
+                                              ck_tile::index_t M_,
+                                              ck_tile::index_t N_,
+                                              ck_tile::index_t K_,
+                                              ck_tile::index_t stride_A_,
+                                              ck_tile::index_t stride_B_,
+                                              ck_tile::index_t stride_C_,
+                                              ck_tile::index_t batch_stride_A_,
+                                              ck_tile::index_t batch_stride_B_,
+                                              ck_tile::index_t batch_stride_C_,
+                                              ck_tile::index_t batch_count_)
+        : UniversalGemmHostArgs<>({a_ptr_},
+                                  {b_ptr_},
+                                  {/*ds_ptr*/},
+                                  c_ptr_,
+                                  k_batch_,
+                                  M_,
+                                  N_,
+                                  K_,
+                                  {stride_A_},
+                                  {stride_B_},
+                                  {/*stride_Ds_*/},
+                                  stride_C_),
           batch_stride_A(batch_stride_A_),
           batch_stride_B(batch_stride_B_),
           batch_stride_E(batch_stride_C_),
@@ -52,36 +58,43 @@ struct BatchedGemmHostArgs : public ck_tile::GemmHostArgs</*NumDTensor = 0*/>
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
-struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>
+struct BatchedGemmKernel
 {
-    using Base = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using UniversalGemmKernel =
+        UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
 
-    using GemmKernelArgs = typename ck_tile::GemmKernelArgs<>;
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
 
-    using ADataType = typename Base::ADataType;
-    using BDataType = typename Base::BDataType;
-    using CDataType = typename Base::EDataType;
+    /// @brief Specify the layout configurations for A, B, E and D
+    using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
 
-    using TilePartitioner  = typename Base::TilePartitioner;
-    using GemmPipeline     = typename Base::GemmPipeline;
-    using EpiloguePipeline = typename Base::EpiloguePipeline;
-    using ALayout          = typename Base::ALayout;
-    using BLayout          = typename Base::BLayout;
-    using CLayout          = typename Base::ELayout;
+    /// @brief Specify the data type configurations for A, B, E and D
+    using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
-    {
-        // clang-format off
-        using P_ = GemmPipeline;
+    /// @brief ALayout and ADataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, ALayout>::value && !is_detected<is_tuple, ADataType>::value,
+        "ALayout and ADataType must be scalars. Multiple parameters are not currently supported.");
 
-        return concat('_', "gemm_batched", gemm_prec_str<ADataType, BDataType>(),
-                      concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock), 
-                      concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
-                      concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
-        // clang-format on
-    }
+    /// @brief  BLayout and BDataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
+        "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
 
-    struct BatchedGemmKernelArgs : GemmKernelArgs
+    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
+                      !is_detected<is_tuple, CDataType>::value,
+                  "C/ELayout and C/EDataType must be scalars.");
+
+    struct BatchedGemmKernelArgs : ck_tile::UniversalGemmKernelArgs<>
     {
         index_t batch_stride_A;
         index_t batch_stride_B;
@@ -91,27 +104,41 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
     using KernelArgs = BatchedGemmKernelArgs;
 
-    __host__ static constexpr auto
-    GridSize(index_t M, index_t N, index_t KBatch, index_t batch_count)
+    [[nodiscard]] CK_TILE_HOST static auto GetName() -> const std::string
+    {
+        // clang-format off
+        using P_ = GemmPipeline;
+        return concat('_', "gemm_batched", gemm_prec_str<ADataType, BDataType>(),
+                      concat('x', P_::MPerBlock, P_::NPerBlock, P_::KPerBlock), 
+                      concat('x', P_::GetVectorSizeA(), P_::GetVectorSizeB(), P_::GetVectorSizeC()),
+                      concat('x', P_::kPadM, P_::kPadN, P_::kPadK));
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto
+    GridSize(index_t M, index_t N, index_t KBatch, index_t batch_count) -> dim3
     {
         return dim3(TilePartitioner::GridSize(M, N), batch_count, KBatch);
     }
 
-    __host__ static constexpr auto BlockSize() { return dim3(Base::KernelBlockSize); }
+    CK_TILE_HOST static constexpr auto BlockSize() -> dim3
+    {
+        return dim3(UniversalGemmKernel::KernelBlockSize);
+    }
 
     CK_TILE_HOST static constexpr BatchedGemmKernelArgs
     MakeKernelArgs(const BatchedGemmHostArgs& hostArgs)
     {
-        return BatchedGemmKernelArgs{{hostArgs.a_ptr,
-                                      hostArgs.b_ptr,
-                                      {},
+        return BatchedGemmKernelArgs{{hostArgs.as_ptr,
+                                      hostArgs.bs_ptr,
+                                      hostArgs.ds_ptr,
                                       hostArgs.e_ptr,
                                       hostArgs.M,
                                       hostArgs.N,
                                       hostArgs.K,
-                                      hostArgs.stride_A,
-                                      hostArgs.stride_B,
-                                      {},
+                                      hostArgs.stride_As,
+                                      hostArgs.stride_Bs,
+                                      hostArgs.stride_Ds,
                                       hostArgs.stride_E,
                                       hostArgs.k_batch},
                                      hostArgs.batch_stride_A,
@@ -125,6 +152,12 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
+    CK_TILE_HOST static auto
+    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs) -> bool
+    {
+        return UniversalGemmKernel::IsSupportedArgument(kargs);
+    }
+
     CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
     {
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
@@ -134,18 +167,18 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         const auto i_batch  = __builtin_amdgcn_readfirstlane(blockIdx.y);
         const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
-        const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk);
+        const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk);
 
         //  options
         const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
         const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
-        const ADataType* a_ptr    = static_cast<const ADataType*>(kargs.a_ptr) + batch_offset_A +
-                                 splitk_batch_offset.a_k_split_offset;
+        const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + batch_offset_A +
+                                 splitk_batch_offset.as_k_split_offset[0];
 
         const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
         const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
-        const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B +
-                                 splitk_batch_offset.b_k_split_offset;
+        const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + batch_offset_B +
+                                 splitk_batch_offset.bs_k_split_offset[0];
 
         const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E);
         const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E);
@@ -154,7 +187,8 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
 
-        this->RunGemm(a_ptr, b_ptr, {}, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        UniversalGemmKernel::RunGemm(
+            {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
index 53c21b49f5..079d3972d1 100755
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -12,6 +12,7 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
@@ -24,14 +25,11 @@ namespace ck_tile {
 ///      and launch kernel on GPU.
 ///      This structure defines the GEMM problem configuration by stating all required information
 ///      like M,N,K sizes and respective strides.
-///      NumDTensor describes the number of D tensors.
-template <index_t NumDTensor = 0>
 struct GemmHostArgs
 {
     CK_TILE_HOST GemmHostArgs() = default;
     CK_TILE_HOST GemmHostArgs(const void* a_ptr_,
                               const void* b_ptr_,
-                              const std::array<const void*, NumDTensor>& ds_ptr_,
                               void* e_ptr_,
                               index_t k_batch_,
                               index_t M_,
@@ -39,18 +37,15 @@ struct GemmHostArgs
                               index_t K_,
                               index_t stride_A_,
                               index_t stride_B_,
-                              const std::array<index_t, NumDTensor>& stride_Ds_,
                               index_t stride_E_)
         : a_ptr(a_ptr_),
           b_ptr(b_ptr_),
-          ds_ptr(ds_ptr_),
           e_ptr(e_ptr_),
           M(M_),
           N(N_),
           K(K_),
           stride_A(stride_A_),
           stride_B(stride_B_),
-          stride_Ds(stride_Ds_),
           stride_E(stride_E_),
           k_batch(k_batch_)
     {
@@ -58,18 +53,18 @@ struct GemmHostArgs
 
     const void* a_ptr;
     const void* b_ptr;
-    const std::array<const void*, NumDTensor> ds_ptr;
     union
     {
         void* e_ptr;
         void* c_ptr;
     };
+
     index_t M;
     index_t N;
     index_t K;
     index_t stride_A;
     index_t stride_B;
-    const std::array<index_t, NumDTensor> stride_Ds;
+
     union
     {
         index_t stride_E;
@@ -79,990 +74,96 @@ struct GemmHostArgs
     index_t k_batch;
 };
 
-/// @brief The GEMM kernel device arguments.
-template <index_t NumDTensor = 0>
-struct GemmKernelArgs
-{
-    /// @brief The A input tensor's pointer to device memory.
-    const void* a_ptr;
-    /// @brief The B input tensor's pointer to device memory.
-    const void* b_ptr;
-    /// @brief The Ds input tensor's pointer to device memory.
-    const std::array<const void*, NumDTensor> ds_ptr;
-    /// @brief The E output tensor's pointer to device memory.
-    void* e_ptr;
-    /// @brief GEMM's M dimension size.
-    index_t M;
-    /// @brief GEMM's N dimension size.
-    index_t N;
-    /// @brief GEMM's K dimension size.
-    index_t K;
-    /// @brief The distance between consecutive elements of non-contiguous dimension
-    ///        (in memory) of A tensor.
-    index_t stride_A;
-    /// @brief The distance between consecutive elements of non-contiguous dimension
-    ///        (in memory) of B tensor.
-    index_t stride_B;
-    /// @brief The distance between consecutive elements of non-contiguous dimension
-    ///        (in memory) of Ds tensor.
-    std::array<index_t, NumDTensor> stride_Ds;
-    /// @brief The distance between consecutive elements of non-contiguous dimension
-    ///        (in memory) of E tensor.
-    index_t stride_E;
-    index_t k_batch;
-};
-
-/// @brief The GEMM kernel template.
-///
-/// @paragraph Overview Overview
-///            This class provides the generic matrix multiplication kernel template. By semantic
-///            division of GEMM algorithm into following parts we achieve flexible, versatile
-///            and robust kernel implementation.
-///
-///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
-///                function call operator" which determines the work scope of each workgroup.
-///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
-///                This is the place where each workgroup is loading data from global memory and
-///                carrying out dot products.
-///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
-///                 responsible for storing results to global memory. This is also the place where
-///                 any additional operator fusion may take place.
-///
-///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
-///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
-///            internal details of those functional parts. You can think of it like both gemm and
-///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
-///            the policy is responsible for definition of all necessary data layouts and thread's
-///            work distribution.
-///
-/// @tparam TilePartitioner_    The type of class providing mapping of workgroup index into the
-///                             output data tile to be calculated. It determines the workgroup to
-///                             data relationship (or in other words - which data would be
-///                             processed and calculated by which workgroup).
-/// @tparam GemmPipeline_       The type of class which provides the core part of matrix
-///                             multiplication. This class should provide implementation of data
-///                             loading from global memory and performing block-wise matrix
-///                             multiplication. You can think of it as a work done by single
-///                             workgroup point of view.
-/// @tparam EpiloguePipeline_   The type of class providing the final part of matrix
-///                             multiplication implementation. It is responsible for storing
-///                             results calculated by @ref GemmPipeline_ "GemmPipeline" to
-///                             the output E tensor in global memory.
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
 struct GemmKernel
 {
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using UniversalGemmKernel =
+        UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+
     using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout          = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using BLayout          = remove_cvref_t<typename GemmPipeline::BLayout>;
-    // TODO: GemmPipeline::CLayout -> GemmPipeline::ELayout will be changed for multi-ABD
-    using ELayout    = remove_cvref_t<typename GemmPipeline::CLayout>;
-    using DsLayout   = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
-    using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
-    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
 
-    // Get the persistent kernel if the pipeline has it available
-    struct has_persistent_kernel
-    {
-        template <typename T>
-        using has_persistent_type = decltype(T::UsePersistentKernel);
-
-        static constexpr bool value = []() {
-            if constexpr(is_detected<has_persistent_type, GemmPipeline>{})
-                return GemmPipeline::UsePersistentKernel;
-            else
-                return false;
-        }();
-    };
-    static constexpr bool PersistentKernel = has_persistent_kernel::value;
+    /// @brief Specify the layout configurations for A, B, E and D
+    using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using ELayout = remove_cvref_t<typename GemmPipeline::CLayout>;
 
+    /// @brief  Specify the data type configurations for A, B, E and D
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
-    // Below type is actually accumulation data type - the output of block GEMM.
     using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
-    static constexpr index_t NumDTensor = DsDataType::size();
+    /// @brief ALayout and ADataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, ALayout>::value && !is_detected<is_tuple, ADataType>::value,
+        "ALayout and ADataType must be scalars. Multiple parameters are not currently supported.");
 
-    static constexpr auto I0 = number<0>();
-    static constexpr auto I1 = number<1>();
-    static constexpr auto I2 = number<2>();
-    static constexpr auto I3 = number<3>{};
+    /// @brief  BLayout and BDataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
+        "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
 
-    static_assert(DsLayout::size() == DsDataType::size(),
-                  "The size of DsLayout and DsDataType should be the same");
-    using KernelArgs = GemmKernelArgs<DsLayout::size()>;
+    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, ELayout>::value &&
+                      !is_detected<is_tuple, EDataType>::value,
+                  "C/ELayout and C/EDataType must be scalars.");
 
-    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    static constexpr index_t NumATensor = 1;
+    static constexpr index_t NumBTensor = 1;
+
+    CK_TILE_HOST static auto GetName() -> const std::string
     {
-        // clang-format off
-        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>(), GemmPipeline::GetName());
-        // clang-format on
+        return UniversalGemmKernel::GetName();
     }
 
-    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) -> dim3
     {
-        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+        return UniversalGemmKernel::GridSize(M, N, KBatch);
     }
 
-    /**
-     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
-     * @return The maximum occupancy grid size.
-     * @note This function queries the maximum occupancy of the kernel using
-     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
-     */
     CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
     {
-        using Kernel      = GemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
-        const auto kernel = kentry<KernelBlockSize, 1, Kernel, KernelArgs>;
-        int occupancy;
-        hip_check_error(
-            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
-        const int grid_size = get_available_compute_units(s) * occupancy;
-        return dim3(grid_size, 1, 1);
+        return UniversalGemmKernel::MaxOccupancyGridSize(s);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
-
-    CK_TILE_HOST static constexpr KernelArgs
-    MakeKernelArgs(const GemmHostArgs<NumDTensor>& hostArgs)
+    CK_TILE_HOST static constexpr auto BlockSize() -> dim3
     {
-
-        return KernelArgs{hostArgs.a_ptr,
-                          hostArgs.b_ptr,
-                          hostArgs.ds_ptr,
-                          hostArgs.e_ptr,
-                          hostArgs.M,
-                          hostArgs.N,
-                          hostArgs.K,
-                          hostArgs.stride_A,
-                          hostArgs.stride_B,
-                          hostArgs.stride_Ds,
-                          hostArgs.stride_E,
-                          hostArgs.k_batch};
+        return UniversalGemmKernel::BlockSize();
     }
 
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    CK_TILE_HOST static constexpr auto MakeKernelArgs(const GemmHostArgs& hostArgs) ->
+        typename UniversalGemmKernel::KernelArgs
     {
-        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+        /// @brief  Universal GEMM requires array objects and corresponding stride information for
+        /// matrices A, B.
+        return UniversalGemmKernel::MakeKernelArgs(
+            UniversalGemmHostArgs<NumATensor, NumBTensor /*NumDTensor = 0 */>(
+                {hostArgs.a_ptr},
+                {hostArgs.b_ptr},
+                {/*hostArgs.ds_ptr*/},
+                hostArgs.e_ptr,
+                hostArgs.k_batch,
+                hostArgs.M,
+                hostArgs.N,
+                hostArgs.K,
+                {hostArgs.stride_A},
+                {hostArgs.stride_B},
+                {/*hostArgs.stride_Ds*/},
+                hostArgs.stride_E));
     }
 
-    struct SplitKBatchOffset
+    CK_TILE_HOST static auto
+    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs) -> bool
     {
-        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
-        {
-            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
-
-            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            }
-            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
-            }
-
-            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
-            {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
-            }
-            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
-            {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            }
-
-            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
-            {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
-            }
-            else
-            {
-                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
-            }
-        }
-
-        index_t a_k_split_offset;
-        index_t b_k_split_offset;
-        index_t splitted_k;
-    };
-
-    CK_TILE_HOST static bool IsSupportedArgument(const KernelArgs& kargs)
-    {
-        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                     is_any_of<EDataType, fp16_t, bf16_t>::value)
-        {
-            if(kargs.k_batch != 1)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
-                }
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false) // k_batch is extra compared to flatmm
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
-                                  "without padding!");
-                }
-                return false;
-            }
-            if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support M that is not a multiple of MPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
-                }
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support N that is not a multiple of NPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
-               GemmPipeline::kPadK == false) // again k_batch is extra compared to flatmm
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("Can't support K that is not a multiple of k_batch * KPerBlock "
-                                  "without padding!");
-                }
-                return false;
-            }
-            if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
-                }
-                return false;
-            }
-        }
-
-        bool DTesnorIsValid = {true};
-        static_for<0, NumDTensor, 1>{}([&](auto index) {
-            using DiLayout = remove_cvref_t<std::tuple_element_t<index.value, DsLayout>>;
-            if(std::is_same_v<DiLayout, ELayout> == false)
-            {
-                DTesnorIsValid = false;
-            }
-            if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
-            {
-                if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-                {
-                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                    {
-                        CK_TILE_ERROR("Can't support N for tensor D that is not a multiple of "
-                                      "NPerBlock without padding!");
-                    }
-                    DTesnorIsValid = false;
-                }
-                if(kargs.N % EpiloguePipeline::GetVectorSizeD(index) != 0)
-                {
-                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                    {
-                        CK_TILE_ERROR("N is not a multiple of vector load size for D tensor!");
-                    }
-                    DTesnorIsValid = false;
-                }
-            }
-            else
-            {
-                if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-                {
-                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                    {
-                        CK_TILE_ERROR("Can't support M for tensor D that is not a multiple of "
-                                      "MPerBlock without padding!");
-                    }
-                    DTesnorIsValid = false;
-                }
-                if(kargs.M % EpiloguePipeline::GetVectorSizeD(index) != 0)
-                {
-                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                    {
-                        CK_TILE_ERROR("M is not a multiple of vector load size for D tensor!");
-                    }
-                    DTesnorIsValid = false;
-                }
-            }
-        });
-
-        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
-        {
-            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support N that is not a multiple of NPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR(
-                        "Can't support M that is not a multiple of MPerBlock without padding!");
-                }
-                return false;
-            }
-            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
-            {
-                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-                {
-                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
-                }
-                return false;
-            }
-        }
-        return DTesnorIsValid;
+        return UniversalGemmKernel::IsSupportedArgument(kargs);
     }
 
-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
-    CK_TILE_DEVICE static auto
-    MakeGemmTensorViews(const ADataType* a_ptr,
-                        const BDataType* b_ptr,
-                        const std::array<const void*, NumDTensor>& ds_ptr,
-                        EDataType* e_ptr,
-                        const KernelArgs& kargs,
-                        const SplitKBatchOffset& splitk_batch_offset)
+    CK_TILE_DEVICE auto operator()(typename UniversalGemmKernel::KernelArgs kargs) const -> void
     {
-        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
-
-        const auto& a_tensor_view = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_ptr,
-                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::GetVectorSizeA()>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global>(
-                    a_ptr,
-                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
-                    make_tuple(kargs.stride_A, 1),
-                    number<GemmPipeline::GetVectorSizeA()>{},
-                    number<1>{});
-            }
-        }();
-
-        const auto& b_tensor_view = [&]() {
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
-            {
-                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
-                {
-                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
-                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
-                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
-                    const auto b_k0_n_k1_desc =
-                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
-                                                     make_tuple(kargs.N * K1, K1, I1),
-                                                     number<VectorSizeB>{},
-                                                     number<1>{});
-                    const auto b_n_k_desc = transform_tensor_descriptor(
-                        b_k0_n_k1_desc,
-                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                                   make_pass_through_transform(kargs.N)),
-                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
-                }
-                else
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        b_ptr,
-                        make_tuple(splitk_batch_offset.splitted_k, kargs.N),
-                        make_tuple(kargs.stride_B, 1),
-                        number<GemmPipeline::GetVectorSizeB()>{},
-                        number<1>{});
-                }
-            }
-            else
-            {
-                if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
-                {
-                    constexpr index_t K1          = GemmPipeline::GetSmemPackB();
-                    const index_t K0              = splitk_batch_offset.splitted_k / K1;
-                    constexpr index_t VectorSizeB = std::min(K1, GemmPipeline::GetVectorSizeB());
-                    const auto b_k0_n_k1_desc =
-                        make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
-                                                     make_tuple(kargs.N * K1, K1, I1),
-                                                     number<VectorSizeB>{},
-                                                     number<1>{});
-                    const auto b_n_k_desc = transform_tensor_descriptor(
-                        b_k0_n_k1_desc,
-                        make_tuple(make_merge_transform(make_tuple(K0, K1)),
-                                   make_pass_through_transform(kargs.N)),
-                        make_tuple(sequence<0, 2>{}, sequence<1>{}),
-                        make_tuple(sequence<1>{}, sequence<0>{}));
-                    return make_tensor_view<address_space_enum::global>(b_ptr, b_n_k_desc);
-                }
-                else
-                {
-                    if constexpr(GemmPipeline::Preshuffle)
-                    {
-                        index_t kFlatK =
-                            GemmPipeline::BlockGemmShape::flatKPerWarp *
-                            (splitk_batch_offset.splitted_k /
-                             TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}));
-                        index_t kFlatN = kargs.N * kargs.K / kFlatK;
-
-                        return make_naive_tensor_view<address_space_enum::global>(
-                            b_ptr,
-                            make_tuple(kFlatN, kFlatK),
-                            make_tuple(kFlatK, 1),
-                            number<GemmPipeline::GetVectorSizeB()>{},
-                            number<1>{});
-                    }
-                    else
-                    {
-                        return make_naive_tensor_view<address_space_enum::global>(
-                            b_ptr,
-                            make_tuple(kargs.N, splitk_batch_offset.splitted_k),
-                            make_tuple(kargs.stride_B, 1),
-                            number<GemmPipeline::GetVectorSizeB()>{},
-                            number<1>{});
-                    }
-                }
-            }
-        }();
-
-        const auto& ds_tensor_view = generate_tuple(
-            [&](auto i) {
-                using DiLayout   = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
-                using DDataType_ = remove_cvref_t<std::tuple_element_t<i.value, DsDataType>>;
-                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        static_cast<const DDataType_*>(ds_ptr[i]),
-                        make_tuple(kargs.M, kargs.N),
-                        make_tuple(kargs.stride_Ds[i], 1),
-                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
-                        number<1>{});
-                }
-                else
-                {
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        static_cast<const DDataType_*>(ds_ptr[i]),
-                        make_tuple(kargs.N, kargs.M),
-                        make_tuple(kargs.stride_Ds[i], 1),
-                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
-                        number<1>{});
-                }
-            },
-            number<NumDTensor>{});
-
-        // TODO: enable vector write for C in ColMajor
-        const auto& e_tensor_view = [&]() {
-            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    e_ptr,
-                    make_tuple(kargs.M, kargs.N),
-                    make_tuple(kargs.stride_E, 1),
-                    number<EpiloguePipeline::GetVectorSizeC()>{},
-                    number<1>{});
-            }
-            else
-            {
-                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
-                    e_ptr,
-                    make_tuple(kargs.M, kargs.N), // arguments not matching with flatmm.
-                    make_tuple(1, kargs.stride_E),
-                    number<1>{},
-                    number<1>{});
-            }
-        }();
-
-        return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, e_tensor_view);
-    }
-
-    template <typename TensorView>
-    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
-    {
-        const auto& a_pad_view = [&]() {
-            const auto& a_tensor_view = views.at(I0);
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(a_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::KPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(a_tensor_view,
-                                       make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                  number<TilePartitioner::MPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadM>{});
-            }
-        }();
-
-        const auto& b_flat_pad_view = views.at(I1);
-
-        const auto& b_pad_view = [&]() {
-            const auto& b_tensor_view = views.at(I1);
-            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-            {
-                return pad_tensor_view(b_tensor_view,
-                                       make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                  number<TilePartitioner::KPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadK>{});
-            }
-            else
-            {
-                return pad_tensor_view(b_tensor_view,
-                                       make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
-            }
-        }();
-
-        const auto& ds_pad_view = generate_tuple(
-            [&](auto i) {
-                const auto& d_tensor_view = views.at(I2);
-                using DiLayout            = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
-                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
-                {
-                    return pad_tensor_view(d_tensor_view[i],
-                                           make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                      number<TilePartitioner::NPerBlock>{}),
-                                           sequence<false, GemmPipeline::kPadN>{});
-                }
-                else
-                {
-                    return pad_tensor_view(d_tensor_view[i],
-                                           make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                      number<TilePartitioner::MPerBlock>{}),
-                                           sequence<false, GemmPipeline::kPadM>{});
-                }
-            },
-            number<NumDTensor>{});
-
-        // TODO vector write in for C in ColMajor
-        const auto& e_pad_view = [&]() {
-            const auto& e_tensor_view = views.at(I3);
-            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
-            {
-                return pad_tensor_view(e_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<false, GemmPipeline::kPadN>{});
-            }
-            else
-            {
-                return pad_tensor_view(e_tensor_view,
-                                       make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                  number<TilePartitioner::NPerBlock>{}),
-                                       sequence<GemmPipeline::kPadM, false>{});
-            }
-        }();
-        if constexpr(GemmPipeline::Preshuffle)
-        {
-            // For flatmm, we need to use the flat B tensor view
-            return make_tuple(a_pad_view, b_flat_pad_view, ds_pad_view, e_pad_view);
-        }
-        else
-        {
-            return make_tuple(a_pad_view, b_pad_view, ds_pad_view, e_pad_view);
-        }
-    }
-
-    template <typename PadView>
-    CK_TILE_DEVICE static auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
-    {
-        const auto& a_pad_view  = views.at(I0);
-        const auto& b_pad_view  = views.at(I1);
-        const auto& ds_pad_view = views.at(I2);
-        const auto& e_pad_view  = views.at(I3);
-
-        const auto& a_block_window = [&]() {
-            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
-            {
-                return make_tile_window(a_pad_view,
-                                        make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                   number<TilePartitioner::KPerBlock>{}),
-                                        {i_m, 0});
-            }
-            else
-            {
-                return make_tile_window(a_pad_view,
-                                        make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                   number<TilePartitioner::MPerBlock>{}),
-                                        {0, i_m});
-            }
-        }();
-
-        const auto& b_block_window = [&]() {
-            if constexpr(GemmPipeline::Preshuffle)
-            {
-                return make_tile_window(
-                    b_pad_view,
-                    make_tuple(number<GemmPipeline::BlockGemmShape::flatNPerWarp>{},
-                               number<GemmPipeline::BlockGemmShape::flatKPerWarp>{}),
-                    {static_cast<int>(i_n / GemmPipeline::BlockGemmShape::WarpTile::at(I1)), 0});
-            }
-            else
-            {
-                if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
-                {
-                    return make_tile_window(b_pad_view,
-                                            make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                       number<TilePartitioner::KPerBlock>{}),
-                                            {i_n, 0});
-                }
-                else
-                {
-                    return make_tile_window(b_pad_view,
-                                            make_tuple(number<TilePartitioner::KPerBlock>{},
-                                                       number<TilePartitioner::NPerBlock>{}),
-                                            {0, i_n});
-                }
-            }
-        }();
-
-        const auto ds_block_window = generate_tuple(
-            [&](auto i) {
-                using DiLayout = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
-                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
-                {
-                    return make_tile_window(ds_pad_view[i],
-                                            make_tuple(number<TilePartitioner::MPerBlock>{},
-                                                       number<TilePartitioner::NPerBlock>{}),
-                                            {i_m, i_n});
-                }
-                else
-                {
-                    return make_tile_window(ds_pad_view[i],
-                                            make_tuple(number<TilePartitioner::NPerBlock>{},
-                                                       number<TilePartitioner::MPerBlock>{}),
-                                            {i_n, i_m});
-                }
-            },
-            number<NumDTensor>{});
-
-        auto e_block_window = make_tile_window(
-            e_pad_view,
-            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
-            {i_m, i_n});
-
-        return make_tuple(a_block_window, b_block_window, ds_block_window, e_block_window);
-    }
-
-    /**
-     * @brief Runs single GEMM problem cooperatively by whole workgroup.
-     *
-     * @param a_ptr input A pointer
-     * @param b_ptr input B pointer
-     * @param ds_ptr input Ds pointer
-     * @param e_ptr output E pointer
-     * @param smem_ptr_0 The start memory pointer of the shared memory block.
-     * @param kargs GEMM kernel arguments
-     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
-     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
-     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
-     *
-     */
-    template <bool UseDefaultScheduler = true>
-    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
-                                       const BDataType* b_ptr,
-                                       const std::array<const void*, NumDTensor>& ds_ptr,
-                                       EDataType* e_ptr,
-                                       void* smem_ptr_0,
-                                       const KernelArgs& kargs,
-                                       const SplitKBatchOffset& splitk_batch_offset,
-                                       const index_t block_idx_m,
-                                       const index_t block_idx_n)
-    {
-        // Create Gemm tensor views, pad views and tile windows
-        const auto& gemm_tensor_views_tuple =
-            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
-
-        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-
-        auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
-
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
-
-        // Run GEMM cooperatively by whole workgroup.
-        const auto& a_block_window = gemm_tile_windows.at(I0);
-        const auto& b_block_window = gemm_tile_windows.at(I1);
-        const auto& d_block_window = gemm_tile_windows.at(I2);
-
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            a_block_window, b_block_window, num_loop, smem_ptr_0);
-
-        if(UseDefaultScheduler || (get_warp_id() == 0))
-        {
-            auto& c_block_window = gemm_tile_windows.at(I3);
-
-            EpiloguePipeline{}.template
-            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
-                c_block_window, c_block_tile, d_block_window, smem_ptr_0);
-        }
-    }
-
-    /**
-     * @brief Runs single GEMM problem cooperatively by whole workgroup.
-     *
-     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
-     *
-     * @param a_ptr input A pointer
-     * @param b_ptr input B pointer
-     * @param ds_ptr input Ds pointer
-     * @param e_ptr output E pointer
-     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
-     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
-     * @param kargs GEMM kernel arguments
-     * @param splitk_batch_offset Utility structure used to calculate k batch.
-     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
-     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
-     *
-     */
-    CK_TILE_DEVICE static void RunGemm2LDS(const ADataType* a_ptr,
-                                           const BDataType* b_ptr,
-                                           const std::array<const void*, NumDTensor>& ds_ptr,
-                                           EDataType* e_ptr,
-                                           void* __restrict__ smem_ptr_0,
-                                           void* __restrict__ smem_ptr_1,
-                                           const KernelArgs& kargs,
-                                           const SplitKBatchOffset& splitk_batch_offset,
-                                           const index_t block_idx_m,
-                                           const index_t block_idx_n)
-    {
-        // Create Gemm tensor views, pad views and tile windows
-        const auto& gemm_tensor_views_tuple =
-            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
-
-        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-
-        auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
-
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
-
-        // Run GEMM cooperatively by whole workgroup.
-        const auto& a_block_window = gemm_tile_windows.at(I0);
-        const auto& b_block_window = gemm_tile_windows.at(I1);
-        const auto& d_block_window = gemm_tile_windows.at(I2);
-
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
-
-        // Run Epilogue Pipeline
-        auto& c_block_window = gemm_tile_windows.at(I3);
-
-        EpiloguePipeline{}.template
-        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
-            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
-    }
-
-    // Non-persistent kernel entry point
-    template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
-    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
-    {
-        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
-        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
-
-        const SplitKBatchOffset splitk_batch_offset(kargs);
-
-        // options
-        const ADataType* a_ptr =
-            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
-        const BDataType* b_ptr =
-            static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-
-        EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
-
-        // allocate LDS
-        __shared__ char smem_ptr_0[GetSmemSize()];
-
-        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
-        {
-            __shared__ char smem_ptr_1[GetSmemSize()];
-            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<EDataType, fp16_t, bf16_t>::value))
-            {
-                RunGemm2LDS(a_ptr,
-                            b_ptr,
-                            kargs.ds_ptr,
-                            e_ptr,
-                            smem_ptr_0,
-                            smem_ptr_1,
-                            kargs,
-                            splitk_batch_offset,
-                            i_m,
-                            i_n);
-            }
-        }
-        else
-        {
-            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                           is_any_of<EDataType, fp16_t, bf16_t>::value))
-            {
-                constexpr auto scheduler_type = (GemmPipeline::NumWaveGroups == 1);
-                RunGemm<scheduler_type>(a_ptr,
-                                        b_ptr,
-                                        kargs.ds_ptr,
-                                        e_ptr,
-                                        smem_ptr_0,
-                                        kargs,
-                                        splitk_batch_offset,
-                                        i_m,
-                                        i_n);
-            }
-        }
-    }
-
-    // Persistent kernel entry point
-    template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
-    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
-    {
-        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
-        const auto num_tiles =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
-        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
-        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
-
-        while(block_id < num_work)
-        {
-            // Get the tile index for this block
-            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
-            const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
-            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
-
-            // Get the SplitK offset for this block
-            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
-            const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
-            const ADataType* a_ptr =
-                static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
-            const BDataType* b_ptr =
-                static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-            EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
-
-            // allocate LDS
-            __shared__ char smem_ptr_0[GetSmemSize()];
-            // Run the GEMM
-            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
-            {
-                __shared__ char smem_ptr_1[GetSmemSize()];
-                if constexpr(!(EpiloguePipeline::MemoryOperation ==
-                                   memory_operation_enum::atomic_add &&
-                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<EDataType, fp16_t, bf16_t>::value))
-                {
-                    RunGemm2LDS(a_ptr,
-                                b_ptr,
-                                kargs.ds_ptr,
-                                e_ptr,
-                                smem_ptr_0,
-                                smem_ptr_1,
-                                kargs,
-                                splitk_batch_offset,
-                                i_m,
-                                i_n);
-                }
-            }
-            else
-            {
-                if constexpr(!(EpiloguePipeline::MemoryOperation ==
-                                   memory_operation_enum::atomic_add &&
-                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                               is_any_of<EDataType, fp16_t, bf16_t>::value))
-                {
-                    RunGemm(a_ptr,
-                            b_ptr,
-                            kargs.ds_ptr,
-                            e_ptr,
-                            smem_ptr_0,
-                            kargs,
-                            splitk_batch_offset,
-                            i_m,
-                            i_n);
-                }
-            }
-            // Advance to the next work item
-            block_id += grid_size;
-            if(block_id >= num_work)
-            {
-                break;
-            }
-        }
+        UniversalGemmKernel{}.template operator()(kargs);
     }
 };
-
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
new file mode 100644
index 0000000000..34340008d4
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/stream_utils.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+/// @brief The MultiD GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GemmKernelMultiD "GemmKernelMultiD" when creating kernel
+///      arguments object. It contain all necessary information required to build proper kernel
+///      argument and launch kernel on GPU. This structure defines the GEMM problem configuration by
+///      stating all required information like M,N,K sizes and respective strides. NumDTensor
+///      describes the number of D tensors.
+template <index_t NumDTensor = 1>
+struct GemmMultiDHostArgs
+{
+    CK_TILE_HOST GemmMultiDHostArgs() = default;
+    CK_TILE_HOST GemmMultiDHostArgs(const void* a_ptr_,
+                                    const void* b_ptr_,
+                                    const std::array<const void*, NumDTensor>& ds_ptr_,
+                                    void* e_ptr_,
+                                    index_t k_batch_,
+                                    index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t stride_A_,
+                                    index_t stride_B_,
+                                    const std::array<index_t, NumDTensor>& stride_Ds_,
+                                    index_t stride_E_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          ds_ptr(ds_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_Ds(stride_Ds_),
+          stride_E(stride_E_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    const std::array<index_t, NumDTensor> stride_Ds;
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
+    index_t k_batch;
+};
+
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct GemmKernelMultiD
+{
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using UniversalGemmKernel =
+        UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+
+    /// @brief  Specify the layout configurations for A, B, E and D
+    using ALayout  = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout  = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using ELayout  = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using DsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
+
+    /// @brief  Specify the data type configurations for A, B, E and D
+    using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
+    using BDataType  = remove_cvref_t<typename GemmPipeline::BDataType>;
+    using EDataType  = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+    using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
+
+    /// @brief  ALayout and ADataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, ALayout>::value &&
+                      !is_detected<is_tuple, ADataType>::value,
+                  "ALayout and ADataType must be scalars.");
+
+    /// @brief  BLayout and BDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, BLayout>::value &&
+                      !is_detected<is_tuple, BDataType>::value,
+                  "BLayout and BDataType must be scalars.");
+
+    /// @brief  ELayout and EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, ELayout>::value &&
+                      !is_detected<is_tuple, EDataType>::value,
+                  "ELayout and EDataType must be scalars.");
+
+    /// @brief  DsLayout and DsDataType are expected to be tuple, not a scalar.
+    static_assert(is_detected<is_tuple, DsLayout>::value &&
+                      is_detected<is_tuple, DsDataType>::value &&
+                      DsLayout::size() == DsDataType::size() && DsLayout::size() > 0,
+                  "DsLayout and DsDataType must be tuples and must have the same size.");
+
+    /// @brief The sizes of NumATensor and NumBTensor have always been 1; the size of D is set by
+    /// the user."
+    static constexpr index_t NumATensor = 1;
+    static constexpr index_t NumBTensor = 1;
+    static constexpr index_t NumDTensor = DsDataType::size();
+
+    CK_TILE_HOST static auto GetName() -> const std::string
+    {
+        return UniversalGemmKernel::GetName();
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch) -> dim3
+    {
+        return UniversalGemmKernel::GridSize(M, N, KBatch);
+    }
+
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        return UniversalGemmKernel::MaxOccupancyGridSize(s);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() -> dim3
+    {
+        return UniversalGemmKernel::BlockSize();
+    }
+
+    CK_TILE_HOST static constexpr auto
+    MakeKernelArgs(const GemmMultiDHostArgs<NumDTensor>& hostArgs) ->
+        typename UniversalGemmKernel::KernelArgs
+    {
+        /// @brief  Universal GEMM requires array objects and corresponding stride information for
+        /// matrices A, B, and D.
+        return UniversalGemmKernel::MakeKernelArgs(
+            UniversalGemmHostArgs<NumATensor, NumBTensor, NumDTensor>({hostArgs.a_ptr},
+                                                                      {hostArgs.b_ptr},
+                                                                      hostArgs.ds_ptr,
+                                                                      hostArgs.e_ptr,
+                                                                      hostArgs.k_batch,
+                                                                      hostArgs.M,
+                                                                      hostArgs.N,
+                                                                      hostArgs.K,
+                                                                      {hostArgs.stride_A},
+                                                                      {hostArgs.stride_B},
+                                                                      hostArgs.stride_Ds,
+                                                                      hostArgs.stride_E));
+    }
+
+    CK_TILE_HOST static auto
+    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs) -> bool
+    {
+        return UniversalGemmKernel::IsSupportedArgument(kargs);
+    }
+
+    CK_TILE_DEVICE auto operator()(typename UniversalGemmKernel::KernelArgs kargs) const -> void
+    {
+        UniversalGemmKernel{}.template operator()(kargs);
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 2605b1afbc..8716475869 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -16,37 +16,116 @@
 
 namespace ck_tile {
 
+/// @brief The Grouped GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref GroupedGemmKernel "GroupedGemmKernel" when creating kernel
+///      arguments object. It contain all necessary information required to build proper kernel
+///      argument and launch kernel on GPU. This structure defines the GEMM problem configuration by
+///      stating all required information like M,N,K sizes and respective strides.
+struct GroupedGemmHostArgs
+{
+    CK_TILE_HOST GroupedGemmHostArgs(const void* a_ptr_,
+                                     const void* b_ptr_,
+                                     void* e_ptr_,
+                                     index_t k_batch_,
+                                     index_t M_,
+                                     index_t N_,
+                                     index_t K_,
+                                     index_t stride_A_,
+                                     index_t stride_B_,
+                                     index_t stride_E_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          stride_A(stride_A_),
+          stride_B(stride_B_),
+          stride_E(stride_E_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
+    index_t k_batch;
+};
+
 struct GemmTransKernelArg
 {
-    GemmKernelArgs<> group_karg;
+    UniversalGemmKernelArgs<> group_karg;
     ck_tile::index_t block_start;
     ck_tile::index_t block_end;
 
     GemmTransKernelArg() = delete;
-    GemmTransKernelArg(GemmKernelArgs<>&& karg, index_t bl_start, index_t bl_end)
+    GemmTransKernelArg(UniversalGemmKernelArgs<>&& karg, index_t bl_start, index_t bl_end)
         : group_karg{karg}, block_start{bl_start}, block_end{bl_end}
     {
     }
 
-    GemmTransKernelArg(GemmKernelArgs<>&& karg) : group_karg{karg}, block_start{0}, block_end{0} {}
+    GemmTransKernelArg(UniversalGemmKernelArgs<>&& karg)
+        : group_karg{karg}, block_start{0}, block_end{0}
+    {
+    }
 };
 
 template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
-struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>
+struct GroupedGemmKernel
 {
+    /// @brief Inject the UniversalGemmKernel base class to support execution of all necessary
+    /// functions.
+    using Base = UniversalGemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
+
     using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
     using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
     using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
-    using ALayout          = remove_cvref_t<typename GemmPipeline::ALayout>;
-    using BLayout          = remove_cvref_t<typename GemmPipeline::BLayout>;
-    using ELayout          = remove_cvref_t<typename GemmPipeline::CLayout>;
 
+    //// @brief Specify the layout configurations for A, B, C/E
+    using ALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
+    using BLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
+    using CLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
+
+    /// @brief Specify the data type configurations for A, B, C/E
     using ADataType = remove_cvref_t<typename GemmPipeline::ADataType>;
     using BDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
     using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
 
+    /// @brief ALayout and ADataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, ALayout>::value && !is_detected<is_tuple, ADataType>::value,
+        "ALayout and ADataType must be scalars. Multiple parameters are not currently supported.");
+
+    /// @brief  BLayout and BDataType are expected to be scalars, not a tuple.
+    static_assert(
+        !is_detected<is_tuple, BLayout>::value && !is_detected<is_tuple, BDataType>::value,
+        "BLayout and BDataType must be scalars. Multiple parameters are not currently supported.");
+
+    /// @brief  C/ELayout and C/EDataType are expected to be scalars, not a tuple.
+    static_assert(!is_detected<is_tuple, CLayout>::value &&
+                      !is_detected<is_tuple, CDataType>::value,
+                  "C/ELayout and C/EDataType must be scalars.");
+
     using OffsetTile1DPartitioner = OffsettedTile1DPartitioner<TilePartitioner>;
-    using Base                    = GemmKernel<TilePartitioner_, GemmPipeline_, EpiloguePipeline_>;
     using Kernel = GroupedGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
 
     static constexpr index_t KernelBlockSize  = GemmPipeline::BlockSize;
@@ -65,8 +144,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // clang-format on
     }
 
-    CK_TILE_HOST static auto
-    GetWorkSpaceSize(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs) -> std::size_t
+    CK_TILE_HOST static auto GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
+        -> std::size_t
     {
         return gemm_descs.size() * sizeof(GemmTransKernelArg);
     }
@@ -95,8 +174,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static auto
-    GridSize(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs)
+    CK_TILE_HOST static auto GridSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
     {
         index_t grid_size = 0;
         for(const auto& it_desc : gemm_descs)
@@ -107,8 +185,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static auto
-    MakeKargs(const std::vector<GemmHostArgs</*NumDTensor = 0*/>>& gemm_descs)
+    CK_TILE_HOST static auto MakeKargs(const std::vector<GroupedGemmHostArgs>& gemm_descs)
         -> std::vector<GemmTransKernelArg>
     {
         std::vector<GemmTransKernelArg> gemm_kernel_args_;
@@ -138,18 +215,19 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
             grid_size += grid_size_grp;
 
-            auto karg = GemmKernelArgs<>{type_convert<const ADataType*>(gemm_descs[i].a_ptr),
-                                         type_convert<const BDataType*>(gemm_descs[i].b_ptr),
-                                         {},
-                                         type_convert<CDataType*>(gemm_descs[i].e_ptr),
-                                         M,
-                                         N,
-                                         K,
-                                         stride_a,
-                                         stride_b,
-                                         {},
-                                         stride_e,
-                                         gemm_descs[i].k_batch};
+            auto karg =
+                UniversalGemmKernelArgs<>{{type_convert<const ADataType*>(gemm_descs[i].a_ptr)},
+                                          {type_convert<const BDataType*>(gemm_descs[i].b_ptr)},
+                                          {/*ds_ptr*/},
+                                          type_convert<CDataType*>(gemm_descs[i].e_ptr),
+                                          M,
+                                          N,
+                                          K,
+                                          {stride_a},
+                                          {stride_b},
+                                          {/*stride_ds*/},
+                                          stride_e,
+                                          gemm_descs[i].k_batch};
 
             gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end);
         }
@@ -181,7 +259,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         Run(kargs.group_karg, block_idx_2d, block_idx_z);
     }
 
-    CK_TILE_DEVICE void Run(const GemmKernelArgs<>& kargs,
+    CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<>& kargs,
                             const tuple<index_t, index_t>& block_idx_2d,
                             const index_t block_idx_z) const
     {
@@ -192,10 +270,10 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
         const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
 
-        const ADataType* a_ptr =
-            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
-        const BDataType* b_ptr =
-            static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+        const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) +
+                                 splitk_batch_offset.as_k_split_offset[0];
+        const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) +
+                                 splitk_batch_offset.bs_k_split_offset[0];
         CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr);
 
         // allocate LDS
@@ -208,7 +286,15 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         }
         else
         {
-            this->RunGemm(a_ptr, b_ptr, {}, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            Base::RunGemm({a_ptr},
+                          {b_ptr},
+                          {/*ds_ptr*/},
+                          c_ptr,
+                          smem_ptr,
+                          kargs,
+                          splitk_batch_offset,
+                          i_m,
+                          i_n);
         }
     }
 
@@ -224,7 +310,8 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
      * @param c_ptr output C pointer
      * @param smem_ptr_0 The start memory pointer of the shared memory block.
      * @param kargs GEMM kernel arguments
-     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k
+     * batch.
      * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
      * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
      *
@@ -234,7 +321,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
                                  const BDataType* b_ptr,
                                  CDataType* c_ptr,
                                  void* smem_ptr_0,
-                                 const GemmKernelArgs<>& kargs,
+                                 const UniversalGemmKernelArgs<>& kargs,
                                  const typename Base::SplitKBatchOffset& splitk_batch_offset,
                                  const index_t block_idx_m,
                                  const index_t block_idx_n)
@@ -242,7 +329,7 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                a_ptr, b_ptr, {}, c_ptr, kargs, splitk_batch_offset);
+                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows =
@@ -258,8 +345,12 @@ struct GroupedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
         const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
 
         // Run GEMM pipeline
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
+        const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                                      b_block_window[Base::I0],
+                                                                      num_loop,
+                                                                      has_hot_loop,
+                                                                      tail_num,
+                                                                      smem_ptr_0);
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(Base::I3);
         EpiloguePipeline{}.template
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
new file mode 100644
index 0000000000..1d513faea3
--- /dev/null
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -0,0 +1,1169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/host/stream_utils.hpp"
+#include "ck_tile/core/utility/env.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+/// @brief The Universal GEMM kernel host arguments.
+///
+/// @par Overview
+///      This structure is passed to @ref UniversalGemmKernel "UniversalGemmKernel" when creating
+///      kernel arguments object. It contain all necessary information required to build proper
+///      kernel argument and launch kernel on GPU. This structure defines the GEMM problem
+///      configuration by stating all required information like M,N,K sizes and respective strides.
+///      NumATensor describes the number of A tensors. The minimum number of tensors is 1(required).
+///      NumBTensor describes the number of B tensors. The minimum number of tensors is 1(required).
+///      NumDTensor describes the number of D tensors. The minimum number of tensors is 0(not
+///      required).
+template <index_t NumATensor = 1, index_t NumBTensor = 1, index_t NumDTensor = 0>
+struct UniversalGemmHostArgs
+{
+    CK_TILE_HOST UniversalGemmHostArgs(const std::array<const void*, NumATensor>& as_ptr_,
+                                       const std::array<const void*, NumBTensor>& bs_ptr_,
+                                       const std::array<const void*, NumDTensor>& ds_ptr_,
+                                       void* e_ptr_,
+                                       index_t k_batch_,
+                                       index_t M_,
+                                       index_t N_,
+                                       index_t K_,
+                                       const std::array<index_t, NumATensor>& stride_As_,
+                                       const std::array<index_t, NumBTensor>& stride_Bs_,
+                                       const std::array<index_t, NumDTensor>& stride_Ds_,
+                                       index_t stride_E_)
+        : as_ptr(as_ptr_),
+          bs_ptr(bs_ptr_),
+          ds_ptr(ds_ptr_),
+          e_ptr(e_ptr_),
+          M(M_),
+          N(N_),
+          K(K_),
+          stride_As(stride_As_),
+          stride_Bs(stride_Bs_),
+          stride_Ds(stride_Ds_),
+          stride_E(stride_E_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const std::array<const void*, NumATensor> as_ptr;
+    const std::array<const void*, NumBTensor> bs_ptr;
+    const std::array<const void*, NumDTensor> ds_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
+    index_t M;
+    index_t N;
+    index_t K;
+    const std::array<index_t, NumATensor> stride_As;
+    const std::array<index_t, NumBTensor> stride_Bs;
+    const std::array<index_t, NumDTensor> stride_Ds;
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
+    index_t k_batch;
+};
+
+/// @brief The GEMM kernel device arguments.
+template <index_t NumATensor = 1, index_t NumBTensor = 1, index_t NumDTensor = 0>
+struct UniversalGemmKernelArgs
+{
+    /// @brief The As input tensor's pointer to device memory.
+    const std::array<const void*, NumATensor> as_ptr;
+    /// @brief The Bs input tensor's pointer to device memory.
+    const std::array<const void*, NumBTensor> bs_ptr;
+    /// @brief The Ds input tensor's pointer to device memory.
+    const std::array<const void*, NumDTensor> ds_ptr;
+    /// @brief The E output tensor's pointer to device memory.
+    void* e_ptr;
+    /// @brief GEMM's M dimension size.
+    index_t M;
+    /// @brief GEMM's N dimension size.
+    index_t N;
+    /// @brief GEMM's K dimension size.
+    index_t K;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of As tensor.
+    std::array<index_t, NumATensor> stride_As;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of Bs tensor.
+    std::array<index_t, NumBTensor> stride_Bs;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of Ds tensor.
+    std::array<index_t, NumDTensor> stride_Ds;
+    /// @brief The distance between consecutive elements of non-contiguous dimension
+    ///        (in memory) of E tensor.
+    index_t stride_E;
+    index_t k_batch;
+};
+
+/// @brief The Universal GEMM kernel template.
+///
+/// @paragraph Overview Overview
+///            This class provides the generic matrix multiplication kernel template. By semantic
+///            division of GEMM algorithm into following parts we achieve flexible, versatile
+///            and robust kernel implementation.
+///
+///            @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
+///                function call operator" which determines the work scope of each workgroup.
+///            @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
+///                This is the place where each workgroup is loading data from global memory and
+///                carrying out dot products.
+///            @li @b Epilogue - The @a "final" part of matrix multiplication implementation
+///                 responsible for storing results to global memory. This is also the place where
+///                 any additional operator fusion may take place.
+///
+///            Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
+///            "EpiloguePipeline" are parameterized with so called @a Policy which determines all
+///            internal details of those functional parts. You can think of it like both gemm and
+///            epilogue pipelines provides the control-flow logic controlled by policies. Moreover
+///            the policy is responsible for definition of all necessary data layouts and thread's
+///            work distribution.
+///
+/// @tparam TilePartitioner_    The type of class providing mapping of workgroup index into the
+///                             output data tile to be calculated. It determines the workgroup to
+///                             data relationship (or in other words - which data would be
+///                             processed and calculated by which workgroup).
+/// @tparam GemmPipeline_       The type of class which provides the core part of matrix
+///                             multiplication. This class should provide implementation of data
+///                             loading from global memory and performing block-wise matrix
+///                             multiplication. You can think of it as a work done by single
+///                             workgroup point of view.
+/// @tparam EpiloguePipeline_   The type of class providing the final part of matrix
+///                             multiplication implementation. It is responsible for storing
+///                             results calculated by @ref GemmPipeline_ "GemmPipeline" to
+///                             the output E tensor in global memory.
+template <typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
+struct UniversalGemmKernel
+{
+    using TilePartitioner  = remove_cvref_t<TilePartitioner_>;
+    using GemmPipeline     = remove_cvref_t<GemmPipeline_>;
+    using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
+
+    static constexpr bool ADataTypeIsTuple =
+        is_detected<is_tuple, typename GemmPipeline::ADataType>::value;
+    static constexpr bool BDataTypeIsTuple =
+        is_detected<is_tuple, typename GemmPipeline::BDataType>::value;
+    static constexpr bool DDataTypeIsTuple =
+        is_detected<is_tuple, typename EpiloguePipeline::DsDataType>::value;
+    static constexpr bool ALayoutIsTuple =
+        is_detected<is_tuple, typename GemmPipeline::ALayout>::value;
+    static constexpr bool BLayoutIsTuple =
+        is_detected<is_tuple, typename GemmPipeline::BLayout>::value;
+    static constexpr bool DLayoutIsTuple =
+        is_detected<is_tuple, typename EpiloguePipeline::DsLayout>::value;
+
+    using AsLayout = std::conditional_t<ALayoutIsTuple,
+                                        remove_cvref_t<typename GemmPipeline::ALayout>,
+                                        remove_cvref_t<tuple<typename GemmPipeline::ALayout>>>;
+    using BsLayout = std::conditional_t<BLayoutIsTuple,
+                                        remove_cvref_t<typename GemmPipeline::BLayout>,
+                                        remove_cvref_t<tuple<typename GemmPipeline::BLayout>>>;
+
+    using DsLayout = std::conditional_t<DLayoutIsTuple,
+                                        remove_cvref_t<typename EpiloguePipeline::DsLayout>,
+                                        remove_cvref_t<tuple<typename EpiloguePipeline::DsLayout>>>;
+
+    using AsDataType = std::conditional_t<ADataTypeIsTuple,
+                                          remove_cvref_t<typename GemmPipeline::ADataType>,
+                                          remove_cvref_t<tuple<typename GemmPipeline::ADataType>>>;
+
+    using BsDataType = std::conditional_t<BDataTypeIsTuple,
+                                          remove_cvref_t<typename GemmPipeline::BDataType>,
+                                          remove_cvref_t<tuple<typename GemmPipeline::BDataType>>>;
+
+    using DsDataType =
+        std::conditional_t<DDataTypeIsTuple,
+                           remove_cvref_t<typename EpiloguePipeline::DsDataType>,
+                           remove_cvref_t<tuple<typename EpiloguePipeline::DsDataType>>>;
+
+    using ELayout   = remove_cvref_t<typename GemmPipeline::CLayout>;
+    using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+
+    // Get the persistent kernel if the pipeline has it available
+    struct has_persistent_kernel
+    {
+        template <typename T>
+        using has_persistent_type = decltype(T::UsePersistentKernel);
+
+        static constexpr bool value = []() {
+            if constexpr(is_detected<has_persistent_type, GemmPipeline>{})
+                return GemmPipeline::UsePersistentKernel;
+            else
+                return false;
+        }();
+    };
+    static constexpr bool PersistentKernel = has_persistent_kernel::value;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+    static constexpr auto I3 = number<3>{};
+
+    static constexpr index_t NumATensor = AsDataType::size();
+    static constexpr index_t NumBTensor = BsDataType::size();
+    static constexpr index_t NumDTensor = DsDataType::size();
+
+    using ADataType = remove_cvref_t<std::tuple_element_t<I0, AsDataType>>;
+    using BDataType = remove_cvref_t<std::tuple_element_t<I0, BsDataType>>;
+
+    static_assert(AsLayout::size() == AsDataType::size(),
+                  "The size of AsLayout and AsDataType should be the same");
+
+    static_assert(BsLayout::size() == BsDataType::size(),
+                  "The size of BsLayout and BsDataType should be the same");
+
+    static_assert(DsLayout::size() == DsDataType::size(),
+                  "The size of DsLayout and DsDataType should be the same");
+
+    using KernelArgs =
+        UniversalGemmKernelArgs<AsLayout::size(), BsLayout::size(), DsLayout::size()>;
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "gemm", gemm_prec_str<ADataType, BDataType>(), GemmPipeline::GetName());
+        // clang-format on
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+    }
+
+    /**
+     * @brief Get the maximum occupancy grid size for the persistent kernel on the current device.
+     * @return The maximum occupancy grid size.
+     * @note This function queries the maximum occupancy of the kernel using
+     *       `hipOccupancyMaxActiveBlocksPerMultiprocessor`.
+     */
+    CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
+    {
+        using Kernel      = UniversalGemmKernel<TilePartitioner, GemmPipeline, EpiloguePipeline>;
+        const auto kernel = kentry<KernelBlockSize, 1, Kernel, KernelArgs>;
+        int occupancy;
+        hip_check_error(
+            hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, KernelBlockSize, 0));
+        const int grid_size = get_available_compute_units(s) * occupancy;
+        return dim3(grid_size, 1, 1);
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    CK_TILE_HOST static constexpr KernelArgs
+    MakeKernelArgs(const UniversalGemmHostArgs<NumATensor, NumBTensor, NumDTensor>& hostArgs)
+    {
+        return KernelArgs{hostArgs.as_ptr,
+                          hostArgs.bs_ptr,
+                          hostArgs.ds_ptr,
+                          hostArgs.e_ptr,
+                          hostArgs.M,
+                          hostArgs.N,
+                          hostArgs.K,
+                          hostArgs.stride_As,
+                          hostArgs.stride_Bs,
+                          hostArgs.stride_Ds,
+                          hostArgs.stride_E,
+                          hostArgs.k_batch};
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
+            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
+
+            static_for<0, NumATensor, 1>{}([&](auto index) {
+                using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
+                if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)
+                {
+                    as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                }
+                else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)
+                {
+                    as_k_split_offset[index] =
+                        __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]);
+                }
+            });
+
+            static_for<0, NumBTensor, 1>{}([&](auto index) {
+                using BiLayout = remove_cvref_t<std::tuple_element_t<index.value, BsLayout>>;
+                if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)
+                {
+                    bs_k_split_offset[index] =
+                        __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]);
+                }
+                else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)
+                {
+                    bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                }
+            });
+
+            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            {
+                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+            }
+            else
+            {
+                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
+            }
+        }
+
+        std::array<index_t, NumATensor> as_k_split_offset;
+        std::array<index_t, NumBTensor> bs_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static bool IsSupportedArgument(const KernelArgs& kargs)
+    {
+        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                     is_any_of<EDataType, fp16_t, bf16_t>::value)
+        {
+            if(kargs.k_batch != 1)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
+                }
+                return false;
+            }
+        }
+
+        bool AsTesnorIsValid = {true};
+        static_for<0, NumATensor, 1>{}([&](auto index) {
+            using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
+            if constexpr(std::is_same_v<AiLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+                   GemmPipeline::kPadK == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR(
+                            "Can't support K that is not a multiple of k_batch * KPerBlock "
+                            "without padding!");
+                    }
+                    AsTesnorIsValid = false;
+                }
+                if(kargs.K % GemmPipeline::GetVectorSizeA() != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("K is not a multiple of vector load size for A tensor!");
+                    }
+                    AsTesnorIsValid = false;
+                }
+            }
+            else
+            {
+                if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR(
+                            "Can't support M that is not a multiple of MPerBlock without padding!");
+                    }
+                    AsTesnorIsValid = false;
+                }
+                if(kargs.M % GemmPipeline::GetVectorSizeA() != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("M is not a multiple of vector load size for A tensor!");
+                    }
+                    AsTesnorIsValid = false;
+                }
+            }
+        });
+
+        bool BsTesnorIsValid = {true};
+        static_for<0, NumBTensor, 1>{}([&](auto index) {
+            using BiLayout = remove_cvref_t<std::tuple_element_t<index.value, BsLayout>>;
+            if constexpr(std::is_same_v<BiLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR(
+                            "Can't support N that is not a multiple of NPerBlock without padding!");
+                    }
+                    BsTesnorIsValid = false;
+                }
+                if(kargs.N % GemmPipeline::GetVectorSizeB() != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("N is not a multiple of vector load size for B tensor!");
+                    }
+                    BsTesnorIsValid = false;
+                }
+            }
+            else
+            {
+                if(kargs.K % (TilePartitioner::KPerBlock * kargs.k_batch) != 0 &&
+                   GemmPipeline::kPadK == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR(
+                            "Can't support K that is not a multiple of k_batch * KPerBlock "
+                            "without padding!");
+                    }
+                    BsTesnorIsValid = false;
+                }
+                if(kargs.K % GemmPipeline::GetVectorSizeB() != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("K is not a multiple of vector load size for B tensor!");
+                    }
+                    BsTesnorIsValid = false;
+                }
+            }
+        });
+
+        bool DTesnorIsValid = {true};
+        static_for<0, NumDTensor, 1>{}([&](auto index) {
+            using DiLayout = remove_cvref_t<std::tuple_element_t<index.value, DsLayout>>;
+            if(std::is_same_v<DiLayout, ELayout> == false)
+            {
+                DTesnorIsValid = false;
+            }
+            if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+            {
+                if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("Can't support N for tensor D that is not a multiple of "
+                                      "NPerBlock without padding!");
+                    }
+                    DTesnorIsValid = false;
+                }
+                if(kargs.N % EpiloguePipeline::GetVectorSizeD(index) != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("N is not a multiple of vector load size for D tensor!");
+                    }
+                    DTesnorIsValid = false;
+                }
+            }
+            else
+            {
+                if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("Can't support M for tensor D that is not a multiple of "
+                                      "MPerBlock without padding!");
+                    }
+                    DTesnorIsValid = false;
+                }
+                if(kargs.M % EpiloguePipeline::GetVectorSizeD(index) != 0)
+                {
+                    if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                    {
+                        CK_TILE_ERROR("M is not a multiple of vector load size for D tensor!");
+                    }
+                    DTesnorIsValid = false;
+                }
+            }
+        });
+
+        if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support N that is not a multiple of NPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.N % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("N is not a multiple of vector load size for C tensor!");
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR(
+                        "Can't support M that is not a multiple of MPerBlock without padding!");
+                }
+                return false;
+            }
+            if(kargs.M % EpiloguePipeline::GetVectorSizeC() != 0)
+            {
+                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+                {
+                    CK_TILE_ERROR("M is not a multiple of vector load size for C tensor!");
+                }
+                return false;
+            }
+        }
+        return AsTesnorIsValid && BsTesnorIsValid && DTesnorIsValid;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto
+    MakeGemmTensorViews(const std::array<const ADataType*, NumATensor>& as_ptr,
+                        const std::array<const BDataType*, NumBTensor>& bs_ptr,
+                        const std::array<const void*, NumDTensor>& ds_ptr,
+                        EDataType* e_ptr,
+                        const KernelArgs& kargs,
+                        const SplitKBatchOffset& splitk_batch_offset)
+    {
+        static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
+
+        const auto& as_tensor_view = generate_tuple(
+            [&](auto i) {
+                using AiLayout   = remove_cvref_t<std::tuple_element_t<i.value, AsLayout>>;
+                using AiDataType = remove_cvref_t<std::tuple_element_t<i.value, AsDataType>>;
+                if constexpr(std::is_same_v<AiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const AiDataType*>(as_ptr[i]),
+                        make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                        make_tuple(kargs.stride_As[i], 1),
+                        number<GemmPipeline::GetVectorSizeA()>{},
+                        number<1>{});
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const AiDataType*>(as_ptr[i]),
+                        make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                        make_tuple(kargs.stride_As[i], 1),
+                        number<GemmPipeline::GetVectorSizeA()>{},
+                        number<1>{});
+                }
+            },
+            number<NumATensor>{});
+
+        const auto& bs_tensor_view = generate_tuple(
+            [&](auto i) {
+                using BiLayout   = remove_cvref_t<std::tuple_element_t<i.value, BsLayout>>;
+                using BiDataType = remove_cvref_t<std::tuple_element_t<i.value, BsDataType>>;
+                if constexpr(std::is_same_v<BiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                    {
+                        constexpr index_t K1 = GemmPipeline::GetSmemPackB();
+                        const index_t K0     = splitk_batch_offset.splitted_k / K1;
+                        constexpr index_t VectorSizeB =
+                            std::min(K1, GemmPipeline::GetVectorSizeB());
+                        const auto b_k0_n_k1_desc =
+                            make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                         make_tuple(kargs.N * K1, K1, I1),
+                                                         number<VectorSizeB>{},
+                                                         number<1>{});
+                        const auto b_n_k_desc = transform_tensor_descriptor(
+                            b_k0_n_k1_desc,
+                            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                       make_pass_through_transform(kargs.N)),
+                            make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                        return make_tensor_view<address_space_enum::global>(
+                            static_cast<const BiDataType*>(bs_ptr[i]), b_n_k_desc);
+                    }
+                    else
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            bs_ptr[i],
+                            make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                            make_tuple(kargs.stride_Bs[i], 1),
+                            number<GemmPipeline::GetVectorSizeB()>{},
+                            number<1>{});
+                    }
+                }
+                else
+                {
+                    if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
+                    {
+                        constexpr index_t K1 = GemmPipeline::GetSmemPackB();
+                        const index_t K0     = splitk_batch_offset.splitted_k / K1;
+                        constexpr index_t VectorSizeB =
+                            std::min(K1, GemmPipeline::GetVectorSizeB());
+                        const auto b_k0_n_k1_desc =
+                            make_naive_tensor_descriptor(make_tuple(K0, kargs.N, K1),
+                                                         make_tuple(kargs.N * K1, K1, I1),
+                                                         number<VectorSizeB>{},
+                                                         number<1>{});
+                        const auto b_n_k_desc = transform_tensor_descriptor(
+                            b_k0_n_k1_desc,
+                            make_tuple(make_merge_transform(make_tuple(K0, K1)),
+                                       make_pass_through_transform(kargs.N)),
+                            make_tuple(sequence<0, 2>{}, sequence<1>{}),
+                            make_tuple(sequence<1>{}, sequence<0>{}));
+                        return make_tensor_view<address_space_enum::global>(
+                            static_cast<const BiDataType*>(bs_ptr[i]), b_n_k_desc);
+                    }
+                    else
+                    {
+                        if constexpr(GemmPipeline::Preshuffle)
+                        {
+                            index_t kFlatK =
+                                GemmPipeline::BlockGemmShape::flatKPerWarp *
+                                (splitk_batch_offset.splitted_k /
+                                 TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}));
+                            index_t kFlatN = kargs.N * kargs.K / kFlatK;
+
+                            return make_naive_tensor_view<address_space_enum::global>(
+                                bs_ptr[i],
+                                make_tuple(kFlatN, kFlatK),
+                                make_tuple(kFlatK, 1),
+                                number<GemmPipeline::GetVectorSizeB()>{},
+                                number<1>{});
+                        }
+                        else
+                        {
+                            return make_naive_tensor_view<address_space_enum::global>(
+                                bs_ptr[i],
+                                make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                                make_tuple(kargs.stride_Bs[i], 1),
+                                number<GemmPipeline::GetVectorSizeB()>{},
+                                number<1>{});
+                        }
+                    }
+                }
+            },
+            number<NumBTensor>{});
+
+        const auto& ds_tensor_view = generate_tuple(
+            [&](auto i) {
+                using DiLayout   = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                using DDataType_ = remove_cvref_t<std::tuple_element_t<i.value, DsDataType>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const DDataType_*>(ds_ptr[i]),
+                        make_tuple(kargs.M, kargs.N),
+                        make_tuple(kargs.stride_Ds[i], 1),
+                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
+                        number<1>{});
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        static_cast<const DDataType_*>(ds_ptr[i]),
+                        make_tuple(kargs.N, kargs.M),
+                        make_tuple(kargs.stride_Ds[i], 1),
+                        number<EpiloguePipeline::GetVectorSizeD(i)>{},
+                        number<1>{});
+                }
+            },
+            number<NumDTensor>{});
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& e_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    e_ptr,
+                    make_tuple(kargs.M, kargs.N), // arguments not matching with flatmm.
+                    make_tuple(kargs.stride_E, 1),
+                    number<EpiloguePipeline::GetVectorSizeC()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    e_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_E),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(as_tensor_view, bs_tensor_view, ds_tensor_view, e_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& as_pad_view = generate_tuple(
+            [&](auto i) {
+                const auto& a_tensor_view = views.at(I0);
+                using AiLayout            = remove_cvref_t<std::tuple_element_t<i.value, AsLayout>>;
+                if constexpr(std::is_same_v<AiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return pad_tensor_view(a_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                      number<TilePartitioner::KPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadK>{});
+                }
+                else
+                {
+                    return pad_tensor_view(a_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                      number<TilePartitioner::MPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadM>{});
+                }
+            },
+            number<NumATensor>{});
+
+        const auto& b_flat_pad_view = views.at(I1);
+
+        const auto& bs_pad_view = generate_tuple(
+            [&](auto i) {
+                const auto& b_tensor_view = views.at(I1);
+                using BiLayout            = remove_cvref_t<std::tuple_element_t<i.value, BsLayout>>;
+                if constexpr(std::is_same_v<BiLayout, tensor_layout::gemm::ColumnMajor>)
+                {
+                    return pad_tensor_view(b_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                      number<TilePartitioner::KPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadK>{});
+                }
+                else
+                {
+                    return pad_tensor_view(b_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                      number<TilePartitioner::NPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadN>{});
+                }
+            },
+            number<NumBTensor>{});
+
+        const auto& ds_pad_view = generate_tuple(
+            [&](auto i) {
+                const auto& d_tensor_view = views.at(I2);
+                using DiLayout            = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return pad_tensor_view(d_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                      number<TilePartitioner::NPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadN>{});
+                }
+                else
+                {
+                    return pad_tensor_view(d_tensor_view[i],
+                                           make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                      number<TilePartitioner::MPerBlock>{}),
+                                           sequence<false, GemmPipeline::kPadM>{});
+                }
+            },
+            number<NumDTensor>{});
+
+        // TODO vector write in for C in ColMajor
+        const auto& e_pad_view = [&]() {
+            const auto& e_tensor_view = views.at(I3);
+            if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(e_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, GemmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(e_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<GemmPipeline::kPadM, false>{});
+            }
+        }();
+
+        if constexpr(GemmPipeline::Preshuffle)
+        {
+            // For flatmm, we need to use the flat B tensor view
+            return make_tuple(as_pad_view, b_flat_pad_view, ds_pad_view, e_pad_view);
+        }
+        else
+        {
+            return make_tuple(as_pad_view, bs_pad_view, ds_pad_view, e_pad_view);
+        }
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& as_pad_view = views.at(I0);
+        const auto& bs_pad_view = views.at(I1);
+        const auto& ds_pad_view = views.at(I2);
+        const auto& e_pad_view  = views.at(I3);
+
+        const auto& as_block_window = generate_tuple(
+            [&](auto i) {
+                using AiLayout = remove_cvref_t<std::tuple_element_t<i.value, AsLayout>>;
+                if constexpr(std::is_same_v<AiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_tile_window(as_pad_view[i],
+                                            make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                       number<TilePartitioner::KPerBlock>{}),
+                                            {i_m, 0});
+                }
+                else
+                {
+                    return make_tile_window(as_pad_view[i],
+                                            make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                       number<TilePartitioner::MPerBlock>{}),
+                                            {0, i_m});
+                }
+            },
+            number<NumATensor>{});
+
+        const auto& bs_block_window = generate_tuple(
+            [&](auto i) {
+                using BiLayout = remove_cvref_t<std::tuple_element_t<i.value, BsLayout>>;
+                if constexpr(GemmPipeline::Preshuffle)
+                {
+                    return make_tile_window(
+                        bs_pad_view[i],
+                        make_tuple(number<GemmPipeline::BlockGemmShape::flatNPerWarp>{},
+                                   number<GemmPipeline::BlockGemmShape::flatKPerWarp>{}),
+                        {static_cast<int>(i_n / GemmPipeline::BlockGemmShape::WarpTile::at(I1)),
+                         0});
+                }
+                else
+                {
+                    if constexpr(std::is_same_v<BiLayout, tensor_layout::gemm::ColumnMajor>)
+                    {
+                        return make_tile_window(bs_pad_view[i],
+                                                make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                           number<TilePartitioner::KPerBlock>{}),
+                                                {i_n, 0});
+                    }
+                    else
+                    {
+                        return make_tile_window(bs_pad_view[i],
+                                                make_tuple(number<TilePartitioner::KPerBlock>{},
+                                                           number<TilePartitioner::NPerBlock>{}),
+                                                {0, i_n});
+                    }
+                }
+            },
+            number<NumBTensor>{});
+
+        const auto ds_block_window = generate_tuple(
+            [&](auto i) {
+                using DiLayout = remove_cvref_t<std::tuple_element_t<i.value, DsLayout>>;
+                if constexpr(std::is_same_v<DiLayout, tensor_layout::gemm::RowMajor>)
+                {
+                    return make_tile_window(ds_pad_view[i],
+                                            make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                       number<TilePartitioner::NPerBlock>{}),
+                                            {i_m, i_n});
+                }
+                else
+                {
+                    return make_tile_window(ds_pad_view[i],
+                                            make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                       number<TilePartitioner::MPerBlock>{}),
+                                            {i_n, i_m});
+                }
+            },
+            number<NumDTensor>{});
+
+        auto e_block_window = make_tile_window(
+            e_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(as_block_window, bs_block_window, ds_block_window, e_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param as_ptr input As pointer
+     * @param bs_ptr input Bs pointer
+     * @param ds_ptr input Ds pointer
+     * @param e_ptr output E pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    template <bool UseDefaultScheduler = true>
+    CK_TILE_DEVICE static void RunGemm(const std::array<const ADataType*, NumATensor>& as_ptr,
+                                       const std::array<const BDataType*, NumBTensor>& bs_ptr,
+                                       const std::array<const void*, NumDTensor>& ds_ptr,
+                                       EDataType* e_ptr,
+                                       void* smem_ptr_0,
+                                       const KernelArgs& kargs,
+                                       const SplitKBatchOffset& splitk_batch_offset,
+                                       const index_t block_idx_m,
+                                       const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& as_block_window = gemm_tile_windows.at(I0);
+        const auto& bs_block_window = gemm_tile_windows.at(I1);
+        const auto& ds_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0);
+
+        if(UseDefaultScheduler || (get_warp_id() == 0))
+        {
+            // Run Epilogue Pipeline
+            auto& c_block_window = gemm_tile_windows.at(I3);
+
+            EpiloguePipeline{}.template
+            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(ds_block_window)>(
+                c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+        }
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param as_ptr input As pointer
+     * @param bs_ptr input Bs pointer
+     * @param ds_ptr input Ds pointer
+     * @param e_ptr output E pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void RunGemm2LDS(const std::array<const ADataType*, NumATensor>& as_ptr,
+                                           const std::array<const BDataType*, NumBTensor>& bs_ptr,
+                                           const std::array<const void*, NumDTensor>& ds_ptr,
+                                           EDataType* e_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const KernelArgs& kargs,
+                                           const SplitKBatchOffset& splitk_batch_offset,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& as_block_window = gemm_tile_windows.at(I0);
+        const auto& bs_block_window = gemm_tile_windows.at(I1);
+        const auto& ds_block_window = gemm_tile_windows.at(I2);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0, smem_ptr_1);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I3);
+
+        EpiloguePipeline{}.template
+        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(ds_block_window)>(
+            c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+    }
+
+    // Non-persistent kernel entry point
+    template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
+    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
+    {
+        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
+        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const SplitKBatchOffset splitk_batch_offset(kargs);
+
+        // options
+        std::array<const ADataType*, NumATensor> as_ptr;
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            as_ptr[i] = static_cast<const ADataType*>(kargs.as_ptr[i]) +
+                        splitk_batch_offset.as_k_split_offset[i];
+        });
+
+        std::array<const BDataType*, NumBTensor> bs_ptr;
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            bs_ptr[i] = static_cast<const BDataType*>(kargs.bs_ptr[i]) +
+                        splitk_batch_offset.bs_k_split_offset[i];
+        });
+
+        EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr_0[GetSmemSize()];
+
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+        {
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<EDataType, fp16_t, bf16_t>::value))
+            {
+                RunGemm2LDS(as_ptr,
+                            bs_ptr,
+                            kargs.ds_ptr,
+                            e_ptr,
+                            smem_ptr_0,
+                            smem_ptr_1,
+                            kargs,
+                            splitk_batch_offset,
+                            i_m,
+                            i_n);
+            }
+        }
+        else
+        {
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<EDataType, fp16_t, bf16_t>::value))
+            {
+                constexpr auto scheduler_type = (GemmPipeline::NumWaveGroups == 1);
+                RunGemm<scheduler_type>(as_ptr,
+                                        bs_ptr,
+                                        kargs.ds_ptr,
+                                        e_ptr,
+                                        smem_ptr_0,
+                                        kargs,
+                                        splitk_batch_offset,
+                                        i_m,
+                                        i_n);
+            }
+        }
+    }
+
+    // Persistent kernel entry point
+    template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
+    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
+    {
+        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
+        const auto num_tiles =
+            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
+        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
+        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
+
+        while(block_id < num_work)
+        {
+            // Get the tile index for this block
+            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
+            const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
+            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+            // Get the SplitK offset for this block
+            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
+            const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
+
+            std::array<const ADataType*, NumATensor> as_ptr;
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                as_ptr[i] = static_cast<const ADataType*>(kargs.as_ptr[i]) +
+                            splitk_batch_offset.as_k_split_offset[i];
+            });
+
+            std::array<const BDataType*, NumBTensor> bs_ptr;
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                bs_ptr[i] = static_cast<const BDataType*>(kargs.bs_ptr[i]) +
+                            splitk_batch_offset.bs_k_split_offset[i];
+            });
+
+            EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+
+            // allocate LDS
+            __shared__ char smem_ptr_0[GetSmemSize()];
+            // Run the GEMM
+            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+            {
+                __shared__ char smem_ptr_1[GetSmemSize()];
+                if constexpr(!(EpiloguePipeline::MemoryOperation ==
+                                   memory_operation_enum::atomic_add &&
+                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<EDataType, fp16_t, bf16_t>::value))
+                {
+                    RunGemm2LDS(as_ptr,
+                                bs_ptr,
+                                kargs.ds_ptr,
+                                e_ptr,
+                                smem_ptr_0,
+                                smem_ptr_1,
+                                kargs,
+                                splitk_batch_offset,
+                                i_m,
+                                i_n);
+                }
+            }
+            else
+            {
+                if constexpr(!(EpiloguePipeline::MemoryOperation ==
+                                   memory_operation_enum::atomic_add &&
+                               EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                               is_any_of<EDataType, fp16_t, bf16_t>::value))
+                {
+                    RunGemm(as_ptr,
+                            bs_ptr,
+                            kargs.ds_ptr,
+                            e_ptr,
+                            smem_ptr_0,
+                            kargs,
+                            splitk_batch_offset,
+                            i_m,
+                            i_n);
+                }
+            }
+            // Advance to the next work item
+            block_id += grid_size;
+            if(block_id >= num_work)
+            {
+                break;
+            }
+        }
+    }
+};
+} // namespace ck_tile
diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
index 79bd51d65c..f654d1a917 100644
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -242,21 +242,20 @@ class TestCkTileBatchedGemm : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::BatchedGemmHostArgs args;
-        args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
-        args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-        args.e_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
-        args.k_batch        = 1;
-        args.M              = M;
-        args.N              = N;
-        args.K              = K;
-        args.stride_A       = StrideA;
-        args.stride_B       = StrideB;
-        args.stride_E       = StrideC;
-        args.batch_stride_A = BatchStrideA;
-        args.batch_stride_B = BatchStrideB;
-        args.batch_stride_E = BatchStrideC;
-        args.batch_count    = BatchCount;
+        ck_tile::BatchedGemmHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                          b_k_n_dev_buf.GetDeviceBuffer(),
+                                          c_m_n_dev_buf.GetDeviceBuffer(),
+                                          1,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideC,
+                                          BatchCount};
 
         invoke_batched_gemm<ALayout, BLayout, CLayout>(args,
                                                        ck_tile::stream_config{nullptr, false});
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
index 9e4c036655..4321709ea5 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_basic_run_test.inc
@@ -25,7 +25,7 @@ template <typename GemmConfig,
           typename CLayout,
           bool Persistent,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
 {
     if constexpr(Persistent)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
index afa6912e0f..a967b92e7f 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_run_test.inc
@@ -158,7 +158,7 @@ template <typename GemmConfig,
           typename CLayout,
           bool Persistent,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float gemm(const ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& s);
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
 
 template <typename GemmConfig,
           typename ADataType,
@@ -185,18 +185,16 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   int n_repeat,
                   bool persistent)
 {
-    ck_tile::GemmHostArgs</*NumDTensor = 0*/> args = {a_m_k_dev_buf.GetDeviceBuffer(),
-                                                      b_k_n_dev_buf.GetDeviceBuffer(),
-                                                      {},
-                                                      c_m_n_dev_buf.GetDeviceBuffer(),
-                                                      kbatch,
-                                                      M,
-                                                      N,
-                                                      K,
-                                                      stride_A,
-                                                      stride_B,
-                                                      {},
-                                                      stride_C};
+    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                  b_k_n_dev_buf.GetDeviceBuffer(),
+                                  c_m_n_dev_buf.GetDeviceBuffer(),
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_A,
+                                  stride_B,
+                                  stride_C};
 
     float ave_time;
     if(persistent)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
index 99a1e50a6f..bd197150a4 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
@@ -411,4 +411,4 @@ template <typename ADataType,
           typename CLayout,
           bool Persistent = false,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s);
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
index 1980648391..860541ef18 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -14,7 +14,7 @@ template <typename GemmConfig,
           typename ELayout,
           bool Persistent,
           typename CDEElementWise>
-float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile::stream_config& s)
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
 {
     using GemmShape = ck_tile::TileGemmShape<
@@ -63,119 +63,120 @@ float gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args, const ck_tile:
 
     float ave_time{0};
 
-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
 
-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 DsDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 DsLayout,
-                                                 ELayout,
-                                                 CDEElementWise,
-                                                 GemmPipelineProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation,
-                                                 GemmConfig::NumWaveGroups>>;
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
 
-            dim3 grids;
-            if constexpr(Persistent)
-            {
-                grids = Kernel::MaxOccupancyGridSize(s);
-            }
-            else
-            {
-                grids = Kernel::GridSize(args.M, args.N, args.k_batch);
-            }
-            constexpr dim3 blocks = Kernel::BlockSize();
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();
 
-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw ArgumentsNotSupportedException(
-                    "Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw ArgumentsNotSupportedException(
+                "Wrong! Arguments not supported! Skipping gemm!\n");
+        }
 
-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
 
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
 
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
 
-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_preprocess(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
 
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 7b519760b9..9adf9ec185 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -91,8 +91,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
     // TODO: expose tile size through test t-param ?
 
     template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
-    void invoke_gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args,
-                     const ck_tile::stream_config& s)
+    void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
         // TODO: This should be parameterized in tests
         constexpr ck_tile::index_t M_Tile = 256;
@@ -324,9 +323,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                     return stride;
             };
 
-        std::size_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
-        std::size_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
-        std::size_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
+        ck_tile::index_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
+        ck_tile::index_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
+        ck_tile::index_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
 
         ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
         ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
@@ -345,17 +344,16 @@ class TestCkTileGemmPipeline : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::GemmHostArgs</*NumDTensor = 0*/> args;
-        args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-        args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-        args.e_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-        args.k_batch  = kbatch;
-        args.M        = M;
-        args.N        = N;
-        args.K        = K;
-        args.stride_A = stride_A;
-        args.stride_B = stride_B;
-        args.stride_E = stride_C;
+        ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C};
 
         invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
 
diff --git a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
index 7dd91077b1..c08951435e 100644
--- a/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
+++ b/test/ck_tile/gemm_multi_d/test_gemm_multi_d_util.hpp
@@ -10,7 +10,7 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
 struct ElementWiseAddAdd
@@ -95,7 +95,7 @@ class TestCkTileGemmMultiD : public ::testing::Test
               typename DsLayout,
               typename ELayout,
               typename CDEElementWise = ck_tile::element_wise::PassThrough>
-    void invoke_gemm_multi_d(const ck_tile::GemmHostArgs<DsDataType::size()>& args,
+    void invoke_gemm_multi_d(const ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args,
                              const ck_tile::stream_config& s)
     {
         constexpr ck_tile::index_t M_Tile = 256;
@@ -189,7 +189,7 @@ class TestCkTileGemmMultiD : public ::testing::Test
                                                  UniversalGemmProblem::TransposeC,
                                                  memory_operation>>;
 
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
             const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
@@ -345,18 +345,18 @@ class TestCkTileGemmMultiD : public ::testing::Test
                                                                   d1_m_n_dev_buf.GetDeviceBuffer()};
         std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {StrideD0, StrideD1};
 
-        ck_tile::GemmHostArgs<DsDataType::size()> args({a_m_k_dev_buf.GetDeviceBuffer(),
-                                                        b_k_n_dev_buf.GetDeviceBuffer(),
-                                                        ds_ptr_buf,
-                                                        e_m_n_dev_buf.GetDeviceBuffer(),
-                                                        k_batch,
-                                                        M,
-                                                        N,
-                                                        K,
-                                                        StrideA,
-                                                        StrideB,
-                                                        stridesDs,
-                                                        StrideE});
+        ck_tile::GemmMultiDHostArgs<DsDataType::size()> args({a_m_k_dev_buf.GetDeviceBuffer(),
+                                                              b_k_n_dev_buf.GetDeviceBuffer(),
+                                                              ds_ptr_buf,
+                                                              e_m_n_dev_buf.GetDeviceBuffer(),
+                                                              k_batch,
+                                                              M,
+                                                              N,
+                                                              K,
+                                                              StrideA,
+                                                              StrideB,
+                                                              stridesDs,
+                                                              StrideE});
 
         invoke_gemm_multi_d<ADataType,
                             BDataType,
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
index 0315f69c16..4d6a1b42b1 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
@@ -86,8 +86,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
     // TODO: expose tile size through test t-param ?
 
     template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
-    void invoke_gemm(const ck_tile::GemmHostArgs</*NumDTensor = 0*/>& args,
-                     const ck_tile::stream_config& s)
+    void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
         // TODO: This should be parameterized in tests
         // constexpr ck_tile::index_t M_Tile = 128;
@@ -314,9 +313,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
                     return stride;
             };
 
-        std::size_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
-        std::size_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
-        std::size_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
+        ck_tile::index_t stride_A = f_get_default_stride(M, K, StrideA, ALayout{});
+        ck_tile::index_t stride_B = f_get_default_stride(K, N, StrideB, BLayout{});
+        ck_tile::index_t stride_C = f_get_default_stride(M, N, StrideC, CLayout{});
 
         ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
         ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
@@ -346,17 +345,16 @@ class TestCkTileGemmPipeline : public ::testing::Test
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::GemmHostArgs</*NumDTensor = 0*/> args;
-        args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-        args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-        args.e_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-        args.k_batch  = kbatch;
-        args.M        = M;
-        args.N        = N;
-        args.K        = K;
-        args.stride_A = stride_A;
-        args.stride_B = stride_B;
-        args.stride_E = stride_C;
+        ck_tile::GemmHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                   b_k_n_dev_buf.GetDeviceBuffer(),
+                                   c_m_n_dev_buf.GetDeviceBuffer(),
+                                   kbatch,
+                                   M,
+                                   N,
+                                   K,
+                                   stride_A,
+                                   stride_B,
+                                   stride_C};
 
         invoke_gemm<PadM, PadN, PadK, Preshuffle>(args, ck_tile::stream_config{nullptr, false});
 
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 54f772f89e..79e29f8b99 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -51,7 +51,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         static const ck_tile::index_t K_Warp_Tile = 16;
     };
 
-    using grouped_gemm_kargs = ck_tile::GemmHostArgs</*NumDTensor = 0*/>;
+    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
     std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
     {
         return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
@@ -437,7 +437,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
             void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();
 
             gemm_descs.push_back(
-                {p_a, p_b, {}, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], {}, stride_Cs[i]});
+                {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
         }
 
         ck_tile::DeviceMem gemm_workspace;
@@ -451,18 +451,18 @@ class TestCkTileGroupedGemm : public ::testing::Test
             const bool splitk = gemm_descs[0].k_batch > 1;
             for(const auto& arg : gemm_descs)
             {
-                kargs.emplace_back(ck_tile::GemmKernelArgs<>{arg.a_ptr,
-                                                             arg.b_ptr,
-                                                             {},
-                                                             arg.e_ptr,
-                                                             arg.M,
-                                                             arg.N,
-                                                             arg.K,
-                                                             arg.stride_A,
-                                                             arg.stride_B,
-                                                             {},
-                                                             arg.stride_E,
-                                                             arg.k_batch});
+                kargs.emplace_back(ck_tile::UniversalGemmKernelArgs<>{{arg.a_ptr},
+                                                                      {arg.b_ptr},
+                                                                      {/*arg.ds_ptr*/},
+                                                                      arg.e_ptr,
+                                                                      arg.M,
+                                                                      arg.N,
+                                                                      arg.K,
+                                                                      {arg.stride_A},
+                                                                      {arg.stride_B},
+                                                                      {/*arg.stride_Ds*/},
+                                                                      arg.stride_E,
+                                                                      arg.k_batch});
             }
             const auto stream = ck_tile::stream_config{nullptr, false, 1};
             ck_tile::hip_check_error(
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 0b38c44a1a..6796121328 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -233,7 +233,7 @@ struct GemmKernel {{
     static constexpr bool kPadN = {pad_n};
     static constexpr bool kPadK = {pad_k};
 
-    static float launch(ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{
+    static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
         static constexpr bool permuteB = false;
         static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
@@ -335,7 +335,7 @@ struct GemmKernel {{
                 auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
 
                 ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, stream.rotating_count_, size_a_buffer, size_b_buffer);
+                    kargs.as_ptr[0], kargs.bs_ptr[0], stream.rotating_count_, size_a_buffer, size_b_buffer);
                 rotating_mem.Print();
 
                 auto run_flush_cache = [&]() {{
@@ -680,7 +680,7 @@ struct GemmDispatcher {
         // Use a static local variable
         static std::unordered_map<
             std::string,
-            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>>
+            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>>
             kernel_map;
         return kernel_map;
     }
@@ -705,7 +705,7 @@ struct GemmDispatcher {
                         warp_tile_n,
                         warp_tile_k,
                     ) = tile[j]
-                    content += f"""[=](ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream) {{ """
+                    content += f"""[=](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ """
                     content += f""" 
                                     if(structured_sparsity){{  // SMFMA"""
                     sparse = (
@@ -746,7 +746,7 @@ struct GemmDispatcher {
         content += """    }
 
     template <typename Kernel>
-    static std::tuple<std::string, float> run_kernel(ck_tile::GemmHostArgs<>& args, const ck_tile::stream_config& stream)
+    static std::tuple<std::string, float> run_kernel(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream)
     {
         std::string name = Kernel::get_name();
         float avg_time = Kernel::launch(args, stream);
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 2b0cbe7880..fdad363f7c 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -22,7 +22,7 @@ class GemmProfiler
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs<>&, const ck_tile::stream_config&)>>& callables)
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables)
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -89,10 +89,9 @@ class GemmProfiler
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
-        ck_tile::GemmHostArgs<> gemm_args = {
+        ck_tile::GemmHostArgs gemm_args = {
             a_m_k_dev_buf.GetDeviceBuffer(),
             b_k_n_dev_buf.GetDeviceBuffer(),
-            {}, // ds_ptr
             c_m_n_dev_buf.GetDeviceBuffer(),
             gemm_problem.split_k_,
             gemm_problem.m_,
@@ -100,7 +99,6 @@ class GemmProfiler
             gemm_problem.k_,
             gemm_problem.stride_a_,
             gemm_problem.stride_b_,
-            {}, // stride_Ds
             gemm_problem.stride_c_,
         };
 

From 963dfa680bebb6b85308f82d9a4e2c68ece1aa06 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Thu, 24 Jul 2025 15:38:24 -0400
Subject: [PATCH 344/443] Revamp TERMINOLOGY.md  (#2522)

* Add comprehensive terminology reference for Composable Kernel, including glossary, hardware and memory hierarchy, execution model, programming model, memory access, tile-based computing, kernel operations, and optimizations.

* Refine terminology in documentation for clarity and consistency.

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Enhance TERMINOLOGY.md by adding definitions for CUDA and HIP

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update TERMINOLOGY.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* address review comments

* refine pipeline and tile partitioner

---------

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 TERMINOLOGY.md | 348 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 347 insertions(+), 1 deletion(-)

diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md
index e8833efb89..6dbe88640c 100644
--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
@@ -1,2 +1,348 @@
 [Back to the main page](./README.md)
-# Composable Kernel terminology
\ No newline at end of file
+
+# Composable Kernel Terminology
+
+This document provides a technical reference for terminology used in the Composable Kernel library, organized by conceptual progression from hardware to machine learning operations.
+
+---
+
+## Glossary Index (Alphabetical)
+
+- [Add+Multiply](#addmultiply)
+- [Bank Conflict](#bank-conflict)
+- [Batched GEMM](#batched-gemm)
+- [Benchmark](#benchmark)
+- [Block Size](#block-size)
+- [Block Tile](#block-tile)
+- [Compute Unit (CU)](#compute-unit-cu)
+- [Coordinate Transformation Primitives](#coordinate-transformation-primitives)
+- [CUDA](#cuda)
+- [Dense Tensor](#dense-tensor)
+- [Descriptor](#descriptor)
+- [Device](#device)
+- [Elementwise](#elementwise)
+- [Epilogue](#epilogue)
+- [Fast Changing Dimension](#fast-changing-dimension)
+- [GEMM](#gemm-general-matrix-multiply)
+- [GEMV](#gemv)
+- [Grouped GEMM](#grouped-gemm)
+- [Global Memory](#global-memory)
+- [Grid](#grid)
+- [Host](#host)
+- [HIP](#hip)
+- [Inner Dimension](#inner-dimension)
+- [Inner Product](#inner-product)
+- [Input/Problem Shape](#inputproblem-shape)
+- [Kernel](#kernel)
+- [Launch Parameters](#launch-parameters)
+- [Load Tile](#load-tile)
+- [LDS Banks](#lds-banks)
+- [Matrix Core](#matrix-core)
+- [MFMA (Matrix Fused Multiply-Add)](#mfma-matrix-fused-multiply-add)
+- [Occupancy](#occupancy)
+- [Outer Dimension](#outer-dimension)
+- [Outer Product](#outer-product)
+- [Pinned Memory](#pinned-memory)
+- [Pipeline](#pipeline)
+- [Policy](#policy)
+- [Problem](#problem)
+- [Processing Units](#processing-units)
+- [Reference Kernel](#reference-kernel)
+- [Regression Test](#regression-test)
+- [ROCm](#rocm)
+- [Scalar General Purpose Register (SGPR)](#scalar-general-purpose-register-sgpr)
+- [Shared Memory / LDS (Local Data Share)](#shared-memory--lds-local-data-share)
+- [SIMT / SIMD](#simt--simd)
+- [Smoke Test](#smoke-test)
+- [Sparse Tensor](#sparse-tensor)
+- [Split-K GEMM](#split-k-gemm)
+- [Store Tile](#store-tile)
+- [Thread / Work-item](#thread--work-item)
+- [Thread Block / Work Group](#thread-block--work-group)
+- [Vanilla GEMM](#vanilla-gemm)
+- [Tile](#tile)
+- [Tile Distribution](#tile-distribution)
+- [Tile Partitioner](#tile-partitioner)
+- [Tile Programming API](#tile-programming-api)
+- [Tile Window](#tile-window)
+- [User Customized Tile Pipeline](#user-customized-tile-pipeline)
+- [User Customized Tile Pipeline Optimization](#user-customized-tile-pipeline-optimization)
+- [Vector](#vector)
+- [Vector General Purpose Register (VGPR)](#vector-general-purpose-register-vgpr)
+- [Warp / Wavefront](#warp--wavefront)
+- [Wave Tile](#wave-tile)
+- [XDL Instructions](#xdl-instructions)
+
+---
+
+## 1. Hardware and Memory
+
+### Processing Units
+The GPU is composed of multiple hardware units ([compute units (CUs)](#compute-unit-cu) on AMD, [streaming multiprocessors (SMs)](#compute-unit-cu) on NVIDIA), each containing many cores that run threads in parallel. These units manage shared resources and coordinate execution at scale.
+
+### Matrix Core
+Specialized GPU units that accelerate matrix operations for AI and deep learning tasks. Modern GPUs contain multiple matrix cores.
+
+### Compute Unit (CU)
+AMD's parallel vector processor in a GPU with multiple ALUs. Each compute unit will run all the waves in a workgroup. _This is equivalent to NVIDIA's streaming multiprocessor (SM)_.
+
+### Matrix Fused Multiply-Add (MFMA)
+AMD's matrix core instruction for efficient GEMM operations. CK optimizes kernel designs to maximize MFMA utilization and performance.
+
+### Registers
+The fastest memory tier, registers are private to each thread/work-item and used for storing temporary variables during computation. AMD distinguishes between [vector (VGPR)](#vector-general-purpose-register-vgpr) and [scalar (SGPR)](#scalar-general-purpose-register-sgpr) registers, while NVIDIA uses a unified register file.
+
+### Vector General Purpose Register (VGPR)
+Per-thread registers that store individual thread data within a wave. Each thread has its own set of VGPRs for private variables and calculations.
+
+### Scalar General Purpose Register (SGPR)
+Wave-level registers shared by all threads in a wave. Used for constants, addresses, and control flow common across the entire wave.
+
+### Shared Memory / Local Data Share (LDS)
+AMD's high-bandwidth, low-latency on-chip memory accessible to all threads within a work group. This is equivalent to NVIDIA's shared memory. It enables fast data sharing and synchronization, but is limited in capacity and must be managed to avoid [bank conflicts](#bank-conflict).
+
+### LDS Banks
+Memory organization where consecutive addresses are distributed across multiple memory banks for parallel access. Prevents memory access conflicts ([bank conflicts](#bank-conflict)) and improves bandwidth.
+
+### Global Memory
+The main device memory accessible by all threads, offering high capacity but higher latency than shared memory.
+
+### Pinned Memory
+Host memory that is page-locked to accelerate transfers between CPU and GPU, reducing overhead for large data movements.
+
+### Dense Tensor
+A tensor in which most elements are nonzero, typically stored in a contiguous block of memory.
+
+### Sparse Tensor
+A tensor in which most elements are zero, allowing for memory and computation optimizations by storing only nonzero values and their indices.
+
+### Host
+CPU and main memory system that manages GPU execution. Launches kernels, transfers data, and coordinates overall computation.
+
+### Device
+GPU hardware that executes parallel kernels. Contains compute units, memory hierarchy, and specialized accelerators.
+
+---
+
+## 2. GPU Programming Model
+
+### Thread / Work-item
+AMD's work-item is the smallest unit of parallel execution, each running an independent instruction stream on a single data element. This is equivalent to NVIDIA's thread. Work-items/threads are grouped into [wavefronts (AMD)](#warp--wavefront) and [warps (NVIDIA)](#warp--wavefront) for efficient scheduling and resource sharing.
+
+### Warp / Wavefront
+AMD's wavefront is a group of threads that run instructions in lockstep, forming the SIMD group. This is equivalent to NVIDIA's warp.
+
+### Thread Block / Work Group
+AMD's work group is a collection of threads/work-items that can synchronize and share memory. This is equivalent to NVIDIA's thread block. Work groups/thread blocks are scheduled independently and mapped to hardware units for execution.
+
+### Grid
+The complete collection of all work groups (thread blocks) that execute a kernel. A grid spans the entire computational domain and is organized in 1D, 2D, or 3D dimensions. Each work group within the grid operates independently and can be scheduled on different compute units, enabling massive parallel execution across the entire GPU.
+
+### Block Size
+Number of work-items/threads in a compute unit (CU). Determines work group size and memory usage.
+
+### Single-Instruction, Multi-Thread (SIMT) / Single-Instruction, Multi-Data (SIMD)
+SIMT (Single-Instruction, Multi-Thread) allows threads in a warp to diverge, while SIMD (Single-Instruction, Multi-Data) enforces strict lockstep execution within wavefronts. These models define how parallelism is expressed and managed on different architectures.
+
+### Occupancy
+The ratio of active warps/wavefronts to the maximum number of warps/wavefronts supported by a hardware unit. Affects the ability to hide memory latency and maximize throughput.
+
+---
+
+## 3. Kernel Structure
+
+### Kernel
+A function executed on the GPU, typically written in [HIP](#hip) or [CUDA](#cuda), that performs parallel computations over input data. Kernels are launched with specific grid and block dimensions to map computation to hardware. In CK, kernels are composed from pipelines and require a pipeline, tile partitioner, and epilogue component.
+
+### Pipeline
+A CK Pipeline orchestrates the sequence of operations for a kernel, including data loading, computation, and storage phases. It consists of two core components: a [Problem](#problem) component that defines what to compute, and a [Policy](#policy) component that specifies how to move data around. 
+
+### Tile Partitioner
+Defines the mapping between problem dimensions (M, N, K) and GPU hierarchy. It specifies workgroup-level tile sizes (kM, kN, kK) and determines grid dimensions by dividing the problem size by tile sizes.
+
+### Problem
+Defines what to compute - input/output shapes, data types, and mathematical operations (e.g., GEMM, convolution).
+
+### Policy
+Defines memory access patterns and hardware-specific optimizations.
+
+### User Customized Tile Pipeline
+User-defined pipeline that combines custom problem and policy components for specialized computations. CK also provides prebuilt pipelines and policies for common operations that can be used as starting points.
+
+### User Customized Tile Pipeline Optimization
+Process of tuning tile sizes, memory access patterns, and hardware utilization for specific workloads. CK also provides prebuilt pipelines and policies for common operations that can be used as starting points.
+
+### Tile Programming API
+CK's high-level interface for defining tile-based computations with predefined hardware mapping for data load/store.
+
+### Coordinate Transformation Primitives
+CK utilities for converting between different coordinate systems (logical, physical, memory layouts).
+
+### Reference Kernel
+A baseline kernel implementation used to verify correctness and performance. CK has two reference kernel implementations: one for CPU and one for GPU.
+
+### Launch Parameters
+Configuration values (e.g., grid size, block size) that determine how a kernel is mapped to hardware resources. Proper tuning of these parameters is essential for optimal performance.
+
+---
+
+## 4. Memory Access and Data Layout
+
+### Memory Coalescing
+An optimization where consecutive threads access consecutive memory addresses, allowing a single memory transaction to serve multiple threads. Proper coalescing is vital for achieving peak memory bandwidth.
+
+### Alignment
+A memory management startegy for efficient memory access where data structures are stored at addresses that are multiples of a specific value.
+
+### Bank Conflict
+Occurs when multiple threads in a warp/wavefront access different addresses mapping to the same shared memory bank, causing serialization and reduced bandwidth.
+
+### Padding
+The addition of extra elements (often zeros) to tensor edges. This is used to control output size in convolution and pooling, or to align data for efficient memory access.
+
+### Permute/Transpose
+Operations that rearrange the order of tensor axes, often required to match kernel input formats or optimize memory access patterns.
+
+### Host-Device Transfer
+The process of moving data between CPU (host) and GPU (device) memory. Host-device transfers can be a performance bottleneck and are optimized using pinned memory and asynchronous operations.
+
+### Stride
+The step size to move from one element to the next in a particular dimension of a tensor or matrix. In convolution and pooling, stride determines how far the kernel moves at each step.
+
+### Dilation
+The spacing between kernel elements in convolution operations, allowing the receptive field to grow without increasing kernel size.
+
+### Im2Col/Col2Im
+Data transformation techniques that convert image data to column format (im2col) for efficient convolution and back (col2im) to reconstruct the original layout.
+
+### Fast Changing Dimension
+Innermost dimension that changes fastest in memory layout.
+
+### Outer Dimension
+Slower-changing dimension in memory layout.
+
+### Inner Dimension
+Faster-changing dimension in memory layout.
+
+---
+
+## 5. Tile-Based Computing and Data Structures
+
+### Tile
+A sub-region of a tensor or matrix processed by a block or thread. Tiles are used to improve memory locality and enable blocking strategies in kernels. Rectangular data blocks are the unit of computation and memory transfer in CK and the basis for tiled algorithms.
+
+### Block Tile
+Memory tile processed by a work group (thread block).
+
+### Wave Tile
+Sub-tile processed by a single wave within a work group. Represents the granularity of SIMD execution.
+
+### Tile Distribution
+Hierarchical data mapping from work-items to data in memory.
+
+### Tile Window
+Viewport into a larger tensor that defines the current tile's position and boundaries for computation.
+
+### Load Tile
+Operation that transfers data from global memory/LDS to per-thread registers using optimized memory access patterns.
+
+### Store Tile
+Operation that transfers data from per-thread registers to LDS/global memory using optimized memory access patterns.
+
+### Descriptor
+Metadata structure that defines tile properties, memory layouts, and coordinate transformations for CK operations.
+
+### Input/Problem Shape
+Dimensions and data types of input tensors that define the computational problem (e.g., M×K, K×N for GEMM).
+
+### Vector
+Smallest data unit processed by individual threads. Typically 4-16 elements depending on data type and hardware.
+
+---
+
+## 6. Kernel Operations and Optimization
+
+### Elementwise
+Operations applied independently to each tensor element, such as addition or multiplication. These are highly parallelizable and benefit from efficient memory access.
+
+### Epilogue
+The final stage of a kernel or operation, often applying activation functions, bias, or other post-processing steps. Epilogues are critical for integrating kernel outputs into larger computation graphs.
+
+### Add+Multiply
+A common fused operation in ML and linear algebra, where an elementwise addition is immediately followed by multiplication, often used for bias and scaling in neural network layers.
+
+---
+
+## 7. Linear Algebra and ML Operations
+
+### General Matrix Multiply (GEMM)
+Core matrix operation in linear algebra and deep learning. A GEMM is defined as C = αAB + βC for matrices A, B, and C. 
+
+### "Vanilla" GEMM (Naive GEMM) Kernel
+The **vanilla GEMM** is the simplest form of GEMM in CK. It:
+- Takes input matrices **A** and **B**
+- Multiplies them to produce output matrix **C**
+
+This is the **baseline** or **building block** GEMM that all other complex versions expand upon.
+
+### Grouped GEMM (GGEMMs)
+
+A kernel which calls multiple VGEMMs. Each call can have a different input shape. Each input shape problem first finds its corresponding kernel and then data is mapped to the work-group (blocks) of that kernel. 
+
+### Batched GEMM
+A kernel which calls VGEMMs with different "batches" of data. All batches have the same input shape. 
+
+### Split-K GEMM
+A parallelization strategy that partitions the reduction dimension (K) across multiple compute units, increasing parallelism for large matrix multiplications.
+
+### GEMV
+The operation of multiplying a matrix by a vector, producing another vector. GEMV (General Matrix Vector Multiplication) is a core linear algebra primitive, widely used in neural networks and scientific computing.
+
+### Inner Product
+Also known as the dot product, it computes the sum of elementwise products of two vectors, yielding a scalar.
+
+### Outer Product
+The result of multiplying a column vector by a row vector, producing a matrix. Outer products are used in rank-1 updates and some ML algorithms.
+
+### Norm
+A function that measures the magnitude of a vector or matrix, such as L2 (Euclidean) or L1 norm. Norms are used in regularization, normalization, and optimization.
+
+---
+
+## 8. Testing, Build, and Infrastructure
+
+### Regression Test
+Tests that are part of CK's ctest suite and explicitly take more than 30s to finish on gfx942.
+
+### Smoke Test
+Tests that are part of CK's ctest suite and take less than or equal to 30 seconds to finish on gfx942.
+
+---
+
+## 9. Low-Level Instructions and Optimizations
+
+### eXtensible Data Language (XDL) Instructions
+eXtensible Data Language (XDL) instructions are a set of specialized, low-level instructions used to optimize data movement, memory access, and layout in high-performance computing, GPU programming, and deep learning tasks.
+
+---
+
+## 10. Miscellaneous
+
+### HIP
+AMD's Heterogeneous-Computing Interface for Portability, a C++ runtime API and programming language that enables developers to create portable applications for AMD and NVIDIA GPUs. HIP provides a familiar CUDA-like programming model while maintaining compatibility across different GPU architectures.
+
+### CUDA
+NVIDIA's Compute Unified Device Architecture, a parallel computing platform and programming model for NVIDIA GPUs. CUDA provides a C++ extension for writing GPU kernels and managing GPU resources.
+
+### ROCm
+AMD's Radeon Open Compute platform, an open-source software stack for GPU computing that includes [HIP](#hip), libraries, and tools for high-performance computing and machine learning workloads on AMD GPUs.
+
+---
+
+## Scientific Context and References
+
+This terminology is grounded in parallel computing theory, numerical linear algebra, and computer architecture. For further reading, see:
+- [Building Efficient GEMM Kernels with CK Tile](https://rocm.blogs.amd.com/software-tools-optimization/building-efficient-gemm-kernels-with-ck-tile-vendo/README.html)
+- [CK Tile Flash](https://rocm.blogs.amd.com/software-tools-optimization/ck-tile-flash/README.html)
+
+This document assumes familiarity with parallel computing, linear algebra, and computer architecture principles.

From 9c04a55626155b2a9d55e3ec6f4046a7a727d934 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 24 Jul 2025 14:52:46 -0700
Subject: [PATCH 345/443] remove repetitive code (#2562)

---
 include/ck/utility/data_type.hpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index ff611d2136..5fbe30d21b 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -71,17 +71,6 @@ struct f4x2_pk_t
     {
         return !(lhs == rhs);
     }
-
-    // Compare operator
-    __host__ __device__ friend bool operator==(const f4x2_pk_t& lhs, const f4x2_pk_t& rhs)
-    {
-        return lhs.data == rhs.data;
-    }
-
-    __host__ __device__ friend bool operator!=(const f4x2_pk_t& lhs, const f4x2_pk_t& rhs)
-    {
-        return !(lhs == rhs);
-    }
 };
 
 template <typename BitType, index_t pk_size>

From 2addf05b9116c9d45ce85a3bb1dee15272dd033e Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Thu, 24 Jul 2025 17:06:32 -0600
Subject: [PATCH 346/443] [CK TILE] Apply CK_GFX950_SUPPORT macro on ck tile
 GEMM unit tests (#2560)

cherry-pick c68687e30 and apply CK_GFX950_SUPPORT macro on ck tile GEMM unit tests

Co-authored-by: AviralGoelAMD <aviral.goel@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 CMakeLists.txt                                      | 2 ++
 test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e032a30cf..da5a86523e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,6 +236,8 @@ endif()
 if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
     add_definitions(-DCK_USE_NATIVE_MX_SUPPORT)
     set(CK_USE_NATIVE_MX_SUPPORT "ON")
+    add_definitions(-DCK_GFX950_SUPPORT)
+    set(CK_GFX950_SUPPORT "ON")
 endif()
 
 option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
index bd197150a4..f64d3e092b 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_smoke_util.hpp
@@ -25,7 +25,7 @@ class ArgumentsNotSupportedException : public std::logic_error
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
     constexpr bool is_8bit_float =
         std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
     if constexpr(M_Warp_Tile == 32)

From b01a27ff2203221a1180a984bc8efefd79d078fc Mon Sep 17 00:00:00 2001
From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com>
Date: Fri, 25 Jul 2025 03:49:58 +0200
Subject: [PATCH 347/443] Support b_scale: (#2350)

- extend pipeline v1 and v3
 - add instances
 - add tests
 - add example

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 example/01_gemm/CMakeLists.txt                |    2 +
 .../gemm_wmma_fp16_pk_i4_v3_b_scale.cpp       |  367 ++++
 .../blockwise_gemm_pipeline_wmmaops_base.hpp  |   76 +-
 .../blockwise_gemm_pipeline_wmmaops_v1.hpp    |  155 +-
 .../blockwise_gemm_pipeline_wmmaops_v3.hpp    |   99 +-
 .../impl/device_gemm_wmma_cshuffle_v3.hpp     |  231 +--
 .../device_gemm_wmma_cshuffle_v3_b_scale.hpp  |  302 ++++
 .../device_gemm_wmma_cshuffle_v3_common.hpp   |  265 +++
 .../grid/gridwise_gemm_wmma_cshuffle_v3.hpp   | 1551 ++---------------
 ...gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp |  551 ++++++
 .../gridwise_gemm_wmma_cshuffle_v3_common.hpp | 1420 +++++++++++++++
 .../gpu/gemm_b_scale.hpp                      |   24 +-
 .../gpu/gemm_b_scale/CMakeLists.txt           |    6 +-
 ..._gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn.hpp |   72 +
 ...4_f16_mk_nk_mn_mem_v2_default_instance.cpp |   31 +
 ...e_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp |    4 +-
 .../profiler/profile_gemm_b_scale_impl.hpp    |    4 +-
 profiler/src/CMakeLists.txt                   |    4 +-
 test/CMakeLists.txt                           |    1 +
 test/gemm_b_scale/CMakeLists.txt              |    9 +
 .../test_gemm_b_scale_ut_cases.inc            |   43 +
 test/gemm_b_scale/test_gemm_b_scale_util.hpp  |   97 ++
 test/gemm_b_scale/test_gemm_b_scale_wmma.cpp  |   45 +
 test/gemm_b_scale/test_gemm_b_scale_xdl.cpp   |   45 +
 24 files changed, 3744 insertions(+), 1660 deletions(-)
 create mode 100644 example/01_gemm/gemm_wmma_fp16_pk_i4_v3_b_scale.cpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
 create mode 100644 test/gemm_b_scale/CMakeLists.txt
 create mode 100644 test/gemm_b_scale/test_gemm_b_scale_ut_cases.inc
 create mode 100644 test/gemm_b_scale/test_gemm_b_scale_util.hpp
 create mode 100644 test/gemm_b_scale/test_gemm_b_scale_wmma.cpp
 create mode 100644 test/gemm_b_scale/test_gemm_b_scale_xdl.cpp

diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index e6a26ecafd..61f3ba5351 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -128,3 +128,5 @@ add_example_executable(example_gemm_wmma_fp16_pk_i4_v3 gemm_wmma_fp16_pk_i4_v3.c
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3)
 add_example_executable(example_gemm_wmma_fp16_fp8_v3 gemm_wmma_fp16_fp8_v3.cpp)
 add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_fp8_v3)
+add_example_executable(example_gemm_wmma_fp16_pk_i4_v3_b_scale gemm_wmma_fp16_pk_i4_v3_b_scale.cpp)
+add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3_b_scale)
diff --git a/example/01_gemm/gemm_wmma_fp16_pk_i4_v3_b_scale.cpp b/example/01_gemm/gemm_wmma_fp16_pk_i4_v3_b_scale.cpp
new file mode 100644
index 0000000000..d3ac184019
--- /dev/null
+++ b/example/01_gemm/gemm_wmma_fp16_pk_i4_v3_b_scale.cpp
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::pk_i4_t;
+using BScaleDataType   = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr bool PermuteA = false;
+static constexpr bool PermuteB = true;
+
+static constexpr ck::index_t Scale_Block_N = 1;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+static constexpr ck::index_t KPerBlock = 64;
+
+// clang-format off
+using DeviceGemmV2Instance = 
+    ck::tensor_operation::device::DeviceGemm_BScale_Wmma_CShuffleV3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType, BDataType, BScaleDataType, CDataType, AccDataType, CShuffleDataType, 
+        AElementOp, BElementOp, CElementOp, GemmDefault, 
+        256, Scale_Block_N, Scale_Block_K,
+        128, 128,
+        KPerBlock, 8, 8,
+        16,  16,
+        4,    2,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        S<2, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>,
+        2, 8, 8, 0,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3,
+        CDataType, CDataType, PermuteA, PermuteB>;
+
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        AccDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+template <typename ProblemType>
+bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto M       = problem_size.M;
+    auto N       = problem_size.N;
+    auto K       = problem_size.K;
+    auto StrideA = problem_size.StrideA;
+    auto StrideB = problem_size.StrideB;
+    auto StrideC = problem_size.StrideC;
+    auto KBatch  = problem_size.KBatch;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    auto f_get_default_stride =
+        [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+
+    StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
+    StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
+    StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BScaleDataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           (N + Scale_Block_N - 1) / Scale_Block_N,
+                                                           Scale_Stride_BN,
+                                                           BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 3:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    case 4:
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+        break;
+    case 5:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
+    DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    // weight permute
+    if constexpr(PermuteB)
+    {
+        int K1 = KPerBlock;
+        int K0 = K / KPerBlock;
+
+        // int K0, N, K1
+        for(int j = 0; j < K0; j++)
+        {
+            for(int i = 0; i < N; i++)
+            {
+                for(int jj = 0; jj < K1; jj++)
+                {
+                    b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j++)
+            {
+                b_k_n_permute(i * K + j) = b_k_n(i * K + j);
+            }
+        }
+    }
+
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int hi   = input[2];
+                int lo   = input[0];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 0, i) = i4x2;
+            }
+
+            {
+                int hi   = input[6];
+                int lo   = input[4];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 2, i) = i4x2;
+            }
+
+            {
+                int hi   = input[3];
+                int lo   = input[1];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 4, i) = i4x2;
+            }
+
+            {
+                int hi   = input[7];
+                int lo   = input[5];
+                int i4x2 = (hi << 4) | lo;
+
+                b_k_n_permute(j + 6, i) = i4x2;
+            }
+        }
+    }
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
+    b1_scale_device_buf.ToDevice(b1_k_n.mData.data());
+    DeviceMem workspace;
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm      = DeviceGemmV2Instance{};
+    auto invoker   = gemm.MakeInvoker();
+    float ave_time = 0;
+
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          StrideA,
+                          StrideB,
+                          StrideC,
+                          Scale_Stride_BN,
+                          static_cast<BScaleDataType*>(b1_scale_device_buf.GetDeviceBuffer()),
+                          KBatch,
+                          a_element_op,
+                          b_element_op,
+                          c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    std::string device_name = ck::get_device_name();
+    if(!(device_name.find("gfx11") != std::string::npos ||
+         device_name.find("gfx12") != std::string::npos))
+    {
+        std::cout << "This kernel support gfx1100 and gfx1200 only" << std::endl;
+
+        return true;
+    }
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        Tensor<float> b_k_n_dequant({K, N});
+
+        float v_b = 0;
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                ck::pk_i4_t i4x2 = b_k_n(k, n).data;
+                int8_t i4        = 0;
+                if(k % 2 == 1)
+                    i4 = (i4x2.data >> 0) & 0xf;
+                else
+                    i4 = (i4x2.data >> 4) & 0xf;
+                i4  = i4 - 8;
+                v_b = ck::type_convert<float>(i4);
+
+                b_k_n_dequant(k, n) =
+                    ck::type_convert<float>(v_b) *
+                    ck::type_convert<float>(b1_k_n(k / Scale_Block_K, n / Scale_Block_N));
+            }
+        }
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n_dequant, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        pass &= ck::utils::check_err(c_m_n_device_result,
+                                     c_m_n_host_result,
+                                     "Error: Incorrect results!",
+                                     get_rtol<CDataType>(),
+                                     get_atol<CDataType>());
+    }
+
+    if(config.time_kernel)
+    {
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
+
+        std::size_t flop = 2_uz * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K +
+            sizeof(BDataType) * K * N /
+                (ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
+            sizeof(CDataType) * M * N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+    return pass;
+}
+
+bool run_gemm_splitk_example(int argc, char* argv[])
+{
+    ProblemSizeSplitK problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
index 14856f210c..d46c5b737d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -91,6 +91,78 @@ struct BlockwiseGemmWmmaops_pipeline_base
                               true>
         c_thread_buf_;
 
+    struct Empty
+    {
+        __device__ Empty(){};
+        template <index_t NBuffer>
+        __device__ void GlobalLoad(bool cond)
+        {
+            ignore = NBuffer;
+            ignore = cond;
+        }
+    };
+
+    template <index_t ScaleSliceSizeN,
+              index_t ScaleSliceSizeK,
+              index_t NWaves,
+              index_t ScaleBlockK,
+              index_t NumberOfBuffers,
+              typename GridDesc,
+              typename ThreadCopy,
+              typename GridBuffer,
+              typename ThreadStaticBuffer,
+              typename BScaleThreadDesc>
+    struct BScale
+    {
+        __device__ BScale(GridDesc b_scale_grid_desc_,
+                          ThreadCopy b_scale_thread_copy_,
+                          GridBuffer b_scale_grid_buf_)
+            : b_scale_thread_copy(b_scale_thread_copy_),
+              b_scale_grid_desc(b_scale_grid_desc_),
+              b_scale_grid_buf(b_scale_grid_buf_){};
+
+        static constexpr index_t num_scale_k_block = BScaleThreadDesc{}.GetLength(Number<1>{});
+        static constexpr index_t num_scale_krepeat = KRepeat / num_scale_k_block;
+
+        static constexpr auto b_scale_thread_desc = BScaleThreadDesc{};
+
+        static constexpr auto b_scale_thread_copy_step =
+            make_tuple(make_multi_index(NWaves * NPerWmma, 0),
+                       make_multi_index(-NPerBlock, 0),
+                       make_multi_index(-NPerBlock, (KPerBlock + ScaleBlockK - 1) / ScaleBlockK));
+
+        template <index_t NBuffer>
+        __device__ void GlobalLoad(bool cond)
+        {
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, Number<0>{}),
+                                        b_scale_thread_bufs(Number<NBuffer>{}));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       b_scale_thread_copy_step.At(Number<0>{}));
+            });
+
+            if(cond)
+            {
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       b_scale_thread_copy_step.At(Number<2>{}));
+            }
+            else
+            {
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       b_scale_thread_copy_step.At(Number<1>{}));
+            }
+        }
+
+        ThreadCopy b_scale_thread_copy;
+        GridDesc b_scale_grid_desc;
+        GridBuffer b_scale_grid_buf;
+        StaticallyIndexedArray<ThreadStaticBuffer, Number<NumberOfBuffers>{}> b_scale_thread_bufs;
+    };
+
     __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
 
     __device__ static auto GetWaveIdx()
@@ -285,7 +357,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                          ComputeTypeA,
                                          decltype(a_block_desc_k0_m0_m1_m2_k1),
                                          decltype(a_thread_desc_),
-                                         Sequence<KPack / A_K1 / A_KRow, MRepeat, 1, 1, 1, A_K1>,
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
                                          Sequence<0, 1, 2, 3, 4, 5>,
                                          5,
                                          A_K1,
@@ -296,7 +368,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                          ComputeTypeB,
                                          decltype(b_block_desc_k0_n0_n1_n2_k1),
                                          decltype(b_thread_desc_),
-                                         Sequence<KPack / B_K1 / B_KRow, NRepeat, 1, 1, 1, B_K1>,
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
                                          Sequence<0, 1, 2, 3, 4, 5>,
                                          5,
                                          B_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
index df82e155be..f25648efa6 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
@@ -132,6 +132,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
     using Base::a_block_desc_k0_m0_m1_m2_k1;
     using Base::b_block_desc_k0_n0_n1_n2_k1;
 
+    using typename Base::Empty;
+
     static constexpr index_t PrefetchStages  = 1;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -158,7 +160,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
               typename BGridBuffer,
               typename BBlockBuffer,
               typename BBlockTransferStep,
-              typename CThreadBuffer>
+              typename CThreadBuffer,
+              typename BScaleStruct>
     __device__ void Run(const AGridDesc& a_grid_desc,
                         const ABlockDesc& a_block_desc,
                         ABlockTransfer& a_blockwise_copy,
@@ -172,7 +175,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                         BBlockBuffer& b_block_buf,
                         const BBlockTransferStep& b_block_copy_step,
                         CThreadBuffer& c_thread_buf,
-                        index_t num_loop) const
+                        // BScaleThreadCopy
+                        BScaleStruct& b_scale_struct,
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
     {
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
@@ -186,6 +192,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+        b_scale_struct.template GlobalLoad<0>(num_loop_per_scale == 1);
+
         // Local prefill 1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
         b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
@@ -195,20 +203,42 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
 
         auto blockwise_gemm_func = [&]() {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
-                a_thread_copy_.Run(
-                    a_block_desc_k0_m0_m1_m2_k1,
-                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, I0, I0, I0, I0, I0),
-                    a_block_buf,
-                    a_thread_desc_,
-                    make_tuple(I0, I0, k0, I0, I0, I0),
-                    a_thread_buf);
-                b_thread_copy_.Run(
-                    b_block_desc_k0_n0_n1_n2_k1,
-                    make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, I0, I0, I0, I0, I0),
-                    b_block_buf,
-                    b_thread_desc_,
-                    make_tuple(I0, I0, k0, I0, I0, I0),
-                    b_thread_buf);
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(
+                        a_block_desc_k0_m0_m1_m2_k1,
+                        make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
+                        a_block_buf,
+                        a_thread_desc_,
+                        make_tuple(I0, m0, k0, I0, I0, I0),
+                        a_thread_buf);
+                });
+                if constexpr(ck::is_same<BScaleStruct, Empty>::value == true)
+                {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(
+                            b_block_desc_k0_n0_n1_n2_k1,
+                            make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                            b_block_buf,
+                            b_thread_desc_,
+                            make_tuple(I0, n0, k0, I0, I0, I0),
+                            b_thread_buf);
+                    });
+                }
+                else
+                {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(
+                            b_block_desc_k0_n0_n1_n2_k1,
+                            make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                            b_block_buf,
+                            b_scale_struct.b_scale_thread_bufs(
+                                I0)[Number<n0 * BScaleStruct::num_scale_k_block +
+                                           k0 / BScaleStruct::num_scale_krepeat>{}],
+                            b_thread_desc_,
+                            make_tuple(I0, n0, k0, I0, I0, I0),
+                            b_thread_buf);
+                    });
+                }
 
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
@@ -258,6 +288,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                 blockwise_gemm_func();
 
                 block_sync_lds();
+                b_scale_struct.template GlobalLoad<0>((i + 2) % num_loop_per_scale == 0);
                 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
                 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
 
@@ -378,6 +409,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
     using Base::a_block_desc_k0_m0_m1_m2_k1;
     using Base::b_block_desc_k0_n0_n1_n2_k1;
 
+    using typename Base::Empty;
+
     static constexpr index_t NumKClusters      = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS;
     static constexpr index_t KRepeatPerCluster = math::max(KRepeat / NumKClusters, 1);
 
@@ -407,7 +440,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
               typename BGridBuffer,
               typename BBlockBuffer,
               typename BBlockTransferStep,
-              typename CThreadBuffer>
+              typename CThreadBuffer,
+              typename BScaleStruct>
     __device__ void Run(const AGridDesc& a_grid_desc,
                         const ABlockDesc& a_block_desc,
                         ABlockTransfer& a_blockwise_copy,
@@ -421,7 +455,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                         BBlockBuffer& b_block_buf,
                         const BBlockTransferStep& b_block_copy_step,
                         CThreadBuffer& c_thread_buf,
-                        index_t num_loop) const
+                        // BScaleThreadCopy
+                        BScaleStruct& b_scale_struct,
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
     {
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
@@ -435,6 +472,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+        b_scale_struct.template GlobalLoad<0>(num_loop_per_scale == 1);
+
         // Local prefill 1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
         b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
@@ -445,30 +484,57 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
         auto blockwise_gemm_func = [&]() {
             static_for<0, KRepeat, KRepeatPerCluster>{}([&](auto k0_offset) {
                 static_for<0, KRepeatPerCluster, 1>{}([&](auto k0_inner) {
-                    a_thread_copy_.Run(
-                        a_block_desc_k0_m0_m1_m2_k1,
-                        make_tuple(Number<(k0_offset + k0_inner) * KPack / A_K1 / A_KRow>{},
-                                   I0,
-                                   I0,
-                                   I0,
-                                   I0,
-                                   I0),
-                        a_block_buf,
-                        a_thread_desc_,
-                        make_tuple(I0, I0, k0_inner, I0, I0, I0),
-                        a_thread_buf);
-                    b_thread_copy_.Run(
-                        b_block_desc_k0_n0_n1_n2_k1,
-                        make_tuple(Number<(k0_offset + k0_inner) * KPack / B_K1 / B_KRow>{},
-                                   I0,
-                                   I0,
-                                   I0,
-                                   I0,
-                                   I0),
-                        b_block_buf,
-                        b_thread_desc_,
-                        make_tuple(I0, I0, k0_inner, I0, I0, I0),
-                        b_thread_buf);
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_k0_m0_m1_m2_k1,
+                            make_tuple(Number<(k0_offset + k0_inner) * KPack / A_K1 / A_KRow>{},
+                                       m0,
+                                       I0,
+                                       I0,
+                                       I0,
+                                       I0),
+                            a_block_buf,
+                            a_thread_desc_,
+                            make_tuple(I0, m0, k0_inner, I0, I0, I0),
+                            a_thread_buf);
+                    });
+                    if constexpr(ck::is_same<BScaleStruct, Empty>::value == true)
+                    {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(
+                                b_block_desc_k0_n0_n1_n2_k1,
+                                make_tuple(Number<(k0_offset + k0_inner) * KPack / B_K1 / B_KRow>{},
+                                           n0,
+                                           I0,
+                                           I0,
+                                           I0,
+                                           I0),
+                                b_block_buf,
+                                b_thread_desc_,
+                                make_tuple(I0, n0, k0_inner, I0, I0, I0),
+                                b_thread_buf);
+                        });
+                    }
+                    else
+                    {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            b_thread_copy_.Run(
+                                b_block_desc_k0_n0_n1_n2_k1,
+                                make_tuple(Number<(k0_offset + k0_inner) * KPack / B_K1 / B_KRow>{},
+                                           n0,
+                                           I0,
+                                           I0,
+                                           I0,
+                                           I0),
+                                b_block_buf,
+                                b_scale_struct.b_scale_thread_bufs(I0)[Number<
+                                    n0 * BScaleStruct::num_scale_k_block +
+                                    (k0_offset + k0_inner) / BScaleStruct::num_scale_krepeat>{}],
+                                b_thread_desc_,
+                                make_tuple(I0, n0, k0_inner, I0, I0, I0),
+                                b_thread_buf);
+                        });
+                    }
                 });
 
                 __builtin_amdgcn_sched_barrier(0);
@@ -564,6 +630,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                 block_sync_lds();
                 blockwise_gemm_func();
 
+                b_scale_struct.template GlobalLoad<0>((i + 2) % num_loop_per_scale == 0);
                 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
                 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
 
@@ -613,7 +680,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                          ComputeTypeA,
                                          decltype(a_block_desc_k0_m0_m1_m2_k1),
                                          decltype(a_thread_desc_),
-                                         Sequence<KPack / A_K1 / A_KRow, MRepeat, 1, 1, 1, A_K1>,
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
                                          Sequence<0, 1, 2, 3, 4, 5>,
                                          5,
                                          A_K1,
@@ -624,7 +691,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                          ComputeTypeB,
                                          decltype(b_block_desc_k0_n0_n1_n2_k1),
                                          decltype(b_thread_desc_),
-                                         Sequence<KPack / B_K1 / B_KRow, NRepeat, 1, 1, 1, B_K1>,
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
                                          Sequence<0, 1, 2, 3, 4, 5>,
                                          5,
                                          B_K1,
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
index 5ceb8a6be4..8fed23d151 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
@@ -132,6 +132,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
     using Base::a_block_desc_k0_m0_m1_m2_k1;
     using Base::b_block_desc_k0_n0_n1_n2_k1;
 
+    using typename Base::Empty;
+
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
@@ -255,6 +257,58 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
         */
     }
 
+    template <typename ABlockBuffer,
+              typename AThreadBuffer,
+              typename BBlockBuffer,
+              typename BThreadBuffer,
+              typename BScaleStruct>
+    __device__ inline void LocalLoad(ABlockBuffer& a_block_buf,
+                                     AThreadBuffer& a_thread_buf,
+                                     BBlockBuffer& b_block_buf,
+                                     BThreadBuffer& b_thread_buf,
+                                     BScaleStruct& b_scale_struct) const
+    {
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                a_thread_copy_.Run(
+                    a_block_desc_k0_m0_m1_m2_k1,
+                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
+                    a_block_buf,
+                    a_thread_desc_,
+                    make_tuple(I0, m0, k0, I0, I0, I0),
+                    a_thread_buf);
+            });
+
+            if constexpr(ck::is_same_v<BScaleStruct, Empty>)
+            {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(
+                        b_block_desc_k0_n0_n1_n2_k1,
+                        make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                        b_block_buf,
+                        b_thread_desc_,
+                        make_tuple(I0, n0, k0, I0, I0, I0),
+                        b_thread_buf);
+                });
+            }
+            else
+            {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(
+                        b_block_desc_k0_n0_n1_n2_k1,
+                        make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                        b_block_buf,
+                        b_scale_struct.b_scale_thread_bufs(
+                            I0)[Number<n0 * BScaleStruct::num_scale_k_block +
+                                       k0 / BScaleStruct::num_scale_krepeat>{}],
+                        b_thread_desc_,
+                        make_tuple(I0, n0, k0, I0, I0, I0),
+                        b_thread_buf);
+                });
+            }
+        });
+    }
+
     template <bool HasMainLoop,
               TailNumber TailNum,
               typename AGridDesc,
@@ -269,7 +323,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
               typename BGridBuffer,
               typename BBlockBuffer,
               typename BBlockTransferStep,
-              typename CThreadBuffer>
+              typename CThreadBuffer,
+              typename BScaleStruct>
     __device__ void Run(const AGridDesc& a_grid_desc,
                         const ABlockDesc& a_block_desc,
                         ABlockTransfer& a_blockwise_copy,
@@ -283,7 +338,10 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                         BBlockBuffer& b_block_buf,
                         const BBlockTransferStep& b_block_copy_step,
                         CThreadBuffer& c_thread_buf,
-                        index_t num_loop) const
+                        // BScaleThreadCopy
+                        BScaleStruct& b_scale_struct,
+                        index_t num_loop,
+                        index_t num_loop_per_scale) const
     {
         __builtin_amdgcn_sched_barrier(0);
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
@@ -298,6 +356,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
         a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
         b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+        b_scale_struct.template GlobalLoad<0>(num_loop_per_scale == 1);
+
         // Local prefill 1
         a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
         b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
@@ -314,20 +374,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
         // Local prefetch 1
         block_sync_lds();
-        static_for<0, KRepeat, 1>{}([&](auto k0) {
-            a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
-                               make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, I0, I0, I0, I0, I0),
-                               a_block_buf,
-                               a_thread_desc_,
-                               make_tuple(I0, I0, k0, I0, I0, I0),
-                               a_thread_buf);
-            b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
-                               make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, I0, I0, I0, I0, I0),
-                               b_block_buf,
-                               b_thread_desc_,
-                               make_tuple(I0, I0, k0, I0, I0, I0),
-                               b_thread_buf);
-        });
+
+        LocalLoad(a_block_buf, a_thread_buf, b_block_buf, b_thread_buf, b_scale_struct);
 
         __builtin_amdgcn_sched_barrier(0);
 
@@ -348,6 +396,8 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
 
+                b_scale_struct.template GlobalLoad<0>((i + 2) % num_loop_per_scale == 0);
+
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
@@ -392,22 +442,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
 
                 block_sync_lds();
 
-                static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    a_thread_copy_.Run(
-                        a_block_desc_k0_m0_m1_m2_k1,
-                        make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, I0, I0, I0, I0, I0),
-                        a_block_buf,
-                        a_thread_desc_,
-                        make_tuple(I0, I0, k0, I0, I0, I0),
-                        a_thread_buf);
-                    b_thread_copy_.Run(
-                        b_block_desc_k0_n0_n1_n2_k1,
-                        make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, I0, I0, I0, I0, I0),
-                        b_block_buf,
-                        b_thread_desc_,
-                        make_tuple(I0, I0, k0, I0, I0, I0),
-                        b_thread_buf);
-                });
+                LocalLoad(a_block_buf, a_thread_buf, b_block_buf, b_thread_buf, b_scale_struct);
 
                 HotLoopScheduler();
                 __builtin_amdgcn_sched_barrier(0);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
index 90afc467d4..a921962c67 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp
@@ -16,6 +16,7 @@
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -229,222 +230,28 @@ struct DeviceGemm_Wmma_CShuffleV3 : public DeviceGemmV2<ALayout,
 
     using Argument = typename GridwiseGemm::Argument;
 
-    /// @brief  Helper structure responsible for kernel invocation.
-    ///
-    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
-    ///             kernel function. It usually determines the launched grid size prepares kernel
-    ///             arguments as well as perform specific kernel configuration selection based on
-    ///             runtime arguments.
-    ///
-    /// @note       If appropriately configured it may measure kernel execution time.
-    ///
-    struct Invoker : public BaseInvoker
-    {
-        /// @brief  This function issues GPU kernel execution.
-        /// @param arg           The GPU kernel arguments.
-        /// @param stream_config The HIP stream configuration helper structure.
-        /// @return              The kernel's average execution time (if time measurement is
-        ///                      enabled).
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            if(stream_config.log_level_ > 0)
-            {
-                arg.Print();
-                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
-            }
+    using DeviceGemmCommon = DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               MPerBlock,
+                                                               NPerBlock,
+                                                               KPerBlock,
+                                                               BlockSize,
+                                                               AK1,
+                                                               BK1,
+                                                               GemmSpec,
+                                                               BlkGemmPipeSched,
+                                                               BlkGemmPipelineVer,
+                                                               ComputeTypeA,
+                                                               ComputeTypeB>;
 
-            if(!GridwiseGemm::CheckValidity(arg))
-            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
-            }
-
-            index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
-
-            float ave_time = 0;
-
-            index_t k_grain = arg.KBatch * KPerBlock;
-            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
-
-            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
-
-            const auto Run = [&](const auto& kernel) {
-                if(stream_config.flush_cache)
-                {
-                    Argument arg_ = arg;
-
-                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
-                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
-                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
-                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
-
-                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
-                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
-                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
-                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
-
-                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
-                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
-                    rotating_mem.Print();
-
-                    auto run_flush_cache = [&]() {
-                        // flush icache
-                        ck::utility::flush_icache();
-                        // rotating mem
-                        rotating_mem.Next();
-                        // clear c mem
-                        if(arg_.KBatch > 1)
-                            HIP_CHECK_ERROR(hipMemsetAsync(arg_.p_c_grid,
-                                                           0,
-                                                           arg_.M * arg_.N * sizeof(CDataType),
-                                                           stream_config.stream_id_));
-                    };
-
-                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
-                        stream_config,
-                        run_flush_cache,
-                        kernel,
-                        dim3(gdx, gdy, gdz),
-                        dim3(BlockSize),
-                        0,
-                        arg_);
-                }
-                else
-                {
-                    if(arg.KBatch > 1)
-                        HIP_CHECK_ERROR(hipMemsetAsync(arg.p_c_grid,
-                                                       0,
-                                                       arg.M * arg.N * sizeof(CDataType),
-                                                       stream_config.stream_id_));
-
-                    ave_time = launch_and_time_kernel(
-                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
-                }
-            };
-
-            constexpr index_t minimum_occupancy = []() {
-                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
-                {
-                    return 2;
-                }
-                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
-                {
-                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
-                }
-                else
-                {
-                    return 1;
-                }
-            }();
-
-            if(has_main_k_block_loop)
-            {
-                // Tail number always full
-                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
-                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
-                {
-                    if(arg.KBatch > 1)
-                    {
-                        const auto kernel =
-                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
-                                                         true,
-                                                         InMemoryDataOperationEnum::AtomicAdd,
-                                                         minimum_occupancy>;
-                        Run(kernel);
-                    }
-                    else
-                    {
-                        const auto kernel =
-                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
-                                                         true,
-                                                         InMemoryDataOperationEnum::Set,
-                                                         minimum_occupancy>;
-                        Run(kernel);
-                    }
-                }
-                else
-                {
-                    // TODO: Implement
-                }
-            }
-            else
-            {
-                // Tail number always 1
-                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
-                {
-                    if(arg.KBatch > 1)
-                    {
-                        const auto kernel =
-                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
-                                                         false,
-                                                         InMemoryDataOperationEnum::AtomicAdd,
-                                                         minimum_occupancy>;
-                        Run(kernel);
-                    }
-                    else
-                    {
-                        const auto kernel =
-                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
-                                                         false,
-                                                         InMemoryDataOperationEnum::Set,
-                                                         minimum_occupancy>;
-                        Run(kernel);
-                    }
-                }
-            }
-
-            return ave_time;
-        }
-
-        // polymorphic
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
+    // Invoker
+    using Invoker = typename DeviceGemmCommon::Invoker;
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported())
-        {
-            return false;
-        }
-
-        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
-                     std::is_same_v<CDataType, ck::bhalf_t>)
-        {
-            if(arg.KBatch > 1 && ck::is_gfx11_supported())
-            {
-                // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
-                return false;
-            }
-        }
-
-        if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
-                     std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
-        {
-            if(ck::is_gfx11_supported())
-            {
-                return false;
-            }
-        }
-
-        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
-                                                       GemmSpec == GemmSpecialization::NKPadding ||
-                                                       GemmSpec == GemmSpecialization::MNKPadding ||
-                                                       GemmSpec == GemmSpecialization::KPadding))
-        {
-            return false;
-        }
-
-        return GridwiseGemm::CheckValidity(arg);
+        return DeviceGemmCommon::IsSupportedArgument(arg);
     }
 
     // polymorphic
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp
new file mode 100644
index 0000000000..1a68b35f1f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockN, // scale block for N
+          index_t ScaleBlockK, // scale block for K
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemm_BScale_Wmma_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     BScaleDataType,
+                                                                     CDataType,
+                                                                     ScaleBlockN,
+                                                                     ScaleBlockK,
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     CElementwiseOperation>
+{
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3_b_scale<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    using DeviceGemmCommon = DeviceGemm_Wmma_CShuffleV3_Common<GridwiseGemm,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               MPerBlock,
+                                                               NPerBlock,
+                                                               KPerBlock,
+                                                               BlockSize,
+                                                               AK1,
+                                                               BK1,
+                                                               GemmSpec,
+                                                               BlkGemmPipeSched,
+                                                               BlkGemmPipelineVer,
+                                                               ComputeTypeA,
+                                                               ComputeTypeB>;
+
+    // Invoker
+    using Invoker = typename DeviceGemmCommon::Invoker;
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return DeviceGemmCommon::IsSupportedArgument(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t StrideScaleB,
+                             const BScaleDataType* p_b_scale,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        StrideScaleB,
+                        p_b_scale,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t StrideScaleB,
+                                                      const void* p_b_scale,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          StrideScaleB,
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemm_Wmma_CShuffleV3_BScale"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma<<"x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat<<"x" << NRepeat<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
new file mode 100644
index 0000000000..24b96a1e60
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t BlockSize,
+          index_t AK1,
+          index_t BK1,
+          GemmSpecialization GemmSpec,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          BlockGemmPipelineVersion BlkGemmPipelineVer,
+          typename ComputeTypeA,
+          typename ComputeTypeB>
+struct DeviceGemm_Wmma_CShuffleV3_Common
+{
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
+    struct Invoker : public BaseInvoker
+    {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            HIP_CHECK_ERROR(hipMemsetAsync(arg_.p_c_grid,
+                                                           0,
+                                                           arg_.M * arg_.N * sizeof(CDataType),
+                                                           stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        HIP_CHECK_ERROR(hipMemsetAsync(arg.p_c_grid,
+                                                       0,
+                                                       arg.M * arg.N * sizeof(CDataType),
+                                                       stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::AtomicAdd,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         true,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                else
+                {
+                    // TODO: Implement
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         false,
+                                                         InMemoryDataOperationEnum::AtomicAdd,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_wmma_cshuffle_v3<GridwiseGemm,
+                                                         false,
+                                                         InMemoryDataOperationEnum::Set,
+                                                         minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported())
+        {
+            return false;
+        }
+
+        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
+                     std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if(arg.KBatch > 1 && ck::is_gfx11_supported())
+            {
+                // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
+                     std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
+        {
+            if(ck::is_gfx11_supported())
+            {
+                return false;
+            }
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
index f3354cd5dd..75f12d094e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
@@ -14,47 +14,10 @@
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp"
 
 namespace ck {
 
-template <typename GridwiseGemm,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Full>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
-#endif
-        kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
-{
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
-#if defined(__gfx11__)
-    // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
-    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
-    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
-                   (std::is_same_v<c_data_type, ck::half_t> ||
-                    std::is_same_v<c_data_type, ck::bhalf_t>)))
-    {
-#endif
-        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
-
-        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-            p_shared,
-            karg);
-#if defined(__gfx11__)
-    }
-#endif
-#else
-    ignore = karg;
-#endif
-}
-
 /// @brief \"Universal\" GEMM kernel with SplitK support.
 ///
 /// @par Overview
@@ -207,391 +170,143 @@ template <typename ALayout,
           bool PermuteA,
           bool PermuteB>
 struct GridwiseGemm_wmma_cshuffle_v3
+    : GridwiseGemm_wmma_cshuffle_v3_base<
+          ALayout,
+          BLayout,
+          CLayout,
+          ADataType,
+          BDataType,
+          AccDataType,
+          CShuffleDataType,
+          CDataType,
+          AElementwiseOperation,
+          BElementwiseOperation,
+          CElementwiseOperation,
+          GemmSpec,
+          BlockSize,
+          MPerBlock,
+          NPerBlock,
+          KPerBlock,
+          AK1Value,
+          BK1Value,
+          MPerWmma,
+          NPerWmma,
+          MRepeat,
+          NRepeat,
+          ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          ABlockTransferThreadClusterArrangeOrder,
+          ABlockTransferSrcAccessOrder,
+          ABlockTransferSrcVectorDim,
+          ABlockTransferSrcScalarPerVector,
+          ABlockTransferDstScalarPerVector_AK1,
+          AThreadTransferSrcResetCoordinateAfterRun,
+          ABlockLdsExtraM,
+          BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          BBlockTransferThreadClusterArrangeOrder,
+          BBlockTransferSrcAccessOrder,
+          BBlockTransferSrcVectorDim,
+          BBlockTransferSrcScalarPerVector,
+          BBlockTransferDstScalarPerVector_BK1,
+          BThreadTransferSrcResetCoordinateAfterRun,
+          BBlockLdsExtraN,
+          CShuffleMRepeatPerShuffle,
+          CShuffleNRepeatPerShuffle,
+          CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlkGemmPipeSched,
+          BlkGemmPipelineVer,
+          ComputeTypeA,
+          ComputeTypeB,
+          PermuteA,
+          PermuteB>
 {
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-    static constexpr auto I4 = Number<4>{};
-    static constexpr auto I5 = Number<5>{};
-    static constexpr auto I6 = Number<6>{};
-    static constexpr auto I7 = Number<7>{};
+    using Base = GridwiseGemm_wmma_cshuffle_v3_base<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1Value,
+        BK1Value,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
 
-    // K1 should be Number<...>
-    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
-    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
-    static constexpr auto AK1Number = Number<AK1Value>{};
-    static constexpr auto BK1Number = Number<BK1Value>{};
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::I3;
+    using Base::I4;
+    using Base::I5;
+    using Base::I6;
+    using Base::I7;
 
-    static constexpr index_t KPack = math::max(
-        math::lcm(AK1Number, BK1Number),
-        WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::selected_wmma
-            .k_per_wmma);
+    using Base::AK0Number;
+    using Base::AK1Number;
+    using Base::BK0Number;
+    using Base::BK1Number;
+
+    using Base::APackedSize;
+    using Base::BPackedSize;
+
+    using Base::CalculateAK0Padded;
+    using Base::CalculateBK0Padded;
+    using Base::CalculateKPadded;
+    using Base::CalculateKRead;
+    using Base::CalculateMBlock;
+    using Base::CalculateMPadded;
+    using Base::CalculateNBlock;
+    using Base::CalculateNPadded;
+    using Base::MakeAGridDescriptor_AK0_M_AK1;
+    using Base::MakeBGridDescriptor_BK0_N_BK1;
+    using Base::MakeCGridDescriptor_M_N;
+
+    using Base::GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat;
+
+    using Base::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
-    static constexpr index_t APackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
-    static constexpr index_t BPackedSize = []() {
-        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
-            return 2;
-        else
-            return 1;
-    }();
-
-    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
-    {
-        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
-    }
-
-    __host__ static auto CalculateMPadded(index_t M)
-    {
-        return math::integer_least_multiple(M, MPerBlock);
-    }
-
-    __host__ static auto CalculateNPadded(index_t N)
-    {
-        return math::integer_least_multiple(N, NPerBlock);
-    }
-
-    __host__ static auto CalculateKPadded(index_t K)
-    {
-        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
-    }
-
-    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
-    {
-        auto K_t = K_Batch * KPerBlock;
-        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
-    }
-
-    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
-    {
-        auto K_t = K_Batch * KPerBlock;
-        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
-    }
-
-    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
-    {
-        auto K_t = K_Batch * KPerBlock;
-        return (K + K_t - 1) / K_t * KPerBlock;
-    }
-
-    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
-    {
-        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
-        auto K_t                = K_Batch * KReadVec;
-        return (K + K_t - 1) / K_t * KReadVec;
-    }
-
-    __host__ static auto CalculateMBlock(index_t M)
-    {
-        return math::integer_divide_ceil(M, MPerBlock);
-    }
-
-    __host__ static auto CalculateNBlock(index_t N)
-    {
-        return math::integer_divide_ceil(N, NPerBlock);
-    }
-
-    template <index_t MNRepeat, index_t MNWaves, index_t MNPerWmma, typename BlockDesc>
-    __host__ __device__ static constexpr auto MakeWmmaTileDescriptor(const BlockDesc&)
-    {
-        // K0_MN_K1 -> K0_MNRepeat_MNWaves_KRow_MNPerWmma_K1
-        constexpr auto K0 = BlockDesc{}.GetLength(I0);
-        constexpr auto K1 = BlockDesc{}.GetLength(I2);
-#ifdef __gfx12__
-        constexpr auto KRow = I2;
-#else
-        constexpr auto KRow = I1;
-#endif
-        return transform_tensor_descriptor(
-            BlockDesc{},
-            make_tuple(make_unmerge_transform(make_tuple(Number<K0 / KRow>{}, KRow)),
-                       make_unmerge_transform(
-                           make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
-                       make_pass_through_transform(Number<K1>{})),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
-    }
-
-    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
-        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-            }
-        }();
-
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(M, MPad - M),
-                                                       make_right_pad_transform(K, KPad - K)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(MPad)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_right_pad_transform(M, MPad - M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            static_assert(!PermuteA, "PermuteA is not supported");
-
-            // not pad M or K
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-    }
-
-    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
-        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
-            }
-        }();
-
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        static_assert(!(is_same_v<remove_cvref_t<BDataType>, pk_i4_t> &&
-                        GemmSpec != GemmSpecialization::Default),
-                      "pk_i4_t does not support padding");
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(N, NPad - N),
-                                                       make_right_pad_transform(K, KPad - K)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_n_k,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(NPad)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_n_k,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            if constexpr(!PermuteB)
-            {
-                // not pad N or K
-                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                    b_grid_desc_nraw_kraw,
-                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                               make_pass_through_transform(N)),
-                    make_tuple(Sequence<1>{}, Sequence<0>{}),
-                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-                return b_grid_desc_bk0_n_bk1;
-            }
-            else
-            {
-                // Pre-shuffled Weight
-                // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1]
-                constexpr index_t BK01 = KPerBlock / BK1Value;
-                const index_t BK0_     = StrideB / BK1Value;
-                const index_t BK00     = BK0_ / BK01;
-
-                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
-                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
-
-                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
-                    b_grid_desc_bk00_n_bk01_bk1_permute,
-                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
-                               make_pass_through_transform(make_tuple(N)),
-                               make_pass_through_transform(BK1Value)),
-                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-                return b_grid_desc_bk0_n_bk1_permute;
-            }
-        }
-    }
-
-    template <typename ABlockDesc_AK0_M_AK1>
-    __host__ __device__ static constexpr auto MakeAWmmaTileDescriptor(const ABlockDesc_AK0_M_AK1&)
-    {
-        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
-
-        return MakeWmmaTileDescriptor<MRepeat, MWaves, MPerWmma>(ABlockDesc_AK0_M_AK1{});
-    }
-
-    template <typename BBlockDesc_BK0_N_BK1>
-    __host__ __device__ static constexpr auto MakeBWmmaTileDescriptor(const BBlockDesc_BK0_N_BK1&)
-    {
-        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
-
-        return MakeWmmaTileDescriptor<NRepeat, NWaves, NPerWmma>(BBlockDesc_BK0_N_BK1{});
-    }
-
-    __host__ __device__ static auto
-    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
-    {
-        const auto c_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        // pad M and N
-        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                           make_tuple(make_right_pad_transform(M, MPad - M),
-                                                      make_right_pad_transform(N, NPad - N)),
-                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
-        // TODO: Investigate why this path is not used in the original
-        // gridwise_gemm_xdl_cshuffle_v3.hpp
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
-    }
+    using Base::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1;
+    using Base::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1;
 
     struct Problem
     {
@@ -749,943 +464,14 @@ struct GridwiseGemm_wmma_cshuffle_v3
         index_t c_reduce_offset;
     };
 
-    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
-    {
-        // A matrix in LDS memory, dst of blockwise copy
-        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
-        {
-            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
-            // loop to hide it in v4. it may give you some benefit from less valu in compute address
-            return make_naive_tensor_descriptor(
-                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
-                make_tuple(Number<MPerBlock>{} * AK1Number, AK1Number, I1));
-        }
-        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
-        // in some cases.
-        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-        {
-            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
-            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
-            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(
-                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
-
-            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-                a_lds_block_desc,
-                make_tuple(make_xor_with_modulo_transform(make_tuple(
-                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
-                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
-
-            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
-                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
-
-            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_ak0_mldslayer_m_ak1,
-                make_tuple(make_pass_through_transform(AK0Number),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return a_lds_block_desc_ak0_m_ak1;
-        }
-        else // ColumnMajor A
-        {
-            // kfold and mpair dimension is not always required.
-            // more dimension in merge_transform increase the difficulty of generating immarg offset
-            // for compiler.
-            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-            constexpr auto M1 = MPerBlock / M0;
-
-            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
-            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / MPerWmma;
-            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
-
-            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
-                                       ? 1
-                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
-            constexpr auto KThreadReadPerm =
-                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
-                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
-                    : KThreadRead;
-
-            // 1<=mpair<=n0
-            constexpr auto mpair = (AK1Number * MPerWmma * sizeof(ADataType) > 128)
-                                       ? 1
-                                       : ((128 / (AK1Number * MPerWmma * sizeof(ADataType))) > M0
-                                              ? M0
-                                              : 128 / (AK1Number * MPerWmma * sizeof(ADataType)));
-
-            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
-                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
-                           Number<K0PerThreadWrite>{},
-                           Number<KThreadReadPerm * M1>{},
-                           Number<kfold * M0 / mpair>{},
-                           Number<mpair>{},
-                           AK1Number));
-
-            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-                a_lds_block_desc,
-                make_tuple(
-                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
-                    make_xor_with_modulo_transform(
-                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
-                    make_pass_through_transform(Number<mpair>{}),
-                    make_pass_through_transform(AK1Number)),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
-
-            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(
-                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
-                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
-                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
-                    make_pass_through_transform(Number<mpair>{}),
-                    make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0>{},
-                           Sequence<1>{},
-                           Sequence<2>{},
-                           Sequence<3>{},
-                           Sequence<4>{},
-                           Sequence<5>{}),
-                make_tuple(Sequence<1>{},
-                           Sequence<2>{},
-                           Sequence<0, 3>{},
-                           Sequence<4, 5>{},
-                           Sequence<6>{},
-                           Sequence<7>{}));
-
-            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_lds_block_desc_unmerged,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(Number<KThreadReadPerm>{},
-                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                          Number<kfold>{},
-                                          Number<K0PerThreadWrite>{})),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
-                           make_pass_through_transform(AK1Number)),
-                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return a_lds_block_desc_ak0_m_ak1;
-        }
-    }
-
-    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
-    {
-        // B matrix in LDS memory, dst of blockwise copy
-        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
-        {
-            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
-            // loop to hide it in v4. it may give you some benefit from less valu in compute address
-            return make_naive_tensor_descriptor(
-                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
-                make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
-        }
-        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
-        {
-            // NLdsLayer * K0 as logical Bank
-            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
-            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
-                make_tuple(
-                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
-                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc,
-                make_tuple(make_xor_with_modulo_transform(make_tuple(
-                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
-                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
-
-            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
-                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
-
-            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_bk0_nldslayer_n_bk1,
-                make_tuple(make_pass_through_transform(BK0Number),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return b_lds_block_desc_bk0_n_bk1;
-        }
-        else // RowMajor B
-        {
-            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
-            constexpr auto N1 = NPerBlock / N0;
-
-            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
-            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
-            constexpr auto KThreadRead      = 64 / NPerWmma;
-            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
-
-            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
-                                       ? 1
-                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
-            constexpr auto KThreadReadPerm =
-                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
-                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
-                    : KThreadRead;
-
-            // 1<=npair<=n0
-            constexpr auto npair = (BK1Number * NPerWmma * sizeof(BDataType) > 128)
-                                       ? 1
-                                       : ((128 / (BK1Number * NPerWmma * sizeof(BDataType))) > N0
-                                              ? N0
-                                              : 128 / (BK1Number * NPerWmma * sizeof(BDataType)));
-
-            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
-                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
-                           Number<K0PerThreadWrite>{},
-                           Number<KThreadReadPerm * N1>{},
-                           Number<kfold * N0 / npair>{},
-                           Number<npair>{},
-                           BK1Number));
-
-            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
-                b_lds_block_desc,
-                make_tuple(
-                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
-                    make_xor_with_modulo_transform(
-                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
-                    make_pass_through_transform(Number<npair>{}),
-                    make_pass_through_transform(BK1Number)),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
-
-            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
-                b_lds_block_desc_permuted,
-                make_tuple(
-                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
-                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
-                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
-                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
-                    make_pass_through_transform(Number<npair>{}),
-                    make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0>{},
-                           Sequence<1>{},
-                           Sequence<2>{},
-                           Sequence<3>{},
-                           Sequence<4>{},
-                           Sequence<5>{}),
-                make_tuple(Sequence<1>{},
-                           Sequence<2>{},
-                           Sequence<0, 3>{},
-                           Sequence<4, 5>{},
-                           Sequence<6>{},
-                           Sequence<7>{}));
-
-            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_lds_block_desc_unmerged,
-                make_tuple(make_merge_transform_v3_division_mod(
-                               make_tuple(Number<KThreadReadPerm>{},
-                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                          Number<kfold>{},
-                                          Number<K0PerThreadWrite>{})),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
-                           make_pass_through_transform(BK1Number)),
-                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            return b_lds_block_desc_bk0_n_bk1;
-        }
-    }
-
-    __host__ __device__ static constexpr auto
-    // *Caution Here repeat is shuffle repeat
-    GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
-    {
-        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
-        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
-
-        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
-            make_naive_tensor_descriptor_packed(
-                make_tuple(I1,
-                           Number<CShuffleMRepeatPerShuffle * MWaves * MPerWmma>{},
-                           I1,
-                           Number<CShuffleNRepeatPerShuffle * NWaves * NPerWmma>{}));
-
-        return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
-    }
-
-    using BlockwiseGemmPipe = remove_cvref_t<
-        decltype(BlockGemmPipeline_Selector<
-                 BlkGemmPipelineVer,
-                 BlkGemmPipeSched,
-                 BlockSize,
-                 ADataType,
-                 BDataType,
-                 ComputeTypeA,
-                 ComputeTypeB,
-                 AccDataType,
-                 decltype(MakeAWmmaTileDescriptor(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
-                 decltype(MakeBWmmaTileDescriptor(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
-                 ABlockTransferSrcScalarPerVector,
-                 BBlockTransferSrcScalarPerVector,
-                 MPerBlock,
-                 NPerBlock,
-                 KPerBlock,
-                 MPerWmma,
-                 NPerWmma,
-                 MRepeat,
-                 NRepeat,
-                 KPack>())>;
-
-    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
-        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
-        // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
-
-        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
-
-        // LDS allocation for C shuffle in LDS
-        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
-            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
-
-        constexpr auto c_block_size =
-            c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
-                .GetElementSpaceSize();
-
-        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
-                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
-                         c_block_size * sizeof(CShuffleDataType));
-    }
-
-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    __host__ static constexpr bool CheckValidity(const Argument& karg)
-    {
-        static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
-                          (NPerBlock % (NPerWmma * NRepeat)) == 0,
-                      "Invalid tuning param!");
-
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
-                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
-        {
-            if(!(karg.M % MPerBlock == 0))
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
-                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
-        {
-            if(!(karg.N % NPerBlock == 0))
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
-        {
-
-            auto K_t = karg.KBatch * KPerBlock;
-            if(!(karg.K % K_t == 0))
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
-                              << karg.K << " " << __FILE__ << ":" << __LINE__
-                              << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
-            auto K_t                = karg.KBatch * KReadVec;
-            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
-            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
-            {
-                return false;
-            }
-        }
-
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-        {
-            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg K (" << karg.K
-                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg M (" << karg.M
-                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-        {
-            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg N (" << karg.N
-                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg K (" << karg.K
-                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-        {
-            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg N (" << karg.N
-                              << ") value is not a multiple of "
-                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg M (" << karg.M
-                              << ") value is not a multiple of "
-                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, float>::value ||
-                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
-        {
-            if(!karg.IsReduceAdd())
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not supported yet"
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                if(karg.KBatch > 1)
-                {
-                    return false;
-                }
-            }
-        }
-
-        // check gridwise gemm pipeline
-        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
-
-        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
-        {
-            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
-            {
-                return false;
-            }
-        }
-
-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
-        return true;
-    }
-
-    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
-    {
-        const index_t num_loop = K / KPerBlock;
-
-        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
-    }
-
-    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
-    {
-        const index_t num_loop = K / KPerBlock;
-
-        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
-    }
-
-    template <typename CGridDesc>
-    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
-    {
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
-            c_grid_desc_m_n,
-            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
-                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}),
-            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-
-        return c_grid_desc_mblock_mperblock_nblock_nperblock;
-    }
+    using BlockwiseGemmPipe = typename Base::BlockwiseGemmPipe;
 
     // return block_id to C matrix tile idx (m0, n0) mapping
     // if arch = gfx942
     using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
     // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
 
-    template <typename AGridDesc_AK0_M_K1,
-              typename BGridDesc_BK0_N_K1,
-              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-              bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const ADataType* p_a_grid,
-                               const BDataType* p_b_grid,
-                               CDataType* p_c_grid,
-                               void* p_shared,
-                               const Problem& problem,
-                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
-                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
-                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
-                                   c_grid_desc_mblock_mperblock_nblock_nperblock)
-    {
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
-        const CElementwiseOperation c_element_op{};
-
-        // divide block work by [M, N]
-        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
-
-        const auto block_work_idx =
-            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
-
-        if(!block_2_ctile_map.ValidCTileIndex(
-               block_work_idx,
-               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
-                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
-        {
-            return;
-        }
-
-        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
-        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
-
-        // HACK: this force m/n_block_data_idx_on_grid into SGPR
-        const index_t m_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
-
-        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
-
-        // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
-
-        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
-
-        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
-        // A matrix blockwise copy
-        auto a_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                AElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<AK0Number, MPerBlock, AK1Number>,
-                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
-                                                ABlockTransferThreadClusterArrangeOrder,
-                                                ADataType,
-                                                ADataType,
-                                                decltype(a_grid_desc_ak0_m_ak1),
-                                                decltype(a_block_desc_ak0_m_ak1),
-                                                ABlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                ABlockTransferSrcVectorDim,
-                                                2,
-                                                ABlockTransferSrcScalarPerVector,
-                                                ABlockTransferDstScalarPerVector_AK1,
-                                                1,
-                                                1,
-                                                AThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
-                a_grid_desc_ak0_m_ak1,
-                make_multi_index(0, m_block_data_idx_on_grid, 0),
-                a_element_op,
-                a_block_desc_ak0_m_ak1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
-
-        // B matrix blockwise copy
-        auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
-                                                BElementwiseOperation,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<BK0Number, NPerBlock, BK1Number>,
-                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                                                BBlockTransferThreadClusterArrangeOrder,
-                                                BDataType,
-                                                BDataType,
-                                                decltype(b_grid_desc_bk0_n_bk1),
-                                                decltype(b_block_desc_bk0_n_bk1),
-                                                BBlockTransferSrcAccessOrder,
-                                                Sequence<0, 1, 2>,
-                                                BBlockTransferSrcVectorDim,
-                                                2,
-                                                BBlockTransferSrcScalarPerVector,
-                                                BBlockTransferDstScalarPerVector_BK1,
-                                                1,
-                                                1,
-                                                BThreadTransferSrcResetCoordinateAfterRun,
-                                                true,
-                                                BlockwiseGemmPipe::GlobalBufferNum>(
-                b_grid_desc_bk0_n_bk1,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_element_op,
-                b_block_desc_bk0_n_bk1,
-                make_multi_index(0, 0, 0),
-                ck::tensor_operation::element_wise::PassThrough{});
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
-
-        // Cast after lds
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
-
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
-                                                                            sizeof(ADataType) /
-                                                                            APackedSize),
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-
-        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
-
-        // Blockwise GEMM pipeline
-        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
-        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
-        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
-
-        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
-            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
-            KPerBlock);
-
-        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
-                                                                         a_block_desc_ak0_m_ak1,
-                                                                         a_blockwise_copy,
-                                                                         a_grid_buf,
-                                                                         a_block_buf,
-                                                                         a_block_slice_copy_step,
-                                                                         b_grid_desc_bk0_n_bk1,
-                                                                         b_block_desc_bk0_n_bk1,
-                                                                         b_blockwise_copy,
-                                                                         b_grid_buf,
-                                                                         b_block_buf,
-                                                                         b_block_slice_copy_step,
-                                                                         c_thread_buf,
-                                                                         num_k_block_main_loop);
-
-        // shuffle C and write out
-        {
-            // C mapping in single thread.
-            constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
-                blockwise_gemm_pipeline
-                    .GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
-
-            // C mapping in single block
-            constexpr auto
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp =
-                    blockwise_gemm_pipeline
-                        .GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
-
-            constexpr auto MWave =
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
-                    .GetLength(I1);
-            constexpr auto MSubGroup =
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
-                    .GetLength(I2);
-            constexpr auto NWave =
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
-                    .GetLength(I4);
-            constexpr auto NThreadPerSubGroup =
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
-                    .GetLength(I5);
-            constexpr auto MAccVgprs =
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
-                    .GetLength(I6);
-
-            // LDS descriptor, shuffle and write out in MRepeat x NRepeat times
-            constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
-                GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
-
-            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                static_cast<CShuffleDataType*>(p_shared),
-                c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
-                    .GetElementSpaceSize());
-
-            constexpr auto
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
-                    transform_tensor_descriptor(
-                        c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
-                        make_tuple(
-                            make_freeze_transform(I0),
-                            make_unmerge_transform(make_tuple(
-                                Number<CShuffleMRepeatPerShuffle>{}, // MRepeat per shuffle repeat
-                                MWave,                               // MWave
-                                MSubGroup, // MSubGroup * MAccVgprs = MPerWmma
-                                MAccVgprs)),
-                            make_freeze_transform(I0),
-                            make_unmerge_transform(make_tuple(
-                                Number<CShuffleNRepeatPerShuffle>{}, // NRepeat per shuffle repeat
-                                NWave,                               // NWave
-                                NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma
-                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                        make_tuple(Sequence<>{},
-                                   Sequence<0, 1, 2, 6>{},
-                                   Sequence<>{},
-                                   Sequence<3, 4, 5>{}));
-
-            // calculate origin of thread output tensor on global memory
-            //     blockwise GEMM c matrix starting index
-            const auto c_thread_mtx_on_block =
-                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0);
-
-            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
-            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
-
-            const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor =
-                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
-                                                     MRepeat, MWave, MSubGroup, MAccVgprs))),
-                                                 make_tuple(Sequence<0, 1, 2, 3>{}),
-                                                 make_tuple(Sequence<0>{}));
-
-            const auto m_thread_data_on_block_idx =
-                m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor
-                    .CalculateBottomIndex(make_multi_index(m_thread_data_on_block));
-
-            const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor =
-                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
-                                                     NRepeat, NWave, NThreadPerSubGroup))),
-                                                 make_tuple(Sequence<0, 1, 2>{}),
-                                                 make_tuple(Sequence<0>{}));
-
-            const auto n_thread_data_on_block_idx =
-                n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor
-                    .CalculateBottomIndex(make_multi_index(n_thread_data_on_block));
-
-            // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
-                AccDataType,
-                CShuffleDataType,
-                decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
-                decltype(c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
-                ck::tensor_operation::element_wise::PassThrough,
-                Sequence<CShuffleMRepeatPerShuffle,
-                         I1,
-                         I1,
-                         CShuffleNRepeatPerShuffle,
-                         I1,
-                         I1,
-                         MAccVgprs>,
-                Sequence<0, 1, 2, 3, 4, 5, 6>,
-                6,
-                1, // vector write pixel
-                InMemoryDataOperationEnum::Set,
-                1,
-                true>{
-                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
-                make_multi_index(0,
-                                 m_thread_data_on_block_idx[I1],
-                                 m_thread_data_on_block_idx[I2],
-                                 0,
-                                 n_thread_data_on_block_idx[I1],
-                                 n_thread_data_on_block_idx[I2],
-                                 m_thread_data_on_block_idx[I3]),
-                ck::tensor_operation::element_wise::PassThrough{}};
-
-            // shuffle: blockwise copy C from LDS to global
-            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
-                ThisThreadBlock,            // ThreadGroup
-                CElementwiseOperation,      // ElementwiseOperation,
-                CGlobalMemoryDataOperation, // DstInMemOp,
-                Sequence<1,
-                         CShuffleMRepeatPerShuffle * MWave * MPerWmma,
-                         1,
-                         CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths,
-                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                CShuffleDataType,     // typename SrcData,
-                CDataType,            // typename DstData,
-                decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
-                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
-                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
-                3,                                              // index_t VectorDim,
-                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
-                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
-                false> // bool ThreadTransferDstResetCoordinateAfterRun>
-                {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
-                 make_multi_index(0, 0, 0, 0),
-                 c_grid_desc_mblock_mperblock_nblock_nperblock,
-                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
-                 c_element_op};
-
-            // space filling curve for local reg & global memory
-            // space filling curve for threadwise C in VGPR
-            constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MRepeat, 1, 1, NRepeat, 1, 1, MAccVgprs>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6>,
-                                  Sequence<CShuffleMRepeatPerShuffle,
-                                           1,
-                                           1,
-                                           CShuffleNRepeatPerShuffle,
-                                           1,
-                                           1,
-                                           MAccVgprs>>{};
-
-            // space filling curve for shuffled blockwise C in global mem
-            constexpr auto sfc_c_global =
-                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
-                                  Sequence<0, 2, 1, 3>,
-                                  Sequence<1,
-                                           CShuffleMRepeatPerShuffle * MWave * MPerWmma,
-                                           1,
-                                           CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{};
-
-            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
-
-            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
-
-            static_for<0, num_access, 1>{}([&](auto access_id) {
-                // make sure it's safe to write to LDS
-                block_sync_lds();
-
-                // each thread write its data from VGPR to LDS
-                c_thread_copy_vgpr_to_lds.Run(
-                    c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
-                    sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
-                    c_thread_buf,
-                    c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
-                    c_shuffle_block_buf);
-
-                // make sure it's safe to read from LDS
-                block_sync_lds();
-
-                // each block copy its data from LDS to global
-                c_shuffle_block_copy_lds_to_global.Run(
-                    c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
-                    c_shuffle_block_buf,
-                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                    c_grid_buf);
-
-                if constexpr(access_id < num_access - 1)
-                {
-                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
-
-                    // move on C
-                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
-                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
-                }
-            });
-        }
-    }
+    __device__ static index_t GetKBlockPerScale() { return 1; }
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -1706,19 +492,62 @@ struct GridwiseGemm_wmma_cshuffle_v3
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
-        Run<decltype(a_grid_desc_ak0_m_ak1),
-            decltype(b_grid_desc_bk0_n_bk1),
-            decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
-            HasMainKBlockLoop,
-            CGlobalMemoryDataOperation,
-            TailNum>(p_a_grid,
-                     p_b_grid,
-                     p_c_grid,
-                     p_shared,
-                     problem,
-                     a_grid_desc_ak0_m_ak1,
-                     b_grid_desc_bk0_n_bk1,
-                     c_grid_desc_mblock_mperblock_nblock_nperblock);
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // BScale struct (Empty)
+        using BScale        = typename BlockwiseGemmPipe::Empty;
+        auto b_scale_struct = BScale{};
+
+        const index_t num_k_block_per_scale = GetKBlockPerScale();
+
+        Base::template Run<decltype(a_grid_desc_ak0_m_ak1),
+                           decltype(b_grid_desc_bk0_n_bk1),
+                           decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                           decltype(b_scale_struct),
+                           HasMainKBlockLoop,
+                           CGlobalMemoryDataOperation,
+                           TailNum>(p_a_grid,
+                                    p_b_grid,
+                                    p_c_grid,
+                                    p_shared,
+                                    a_grid_desc_ak0_m_ak1,
+                                    b_grid_desc_bk0_n_bk1,
+                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    block_m_id,
+                                    block_n_id,
+                                    num_k_block_per_scale,
+                                    b_scale_struct);
+    }
+
+    // Wrapper function to have __global__ function in common
+    // between gemm_universal, b_scale, ab_scale, etc.
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void
+    Run(void* p_shared, const SplitKBatchOffset& splitk_batch_offset, const Argument& karg)
+    {
+        Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
new file mode 100644
index 0000000000..7b6ad5ca3e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/env.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp"
+
+namespace ck {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockN, // scale N
+          index_t ScaleBlockK, // scale K
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v4,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct GridwiseGemm_wmma_cshuffle_v3_b_scale
+    : GridwiseGemm_wmma_cshuffle_v3_base<
+          ALayout,
+          BLayout,
+          CLayout,
+          ADataType,
+          BDataType,
+          AccDataType,
+          CShuffleDataType,
+          CDataType,
+          AElementwiseOperation,
+          BElementwiseOperation,
+          CElementwiseOperation,
+          GemmSpec,
+          BlockSize,
+          MPerBlock,
+          NPerBlock,
+          KPerBlock,
+          AK1Value,
+          BK1Value,
+          MPerWmma,
+          NPerWmma,
+          MRepeat,
+          NRepeat,
+          ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          ABlockTransferThreadClusterArrangeOrder,
+          ABlockTransferSrcAccessOrder,
+          ABlockTransferSrcVectorDim,
+          ABlockTransferSrcScalarPerVector,
+          ABlockTransferDstScalarPerVector_AK1,
+          AThreadTransferSrcResetCoordinateAfterRun,
+          ABlockLdsExtraM,
+          BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          BBlockTransferThreadClusterArrangeOrder,
+          BBlockTransferSrcAccessOrder,
+          BBlockTransferSrcVectorDim,
+          BBlockTransferSrcScalarPerVector,
+          BBlockTransferDstScalarPerVector_BK1,
+          BThreadTransferSrcResetCoordinateAfterRun,
+          BBlockLdsExtraN,
+          CShuffleMRepeatPerShuffle,
+          CShuffleNRepeatPerShuffle,
+          CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlkGemmPipeSched,
+          BlkGemmPipelineVer,
+          ComputeTypeA,
+          ComputeTypeB,
+          PermuteA,
+          PermuteB>
+{
+    using BScaleType = ck::half_t;
+
+    using Base = GridwiseGemm_wmma_cshuffle_v3_base<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1Value,
+        BK1Value,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::I3;
+    using Base::I4;
+    using Base::I5;
+    using Base::I6;
+    using Base::I7;
+
+    using Base::AK0Number;
+    using Base::AK1Number;
+    using Base::BK0Number;
+    using Base::BK1Number;
+
+    using Base::APackedSize;
+    using Base::BPackedSize;
+
+    using Base::CalculateAK0Padded;
+    using Base::CalculateBK0Padded;
+    using Base::CalculateKPadded;
+    using Base::CalculateKRead;
+    using Base::CalculateMBlock;
+    using Base::CalculateMPadded;
+    using Base::CalculateNBlock;
+    using Base::CalculateNPadded;
+    using Base::MakeAGridDescriptor_AK0_M_AK1;
+    using Base::MakeBGridDescriptor_BK0_N_BK1;
+    using Base::MakeCGridDescriptor_M_N;
+
+    using Base::GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat;
+
+    using Base::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock;
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using Base::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1;
+    using Base::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1;
+
+    struct Problem
+    {
+        __host__ Problem(index_t M_,
+                         index_t N_,
+                         index_t K_,
+                         index_t StrideA_,
+                         index_t StrideB_,
+                         index_t StrideC_,
+                         index_t StrideScaleB_,
+                         index_t KBatch_)
+            : M{M_},
+              N{N_},
+              K{K_},
+              StrideA{StrideA_},
+              StrideB{StrideB_},
+              StrideC{StrideC_},
+              StrideScaleB{StrideScaleB_},
+              KBatch{KBatch_},
+              MPadded{CalculateMPadded(M_)},
+              NPadded{CalculateNPadded(N_)},
+              KRead{CalculateKRead(K_, KBatch_)},
+              KPadded{CalculateKPadded(K_, KBatch_)},
+              AK0{CalculateAK0Padded(K_, KBatch_)},
+              BK0{CalculateBK0Padded(K_, KBatch_)},
+              MBlock{CalculateMBlock(M_)},
+              NBlock{CalculateNBlock(N_)}
+        {
+        }
+
+        __host__ void Print() const
+        {
+            std::cout << "problem {"
+                      << "M:" << M << ", "
+                      << "N:" << N << ", "
+                      << "K:" << K << ", "
+                      << "SA:" << StrideA << ", "
+                      << "SB:" << StrideB << ", "
+                      << "SC:" << StrideC << ", "
+                      << "SScaleB:" << StrideScaleB << ", "
+                      << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", "
+                      << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << "}" << std::endl;
+        }
+
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t StrideA;
+        index_t StrideB;
+        index_t StrideC;
+        index_t StrideScaleB;
+        index_t KBatch;
+        index_t MPadded;
+        index_t NPadded;
+        index_t KRead;
+        index_t KPadded;
+        index_t AK0;
+        index_t BK0;
+        index_t MBlock;
+        index_t NBlock;
+    };
+
+    // Argument
+    struct Argument : public tensor_operation::device::BaseArgument, public Problem
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t StrideScaleB_,
+                          const BScaleType* p_b_scale_grid_,
+                          index_t k_batch_,
+                          AElementwiseOperation a_element_op_,
+                          BElementwiseOperation b_element_op_,
+                          CElementwiseOperation c_element_op_,
+                          bool is_reduce_ = false)
+            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, StrideScaleB_, k_batch_},
+              p_a_grid{p_a_grid_},
+              p_b_grid{p_b_grid_},
+              p_c_grid{p_c_grid_},
+              p_b_scale_grid{p_b_scale_grid_},
+              a_element_op{a_element_op_},
+              b_element_op{b_element_op_},
+              c_element_op{c_element_op_},
+              is_reduce(is_reduce_)
+        {
+        }
+
+        __host__ __device__ inline bool IsReduceAdd() const
+        {
+            return (Problem::KBatch > 1) && is_reduce;
+        }
+
+        __host__ __device__ inline bool IsAtomicAdd() const
+        {
+            return (Problem::KBatch > 1) && (!is_reduce);
+        }
+
+        const ADataType* p_a_grid;
+        const BDataType* p_b_grid;
+        CDataType* p_c_grid;
+
+        const BScaleType* p_b_scale_grid;
+        const AElementwiseOperation a_element_op;
+        const BElementwiseOperation b_element_op;
+        const CElementwiseOperation c_element_op;
+        bool is_reduce;
+    };
+
+    struct SplitKBatchOffset
+    {
+
+        __device__ SplitKBatchOffset(Argument& karg)
+        {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead / APackedSize;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = blockIdx.z * karg.KRead * karg.StrideA;
+            }
+
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = blockIdx.z * karg.KRead * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                if constexpr(!PermuteB)
+                {
+                    b_k_split_offset = blockIdx.z * karg.KRead / BPackedSize;
+                }
+                else
+                {
+                    const int k0_offset = karg.KRead * karg.N;
+                    b_k_split_offset    = blockIdx.z * k0_offset / BPackedSize;
+                }
+            }
+
+            // Calculate B scale offset
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK) * karg.StrideB;
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                scale_k_split_offset = blockIdx.z * (karg.KRead / ScaleBlockK);
+            }
+
+            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
+            {
+                karg.K = karg.KRead;
+            }
+            else
+            {
+                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
+            }
+
+            if(karg.IsReduceAdd())
+            {
+                c_reduce_offset = blockIdx.z * karg.M * karg.N;
+            }
+            else
+            {
+                c_reduce_offset = 0;
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t scale_k_split_offset; // New member for scale matrix offset
+        index_t c_reduce_offset;
+    };
+
+    using BlockwiseGemmPipe = typename Base::BlockwiseGemmPipe;
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    template <index_t NumberOfBuffers, typename BScaleGridDesc_BN_AK, typename BScaleType>
+    __device__ static auto MakeBScale(const BScaleGridDesc_BN_AK& b_scale_grid_desc_bn_ak,
+                                      const BScaleType* p_b_scale_grid,
+                                      index_t block_n_id)
+    {
+        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
+        static constexpr auto wmma =
+            WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>{};
+        static constexpr auto KPerThread = wmma.selected_wmma.k_per_wmma;
+
+        static constexpr auto ScaleSliceSizeN = NRepeat;
+        static constexpr auto ScaleSliceSizeK = (KPerThread + ScaleBlockK - 1) / ScaleBlockK;
+
+        constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
+
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        auto b_thread_offset_n = get_thread_local_1d_id() % NPerWmma +
+                                 (get_thread_local_1d_id() / 32) % NWaves * NPerWmma;
+        auto b_thread_offset_k = (get_thread_local_1d_id() % 32) / NPerWmma * KPerThread;
+
+        auto b_scale_thread_copy =
+            ThreadwiseTensorSliceTransfer_v2<BScaleType,
+                                             BScaleType,
+                                             decltype(b_scale_grid_desc_bn_ak),
+                                             decltype(b_scale_thread_desc),
+                                             Sequence<1, ScaleSliceSizeK>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             ScaleSliceSizeK,
+                                             1,
+                                             false>(
+                b_scale_grid_desc_bn_ak,
+                make_multi_index(block_n_id * NPerBlock / ScaleBlockN + b_thread_offset_n,
+                                 b_thread_offset_k / ScaleBlockK));
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        using BScale =
+            typename BlockwiseGemmPipe::template BScale<ScaleSliceSizeN,
+                                                        ScaleSliceSizeK,
+                                                        NWaves,
+                                                        ScaleBlockK,
+                                                        NumberOfBuffers,
+                                                        decltype(b_scale_grid_desc_bn_ak),
+                                                        decltype(b_scale_thread_copy),
+                                                        decltype(b_scale_grid_buf),
+                                                        decltype(b_scale_thread_buf),
+                                                        decltype(b_scale_thread_desc)>;
+
+        return BScale{b_scale_grid_desc_bn_ak, b_scale_thread_copy, b_scale_grid_buf};
+    }
+
+    __device__ static index_t GetKBlockPerScale()
+    {
+        return (ScaleBlockK + KPerBlock - 1) / KPerBlock;
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               const BScaleType* p_b_scale_grid,
+                               void* p_shared,
+                               const Problem& problem)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+        // B Scale grid
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
+            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
+                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
+            make_tuple(problem.StrideScaleB, 1));
+
+        // divide block work by [M, N]
+        const auto block_2_ctile_map = Block2CTileMap{problem.M, problem.N, 4};
+
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]);
+        const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]);
+
+        // BScale struct
+        auto b_scale_struct = MakeBScale<1>(b_scale_grid_desc_bn_ak, p_b_scale_grid, block_n_id);
+
+        const index_t num_k_block_per_scale = GetKBlockPerScale();
+
+        Base::template Run<decltype(a_grid_desc_ak0_m_ak1),
+                           decltype(b_grid_desc_bk0_n_bk1),
+                           decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                           decltype(b_scale_struct),
+                           HasMainKBlockLoop,
+                           CGlobalMemoryDataOperation,
+                           TailNum>(p_a_grid,
+                                    p_b_grid,
+                                    p_c_grid,
+                                    p_shared,
+                                    a_grid_desc_ak0_m_ak1,
+                                    b_grid_desc_bk0_n_bk1,
+                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    block_m_id,
+                                    block_n_id,
+                                    num_k_block_per_scale,
+                                    b_scale_struct);
+    }
+
+    // NOTE: Wrapper function to have __global__ function in common
+    // between gemm_universal, b_scale, ab_scale, etc.
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void
+    Run(void* p_shared, const SplitKBatchOffset& splitk_batch_offset, const Argument& karg)
+    {
+        Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            karg.p_b_scale_grid + splitk_batch_offset.scale_k_split_offset,
+            p_shared,
+            karg);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
new file mode 100644
index 0000000000..5a4a41e507
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
@@ -0,0 +1,1420 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/env.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if defined(__gfx11__)
+    // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
+    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
+                   (std::is_same_v<c_data_type, ck::half_t> ||
+                    std::is_same_v<c_data_type, ck::bhalf_t>)))
+    {
+#endif
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            p_shared, splitk_batch_offset, karg);
+
+#if defined(__gfx11__)
+    }
+#endif
+#else
+    ignore = karg;
+#endif
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          tensor_operation::device::GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched,
+          BlockGemmPipelineVersion BlkGemmPipelineVer,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          bool PermuteA,
+          bool PermuteB>
+struct GridwiseGemm_wmma_cshuffle_v3_base
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0Number = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0Number = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1Number = Number<AK1Value>{};
+    static constexpr auto BK1Number = Number<BK1Value>{};
+
+    static constexpr index_t KPack = math::max(
+        math::lcm(AK1Number, BK1Number),
+        WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::selected_wmma
+            .k_per_wmma);
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    // if arch = gfx942
+    using Block2CTileMap = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock, NPerBlock>;
+    // using Block2CTileMap = BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>;
+
+    __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static auto CalculateMPadded(index_t M)
+    {
+        return math::integer_least_multiple(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNPadded(index_t N)
+    {
+        return math::integer_least_multiple(N, NPerBlock);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K)
+    {
+        return math::integer_divide_ceil(K, KPerBlock) * KPerBlock;
+    }
+
+    __host__ static auto CalculateAK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
+    }
+
+    __host__ static auto CalculateBK0Padded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
+    }
+
+    __host__ static auto CalculateKPadded(index_t K, index_t K_Batch = 1)
+    {
+        auto K_t = K_Batch * KPerBlock;
+        return (K + K_t - 1) / K_t * KPerBlock;
+    }
+
+    __host__ static auto CalculateKRead(index_t K, index_t K_Batch = 1)
+    {
+        constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+        auto K_t                = K_Batch * KReadVec;
+        return (K + K_t - 1) / K_t * KReadVec;
+    }
+
+    __host__ static auto CalculateMBlock(index_t M)
+    {
+        return math::integer_divide_ceil(M, MPerBlock);
+    }
+
+    __host__ static auto CalculateNBlock(index_t N)
+    {
+        return math::integer_divide_ceil(N, NPerBlock);
+    }
+
+    template <index_t MNRepeat, index_t MNWaves, index_t MNPerWmma, typename BlockDesc>
+    __host__ __device__ static constexpr auto MakeWmmaTileDescriptor(const BlockDesc&)
+    {
+        // K0_MN_K1 -> K0_MNRepeat_MNWaves_KRow_MNPerWmma_K1
+        constexpr auto K0 = BlockDesc{}.GetLength(I0);
+        constexpr auto K1 = BlockDesc{}.GetLength(I2);
+#ifdef __gfx12__
+        constexpr auto KRow = I2;
+#else
+        constexpr auto KRow = I1;
+#endif
+        return transform_tensor_descriptor(
+            BlockDesc{},
+            make_tuple(make_unmerge_transform(make_tuple(Number<K0 / KRow>{}, KRow)),
+                       make_unmerge_transform(
+                           make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                       make_pass_through_transform(Number<K1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+    }
+
+    __host__ __device__ static auto MakeAGridDescriptor_AK0_M_AK1(
+        index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(M, MPad - M),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(MPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_right_pad_transform(M, MPad - M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            static_assert(!PermuteA, "PermuteA is not supported");
+
+            // not pad M or K
+            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
+        index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(N, K), make_tuple(StrideB, I1));
+            }
+        }();
+
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        static_assert(!(is_same_v<remove_cvref_t<BDataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(N, NPad - N),
+                                                       make_right_pad_transform(K, KPad - K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(NPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_grid_desc_n_k,
+                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            if constexpr(!PermuteB)
+            {
+                // not pad N or K
+                const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                    b_grid_desc_nraw_kraw,
+                    make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                               make_pass_through_transform(N)),
+                    make_tuple(Sequence<1>{}, Sequence<0>{}),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+                return b_grid_desc_bk0_n_bk1;
+            }
+            else
+            {
+                // Pre-shuffled Weight
+                // BGlobal[K / KPerBlock, N, KPerBlock / K1, K1] -> BTile[K / K1, N, K1]
+                constexpr index_t BK01 = KPerBlock / BK1Value;
+                const index_t BK0_     = StrideB / BK1Value;
+                const index_t BK00     = BK0_ / BK01;
+
+                const auto b_grid_desc_bk00_n_bk01_bk1_permute =
+                    make_naive_tensor_descriptor_packed(make_tuple(BK00, N, BK01, BK1Value));
+
+                const auto b_grid_desc_bk0_n_bk1_permute = transform_tensor_descriptor(
+                    b_grid_desc_bk00_n_bk01_bk1_permute,
+                    make_tuple(make_merge_transform(make_tuple(BK00, BK01)),
+                               make_pass_through_transform(make_tuple(N)),
+                               make_pass_through_transform(BK1Value)),
+                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                return b_grid_desc_bk0_n_bk1_permute;
+            }
+        }
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto MakeAWmmaTileDescriptor(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+
+        return MakeWmmaTileDescriptor<MRepeat, MWaves, MPerWmma>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto MakeBWmmaTileDescriptor(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        return MakeWmmaTileDescriptor<NRepeat, NWaves, NPerWmma>(BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static auto
+    MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        // pad M and N
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+        // TODO: Investigate why this path is not used in the original
+        // gridwise_gemm_xdl_cshuffle_v3.hpp
+#if 0
+        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(M, MPad - M),
+                                                          make_right_pad_transform(N, NPad - N)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+#endif
+    }
+
+    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
+            return make_naive_tensor_descriptor(
+                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
+                make_tuple(Number<MPerBlock>{} * AK1Number, AK1Number, I1));
+        }
+        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
+        // in some cases.
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(ADataType) / APackedSize;
+            constexpr auto MLdsLayer        = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
+                make_tuple(AK1Number, Number<KPerBlock * MLdsLayer>{}, I1));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<MPerBlock / MLdsLayer>{}, Number<AK0Number * MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number<MLdsLayer>{})),
+                           make_pass_through_transform(Number<MPerBlock / MLdsLayer>{}),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_ak0_mldslayer_m_ak1,
+                make_tuple(make_pass_through_transform(AK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<MPerBlock / MLdsLayer>{}, Number<MLdsLayer>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+        else // ColumnMajor A
+        {
+            // kfold and mpair dimension is not always required.
+            // more dimension in merge_transform increase the difficulty of generating immarg offset
+            // for compiler.
+            constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
+            constexpr auto M1 = MPerBlock / M0;
+
+            constexpr auto KThreadWrite     = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = AK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / MPerWmma;
+            constexpr auto K0PerThreadRead  = AK0Number / KThreadRead;
+
+            constexpr auto kfold = (AK1Number * M0 * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : 128 / (AK1Number * M0 * sizeof(ADataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=mpair<=n0
+            constexpr auto mpair = (AK1Number * MPerWmma * sizeof(ADataType) > 128)
+                                       ? 1
+                                       : ((128 / (AK1Number * MPerWmma * sizeof(ADataType))) > M0
+                                              ? M0
+                                              : 128 / (AK1Number * MPerWmma * sizeof(ADataType)));
+
+            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * M1>{},
+                           Number<kfold * M0 / mpair>{},
+                           Number<mpair>{},
+                           AK1Number));
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * M1>{}, Number<kfold * M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto a_lds_block_desc_unmerged = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<M1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<M0 / mpair>{})),
+                    make_pass_through_transform(Number<mpair>{}),
+                    make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor(
+                a_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<M0 / mpair>{}, Number<mpair>{}, Number<M1>{})),
+                           make_pass_through_transform(AK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return a_lds_block_desc_ak0_m_ak1;
+        }
+    }
+
+    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
+            return make_naive_tensor_descriptor(
+                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
+                make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            // NLdsLayer * K0 as logical Bank
+            constexpr index_t LdsSize       = 32 * 4 / KPerBlock / sizeof(BDataType) / BPackedSize;
+            constexpr index_t NLdsLayer     = LdsSize < 1 ? 1 : LdsSize;
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
+                make_tuple(
+                    BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
+                make_tuple(BK1Number, Number<KPerBlock * NLdsLayer>{}, I1));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(make_xor_with_modulo_transform(make_tuple(
+                               Number<NPerBlock / NLdsLayer>{}, Number<BK0Number * NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}),
+                make_tuple(Sequence<1, 0>{}, Sequence<2>{}));
+
+            constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number<NLdsLayer>{})),
+                           make_pass_through_transform(Number<NPerBlock / NLdsLayer>{}),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_bk0_nldslayer_n_bk1,
+                make_tuple(make_pass_through_transform(BK0Number),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<NPerBlock / NLdsLayer>{}, Number<NLdsLayer>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+        else // RowMajor B
+        {
+            constexpr auto N0 = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I1);
+            constexpr auto N1 = NPerBlock / N0;
+
+            constexpr auto KThreadWrite     = BBlockTransferThreadClusterLengths_BK0_N_BK1{}.At(I0);
+            constexpr auto K0PerThreadWrite = BK0Number / KThreadWrite;
+            constexpr auto KThreadRead      = 64 / NPerWmma;
+            constexpr auto K0PerThreadRead  = BK0Number / KThreadRead;
+
+            constexpr auto kfold = (BK1Number * N0 * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : 128 / (BK1Number * N0 * sizeof(BDataType));
+            constexpr auto KThreadReadPerm =
+                (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
+                    ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
+                    : KThreadRead;
+
+            // 1<=npair<=n0
+            constexpr auto npair = (BK1Number * NPerWmma * sizeof(BDataType) > 128)
+                                       ? 1
+                                       : ((128 / (BK1Number * NPerWmma * sizeof(BDataType))) > N0
+                                              ? N0
+                                              : 128 / (BK1Number * NPerWmma * sizeof(BDataType)));
+
+            constexpr auto b_lds_block_desc = make_naive_tensor_descriptor_packed(
+                make_tuple(Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                           Number<K0PerThreadWrite>{},
+                           Number<KThreadReadPerm * N1>{},
+                           Number<kfold * N0 / npair>{},
+                           Number<npair>{},
+                           BK1Number));
+
+            constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+                b_lds_block_desc,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_xor_with_modulo_transform(
+                        make_tuple(Number<KThreadReadPerm * N1>{}, Number<kfold * N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}, Sequence<5>{}));
+
+            constexpr auto b_lds_block_desc_unmerged = transform_tensor_descriptor(
+                b_lds_block_desc_permuted,
+                make_tuple(
+                    make_pass_through_transform(Number<KThreadWrite / kfold / KThreadReadPerm>{}),
+                    make_pass_through_transform(Number<K0PerThreadWrite>{}),
+                    make_unmerge_transform(make_tuple(Number<KThreadReadPerm>{}, Number<N1>{})),
+                    make_unmerge_transform(make_tuple(Number<kfold>{}, Number<N0 / npair>{})),
+                    make_pass_through_transform(Number<npair>{}),
+                    make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<0, 3>{},
+                           Sequence<4, 5>{},
+                           Sequence<6>{},
+                           Sequence<7>{}));
+
+            constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor(
+                b_lds_block_desc_unmerged,
+                make_tuple(make_merge_transform_v3_division_mod(
+                               make_tuple(Number<KThreadReadPerm>{},
+                                          Number<KThreadWrite / kfold / KThreadReadPerm>{},
+                                          Number<kfold>{},
+                                          Number<K0PerThreadWrite>{})),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(Number<N0 / npair>{}, Number<npair>{}, Number<N1>{})),
+                           make_pass_through_transform(BK1Number)),
+                make_tuple(Sequence<0, 1, 4, 2>{}, Sequence<5, 6, 3>{}, Sequence<7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            return b_lds_block_desc_bk0_n_bk1;
+        }
+    }
+
+    __host__ __device__ static constexpr auto
+    // *Caution Here repeat is shuffle repeat
+    GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
+    {
+        constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma);
+        constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma);
+
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMRepeatPerShuffle * MWaves * MPerWmma>{},
+                           I1,
+                           Number<CShuffleNRepeatPerShuffle * NWaves * NPerWmma>{}));
+
+        return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
+    }
+
+    using BlockwiseGemmPipe = remove_cvref_t<
+        decltype(BlockGemmPipeline_Selector<
+                 BlkGemmPipelineVer,
+                 BlkGemmPipeSched,
+                 BlockSize,
+                 ADataType,
+                 BDataType,
+                 ComputeTypeA,
+                 ComputeTypeB,
+                 AccDataType,
+                 decltype(MakeAWmmaTileDescriptor(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())),
+                 decltype(MakeBWmmaTileDescriptor(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())),
+                 ABlockTransferSrcScalarPerVector,
+                 BBlockTransferSrcScalarPerVector,
+                 MPerBlock,
+                 NPerBlock,
+                 KPerBlock,
+                 MPerWmma,
+                 NPerWmma,
+                 MRepeat,
+                 NRepeat,
+                 KPack>())>;
+
+    template <typename CGridDesc>
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const CGridDesc& c_grid_desc_m_n, index_t MBlock, index_t NBlock)
+    {
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Argument>
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
+                          (NPerBlock % (NPerWmma * NRepeat)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not supported yet"
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
+    }
+
+    __host__ static constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
+    }
+
+    __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+            GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
+                .GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned * sizeof(ADataType) / APackedSize +
+                          b_block_space_size_aligned * sizeof(BDataType) / BPackedSize),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    template <typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename BScaleStruct,
+              bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum = TailNumber::Odd>
+    __device__ static void Run(const ADataType* p_a_grid,
+                               const BDataType* p_b_grid,
+                               CDataType* p_c_grid,
+                               void* p_shared,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const index_t& block_m_id,
+                               const index_t& block_n_id,
+                               const index_t& num_k_block_per_scale,
+                               BScaleStruct& b_scale_struct)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const AElementwiseOperation a_element_op{};
+        const BElementwiseOperation b_element_op{};
+        const CElementwiseOperation c_element_op{};
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0Number, MPerBlock, AK1Number>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ADataType,
+                                                ADataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0Number, NPerBlock, BK1Number>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BDataType,
+                                                BDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 1, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                BlockwiseGemmPipe::GlobalBufferNum>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        // Cast after lds
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) + a_block_space_size_aligned *
+                                                                            sizeof(ADataType) /
+                                                                            APackedSize),
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
+
+        // Blockwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
+        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
+        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(a_grid_desc_ak0_m_ak1,
+                                                                         a_block_desc_ak0_m_ak1,
+                                                                         a_blockwise_copy,
+                                                                         a_grid_buf,
+                                                                         a_block_buf,
+                                                                         a_block_slice_copy_step,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         b_block_desc_bk0_n_bk1,
+                                                                         b_blockwise_copy,
+                                                                         b_grid_buf,
+                                                                         b_block_buf,
+                                                                         b_block_slice_copy_step,
+                                                                         c_thread_buf,
+                                                                         b_scale_struct,
+                                                                         num_k_block_main_loop,
+                                                                         num_k_block_per_scale);
+
+        // shuffle C and write out
+        {
+            // C mapping in single thread.
+            constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
+                blockwise_gemm_pipeline
+                    .GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
+
+            // C mapping in single block
+            constexpr auto
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp =
+                    blockwise_gemm_pipeline
+                        .GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
+
+            constexpr auto MWave =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I1);
+            constexpr auto MSubGroup =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I2);
+            constexpr auto NWave =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I4);
+            constexpr auto NThreadPerSubGroup =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I5);
+            constexpr auto MAccVgprs =
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
+                    .GetLength(I6);
+
+            // LDS descriptor, shuffle and write out in MRepeat x NRepeat times
+            constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+                GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat
+                    .GetElementSpaceSize());
+
+            constexpr auto
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
+                    transform_tensor_descriptor(
+                        c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                        make_tuple(
+                            make_freeze_transform(I0),
+                            make_unmerge_transform(make_tuple(
+                                Number<CShuffleMRepeatPerShuffle>{}, // MRepeat per shuffle repeat
+                                MWave,                               // MWave
+                                MSubGroup, // MSubGroup * MAccVgprs = MPerWmma
+                                MAccVgprs)),
+                            make_freeze_transform(I0),
+                            make_unmerge_transform(make_tuple(
+                                Number<CShuffleNRepeatPerShuffle>{}, // NRepeat per shuffle repeat
+                                NWave,                               // NWave
+                                NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                        make_tuple(Sequence<>{},
+                                   Sequence<0, 1, 2, 6>{},
+                                   Sequence<>{},
+                                   Sequence<3, 4, 5>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor =
+                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
+                                                     MRepeat, MWave, MSubGroup, MAccVgprs))),
+                                                 make_tuple(Sequence<0, 1, 2, 3>{}),
+                                                 make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor
+                    .CalculateBottomIndex(make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor =
+                make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(
+                                                     NRepeat, NWave, NThreadPerSubGroup))),
+                                                 make_tuple(Sequence<0, 1, 2>{}),
+                                                 make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor
+                    .CalculateBottomIndex(make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
+                decltype(c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMRepeatPerShuffle,
+                         I1,
+                         I1,
+                         CShuffleNRepeatPerShuffle,
+                         I1,
+                         I1,
+                         MAccVgprs>,
+                Sequence<0, 1, 2, 3, 4, 5, 6>,
+                6,
+                1, // vector write pixel
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{
+                c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                make_multi_index(0,
+                                 m_thread_data_on_block_idx[I1],
+                                 m_thread_data_on_block_idx[I2],
+                                 0,
+                                 n_thread_data_on_block_idx[I1],
+                                 n_thread_data_on_block_idx[I2],
+                                 m_thread_data_on_block_idx[I3]),
+                ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                         1,
+                         CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                CShuffleDataType,     // typename SrcData,
+                CDataType,            // typename DstData,
+                decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_m_id, 0, block_n_id, 0),
+                 c_element_op};
+
+            // space filling curve for local reg & global memory
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MRepeat, 1, 1, NRepeat, 1, 1, MAccVgprs>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                  Sequence<CShuffleMRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           CShuffleNRepeatPerShuffle,
+                                           1,
+                                           1,
+                                           MAccVgprs>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMRepeatPerShuffle * MWave * MPerWmma,
+                                           1,
+                                           CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(
+                    c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                    sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                    c_thread_buf,
+                    c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
+                    c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
index 93eed31bc5..6543e3df23 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -17,6 +17,22 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
+#ifdef CK_USE_WMMA
+void add_device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BScale<Row,
+                                                   Col,
+                                                   Row,
+                                                   F16,
+                                                   I4,
+                                                   F16,
+                                                   F16,
+                                                   1,
+                                                   128,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   PassThrough>>>& instances);
+#endif
+#ifdef CK_USE_XDL
 void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
     std::vector<std::unique_ptr<DeviceGemmV2BScale<Row,
                                                    Col,
@@ -31,6 +47,7 @@ void add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
                                                    PassThrough,
                                                    PassThrough>>>& instances);
 #endif
+#endif
 
 template <typename ADataType,
           typename BDataType,
@@ -77,7 +94,12 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmV2
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                          is_same_v<CLayout, Row>)
             {
+#ifdef CK_USE_WMMA
+                add_device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
+#endif
+#ifdef CK_USE_XDL
                 add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
+#endif
             }
         }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt
index 424320fa8f..34f51f5f58 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/CMakeLists.txt
@@ -1,10 +1,12 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_B_SCALE_INSTANCES)
 
 list(APPEND GEMM_B_SCALE_INSTANCES 
         device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
         )
 
 set_source_files_properties(device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
-add_instance_library(device_gemm_b_scale_instance ${GEMM_B_SCALE_INSTANCES})
\ No newline at end of file
+add_instance_library(device_gemm_b_scale_instance ${GEMM_B_SCALE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn.hpp
new file mode 100644
index 0000000000..9476eb6bf0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn.hpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using I4  = pk_i4_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm| Compute| Compute| PermuteA| PermuteB|
+        //################################|        |        |        | Type|  Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block| Block| Block|    |    |Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|     NRepeat|            _MBlock_MPerBlock| ScalarPerVector|          Pipeline|                     Pipeline|   TypeA|   TypeB|         |         |
+        //################################|        |        |        |     |      |   Type|      |        |         |   Operation|   Operation|   Operation|              |      |     N|     K|      |      |      |    |    |    |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|         Scheduler|                     Verision|        |        |         |         |
+        //################################|        |        |        |     |      |       |      |        |         |            |            |            |              |      |      |      |      |      |      |    |    |    |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |           |            |                             |                |                  |                             |        |        |         |         |
+
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,     1,   128,   128,   128,    64,   8,   8,  16,   16,       4,       2,    S< 8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 8>,               8,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //0
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,     1,   128,   128,   128,    64,   8,   8,  16,   16,       4,       2,    S< 8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 8>,               8,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //1
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,     1,   128,   128,   128,    64,   8,   8,  16,   16,       4,       2,    S< 8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 8>,               8,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //2
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,     1,   128,    64,    64,    64,   8,   8,  16,   16,       2,       2,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 4>,               8,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //3
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,     1,   128,    64,    64,    64,   8,   8,  16,   16,       2,       2,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 4>,               8,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //4
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,     1,   128,    64,    64,    64,   8,   8,  16,   16,       2,       2,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 4>,               8,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //5
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    32,    32,    64,   8,   8,  16,   16,       2,       2,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               8,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //6
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    32,    32,    64,   8,   8,  16,   16,       2,       2,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               8,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //7
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    32,    32,    64,   8,   8,  16,   16,       2,       2,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               8,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //8
+
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,     1,   128,   128,   128,   128,   8,   8,  16,   16,       4,       2,    S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 8>,               8,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //9
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,     1,   128,   128,   128,   128,   8,   8,  16,   16,       4,       2,    S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 8>,               8,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //10
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,     1,   128,   128,   128,   128,   8,   8,  16,   16,       4,       2,    S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 32, 1, 8>,               8,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //11
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,     1,   128,    32,    32,   128,   8,   8,  16,   16,       1,       1,    S< 8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 8>,               2,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //12
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,     1,   128,    32,    32,   128,   8,   8,  16,   16,       1,       1,    S< 8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 8>,               2,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //13
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,     1,   128,    32,    32,   128,   8,   8,  16,   16,       1,       1,    S< 8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 8>,               2,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //14
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    16,    16,   128,   8,   8,  16,   16,       1,       1,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               8,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //15
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    16,    16,   128,   8,   8,  16,   16,       1,       1,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               8,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //16
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    16,    16,   128,   8,   8,  16,   16,       1,       1,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               8,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //17
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    16,    16,   128,   8,   8,  16,   16,       1,       1,    S< 4,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               4,         Intrawave, BlockGemmPipelineVersion::v3,  half_t,  half_t,    false,   false>, //18
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    16,    16,   128,   8,   8,  16,   16,       1,       1,    S< 4,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               4,         Intrawave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>, //19
+        DeviceGemm_BScale_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    I4,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec,    32,     1,   128,    16,    16,   128,   8,   8,  16,   16,       1,       1,    S< 4,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S< 4,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,          1,           1,               S<1, 16, 1, 2>,               4,         Interwave, BlockGemmPipelineVersion::v1,  half_t,  half_t,    false,   false>  //20
+
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
new file mode 100644
index 0000000000..9c196a3c58
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_wmma_f16_i4_f16/device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmV2BScale<Row,
+                                                   Col,
+                                                   Row,
+                                                   F16,
+                                                   I4,
+                                                   F16,
+                                                   F16,
+                                                   1,
+                                                   128,
+                                                   PassThrough,
+                                                   PassThrough,
+                                                   PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_b_scale_wmma_f16_i4_f16_mk_nk_mn_mem_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
index ce5cf21a85..1f8ca4d23a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
@@ -46,7 +46,7 @@ using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
         //#########################|        |        |        | Type|  Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |   Type|      |        |         |   Operation|   Operation|   Operation|              |      |     N|     K|      |      |      |    |    |Wave| Wave|     |     | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |       |      |        |         |            |            |            |              |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-        
+
         //Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,   128,   8,   32,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v3, half_t, half_t, false, false>, //0
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,   F16,   F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    1,   128,   128,  128,    64,   8,   32,  32,   32,    2,    2,     S<8, 32, 1>,      S<1, 0, 2>,    S<1, 0, 2>,             2,              8,              8,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v4, half_t, half_t, false, false>, //1
diff --git a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
index fe977e766e..86370e2f47 100644
--- a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -173,7 +173,7 @@ bool profile_gemm_b_scale_impl(int do_verification,
             }
         }
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                BDataType,
+                                                                                AccDataType,
                                                                                 CDataType,
                                                                                 AccDataType,
                                                                                 AElementOp,
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 1dc942699f..e27fda05e4 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -72,7 +72,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
   list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
   list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
   list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
   list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
   list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
@@ -93,6 +92,7 @@ endif()
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
   list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
+  list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
@@ -178,7 +178,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
   endif()
   list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
   list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
   list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
@@ -208,6 +207,7 @@ endif()
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
   list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
   list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
+  list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c738eab802..c6c09eb6ca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -242,6 +242,7 @@ add_subdirectory(gemm_add)
 add_subdirectory(gemm_layernorm)
 add_subdirectory(gemm_split_k)
 add_subdirectory(gemm_universal)
+add_subdirectory(gemm_b_scale)
 add_subdirectory(gemm_universal_streamk)
 add_subdirectory(gemm_reduce)
 add_subdirectory(batched_gemm)
diff --git a/test/gemm_b_scale/CMakeLists.txt b/test/gemm_b_scale/CMakeLists.txt
new file mode 100644
index 0000000000..0bf8a024ea
--- /dev/null
+++ b/test/gemm_b_scale/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_gtest_executable(test_gemm_b_scale_xdl test_gemm_b_scale_xdl.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_b_scale_xdl PRIVATE utility device_gemm_b_scale_instance)
+endif()
+
+add_gtest_executable(test_gemm_b_scale_wmma test_gemm_b_scale_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_gemm_b_scale_wmma PRIVATE utility device_gemm_b_scale_instance)
+endif()
diff --git a/test/gemm_b_scale/test_gemm_b_scale_ut_cases.inc b/test/gemm_b_scale/test_gemm_b_scale_ut_cases.inc
new file mode 100644
index 0000000000..b9b4ea7b9d
--- /dev/null
+++ b/test/gemm_b_scale/test_gemm_b_scale_ut_cases.inc
@@ -0,0 +1,43 @@
+#pragma once
+
+TYPED_TEST(TestGemmBScale_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 256;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmBScale_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 768;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
+
+TYPED_TEST(TestGemmBScale_MK_NK, Regular)
+{
+    std::vector<int> Ms{512, 1024};
+    constexpr int N = 512;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC);
+}
diff --git a/test/gemm_b_scale/test_gemm_b_scale_util.hpp b/test/gemm_b_scale/test_gemm_b_scale_util.hpp
new file mode 100644
index 0000000000..ec47470b84
--- /dev/null
+++ b/test/gemm_b_scale/test_gemm_b_scale_util.hpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_gemm_b_scale_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestGemmBScale : public testing::Test
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using F32 = float;
+
+    protected:
+    using ALayout         = std::tuple_element_t<0, Tuple>;
+    using BLayout         = std::tuple_element_t<1, Tuple>;
+    using CLayout         = Row;
+    using ADataType       = std::tuple_element_t<2, Tuple>;
+    using BDataType       = std::tuple_element_t<3, Tuple>;
+    using BScaleDataType  = std::tuple_element_t<4, Tuple>;
+    using ComputeDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType       = std::tuple_element_t<6, Tuple>;
+
+    public:
+    static constexpr ck::index_t ScaleBlockK = 128; // all instances
+    static constexpr bool verify_            = true;
+    static constexpr int init_method_        = 2;
+    static constexpr bool log_               = false;
+    static constexpr bool bench_             = false; // measure kernel performance
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1, 2}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   int kbatch   = 1,
+                   int n_warmup = 1,
+                   int n_iter   = 10)
+    {
+        bool pass = ck::profiler::profile_gemm_b_scale_impl<ADataType,
+                                                            BDataType,
+                                                            BScaleDataType,
+                                                            ComputeDataType,
+                                                            F32,
+                                                            CDataType,
+                                                            ScaleBlockK,
+                                                            ALayout,
+                                                            BLayout,
+                                                            CLayout>(verify_,
+                                                                     init_method_,
+                                                                     log_,
+                                                                     bench_,
+                                                                     M,
+                                                                     N,
+                                                                     K,
+                                                                     StrideA,
+                                                                     StrideB,
+                                                                     StrideC,
+                                                                     kbatch,
+                                                                     n_warmup,
+                                                                     n_iter);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
diff --git a/test/gemm_b_scale/test_gemm_b_scale_wmma.cpp b/test/gemm_b_scale/test_gemm_b_scale_wmma.cpp
new file mode 100644
index 0000000000..38a3540925
--- /dev/null
+++ b/test/gemm_b_scale/test_gemm_b_scale_wmma.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_b_scale_util.hpp"
+
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmBScale_MK_NK
+    : public ck::test::TestGemmBScale<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, BScaleDataType, ComputeDataType, CDataType
+    std::tuple<      F16,        I4,            F16,             F16,       F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmBScale_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_b_scale_ut_cases.inc"
diff --git a/test/gemm_b_scale/test_gemm_b_scale_xdl.cpp b/test/gemm_b_scale/test_gemm_b_scale_xdl.cpp
new file mode 100644
index 0000000000..38a3540925
--- /dev/null
+++ b/test/gemm_b_scale/test_gemm_b_scale_xdl.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_gemm_b_scale_util.hpp"
+
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestGemmBScale_MK_NK
+    : public ck::test::TestGemmBScale<typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, BScaleDataType, ComputeDataType, CDataType
+    std::tuple<      F16,        I4,            F16,             F16,       F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestGemmBScale_MK_NK, KernelTypes_MK_NK);
+
+#include "test_gemm_b_scale_ut_cases.inc"

From c8eb2f995cea2d8dbbe2e286ff0fe99f75efb227 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Fri, 25 Jul 2025 10:34:31 +0200
Subject: [PATCH 348/443] Add v3 support for Groupd fwd conv+bias+clamp &
 ckProfiler (#2463)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add logging to IsSupported.

* Less casting in AddClamp

* Conv+bias+clamp instances & profiler BF16

* Fix 3D instances & run just 1x for verification.

* :Run just once for verification conv fwd.

* ckProfiler conv fwd clampwq

* Remove exec bit & formatting

* Add support for MultiD for grouped conv fwd v3.

* Enable 2Lds.

* clean

* align instances

* align instances

* profiler fixes

* Fixes

* fix

* fix

---------

Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 673 ++++++++++++------
 .../element/binary_element_wise_operation.hpp |   8 +-
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp | 327 ++++++---
 profiler/src/CMakeLists.txt                   |   6 +
 .../profile_grouped_conv_fwd_bias_clamp.cpp   | 191 +++++
 .../src/profile_grouped_conv_fwd_clamp.cpp    | 194 +++++
 6 files changed, 1098 insertions(+), 301 deletions(-)
 create mode 100644 profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp
 create mode 100644 profiler/src/profile_grouped_conv_fwd_clamp.cpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index 48424c16b9..e30caf3aac 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -21,7 +21,7 @@
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
@@ -61,10 +61,11 @@ namespace {
  *
  */
 template <typename GridwiseGemm,
+          typename ComputePtrOffset,
           typename AGridDesc_AK0_M_K1,
           typename BGridDesc_BK0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename ComputePtrOffset,
+          typename DsGridDesc_M_N,
+          typename EGridDesc_M_N,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
@@ -73,25 +74,33 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
     __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_fwd_xdl_cshuffle_v3(
-            typename GridwiseGemm::Argument karg,
-            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n)
+        kernel_grouped_conv_fwd_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg,
+                                                const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+                                                const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+                                                const DsGridDesc_M_N ds_grid_desc_m_n,
+                                                const EGridDesc_M_N c_grid_desc_m_n,
+                                                const ComputePtrOffset compute_ptr_offset_of_groups,
+                                                const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
-    const long_index_t a_batch_offset =
+    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+
+    static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
+    using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
+    DsGridPointer p_ds_grid_grp{};
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_group_offset[i]; });
+
+    const long_index_t a_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
+    const long_index_t b_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
+    const long_index_t e_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
 
     const long_index_t a_n_offset =
@@ -101,29 +110,41 @@ __global__ void
 
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
-                               BGridDesc_BK0_N_K1,
-                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                               HasMainKBlockLoop,
-                               CGlobalMemoryDataOperation,
-                               TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
-                                        karg.p_b_grid + b_batch_offset,
-                                        karg.p_c_grid + e_batch_offset + e_n_offset,
-                                        p_shared,
-                                        karg,
-                                        a_grid_desc_ak0_m_ak1,
-                                        b_grid_desc_bk0_n_bk1,
-                                        c_grid_desc_mblock_mperblock_nblock_nperblock);
+    using Block2CTileMap         = typename GridwiseGemm::Block2CTileMapDefault;
+    const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
+
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + a_group_offset + a_n_offset,
+        karg.p_b_grid + b_group_offset,
+        p_ds_grid_grp,
+        karg.p_c_grid + e_group_offset + e_n_offset,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op,
+        block_2_ctile_map,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ds_grid_desc_m_n,
+        c_grid_desc_m_n);
 #else
     ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_m_n;
+    ignore = c_grid_desc_m_n;
+    ignore = compute_ptr_offset_of_groups;
+    ignore = compute_ptr_offset_of_n;
 #endif // end of if (defined(__gfx9__))
 }
 
 template <typename GridwiseGemm,
+          typename ComputePtrOffset,
           typename AGridDesc_AK0_M_K1,
           typename BGridDesc_BK0_N_K1,
-          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename ComputePtrOffset,
+          typename DsGridDesc_M_N,
+          typename EGridDesc_M_N,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
           index_t MinimumOccupancy = 1,
@@ -134,23 +155,32 @@ __global__ void
 #endif
         kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds(
             typename GridwiseGemm::Argument karg,
-            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_groups,
-            [[maybe_unused]] const ComputePtrOffset compute_ptr_offset_of_n)
+            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            const DsGridDesc_M_N ds_grid_desc_m_n,
+            const EGridDesc_M_N c_grid_desc_m_n,
+            const ComputePtrOffset compute_ptr_offset_of_groups,
+            const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
-    const long_index_t a_batch_offset =
+    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+
+    static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
+    using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
+    DsGridPointer p_ds_grid_grp{};
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_group_offset[i]; });
+
+    const long_index_t a_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset =
+    const long_index_t b_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-    const long_index_t e_batch_offset =
+    const long_index_t e_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
 
     const long_index_t a_n_offset =
@@ -163,22 +193,33 @@ __global__ void
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
-                                    BGridDesc_BK0_N_K1,
-                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                    HasMainKBlockLoop,
-                                    CGlobalMemoryDataOperation,
-                                    TailNum>(karg.p_a_grid + a_batch_offset + a_n_offset,
-                                             karg.p_b_grid + b_batch_offset,
-                                             karg.p_c_grid + e_batch_offset + e_n_offset,
-                                             p_shared_0,
-                                             p_shared_1,
-                                             karg,
-                                             a_grid_desc_ak0_m_ak1,
-                                             b_grid_desc_bk0_n_bk1,
-                                             c_grid_desc_mblock_mperblock_nblock_nperblock);
+    using Block2CTileMap         = typename GridwiseGemm::Block2CTileMapDefault;
+    const auto block_2_ctile_map = Block2CTileMap{karg.M, karg.N, 4};
+
+    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_a_grid + a_group_offset + a_n_offset,
+        karg.p_b_grid + b_group_offset,
+        p_ds_grid_grp,
+        karg.p_c_grid + e_group_offset + e_n_offset,
+        p_shared_0,
+        p_shared_1,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op,
+        block_2_ctile_map,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        ds_grid_desc_m_n,
+        c_grid_desc_m_n);
 #else
     ignore = karg;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_m_n;
+    ignore = c_grid_desc_m_n;
+    ignore = compute_ptr_offset_of_groups;
+    ignore = compute_ptr_offset_of_n;
 #endif // end of if (defined(__gfx9__))
 }
 
@@ -277,10 +318,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     static constexpr bool isMultiA   = is_detected<is_tuple, ADataType>::value;
     static constexpr bool isMultiB   = is_detected<is_tuple, BDataType>::value;
     static constexpr bool isMultiD   = DsDataType::Size() > 0;
-    static constexpr bool isMultiABD = isMultiA || isMultiB || isMultiD;
+    static constexpr bool isMultiABD = isMultiA && isMultiB && isMultiD;
 
     static constexpr bool DoElementwiseBeforeCShuffle =
-        !isMultiABD && is_same_v<EDataType, bhalf_t> &&
+        !isMultiD && is_same_v<EDataType, bhalf_t> &&
         !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
 
     static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
@@ -294,12 +335,19 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     static constexpr auto I4 = Number<4>{};
     static constexpr auto I5 = Number<5>{};
 
+    // Generate vector size for C & Ds
+    using CDEBlockTransferScalarPerVectors =
+        typename uniform_sequence_gen<NumDTensor + 1,
+                                      CDEBlockTransferScalarPerVector_NPerBlock>::type;
+
     using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial,
                                                             ConvForwardSpecialization,
                                                             true /*SplitN*/,
                                                             ADataType,
                                                             EDataType>;
 
+    using ComputePtrOffset = ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>;
+
     static constexpr auto matrix_padder =
         MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
 
@@ -396,30 +444,81 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         return out_gemmm_gemmn_desc;
     }
 
+    // Shape of Ds and E must be aligned. Strides can be different.
+    // Pass e_g_n_k_wos_lengths for logical broadcast.
+    static auto MakeDsGridDescriptor_M_N(const ConvToGemmFwdTransformer& conv_to_gemm_transformer)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(conv_to_gemm_transformer);
+            },
+            Number<NumDTensor>{});
+    }
+
     // desc for problem definition
     constexpr static ConvToGemmFwdTransformer dummy_conv_to_gemm_transformer;
     using EGridDesc_M_N =
         remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>(dummy_conv_to_gemm_transformer))>;
-
-#define GridwiseGemmV3TemplateParams                                                           \
-    tensor_layout::gemm::RowMajor, tensor_layout::gemm::ColumnMajor,                           \
-        tensor_layout::gemm::RowMajor, ADataType, BDataType, AccDataType, CShuffleDataType,    \
-        EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation,      \
-        GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL,      \
-        MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1,                \
-        ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,                 \
-        ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,                          \
-        ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM,                          \
-        BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, \
-        BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,                              \
-        BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false,         \
-        BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
-        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
-        CDEBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer,       \
-        AComputeDataType, BComputeDataType, false, false, DoElementwiseBeforeCShuffle
+    using DsGridDesc_M_N =
+        remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(dummy_conv_to_gemm_transformer))>;
 
     // Use appropriate gridwise gemm
-    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<GridwiseGemmV3TemplateParams>;
+    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3<
+        tensor_layout::gemm::RowMajor,
+        tensor_layout::gemm::ColumnMajor,
+        DsLayout,
+        tensor_layout::gemm::RowMajor,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        AComputeDataType,
+        BComputeDataType,
+        ADataType,
+        BDataType,
+        DoElementwiseBeforeCShuffle>;
+
+    // #undef GridwiseGemmV3TemplateParams
 
     using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, NPerBlock>;
 
@@ -493,37 +592,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                             I0,
                             I1>;
 
-    static auto
-    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N& e_grid_desc_m_n)
-    {
-        const index_t M = e_grid_desc_m_n.GetLength(I0);
-        const index_t N = e_grid_desc_m_n.GetLength(I1);
-        return GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            e_grid_desc_m_n, GridwiseGemm::CalculateMBlock(M), GridwiseGemm::CalculateNBlock(N));
-    }
-
     // desc for blockwise copy
     using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(MakeAGridDescriptor_AK0_M_AK1<ALayout>(
         dummy_conv_to_gemm_transformer))>;
     using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1<BLayout>(
         dummy_conv_to_gemm_transformer))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            EGridDesc_M_N{}))>;
 
     // Argument
     struct Argument : public BaseArgument
     {
         Argument(const void* p_as,
                  const void* p_bs,
-                 const std::array<const void*, NumDTensor>&,
+                 const std::array<const void*, NumDTensor>& p_ds,
                  void* p_e,
                  const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
                  const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
-                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&,
-                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_strides,
                  const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
                  const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                  const std::array<index_t, NDimSpatial>& conv_filter_strides,
@@ -535,6 +624,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                  const CDEElementwiseOperation& cde_element_op)
             : p_a_grid_{},
               p_b_grid_{},
+              p_ds_grid_{p_ds},
               p_e_grid_{static_cast<EDataType*>(p_e)},
               a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
               a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
@@ -542,6 +632,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
               b_g_k_c_xs_strides_{conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
                   b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
+              ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
+              ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
               e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
               e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
                   e_g_n_k_wos_lengths, e_g_n_k_wos_strides)},
@@ -561,13 +653,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                                         input_left_pads_,
                                         input_right_pads_},
               conv_N_per_block_{conv_to_gemm_transformer_.N_},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{
+                  DeviceOp::MakeEGridDescriptor_M_N<ELayout>(conv_to_gemm_transformer_)},
               a_grid_desc_ak0_m_ak1_{
                   MakeAGridDescriptor_AK0_M_AK1<ALayout>(conv_to_gemm_transformer_)},
               b_grid_desc_bk0_n_bk1_{
                   MakeBGridDescriptor_BK0_N_BK1<BLayout>(conv_to_gemm_transformer_)},
-              e_grid_desc_m_n_{
-                  DeviceOp::MakeEGridDescriptor_M_N<ELayout>(conv_to_gemm_transformer_)},
-              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
               compute_ptr_offset_of_groups_{},
               compute_ptr_offset_of_n_{},
               a_element_op_{a_element_op},
@@ -583,12 +675,33 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             p_a_grid_ = static_cast<const ADataType*>(p_as);
             p_b_grid_ = static_cast<const BDataType*>(p_bs);
 
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                // D batch stride
+                compute_ptr_offset_of_groups_.BatchStrideDs_(i) = ds_g_n_k_wos_strides_[i][0];
+                compute_ptr_offset_of_n_.BatchStrideDs_(i) =
+                    ds_g_n_k_wos_strides_[i][1] * conv_N_per_block_;
+
+                ConvToGemmFwdTransformer conv_to_gemm_transformer_d{a_g_n_c_wis_lengths_,
+                                                                    a_g_n_c_wis_strides_,
+                                                                    b_g_k_c_xs_lengths_,
+                                                                    b_g_k_c_xs_strides_,
+                                                                    e_g_n_k_wos_lengths_,
+                                                                    ds_g_n_k_wos_strides_[i],
+                                                                    conv_filter_strides_,
+                                                                    conv_filter_dilations_,
+                                                                    input_left_pads_,
+                                                                    input_right_pads_};
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N<DLayout>(conv_to_gemm_transformer_d);
+            });
+
             compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides_[0];
             compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides_[1] * conv_N_per_block_;
 
-            e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_);
-
             if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
                          is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
             {
@@ -610,14 +723,14 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                 e_in_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
                         e_g_n_k_wos_lengths, e_g_n_k_wos_strides);
-                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
-                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
                 e_out_transpose_desc_ =
                     conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
                         e_g_n_k_wos_lengths, e_g_n_k_wos_strides);
 
                 elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
                     a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
                 elementwise_block_2_ctile_map_transpose_e_ = Block2TileMapElementwise{
                     e_in_transpose_desc_.GetLength(I0), e_in_transpose_desc_.GetLength(I1)};
             }
@@ -680,6 +793,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             std::cout << "A[AK0, M, AK1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
             std::cout << "B[BK0, N, BK1]: " << b_grid_desc_bk0_n_bk1_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
             std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
         }
 
@@ -687,6 +802,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         // pointers (tuple if multi AB, pointer if no)
         const ADataType* p_a_grid_;
         const BDataType* p_b_grid_;
+        const std::array<const void*, NumDTensor> p_ds_grid_;
         EDataType* p_e_grid_;
 
         // for checking IsSupportedArgument()
@@ -694,6 +810,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
         std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
         std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_;
         std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_;
         std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_;
         std::array<index_t, NDimSpatial> conv_filter_strides_;
@@ -705,18 +823,18 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         index_t num_group_;
 
         ConvToGemmFwdTransformer conv_to_gemm_transformer_;
-
         index_t conv_N_per_block_;
 
         // tensor descriptors for block/thread-wise copy
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        EGridDesc_M_N e_grid_desc_m_n_;
-        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // for computing batch offset
-        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_groups_;
-        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_n_;
+        ComputePtrOffset compute_ptr_offset_of_groups_;
+        ComputePtrOffset compute_ptr_offset_of_n_;
 
         // element-wise op
         AElementwiseOperation a_element_op_;
@@ -759,6 +877,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                 arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;
 
             index_t gdx, gdy, gdz;
+            // TODO: Do we want to support kbatch ??
             std::tie(gdx, gdy, gdz) =
                 GridwiseGemm::CalculateGridSize(GemmM, GemmN, I1 /*arg.KBatch*/);
 
@@ -784,20 +903,23 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         sizeof(EDataType);
             }
 
-            typename GridwiseGemm::Argument gemm_arg{p_a_grid,
-                                                     p_b_grid,
-                                                     p_e_grid,
-                                                     GemmM,
-                                                     GemmN,
-                                                     GemmK,
-                                                     I0,
-                                                     I0,
-                                                     I0,
-                                                     I1,
-                                                     false,
-                                                     arg.a_element_op_,
-                                                     arg.b_element_op_,
-                                                     arg.cde_element_op_};
+            typename GridwiseGemm::Argument gemm_arg{
+                p_a_grid,
+                p_b_grid,
+                arg.p_ds_grid_,
+                p_e_grid,
+                GemmM,
+                GemmN,
+                GemmK,
+                // No need to set strides, we pass descs to kernel
+                I0,
+                I0,
+                {},
+                I0,
+                I1, // kbatch
+                arg.a_element_op_,
+                arg.b_element_op_,
+                arg.cde_element_op_};
 
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
@@ -827,24 +949,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         gemm_arg_,
                         arg.a_grid_desc_ak0_m_ak1_,
                         arg.b_grid_desc_bk0_n_bk1_,
-                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                        arg.ds_grid_desc_m_n_,
+                        arg.e_grid_desc_m_n_,
                         arg.compute_ptr_offset_of_groups_,
                         arg.compute_ptr_offset_of_n_);
                 }
                 else
                 {
-                    ave_time +=
-                        launch_and_time_kernel(stream_config,
-                                               kernel,
-                                               dim3(gdx, gdy, gdz),
-                                               dim3(BlockSize),
-                                               0,
-                                               gemm_arg,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.compute_ptr_offset_of_groups_,
-                                               arg.compute_ptr_offset_of_n_);
+                    ave_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(gdx, gdy, gdz),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       gemm_arg,
+                                                       arg.a_grid_desc_ak0_m_ak1_,
+                                                       arg.b_grid_desc_bk0_n_bk1_,
+                                                       arg.ds_grid_desc_m_n_,
+                                                       arg.e_grid_desc_m_n_,
+                                                       arg.compute_ptr_offset_of_groups_,
+                                                       arg.compute_ptr_offset_of_n_);
                 }
             };
 
@@ -854,15 +977,16 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
                              BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
                 {
-                    const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
-                        GridwiseGemm,
-                        DeviceOp::AGridDesc_AK0_M_AK1,
-                        DeviceOp::BGridDesc_BK0_N_BK1,
-                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                        true,
-                        InMemoryDataOperationEnum::Set,
-                        minimum_occupancy>;
+                    const auto kernel =
+                        kernel_grouped_conv_fwd_xdl_cshuffle_v3<GridwiseGemm,
+                                                                ComputePtrOffset,
+                                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                                DeviceOp::DsGridDesc_M_N,
+                                                                DeviceOp::EGridDesc_M_N,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy>;
                     Run(kernel);
                 }
                 // Tail number could be One to Seven
@@ -870,30 +994,32 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                 {
                     if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
                     {
-                        const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
-                            GridwiseGemm,
-                            DeviceOp::AGridDesc_AK0_M_AK1,
-                            DeviceOp::BGridDesc_BK0_N_BK1,
-                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                            true,
-                            InMemoryDataOperationEnum::Set,
-                            minimum_occupancy,
-                            TailNumber::One>;
+                        const auto kernel =
+                            kernel_grouped_conv_fwd_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    ComputePtrOffset,
+                                                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                                                    DeviceOp::DsGridDesc_M_N,
+                                                                    DeviceOp::EGridDesc_M_N,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::One>;
                         Run(kernel);
                     }
                     else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full)
                     {
-                        const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
-                            GridwiseGemm,
-                            DeviceOp::AGridDesc_AK0_M_AK1,
-                            DeviceOp::BGridDesc_BK0_N_BK1,
-                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                            true,
-                            InMemoryDataOperationEnum::Set,
-                            minimum_occupancy,
-                            TailNumber::Full>;
+                        const auto kernel =
+                            kernel_grouped_conv_fwd_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    ComputePtrOffset,
+                                                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                                                    DeviceOp::DsGridDesc_M_N,
+                                                                    DeviceOp::EGridDesc_M_N,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Full>;
                         Run(kernel);
                     }
 
@@ -903,10 +1029,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
                                 GridwiseGemm,
+                                ComputePtrOffset,
                                 DeviceOp::AGridDesc_AK0_M_AK1,
                                 DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                DeviceOp::DsGridDesc_M_N,
+                                DeviceOp::EGridDesc_M_N,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -921,10 +1048,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
                                 GridwiseGemm,
+                                ComputePtrOffset,
                                 DeviceOp::AGridDesc_AK0_M_AK1,
                                 DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                DeviceOp::DsGridDesc_M_N,
+                                DeviceOp::EGridDesc_M_N,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -939,10 +1067,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
                                 GridwiseGemm,
+                                ComputePtrOffset,
                                 DeviceOp::AGridDesc_AK0_M_AK1,
                                 DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                DeviceOp::DsGridDesc_M_N,
+                                DeviceOp::EGridDesc_M_N,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -957,10 +1086,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
                                 GridwiseGemm,
+                                ComputePtrOffset,
                                 DeviceOp::AGridDesc_AK0_M_AK1,
                                 DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                DeviceOp::DsGridDesc_M_N,
+                                DeviceOp::EGridDesc_M_N,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -975,10 +1105,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
                                 GridwiseGemm,
+                                ComputePtrOffset,
                                 DeviceOp::AGridDesc_AK0_M_AK1,
                                 DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                DeviceOp::DsGridDesc_M_N,
+                                DeviceOp::EGridDesc_M_N,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -993,10 +1124,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         {
                             const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
                                 GridwiseGemm,
+                                ComputePtrOffset,
                                 DeviceOp::AGridDesc_AK0_M_AK1,
                                 DeviceOp::BGridDesc_BK0_N_BK1,
-                                DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                DeviceOp::DsGridDesc_M_N,
+                                DeviceOp::EGridDesc_M_N,
                                 true,
                                 InMemoryDataOperationEnum::Set,
                                 minimum_occupancy,
@@ -1012,10 +1144,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                     {
                         const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds<
                             GridwiseGemm,
+                            ComputePtrOffset,
                             DeviceOp::AGridDesc_AK0_M_AK1,
                             DeviceOp::BGridDesc_BK0_N_BK1,
-                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            DeviceOp::DsGridDesc_M_N,
+                            DeviceOp::EGridDesc_M_N,
                             true,
                             InMemoryDataOperationEnum::Set,
                             minimum_occupancy,
@@ -1026,10 +1159,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                     {
                         const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds<
                             GridwiseGemm,
+                            ComputePtrOffset,
                             DeviceOp::AGridDesc_AK0_M_AK1,
                             DeviceOp::BGridDesc_BK0_N_BK1,
-                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            DeviceOp::DsGridDesc_M_N,
+                            DeviceOp::EGridDesc_M_N,
                             true,
                             InMemoryDataOperationEnum::Set,
                             minimum_occupancy,
@@ -1041,48 +1175,52 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                 {
                     if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
                     {
-                        const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
-                            GridwiseGemm,
-                            DeviceOp::AGridDesc_AK0_M_AK1,
-                            DeviceOp::BGridDesc_BK0_N_BK1,
-                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                            true,
-                            InMemoryDataOperationEnum::Set,
-                            minimum_occupancy,
-                            TailNumber::Odd>;
+                        const auto kernel =
+                            kernel_grouped_conv_fwd_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    ComputePtrOffset,
+                                                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                                                    DeviceOp::DsGridDesc_M_N,
+                                                                    DeviceOp::EGridDesc_M_N,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Odd>;
                         Run(kernel);
                     }
                     else
                     {
-                        const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
-                            GridwiseGemm,
-                            DeviceOp::AGridDesc_AK0_M_AK1,
-                            DeviceOp::BGridDesc_BK0_N_BK1,
-                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                            true,
-                            InMemoryDataOperationEnum::Set,
-                            minimum_occupancy,
-                            TailNumber::Even>;
+                        const auto kernel =
+                            kernel_grouped_conv_fwd_xdl_cshuffle_v3<GridwiseGemm,
+                                                                    ComputePtrOffset,
+                                                                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                                                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                                                    DeviceOp::DsGridDesc_M_N,
+                                                                    DeviceOp::EGridDesc_M_N,
+                                                                    true,
+                                                                    InMemoryDataOperationEnum::Set,
+                                                                    minimum_occupancy,
+                                                                    TailNumber::Even>;
                         Run(kernel);
                     }
                 }
             }
+            // has_main_k_block_loop
             else
             {
                 // Tail number always 1
                 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
                 {
-                    const auto kernel = kernel_grouped_conv_fwd_xdl_cshuffle_v3<
-                        GridwiseGemm,
-                        DeviceOp::AGridDesc_AK0_M_AK1,
-                        DeviceOp::BGridDesc_BK0_N_BK1,
-                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                        false,
-                        InMemoryDataOperationEnum::Set,
-                        minimum_occupancy>;
+                    const auto kernel =
+                        kernel_grouped_conv_fwd_xdl_cshuffle_v3<GridwiseGemm,
+                                                                ComputePtrOffset,
+                                                                DeviceOp::AGridDesc_AK0_M_AK1,
+                                                                DeviceOp::BGridDesc_BK0_N_BK1,
+                                                                DeviceOp::DsGridDesc_M_N,
+                                                                DeviceOp::EGridDesc_M_N,
+                                                                false,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy>;
                     Run(kernel);
                 }
             }
@@ -1095,6 +1233,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             float avg_time = 0.f;
             if constexpr(!isMultiABD)
             {
+                // Transpose to NGHWC layotu
                 if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
                              is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
                 {
@@ -1147,6 +1286,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 
                 avg_time += RunGemm(arg, stream_config);
 
+                // Transpose result back to NGCHW
                 if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
                              is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
                 {
@@ -1205,6 +1345,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         if constexpr(isMultiABD)
         {
             return false;
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                std::cout << "The MultiABD is not supported!"
+                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+            }
         }
 
         // check device
@@ -1213,12 +1359,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             // FIXME: re-enable fp64 when SWDEV-335738 is fixed
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout
+                        << "On gfx908 the accumulation data type must be one of fp32 or int32!"
+                        << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                        << std::endl;
+                }
                 return false;
             }
         }
 
         if(!ck::is_xdl_supported())
         {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                std::cout << "Current device does not support xdl instructions!"
+                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+            }
             return false;
         }
 
@@ -1236,6 +1395,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 
                 if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
                 {
+                    if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                    {
+                        std::cout << "The input paramters do not align with specialization "
+                                     "Filter1x1Stride1Pad0!"
+                                  << " In " << __FILE__ << ":" << __LINE__
+                                  << ", in function: " << __func__ << std::endl;
+                    }
                     return false;
                 }
             }
@@ -1252,6 +1418,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 
                 if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
                 {
+                    if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                    {
+                        std::cout
+                            << "The input paramters do not align with specialization Filter1x1Pad0!"
+                            << " In " << __FILE__ << ":" << __LINE__
+                            << ", in function: " << __func__ << std::endl;
+                    }
                     return false;
                 }
             }
@@ -1268,11 +1441,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[A Layout] The number of input channels is not a multiple of "
+                                 "ABlockTransferSrcScalarPerVector!"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
         else
         {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                std::cout << "Unsupported A Layout!"
+                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+            }
             return false;
         }
 
@@ -1288,11 +1474,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[B Layout] The number of input channels is not a multiple of "
+                                 "BBlockTransferSrcScalarPerVector!"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
         else
         {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                std::cout << "Unsupported A Layout!"
+                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+            }
             return false;
         }
 
@@ -1301,11 +1500,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[NGCHW Layout] The G * C is not a multiple of "
+                                 "CDEBlockTransferScalarPerVector_NPerBlock"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
 
             if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[NGCHW Layout] The G * K is not a multiple of "
+                                 "CDEBlockTransferScalarPerVector_NPerBlock"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
 
@@ -1316,11 +1529,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 
             if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[NGCHW Layout] The input_spatial_acum is not a multiple of "
+                                 "CDEBlockTransferScalarPerVector_NPerBlock"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
 
             if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[NGCHW Layout] The output_spatial_acum is not a multiple of "
+                                 "CDEBlockTransferScalarPerVector_NPerBlock"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
 
@@ -1340,6 +1567,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             if(!(arg.a_out_transpose_desc_.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
                  arg.e_in_transpose_desc_.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[NGCHW Layout] One of the transposed vectors is exceeding 2GB "
+                                 "memory size!"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
@@ -1354,17 +1588,37 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
             {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[E Layout] The K is not a multiple of "
+                                 "CDEBlockTransferScalarPerVector_NPerBlock"
+                              << " In " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
                 return false;
             }
         }
         else
         {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                std::cout << "Unsupported E Layout!"
+                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+            }
             return false;
         }
 
         // Gridwise gemm v3 doesn't verify descriptors size
         if(!arg.conv_to_gemm_transformer_.AreDescriptorsSmallerThan2GB())
         {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                std::cout
+                    << "[conv_to_gemm_transformer_] One of the descriptors is bigger than 2GB!"
+                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                    << std::endl;
+            }
             return false;
         }
 
@@ -1374,8 +1628,21 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         const index_t GemmK =
             arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
-        typename GridwiseGemm::Argument gemm_arg{
-            nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, I1 /*KBatch*/};
+        typename GridwiseGemm::Argument gemm_arg{nullptr,
+                                                 nullptr,
+                                                 {},
+                                                 nullptr,
+                                                 GemmM,
+                                                 GemmN,
+                                                 GemmK,
+                                                 I0,
+                                                 I0,
+                                                 {},
+                                                 I0,
+                                                 I1 /*KBatch*/,
+                                                 arg.a_element_op_,
+                                                 arg.b_element_op_,
+                                                 arg.cde_element_op_};
 
         return GridwiseGemm::CheckValidity(gemm_arg);
     }
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 34c76b89e4..d86f01e255 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -379,10 +379,10 @@ struct AddClamp
     __host__ __device__ constexpr void
     operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
     {
-        const half_t a = x0 + x1;
-        y              = a > type_convert<half_t>(floor_)
-                             ? (a < type_convert<half_t>(ceil_) ? a : type_convert<half_t>(ceil_))
-                             : type_convert<half_t>(floor_);
+        const half_t floor = type_convert<half_t>(floor_);
+        const half_t ceil  = type_convert<half_t>(ceil_);
+        const half_t a     = x0 + x1;
+        y                  = a > floor ? (a < ceil ? a : ceil) : floor;
     };
 
     template <>
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index c8dbd81b73..a3694e3767 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -143,7 +143,8 @@ template <typename ALayout,
           typename ComputeTypeA                       = CDataType,
           typename ComputeTypeB                       = ComputeTypeA,
           typename LDSTypeA                           = ADataType,
-          typename LDSTypeB                           = BDataType>
+          typename LDSTypeB                           = BDataType,
+          bool DoElementwiseBeforeCShuffle            = false>
 struct GridwiseGemmMultiD_xdl_cshuffle_v3
 {
     static constexpr auto I0 = Number<0>{};
@@ -466,6 +467,12 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
             {
                 return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
             }
+            else
+            {
+                static_assert(false,
+                              "The layout configuration is not supported! "
+                              "Only support Row & Col major.");
+            }
         }();
 
         // pad M and N
@@ -538,8 +545,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
             Number<NumDTensor>{});
     }
 
-    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(0, 0, 0, 0, {}))>;
-
     struct Problem
     {
         __host__ __device__ Problem() = default;
@@ -1245,11 +1250,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const ADataType* p_a_grid,
-                               const BDataType* p_b_grid,
+    __device__ static void Run(const ADataType* __restrict__ p_a_grid,
+                               const BDataType* __restrict__ p_b_grid,
                                DsGridPointer& p_ds_grid,
-                               CDataType* p_c_grid,
-                               void* p_shared,
+                               CDataType* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
                                const Problem& problem,
                                AElementwiseOperation a_element_op,
                                BElementwiseOperation b_element_op,
@@ -1273,11 +1278,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
               bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const ADataType* p_a_grid,
-                               const BDataType* p_b_grid,
+    __device__ static void Run(const ADataType* __restrict__ p_a_grid,
+                               const BDataType* __restrict__ p_b_grid,
                                DsGridPointer& p_ds_grid,
-                               CDataType* p_c_grid,
-                               void* p_shared,
+                               CDataType* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
                                const Problem& problem,
                                AElementwiseOperation a_element_op,
                                BElementwiseOperation b_element_op,
@@ -1288,17 +1293,62 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
         const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
             problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
 
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+        Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum, Block2CTileMap>(
+            p_a_grid,
+            p_b_grid,
+            p_ds_grid,
+            p_c_grid,
+            p_shared,
+            problem,
+            a_element_op,
+            b_element_op,
+            c_element_op,
+            block_2_ctile_map,
+            a_grid_desc_ak0_m_ak1,
+            b_grid_desc_bk0_n_bk1,
+            ds_grid_desc_m_n,
+            c_grid_desc_m_n);
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum,
+              typename Block2CTileMap,
+              typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename DsGridDesc_M_N,
+              typename CGridDesc_M_N>
+    __device__ static void Run(const ADataType* __restrict__ p_a_grid,
+                               const BDataType* __restrict__ p_b_grid,
+                               DsGridPointer& p_ds_grid,
+                               CDataType* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
+                               const Problem& problem,
+                               AElementwiseOperation a_element_op,
+                               BElementwiseOperation b_element_op,
+                               CElementwiseOperation c_element_op,
+                               const Block2CTileMap& block_2_ctile_map,
+                               const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                               const DsGridDesc_M_N& ds_grid_desc_m_n,
+                               const CGridDesc_M_N& c_grid_desc_m_n)
+    {
 
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
         auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
 
@@ -1515,43 +1565,63 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
                     make_multi_index(n_thread_data_on_block));
 
+            tensor_operation::element_wise::PassThrough pass_through{};
+            const auto& vpgr_to_lds_element_op = [&] {
+                if constexpr(DoElementwiseBeforeCShuffle)
+                {
+                    return c_element_op;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+            const auto& lds_to_global_element_op = [&] {
+                if constexpr(!DoElementwiseBeforeCShuffle)
+                {
+                    return c_element_op;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                conditional_t<DoElementwiseBeforeCShuffle,
+                              CElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
+                Sequence<CShuffleMXdlPerWavePerShuffle,
+                         CShuffleNXdlPerWavePerShuffle,
+                         I1,
+                         I1,
+                         M2,
+                         I1,
+                         M4,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                7,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       n_thread_data_on_block_idx[I2]),
+                      vpgr_to_lds_element_op()};
 
             using EDataType = CDataType;
 
-            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
-                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
-
             const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
                 MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                     ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
@@ -1601,7 +1671,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                 Tuple<EDataType>,
                 decltype(c_ds_desc_refs),
                 decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
+                conditional_t<!DoElementwiseBeforeCShuffle,
+                              CElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
                 Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
                                                                             // support arbitray type
                 Sequence<1,
@@ -1625,7 +1697,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                  idx_c_ds_block_begin,
                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
                  make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
-                 c_element_op};
+                 lds_to_global_element_op()};
 
             // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
@@ -1698,12 +1770,12 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run_2Lds(const ADataType* p_a_grid,
-                                    const BDataType* p_b_grid,
+    __device__ static void Run_2Lds(const ADataType* __restrict__ p_a_grid,
+                                    const BDataType* __restrict__ p_b_grid,
                                     DsGridPointer& p_ds_grid,
-                                    CDataType* p_c_grid,
-                                    void* p_shared_0,
-                                    void* p_shared_1,
+                                    CDataType* __restrict__ p_c_grid,
+                                    void* __restrict__ p_shared_0,
+                                    void* __restrict__ p_shared_1,
                                     const Problem& problem,
                                     AElementwiseOperation a_element_op,
                                     BElementwiseOperation b_element_op,
@@ -1729,12 +1801,12 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
               bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run_2Lds(const ADataType* p_a_grid,
-                                    const BDataType* p_b_grid,
+    __device__ static void Run_2Lds(const ADataType* __restrict__ p_a_grid,
+                                    const BDataType* __restrict__ p_b_grid,
                                     DsGridPointer& p_ds_grid,
-                                    CDataType* p_c_grid,
-                                    void* p_shared_0,
-                                    void* p_shared_1,
+                                    CDataType* __restrict__ p_c_grid,
+                                    void* __restrict__ p_shared_0,
+                                    void* __restrict__ p_shared_1,
                                     const Problem& problem,
                                     AElementwiseOperation a_element_op,
                                     BElementwiseOperation b_element_op,
@@ -1745,8 +1817,53 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
         const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
             problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
+
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
+            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
+
+        Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(p_a_grid,
+                                                                         p_b_grid,
+                                                                         p_ds_grid,
+                                                                         p_c_grid,
+                                                                         p_shared_0,
+                                                                         p_shared_1,
+                                                                         problem,
+                                                                         a_element_op,
+                                                                         b_element_op,
+                                                                         c_element_op,
+                                                                         block_2_ctile_map,
+                                                                         a_grid_desc_ak0_m_ak1,
+                                                                         b_grid_desc_bk0_n_bk1,
+                                                                         ds_grid_desc_m_n,
+                                                                         c_grid_desc_m_n);
+    }
+
+    template <bool HasMainKBlockLoop,
+              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+              TailNumber TailNum,
+              typename Block2CTileMap,
+              typename AGridDesc_AK0_M_K1,
+              typename BGridDesc_BK0_N_K1,
+              typename DsGridDesc_M_N,
+              typename CGridDesc_M_N>
+    __device__ static void Run_2Lds(const ADataType* __restrict__ p_a_grid,
+                                    const BDataType* __restrict__ p_b_grid,
+                                    DsGridPointer& p_ds_grid,
+                                    CDataType* __restrict__ p_c_grid,
+                                    void* __restrict__ p_shared_0,
+                                    void* __restrict__ p_shared_1,
+                                    const Problem& problem,
+                                    AElementwiseOperation a_element_op,
+                                    BElementwiseOperation b_element_op,
+                                    CElementwiseOperation c_element_op,
+                                    const Block2CTileMap& block_2_ctile_map,
+                                    const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
+                                    const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
+                                    const DsGridDesc_M_N& ds_grid_desc_m_n,
+                                    const CGridDesc_M_N& c_grid_desc_m_n)
+    {
 
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
@@ -1982,43 +2099,63 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
                     make_multi_index(n_thread_data_on_block));
 
+            tensor_operation::element_wise::PassThrough pass_through{};
+            const auto& vpgr_to_lds_element_op = [&] {
+                if constexpr(DoElementwiseBeforeCShuffle)
+                {
+                    return c_element_op;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+            const auto& lds_to_global_element_op = [&] {
+                if constexpr(!DoElementwiseBeforeCShuffle)
+                {
+                    return c_element_op;
+                }
+                else
+                {
+                    return pass_through;
+                }
+            };
+
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                conditional_t<DoElementwiseBeforeCShuffle,
+                              CElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
+                Sequence<CShuffleMXdlPerWavePerShuffle,
+                         CShuffleNXdlPerWavePerShuffle,
+                         I1,
+                         I1,
+                         M2,
+                         I1,
+                         M4,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                7,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       n_thread_data_on_block_idx[I2]),
+                      vpgr_to_lds_element_op()};
 
             using EDataType = CDataType;
 
-            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
-                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
-
             const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
                 MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                     ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
@@ -2068,7 +2205,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                 Tuple<EDataType>,
                 decltype(c_ds_desc_refs),
                 decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
+                conditional_t<!DoElementwiseBeforeCShuffle,
+                              CElementwiseOperation,
+                              tensor_operation::element_wise::PassThrough>,
                 Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
                                                                             // support arbitray type
                 Sequence<1,
@@ -2092,7 +2231,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                  idx_c_ds_block_begin,
                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
                  make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
-                 c_element_op};
+                 lds_to_global_element_op()};
 
             // space filling curve for threadwise C in VGPR
             constexpr auto sfc_c_vgpr =
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index e27fda05e4..4700a34e9d 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -94,6 +94,8 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12
   list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
   list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
+  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
 endif()
@@ -197,6 +199,10 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
   list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_clamp_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance)
+  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance)
 endif()
 
 if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
diff --git a/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp b/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp
new file mode 100644
index 0000000000..34b3df1c65
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/ignore.hpp"
+#include "profiler_operation_registry.hpp"
+
+#include <iostream>
+
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK, // 0
+    NHWGC_GKYXC_NHWGK, // 1
+    NGCHW_GKYXC_NGKHW, // 2
+    NGCHW_GKCYX_NGKHW, // 3
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F8_F8,       // 4
+    BF8_BF8_F8,     // 5
+    F8_BF8_F8,      // 6
+    BF8_F8_F8,      // 7
+};
+
+enum struct IndexType
+{
+    INDEX_T,      // 0
+    LONG_INDEX_T, // 1
+};
+
+#define OP_NAME "grouped_conv_fwd_bias_clamp"
+#define OP_DESC "Grouped Convolution Forward+Bias+Clamp"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8\n"
+        << "                 4: Input fp8, Weight fp8, Output fp8\n"
+        << "                 5: Input bf8, Weight bf8, Output fp8\n"
+        << "                 6: Input fp8, Weight bf8, Output fp8\n"
+        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
+        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n"
+        << "                     2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
+        "G, K, Ho, Wo]\n"
+        << "                     3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, "
+        "G, K, Ho, Wo])\n"
+        << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n"
+        << "arg5: verification (0: no, 1: yes)\n"
+        << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg7: print tensor value (0: no; 1: yes)\n"
+        << "arg8: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+int grouped_conv_fwd_bias_clamp(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 10)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const auto index_type      = static_cast<IndexType>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int num_dim_spatial  = std::stoi(argv[9]);
+
+    // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
+
+    if(index_type != IndexType::INDEX_T)
+    {
+        std::cout << "this indexing data type is not implemented" << std::endl;
+        return 1;
+    }
+
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F16  = ck::half_t;
+
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+    using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+
+    using GKYXC = ck::tensor_layout::convolution::GKYXC;
+    using NHWGC = ck::tensor_layout::convolution::NHWGC;
+    using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type,
+                       auto a_compute_type,
+                       auto b_compute_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        using AComputeType = decltype(a_compute_type);
+        using BComputeType = decltype(b_compute_type);
+
+        bool pass = ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                           InLayout,
+                                                                           WeiLayout,
+                                                                           OutLayout,
+                                                                           InDataType,
+                                                                           WeiDataType,
+                                                                           OutDataType,
+                                                                           AComputeType,
+                                                                           BComputeType>(
+            do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, grouped_conv_fwd_bias_clamp);
diff --git a/profiler/src/profile_grouped_conv_fwd_clamp.cpp b/profiler/src/profile_grouped_conv_fwd_clamp.cpp
new file mode 100644
index 0000000000..600f91744a
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_fwd_clamp.cpp
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/ignore.hpp"
+#include "profiler_operation_registry.hpp"
+
+#include <iostream>
+
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK, // 0
+    NHWGC_GKYXC_NHWGK, // 1
+    NGCHW_GKYXC_NGKHW, // 2
+    NGCHW_GKCYX_NGKHW, // 3
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F8_F8,       // 4
+    BF8_BF8_F8,     // 5
+    F8_BF8_F8,      // 6
+    BF8_F8_F8,      // 7
+};
+
+enum struct IndexType
+{
+    INDEX_T,      // 0
+    LONG_INDEX_T, // 1
+};
+
+#define OP_NAME "grouped_conv_fwd_clamp"
+#define OP_DESC "Grouped Convolution Forward+Clamp"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8\n"
+        << "                 4: Input fp8, Weight fp8, Output fp8\n"
+        << "                 5: Input bf8, Weight bf8, Output fp8\n"
+        << "                 6: Input fp8, Weight bf8, Output fp8\n"
+        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
+        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n"
+        << "                     2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
+        "G, K, Ho, Wo]\n"
+        << "                     3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, "
+        "G, K, Ho, Wo])\n"
+        << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n"
+        << "arg5: verification (0: no, 1: yes)\n"
+        << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg7: print tensor value (0: no; 1: yes)\n"
+        << "arg8: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+int grouped_conv_fwd_clamp(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 10)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const auto index_type      = static_cast<IndexType>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int num_dim_spatial  = std::stoi(argv[9]);
+
+    // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
+
+    if(index_type != IndexType::INDEX_T)
+    {
+        std::cout << "this indexing data type is not implemented" << std::endl;
+        return 1;
+    }
+
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F16  = ck::half_t;
+
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+    using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+
+    using GKYXC = ck::tensor_layout::convolution::GKYXC;
+    using NHWGC = ck::tensor_layout::convolution::NHWGC;
+    using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type,
+                       auto a_compute_type,
+                       auto b_compute_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        using AComputeType = decltype(a_compute_type);
+        using BComputeType = decltype(b_compute_type);
+
+        bool pass =
+            ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                        InLayout,
+                                                        WeiLayout,
+                                                        OutLayout,
+                                                        InDataType,
+                                                        WeiDataType,
+                                                        OutDataType,
+                                                        AComputeType,
+                                                        BComputeType,
+                                                        ck::index_t,
+                                                        ck::tensor_operation::element_wise::Clamp>(
+                do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(
+                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, grouped_conv_fwd_clamp);

From 78082855d8195467c36eabb5cd2428b94777231d Mon Sep 17 00:00:00 2001
From: rahjain-amd <Rahul.Jain@amd.com>
Date: Fri, 25 Jul 2025 21:15:50 +0530
Subject: [PATCH 349/443] Fixing 0ms and inf GB/s issue in img2col (#2565)

issue :
====
``` sh
$ bin/tile_example_img2col
Perf: 0 ms, inf GB/s
```

solution :
======
Problem occured because config.time_kernel is false by default.
if false, then no need to calculate perf, just print proper message

`image_to_coloumn: pass, No Perf generated due to config.time_kernel=0`
---
 example/ck_tile/04_img2col/image_to_column.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/example/ck_tile/04_img2col/image_to_column.cpp b/example/ck_tile/04_img2col/image_to_column.cpp
index 6380cd2994..388fcf5137 100644
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
@@ -149,9 +149,16 @@ int main(int argc, char* argv[])
     float ave_time =
         image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel});
 
-    std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
-    float gb_per_sec      = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    if (config.time_kernel)
+    {
+        std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
+        float gb_per_sec      = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+    {
+        std::cout << "image_to_column: pass, No Perf generated due to config.time_kernel=0" << std::endl;
+    }
 
     bool pass = true;
 

From 5741edf76198ff76b1d898a42b1da491e3c20697 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 25 Jul 2025 18:54:34 +0200
Subject: [PATCH 350/443] Fix clang format (#2567)

* clean

* clang format fix
---
 example/ck_tile/04_img2col/image_to_column.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/example/ck_tile/04_img2col/image_to_column.cpp b/example/ck_tile/04_img2col/image_to_column.cpp
index 388fcf5137..299a2f3444 100644
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
@@ -149,7 +149,7 @@ int main(int argc, char* argv[])
     float ave_time =
         image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel});
 
-    if (config.time_kernel)
+    if(config.time_kernel)
     {
         std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
         float gb_per_sec      = num_btype / 1.E6 / ave_time;
@@ -157,7 +157,8 @@ int main(int argc, char* argv[])
     }
     else
     {
-        std::cout << "image_to_column: pass, No Perf generated due to config.time_kernel=0" << std::endl;
+        std::cout << "image_to_column: pass, No Perf generated due to config.time_kernel=0"
+                  << std::endl;
     }
 
     bool pass = true;

From d2459878cf993565b8f55f1c1c0915251b944105 Mon Sep 17 00:00:00 2001
From: liang <38024827+smallmou@users.noreply.github.com>
Date: Sat, 26 Jul 2025 02:46:55 +0800
Subject: [PATCH 351/443] reorder grid dim schedule (#2533)

Co-authored-by: smallmou <liangshenghao.lsh@alibaba-inc.com>
Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 561e5fb00a..8d257a3329 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -955,9 +955,9 @@ struct FmhaFwdKernel
         else
         {
             // TODO: this may need tuning
-            return dim3(ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
+            return dim3(nhead_,
+                        ck_tile::integer_divide_ceil(seqlen_q_, FmhaPipeline::kM0) *
                             ck_tile::integer_divide_ceil(hdim_v_, FmhaPipeline::kN1),
-                        nhead_,
                         batch_size_);
         }
     }
@@ -1003,8 +1003,8 @@ struct FmhaFwdKernel
             const index_t num_tile_n1 =
                 ck_tile::integer_divide_ceil(kargs.hdim_v, FmhaPipeline::kN1);
 
-            const index_t i_block = blockIdx.x;
-            const index_t i_nhead = blockIdx.y;
+            const index_t i_block = blockIdx.y; // blockIdx.x
+            const index_t i_nhead = blockIdx.x; // blockIdx.y
             const index_t i_batch = blockIdx.z;
 
             const auto f = [](index_t dividend, index_t divisor) {
@@ -1018,7 +1018,7 @@ struct FmhaFwdKernel
             if constexpr(kHasMask)
             {
                 // assume that num_tile_n1 is always 1
-                return ck_tile::make_tuple(gridDim.x - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
+                return ck_tile::make_tuple(gridDim.y - 1 - i_tile_m, i_tile_n, i_nhead, i_batch);
             }
             else
             {

From 821cd26c13323672b50c4cd2b35510d94f2199b8 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Sat, 26 Jul 2025 21:51:54 -0700
Subject: [PATCH 352/443] [CK-Tile] Merge transpose examples (#2450)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* unify pipeline signature with existing example

* iwyu

* move stuff around in load-tile-transpose

* cleanups in batched transpose pipeline

* comments

* use same inputs size

* cleaner printf

* print host args

* use 64 block sides in the 37_transpose example

* roll back grid dimension size adjustment for 37_transpose example

* transpose grid for 37_transpose to unify with 35_batched_transpose

* unify grid computation logic

* make policy methods device only (since they are used only on device from the pipeline)

* more host/device attribute cleanups

* copy over problem

* move over pipeline and policy

* add switch to batched transpose api

* make the lds problem more similar to original problem

* factor out logic into traits

* factor out conditional compilation into trait parameter

* propagate pipeline to args

* unhardcode pipeline dispatch parameter

* refactor vector size

* put warp tile out of dispatch

* rename template parameter for trait

* rewrite vector size in terms of problem

* mark policy-internal struct variable as device

* factor out input distribution and thread access pattern from policies

* reword vector size

* use datatype across batched transpose pipelines, problems and kernel

* remove transpose traits from lds pipeline

* add padding to the lds pipeline *interface*

* add comment

* remove ck_tile example #37

* update cmakelists

* add test for new pipeline

* update batched transpose test

* roll back load_tile_transpose changes

* remove comments

* pack dispatch parameters into a config

* padM can be enabled

* adjust lds vector size to enable padding along N

* update test

* clean up logic

* swap m/n input vector size

* adjust perf test script

* sweep over C/W in perf test

* count both read and written bytes into bandwidth (x2 the number)

* clang-format

* widen size range for perf test

* remove 64k x 64k case; it's too large for index

* remove thread tile from dispatch

* Solve merge conflict

* fix compile

* modify the transpose

* solve the test error and clang format

* Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463)

* Add logging to IsSupported.

* Less casting in AddClamp

* Conv+bias+clamp instances & profiler BF16

* Fix 3D instances & run just 1x for verification.

* :Run just once for verification conv fwd.

* ckProfiler conv fwd clampwq

* Remove exec bit & formatting

* Add support for MultiD for grouped conv fwd v3.

* Enable 2Lds.

* clean

* align instances

* align instances

* profiler fixes

* Fixes

* fix

* fix

---------

Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Fixing 0ms and inf GB/s issue in img2col (#2565)

issue :
====
``` sh
$ bin/tile_example_img2col
Perf: 0 ms, inf GB/s
```

solution :
======
Problem occured because config.time_kernel is false by default.
if false, then no need to calculate perf, just print proper message

`image_to_coloumn: pass, No Perf generated due to config.time_kernel=0`

* merge with develop

* solve clang format

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
Co-authored-by: rahjain-amd <Rahul.Jain@amd.com>
---
 .../batched_transpose_api.cpp                 | 215 +++++++++++----
 .../batched_transpose_example.cpp             |  16 +-
 .../batched_transpose_example.hpp             |   1 +
 .../35_batched_transpose/script/perf_test.sh  |  12 +-
 .../35_batched_transpose/script/smoke_test.sh |  42 +--
 example/ck_tile/37_transpose/CMakeLists.txt   |   9 -
 example/ck_tile/37_transpose/README.md        |  27 --
 .../37_transpose/batched_transpose_kernel.hpp | 120 --------
 .../ck_tile/37_transpose/block_transpose.hpp  | 149 ----------
 .../ck_tile/37_transpose/transpose_api.cpp    |  59 ----
 .../37_transpose/transpose_example.cpp        | 257 ------------------
 .../37_transpose/transpose_example.hpp        |  27 --
 example/ck_tile/CMakeLists.txt                |   1 -
 include/ck_tile/ops/batched_transpose.hpp     |   4 +
 .../kernel/batched_transpose_kernel.hpp       |   4 +-
 .../batched_transpose_common_policy.hpp       |  33 +++
 .../batched_transpose_lds_pipeline.hpp        |  67 +++++
 .../pipeline/batched_transpose_lds_policy.hpp |  58 +---
 .../batched_transpose_lds_problem.hpp         |  73 +++++
 .../pipeline/batched_transpose_pipeline.hpp   |  15 +-
 .../pipeline/batched_transpose_policy.hpp     |  34 +--
 .../pipeline/batched_transpose_problem.hpp    |  31 +--
 include/ck_tile/ops/gemm.hpp                  |   2 +-
 .../batched_transpose_api.cpp                 |  44 ++-
 24 files changed, 431 insertions(+), 869 deletions(-)
 delete mode 100644 example/ck_tile/37_transpose/CMakeLists.txt
 delete mode 100644 example/ck_tile/37_transpose/README.md
 delete mode 100644 example/ck_tile/37_transpose/batched_transpose_kernel.hpp
 delete mode 100644 example/ck_tile/37_transpose/block_transpose.hpp
 delete mode 100644 example/ck_tile/37_transpose/transpose_api.cpp
 delete mode 100644 example/ck_tile/37_transpose/transpose_example.cpp
 delete mode 100644 example/ck_tile/37_transpose/transpose_example.hpp
 create mode 100644 include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
 create mode 100644 include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_pipeline.hpp
 rename example/ck_tile/37_transpose/transpose_policy.hpp => include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_policy.hpp (65%)
 create mode 100644 include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp

diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
index 1eb0445c84..1f0f0b9bc1 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
@@ -2,41 +2,93 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "batched_transpose_example.hpp"
 
-template <typename ts_type,
-          ck_tile::index_t block_x,
-          ck_tile::index_t block_y,
-          ck_tile::index_t warp_x,
-          ck_tile::index_t warp_y,
-          ck_tile::index_t thread_x,
-          ck_tile::index_t thread_y,
-          bool kPadM,
-          bool kPadN>
+namespace {
+
+template <int32_t pipeline_id>
+struct kernel_traits;
+
+template <>
+struct kernel_traits<0>
+{
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
+    using Policy = ck_tile::BatchedTransposePolicy;
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Pipeline =
+        ck_tile::BatchedTransposePipeline<Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
+                                          Policy>;
+};
+
+template <>
+struct kernel_traits<1>
+{
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeLdsProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
+    using Policy = ck_tile::BatchedTransposeLdsPolicy;
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Pipeline = ck_tile::BatchedTransposeLdsPipeline<
+        Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
+        Policy>;
+};
+} // namespace
+
+template <typename InputType_,
+          ck_tile::index_t BlockX_,
+          ck_tile::index_t BlockY_,
+          ck_tile::index_t NumWarpsX_,
+          ck_tile::index_t NumWarpsY_,
+          bool PadM_,
+          bool PadN_,
+          ck_tile::index_t PipelineId_>
+struct BatchedTransposeConfig
+{
+    using InputType                               = InputType_;
+    static constexpr ck_tile::index_t kBlockX     = BlockX_;
+    static constexpr ck_tile::index_t kBlockY     = BlockY_;
+    static constexpr ck_tile::index_t kNumWarpsX  = NumWarpsX_;
+    static constexpr ck_tile::index_t kNumWarpsY  = NumWarpsY_;
+    static constexpr bool kPadM                   = PadM_;
+    static constexpr bool kPadN                   = PadN_;
+    static constexpr ck_tile::index_t kPipelineId = PipelineId_;
+};
+
+template <typename Config>
 float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
 {
     uint32_t dim_stride = a.height * a.width;
 
     a.dim_stride  = dim_stride;
-    a.dim_block_h = block_y;
-    a.dim_block_w = block_x;
+    a.dim_block_h = Config::kBlockY;
+    a.dim_block_w = Config::kBlockX;
 
-    using block_tile  = ck_tile::sequence<block_x, block_y>;
-    using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
-    using thread_tile = ck_tile::sequence<thread_x, thread_y>;
-
-    using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
-    using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
-
-    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
+    // TODO: this is fragile and slow to compile
+    using kernel = ck_tile::BatchedTransposeKernel<
+        typename kernel_traits<Config::kPipelineId>::template Pipeline<
+            typename Config::InputType,
+            ck_tile::sequence<Config::kBlockX, Config::kBlockY>,
+            ck_tile::sequence<Config::kNumWarpsX, Config::kNumWarpsY>,
+            Config::kPadM,
+            Config::kPadN>>;
 
     auto kargs = kernel::MakeKargs(a);
 
     const dim3 grids      = kernel::GridSize(a);
     constexpr dim3 blocks = kernel::BlockSize();
 
-    printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
-    printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
-    printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
+    printf("Pipeline: %d\n", Config::kPipelineId);
+    printf("Grid: x=%u y=%u z=%u\n", grids.x, grids.y, grids.z);
+    printf("Block: x=%u y=%u z=%u\n", blocks.x, blocks.y, blocks.z);
+    printf(
+        "Host args: batch=%d, height=%d, width=%d, dim_stride=%d, dim_block_h=%d, dim_block_w=%d\n",
+        a.batch,
+        a.height,
+        a.width,
+        a.dim_stride,
+        a.dim_block_h,
+        a.dim_block_w);
+    printf("kargs: kargs.batch=%d kargs.height=%d kargs.width=%d kargs.dim_stride=%d\n",
            kargs.batch,
            kargs.height,
            kargs.width,
@@ -52,22 +104,29 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
     return ave_time;
 }
 
-// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)                               \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true)     \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
+// Param Comb: type_size, block_x & y, WarpNum_x & y
+#define FOREACH_TRANSPOSE_PARAM(F)                          \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 0)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 0)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 0)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 0) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 0)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 0) \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 1)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 1)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 1)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 1) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 1)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 1)
 
 // Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN)             \
-    static float                                                                                \
-        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN(  \
-            batched_transpose_kargs& a, ck_tile::stream_config& s)                              \
-    {                                                                                           \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE)          \
+    static float                                                                           \
+        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN##_v##PIPE( \
+            batched_transpose_kargs& a, ck_tile::stream_config& s)                         \
+    {                                                                                      \
+        return batched_transpose_dispatch<                                                 \
+            BatchedTransposeConfig<REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE>>(a, s);    \
     }
 
 FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -76,38 +135,78 @@ float batched_transpose(batched_transpose_trait t,
                         batched_transpose_kargs a,
                         ck_tile::stream_config s)
 {
-    if(t.type == "fp8")
+    if(t.pipeline == "0")
     {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
+        if(t.type == "fp8")
         {
-            return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp8_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp8_64_64_1_1_true_true_v0(a, s);
+            }
         }
-        else
+        else if(t.type == "fp16")
         {
-            return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp16_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp16_64_64_1_1_true_true_v0(a, s);
+            }
+        }
+        else if(t.type == "bf16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_bf16_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_bf16_64_64_1_1_true_true_v0(a, s);
+            }
         }
     }
-    else if(t.type == "fp16")
+    else if(t.pipeline == "1")
     {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
+        if(t.type == "fp8")
         {
-            return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp8_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp8_64_64_1_1_true_true_v1(a, s);
+            }
         }
-        else
+        else if(t.type == "fp16")
         {
-            return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
-        }
-    }
-    else if(t.type == "bf16")
-    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
-        {
-            return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
-        }
-        else
-        {
-            return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp16_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp16_64_64_1_1_true_true_v1(a, s);
+            }
+        }
+        else if(t.type == "bf16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_bf16_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_bf16_64_64_1_1_true_true_v1(a, s);
+            }
         }
     }
+
     return -1;
 }
diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
index 33b6f0eacf..571386694b 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
@@ -102,7 +102,8 @@ auto create_args(int argc, char* argv[])
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
         .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "t to 1 will print kernel name");
+        .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("pipeline", "0", "0: no LDS usage, 1: LDS-accelerated (gfx950)");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -121,6 +122,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
     int n_repeat           = args.get_int("repeat");
     std::string layout_in  = args.get_str("layout_in");
     std::string layout_out = args.get_str("layout_out");
+    std::string pipeline   = args.get_str("pipeline");
     int seed               = args.get_int("seed");
 
     int dim_in[4], dim_out[4];
@@ -166,7 +168,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)
 
     x_dev.ToDevice(x_host.data());
 
-    auto trait = batched_transpose_trait{prec, layout_in};
+    auto trait = batched_transpose_trait{prec, layout_in, pipeline};
 
     uint32_t height = nchw2nhwc ? C : H * W;
     uint32_t width  = nchw2nhwc ? H * W : C;
@@ -185,17 +187,15 @@ bool run_batched_transpose(ck_tile::ArgParser args)
 
     auto ms = batched_transpose(trait, karg, sc);
 
-    std::size_t num_operations = N * C * H * (W - 1);
-    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
+    std::size_t num_bytes = N * C * H * W * sizeof(Type) * 2; // read + written
 
-    float ave_time   = ms * 1E-3;
     float gb_per_sec = num_bytes / ms * 1.E-6;
-    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;
 
     std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
               << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
-              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
-              << gb_per_sec << " GB/s, " << std::endl;
+              << " : " << std::endl
+              << ms << " ms " << std::endl
+              << gb_per_sec << " GB/s " << std::endl;
 
     printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
            prec.c_str(),
diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp b/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
index 487ddc17b2..c37dbed4b3 100644
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
@@ -14,6 +14,7 @@ struct batched_transpose_trait
 {
     std::string type;
     std::string layout;
+    std::string pipeline;
 };
 
 struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
diff --git a/example/ck_tile/35_batched_transpose/script/perf_test.sh b/example/ck_tile/35_batched_transpose/script/perf_test.sh
index dde646eb2a..f19242af28 100755
--- a/example/ck_tile/35_batched_transpose/script/perf_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/perf_test.sh
@@ -5,10 +5,14 @@
 
 EXE=./build/bin/tile_example_batched_transpose
 
+for C in "64" "256" "1024" "4096" "16384"; do
+for W in "64" "256" "1024" "4096" "16384"; do
 for pr in "fp8" "fp16" "bf16"; do
-$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1024 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=4096 -H=1 -W=2048 -layout_in='NCHW' -layout_out='NHWC'
+for pipeline in "0" "1"; do
+
+$EXE -pipeline=$pipeline -pr=$pr -N=1 -C=$C -H=1 -W=$W -layout_in='NCHW' -layout_out='NHWC'
 
 done
+done
+done
+done
\ No newline at end of file
diff --git a/example/ck_tile/35_batched_transpose/script/smoke_test.sh b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
index 5ba2743364..a8bd692183 100755
--- a/example/ck_tile/35_batched_transpose/script/smoke_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/smoke_test.sh
@@ -6,25 +6,27 @@
 EXE=./build/bin/tile_example_batched_transpose
 
 for pr in "fp8" "fp16" "bf16"; do
-$EXE -pr=$pr -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
-$EXE -pr=$pr -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
-$EXE -pr=$pr -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+for pipeline in "0" "1"; do
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=32 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1 -W=64 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=2 -C=12 -H=1 -W=32 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=3 -C=1334 -H=1 -W=37 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=4 -C=27 -H=1 -W=32 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=5 -C=1234 -H=1 -W=12 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=128 -C=1024 -H=64 -W=64 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=128 -C=1024 -H=64 -W=64 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=16 -C=64 -H=32 -W=128 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=16 -C=64 -H=128 -W=32 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=2048 -H=1 -W=1 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=2048 -H=1 -W=1 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1024 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=1 -H=1024 -W=1024 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=8 -C=16 -H=8 -W=16 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=8 -C=16 -H=8 -W=16 -layout_in='NHWC' -layout_out='NCHW'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1 -W=1024 -layout_in='NCHW' -layout_out='NHWC'
+$EXE -pr=$pr -pipeline=$pipeline -N=1 -C=64 -H=1024 -W=1 -layout_in='NHWC' -layout_out='NCHW'
 
 done
+done
diff --git a/example/ck_tile/37_transpose/CMakeLists.txt b/example/ck_tile/37_transpose/CMakeLists.txt
deleted file mode 100644
index d6f374a9b4..0000000000
--- a/example/ck_tile/37_transpose/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-set(TARGET_NAME tile_example_transpose)
-add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL transpose_example.cpp transpose_api.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
-
-# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
-# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
-target_compile_options(tile_example_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
-
diff --git a/example/ck_tile/37_transpose/README.md b/example/ck_tile/37_transpose/README.md
deleted file mode 100644
index 21578dd00e..0000000000
--- a/example/ck_tile/37_transpose/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Batched Transpose
-This folder contains example for transpose load for architecture gfx950. This transpose load has some constraints in input tile distribution.
-
-## build
-```
-# in the root of ck_tile
-mkdir build && cd build
-# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
-sh ../script/cmake-ck-dev.sh  ../ <arch>
-# Make the transpose executable
-make tile_example_transpose -j
-```
-This will result in an executable `build/bin/tile_example_transpose`
-
-## example
-```
-args:
-          -N    input batch size (default:2)
-          -C    input channel size. (default:64)
-          -H    input height size. (default:1)
-          -W    input width size. (default:64)
-          -v    whether do CPU validation or not (default: 1)
-  -layout_in    input tensor data layout - NCHW by default
- -layout_out    output tensor data layout - NHWC by default
-       -seed    seed to be used, -1 means random every time (default:-1)
-     -k_name    t to 1 will print kernel name (default:0)
-```
\ No newline at end of file
diff --git a/example/ck_tile/37_transpose/batched_transpose_kernel.hpp b/example/ck_tile/37_transpose/batched_transpose_kernel.hpp
deleted file mode 100644
index 4681a12cf7..0000000000
--- a/example/ck_tile/37_transpose/batched_transpose_kernel.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
-#include "ck_tile/ops/elementwise.hpp"
-#include "ck_tile/host/hip_check_error.hpp"
-#include <string>
-#include <type_traits>
-
-namespace ck_tile {
-
-struct BatchedTransposeHostArgs
-{
-    const void* p_input;
-    void* p_output;
-    index_t batch;
-    index_t height;
-    index_t width;
-    // index_t dim_blocks;
-    index_t dim_stride;
-    index_t dim_block_h;
-    index_t dim_block_w;
-};
-
-template <typename Pipeline_>
-struct BatchedTransposeKernel
-{
-    using Pipeline = remove_cvref_t<Pipeline_>;
-    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
-
-    using Type = typename Problem::DataType;
-
-    struct BatchedTransposeKargs
-    {
-        const void* p_input;
-        void* p_output;
-        index_t batch;
-        index_t height;
-        index_t width;
-        index_t dim_stride;
-    };
-
-    using Kargs = BatchedTransposeKargs;
-    using Hargs = BatchedTransposeHostArgs;
-
-    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
-    {
-        size_t grid_size_x = h.dim_block_w;
-        size_t grid_size_y = h.dim_block_h;
-        size_t grid_size_z = h.batch;
-        return dim3(grid_size_x, grid_size_y, grid_size_z);
-    }
-
-    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
-    {
-        Kargs k;
-        k.p_input    = h.p_input;
-        k.p_output   = h.p_output;
-        k.batch      = h.batch;
-        k.height     = h.height;
-        k.width      = h.width;
-        k.dim_stride = h.dim_stride;
-        return k;
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; }
-
-    CK_TILE_DEVICE void operator()(Kargs kargs) const
-    {
-        __shared__ char smem[Pipeline::GetSmemSize()];
-        static constexpr ck_tile::index_t kMPerBlock = Problem::kSecondSizePerBlock;
-        static constexpr ck_tile::index_t kNPerBlock = Problem::kLeadSizePerBlock;
-
-        const auto iDim  = blockIdx.z;
-        const auto x_m_n = [&]() {
-            const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                static_cast<const Type*>(kargs.p_input) + iDim * kargs.dim_stride,
-                make_tuple(kargs.height, kargs.width),
-                make_tuple(kargs.width, 1),
-                number<Pipeline::GetVectorSize()>{},
-                number<1>{});
-
-            return pad_tensor_view(x_dram_naive,
-                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                                   sequence<false, false>{});
-        }();
-
-        const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.y * kMPerBlock);
-        const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.x * kNPerBlock);
-
-        const auto y_n_m = [&]() {
-            const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                static_cast<Type*>(kargs.p_output) + iDim * kargs.dim_stride,
-                make_tuple(kargs.width, kargs.height),
-                make_tuple(kargs.height, 1),
-                number<Pipeline::GetVectorSize()>{},
-                number<1>{});
-
-            return pad_tensor_view(y_dram_naive,
-                                   make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
-                                   sequence<false, false>{});
-        }();
-
-        auto x_block_window = make_tile_window(
-            x_m_n,
-            make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-            {static_cast<ck_tile::index_t>(iM), static_cast<ck_tile::index_t>(iN)});
-
-        auto y_block_window = make_tile_window(
-            y_n_m,
-            make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
-            {static_cast<ck_tile::index_t>(iN), static_cast<ck_tile::index_t>(iM)});
-
-        Pipeline{}(x_block_window, y_block_window, smem);
-    }
-};
-} // namespace ck_tile
diff --git a/example/ck_tile/37_transpose/block_transpose.hpp b/example/ck_tile/37_transpose/block_transpose.hpp
deleted file mode 100644
index 5c0baab846..0000000000
--- a/example/ck_tile/37_transpose/block_transpose.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "transpose_policy.hpp"
-
-namespace ck_tile {
-
-template <typename Layout_, index_t kRow, index_t kCol>
-struct TransposeTraits
-{
-    static constexpr index_t kLeadDim   = kCol;
-    static constexpr index_t kSecondDim = kRow;
-};
-
-template <index_t kRow, index_t kCol>
-struct TransposeTraits<tensor_layout::gemm::ColumnMajor, kRow, kCol>
-{
-    static constexpr index_t kLeadDim   = kRow;
-    static constexpr index_t kSecondDim = kCol;
-};
-
-// supports 2D transpose which will store to lds, then use ds_read_b*_tr_b* instruction to get the
-// transposed data; Layout in TransposePipelineProblem is the original layout of the data in the
-// global memory
-template <typename DataType_,
-          typename Layout_,
-          index_t kBlockSize_,
-          index_t kRowWarps_,    // how many warps in row direction
-          index_t kColWarps_,    // how many warps in col direction
-          index_t kRowPerBlock_, // row number per block
-          index_t kColPerBlock_, // col number per block
-          index_t kRowPerXdl_,   // row number per xdl ops
-          index_t kColPerXdl_>   // col number per xdl ops
-struct TransposePipelineProblem
-{
-    static_assert(kRowWarps_ * kColWarps_ * get_warp_size() == kBlockSize_,
-                  "the block size is not correct!");
-    using DataType                      = remove_cvref_t<DataType_>;
-    using Layout                        = remove_cvref_t<Layout_>;
-    static constexpr index_t kBlockSize = kBlockSize_;
-    static constexpr index_t kLeadNumWarps =
-        TransposeTraits<Layout, kRowWarps_, kColWarps_>::kLeadDim;
-    static constexpr index_t kSecondNumWarps =
-        TransposeTraits<Layout, kRowWarps_, kColWarps_>::kSecondDim;
-    static constexpr index_t kLeadSizePerBlock =
-        TransposeTraits<Layout, kRowPerBlock_, kColPerBlock_>::kLeadDim;
-    static constexpr index_t kSecondSizePerBlock =
-        TransposeTraits<Layout, kRowPerBlock_, kColPerBlock_>::kSecondDim;
-    static constexpr index_t kLeadSizePerXdl =
-        TransposeTraits<Layout, kRowPerXdl_, kColPerXdl_>::kLeadDim;
-    static constexpr index_t kSecondSizePerXdl =
-        TransposeTraits<Layout, kRowPerXdl_, kColPerXdl_>::kSecondDim;
-
-    static constexpr index_t kQuadrantLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
-    static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
-
-    static_assert(kLeadSizePerBlock % kLeadNumWarps == 0,
-                  "block dim should be divided by warp dim!");
-    static_assert(kSecondSizePerBlock % kSecondNumWarps == 0,
-                  "block dim should be divided by warp dim!");
-    // how many rows/cols implemented in one warp
-    static constexpr index_t kLeadSizePerWarp   = kLeadSizePerBlock / kLeadNumWarps;
-    static constexpr index_t kSecondSizePerWarp = kSecondSizePerBlock / kSecondNumWarps;
-
-    static_assert(kLeadSizePerWarp % kLeadSizePerXdl == 0,
-                  "warp dim should be divided by xdl dim!");
-    static_assert(kSecondSizePerWarp % kSecondSizePerXdl == 0,
-                  "warp dim should be divided by xdl dim!");
-
-    // warp rows/cols is divided into xdl.
-    static constexpr index_t kLeadXdlNumPerWarp   = kLeadSizePerWarp / kLeadSizePerXdl;
-    static constexpr index_t kSecondXdlNumPerWarp = kSecondSizePerWarp / kSecondSizePerXdl;
-
-    static_assert(kLeadSizePerXdl % kQuadrantLeadDim == 0,
-                  "xdl dim should be divided by quad dim!");
-    static_assert(kSecondSizePerXdl % kQuadrantSecondDim == 0,
-                  "xdl dim should be divided by quad dim!");
-    // xdl rows/cols is divided into quadrants.
-    static constexpr index_t kQuadNumPerLeadDim   = kLeadSizePerXdl / kQuadrantLeadDim;
-    static constexpr index_t kQuadNumPerSecondDim = kSecondSizePerXdl / kQuadrantSecondDim;
-
-    static constexpr index_t kIterationsInSecondDim =
-        kQuadNumPerLeadDim * kQuadNumPerSecondDim * 16 / get_warp_size();
-};
-
-template <typename Problem_, typename Policy_ = TransposePolicy>
-struct BlockTranspose
-{
-    using Problem = remove_cvref_t<Problem_>;
-    using Policy  = remove_cvref_t<Policy_>;
-
-    using DataType = remove_cvref_t<typename Problem::DataType>;
-    using Layout   = remove_cvref_t<typename Problem::Layout>;
-
-    static constexpr index_t kBlockSize          = Problem::kBlockSize;
-    static constexpr index_t kLeadSizePerBlock   = Problem::kLeadSizePerBlock;
-    static constexpr index_t kSecondSizePerBlock = Problem::kSecondSizePerBlock;
-
-    static constexpr index_t GetVectorSize() { return Policy::template GetVectorSize<Problem>(); }
-
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
-    {
-        return Policy::template GetSmemSize<Problem>();
-    }
-
-    template <typename InputTileWindow, typename OutputTileWindow>
-    CK_TILE_DEVICE void operator()(const InputTileWindow& input_window,
-                                   OutputTileWindow& output_window,
-                                   void* __restrict__ p_smem)
-    {
-        auto input_tile_window =
-            make_tile_window(input_window, Policy::template MakeInputDistribution<Problem>());
-        auto output_tile_window =
-            make_tile_window(output_window, Policy::template MakeOutputDistribution<Problem>());
-
-        DataType* p_lds_ptr              = static_cast<DataType*>(p_smem);
-        constexpr auto in_lds_block_desc = Policy::template MakeLdsStoreBlockDescriptor<Problem>();
-        auto input_lds_block =
-            make_tensor_view<address_space_enum::lds>(p_lds_ptr, in_lds_block_desc);
-
-        constexpr auto out_lds_block_desc = Policy::template MakeLdsLoadBlockDescriptor<Problem>();
-        auto output_lds_block =
-            make_tensor_view<address_space_enum::lds>(p_lds_ptr, out_lds_block_desc);
-
-        auto copy_to_lds_window =
-            make_tile_window(input_lds_block,
-                             make_tuple(number<kSecondSizePerBlock>{}, number<kLeadSizePerBlock>{}),
-                             {0, 0});
-        auto load_from_lds_window =
-            make_tile_window(output_lds_block,
-                             make_tuple(number<kSecondSizePerBlock>{}, number<kLeadSizePerBlock>{}),
-                             {0, 0},
-                             Policy::template MakeLdsLoadTileDistribution<Problem>());
-
-        auto x = load_tile(input_tile_window);
-
-        store_tile(copy_to_lds_window, x);
-        block_sync_lds();
-
-        auto y = load_tile_transpose(load_from_lds_window);
-
-        store_tile(output_tile_window, y);
-    }
-};
-
-} // namespace ck_tile
diff --git a/example/ck_tile/37_transpose/transpose_api.cpp b/example/ck_tile/37_transpose/transpose_api.cpp
deleted file mode 100644
index fe184b4023..0000000000
--- a/example/ck_tile/37_transpose/transpose_api.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "transpose_example.hpp"
-#include <iostream>
-
-template <typename ts_type,
-          ck_tile::index_t block_x,
-          ck_tile::index_t block_y,
-          ck_tile::index_t warp_x,
-          ck_tile::index_t warp_y>
-float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
-{
-    uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
-    uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
-    uint32_t dim_stride  = a.height * a.width;
-
-    a.dim_stride  = dim_stride;
-    a.dim_block_h = dim_block_h;
-    a.dim_block_w = dim_block_w;
-
-    using ts_problem  = ck_tile::TransposePipelineProblem<ts_type,
-                                                         ck_tile::tensor_layout::gemm::RowMajor,
-                                                         64,
-                                                         1,
-                                                         1,
-                                                         block_y,
-                                                         block_x,
-                                                         warp_y,
-                                                         warp_x>;
-    using ts_pipeline = ck_tile::BlockTranspose<ts_problem>;
-
-    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
-
-    auto kargs = kernel::MakeKargs(a);
-
-    const dim3 grids      = kernel::GridSize(a);
-    constexpr dim3 blocks = kernel::BlockSize();
-
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
-
-    return ave_time;
-}
-
-float batched_transpose(batched_transpose_trait t,
-                        batched_transpose_kargs a,
-                        ck_tile::stream_config s)
-{
-    if(t.type == "fp16")
-    {
-        return batched_transpose_dispatch<ck_tile::fp16_t, 16, 32, 16, 32>(a, s);
-    }
-    else if(t.type == "fp8")
-    {
-        return batched_transpose_dispatch<ck_tile::fp8_t, 16, 64, 16, 64>(a, s);
-    }
-
-    return -1;
-}
diff --git a/example/ck_tile/37_transpose/transpose_example.cpp b/example/ck_tile/37_transpose/transpose_example.cpp
deleted file mode 100644
index ac27ca7911..0000000000
--- a/example/ck_tile/37_transpose/transpose_example.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-#include <iostream>
-#include <numeric>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <time.h>
-#include <unordered_set>
-
-#include "transpose_example.hpp"
-
-#if 0
-template <typename T>
-void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
-{
-    auto len = x.get_lengths();
-    assert(len.size() == 4);
-    std::cout << "[";
-    for(size_t i = 0; i < len[0]; i++)
-    {
-        std::cout << i << ": [";
-        for(size_t j = 0; j < len[1]; j++)
-        {
-            std::cout << j << ": [";
-            for(size_t k = 0; k < len[2]; k++)
-            {
-                std::cout << k << ": [";
-                for(size_t v = 0; v < len[3]; v++)
-                {
-                    if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
-                    {
-                        auto m =
-                            ck_tile::type_convert<float>(x(std::vector<std::size_t>{i, j, k, v}));
-
-                        std::cout << m;
-                        if(v != len[3] - 1)
-                            std::cout << ",";
-                    }
-                    else
-                    {
-                        std::cout << x(std::vector<std::size_t>{i, j, k, v}) << " ";
-                    }
-                }
-                std::cout << "]" << std::endl;
-            }
-            std::cout << "]" << std::endl;
-        }
-        std::cout << std::endl;
-    }
-    std::cout << "--------------------" << std::endl;
-}
-#endif
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit(std::string /*init_method*/)
-{
-    double rtol = 1e-3;
-    double atol = 1e-3;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
-{
-    double rtol = 1e-2;
-    double atol = 1e-2;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::fp8_t>(std::string init_method)
-{
-    if(init_method == "ui" || init_method == "ni")
-    {
-        unsigned max_rounding_point_distance = 0;
-        double atol                          = 2e-3;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-    else
-    {
-        unsigned max_rounding_point_distance = 1;
-        double atol                          = 0.0625;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-}
-
-auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "whether do CPU validation or not")
-        .insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
-        .insert("N", "2", "input batch size. ")
-        .insert("C", "64", "input channel size.")
-        .insert("H", "1", "input height size.")
-        .insert("W", "64", "input width size. ")
-        .insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
-        .insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
-        .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "t to 1 will print kernel name");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename Type>
-bool run_batched_transpose(ck_tile::ArgParser args)
-{
-    int validate           = args.get_int("v");
-    std::string prec       = args.get_str("pr");
-    int N                  = args.get_int("N");
-    int C                  = args.get_int("C");
-    int H                  = args.get_int("H");
-    int W                  = args.get_int("W");
-    std::string layout_in  = args.get_str("layout_in");
-    std::string layout_out = args.get_str("layout_out");
-    int seed               = args.get_int("seed");
-
-    int dim_in[4], dim_out[4];
-    int stride_dim_in[4], stride_dim_out[4];
-    bool nchw2nhwc = layout_in == "NCHW" && layout_out == "NHWC";
-    bool nhwc2nchw = layout_in == "NHWC" && layout_out == "NCHW";
-    assert(nchw2nhwc != nhwc2nchw);
-    (void)nhwc2nchw;
-
-    dim_in[0]         = N;
-    dim_in[1]         = nchw2nhwc ? C : H;
-    dim_in[2]         = nchw2nhwc ? H : W;
-    dim_in[3]         = nchw2nhwc ? W : C;
-    dim_out[0]        = N;
-    dim_out[1]        = nchw2nhwc ? H : C;
-    dim_out[2]        = nchw2nhwc ? W : H;
-    dim_out[3]        = nchw2nhwc ? C : W;
-    stride_dim_in[0]  = C * H * W;
-    stride_dim_in[1]  = nchw2nhwc ? H * W : C * W;
-    stride_dim_in[2]  = nchw2nhwc ? W : C;
-    stride_dim_in[3]  = 1;
-    stride_dim_out[0] = C * H * W;
-    stride_dim_out[1] = nchw2nhwc ? C * W : H * W;
-    stride_dim_out[2] = nchw2nhwc ? C : W;
-    stride_dim_out[3] = 1;
-
-    if(seed < 0)
-    {
-        seed = std::time(nullptr);
-    }
-
-    ck_tile::HostTensor<Type> x_host(
-        {dim_in[0], dim_in[1], dim_in[2], dim_in[3]},
-        {stride_dim_in[0], stride_dim_in[1], stride_dim_in[2], stride_dim_in[3]});
-    ck_tile::HostTensor<Type> y_host(
-        {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
-        {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
-
-    ck_tile::FillUniformDistribution<Type>{-.5f, .5f}(x_host);
-
-    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
-
-    x_dev.ToDevice(x_host.data());
-
-    auto trait = batched_transpose_trait{prec, layout_in};
-
-    uint32_t height = nchw2nhwc ? C : H * W;
-    uint32_t width  = nchw2nhwc ? H * W : C;
-
-    batched_transpose_kargs karg = [&]() {
-        batched_transpose_kargs a_;
-        a_.p_input  = x_dev.GetDeviceBuffer();
-        a_.p_output = y_dev.GetDeviceBuffer();
-        a_.batch    = N;
-        a_.height   = height;
-        a_.width    = width;
-        return a_;
-    }();
-
-    ck_tile::stream_config sc{nullptr, true};
-
-    auto ms = batched_transpose(trait, karg, sc);
-
-    std::size_t num_operations = N * C * H * (W - 1);
-    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
-
-    float ave_time   = ms * 1E-3;
-    float gb_per_sec = num_bytes / ms * 1.E-6;
-    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;
-
-    std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
-              << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
-              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
-              << gb_per_sec << " GB/s, " << std::endl;
-
-    printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
-           prec.c_str(),
-           N,
-           C,
-           H,
-           W,
-           layout_in.c_str(),
-           ms);
-    if(ms < 0)
-        printf("not supported\n");
-    fflush(stdout);
-
-    if(ms < 0)
-    {
-        return false;
-    }
-
-    y_dev.FromDevice(y_host.data());
-
-    bool rtn = true;
-    if(validate)
-    {
-        // this host buffer will not copy to GPU, so no need use stride
-        ck_tile::HostTensor<Type> y_ref(
-            {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
-            {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
-
-        ck_tile::reference_batched_transpose<Type>(x_host, y_ref, layout_in, layout_out);
-
-        auto [rtol, atol] = get_elimit<Type>("");
-
-        rtn &= ck_tile::check_err(
-            y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
-    }
-    printf("valid:%s\n", rtn ? "y" : "n");
-    fflush(stdout);
-    return rtn;
-}
-
-int main(int argc, char** argv)
-{
-    auto [result, args] = create_args(argc, argv);
-    if(!result)
-        return -1;
-    std::string prec = args.get_str("pr");
-
-    bool r = true;
-    if(prec.compare("fp16") == 0)
-    {
-        r &= run_batched_transpose<ck_tile::fp16_t>(args);
-    }
-    else if(prec.compare("fp8") == 0)
-    {
-        r &= run_batched_transpose<ck_tile::fp8_t>(args);
-    }
-    else
-    {
-        std::cerr << "Unsupported data type: " << prec << std::endl;
-    }
-
-    return r ? 0 : -1;
-}
diff --git a/example/ck_tile/37_transpose/transpose_example.hpp b/example/ck_tile/37_transpose/transpose_example.hpp
deleted file mode 100644
index 8128d583ef..0000000000
--- a/example/ck_tile/37_transpose/transpose_example.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/reduce.hpp"
-#include "batched_transpose_kernel.hpp"
-#include "block_transpose.hpp"
-#include "transpose_policy.hpp"
-
-#include <vector>
-#include <string>
-
-#pragma once
-
-struct batched_transpose_trait
-{
-    std::string type;
-    std::string layout;
-};
-
-struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
-{
-};
-
-float batched_transpose(batched_transpose_trait t,
-                        batched_transpose_kargs a,
-                        ck_tile::stream_config s);
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index f85346e9be..630b96ede0 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -22,5 +22,4 @@ add_subdirectory(19_gemm_multi_d)
 add_subdirectory(20_grouped_convolution)
 add_subdirectory(21_elementwise)
 add_subdirectory(35_batched_transpose)
-add_subdirectory(37_transpose)
 add_subdirectory(38_block_scale_gemm)
diff --git a/include/ck_tile/ops/batched_transpose.hpp b/include/ck_tile/ops/batched_transpose.hpp
index 200e2a618c..ca0088c812 100644
--- a/include/ck_tile/ops/batched_transpose.hpp
+++ b/include/ck_tile/ops/batched_transpose.hpp
@@ -4,6 +4,10 @@
 #pragma once
 
 #include "ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_pipeline.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_policy.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp"
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp"
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index 4c3aa2ba29..a89a190489 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -32,7 +32,7 @@ struct BatchedTransposeKernel
     using Pipeline                        = remove_cvref_t<Pipeline_>;
     using Problem                         = remove_cvref_t<typename Pipeline::Problem>;
 
-    using Type = typename Problem::InputType;
+    using Type = typename Problem::DataType;
 
     struct BatchedTransposeKargs
     {
@@ -67,7 +67,7 @@ struct BatchedTransposeKernel
         return k;
     }
 
-    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; }
+    CK_TILE_HOST static constexpr auto BlockSize() { return Problem::kBlockSize; }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
new file mode 100644
index 0000000000..e344c24bf5
--- /dev/null
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+struct BatchedTransposeCommonPolicy
+{
+    CK_TILE_DEVICE static constexpr auto TileAccessPattern =
+        tile_distribution_pattern::thread_raked;
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeInputDistribution()
+    {
+        constexpr index_t BlockSize         = Problem::kBlockSize;
+        constexpr index_t LeadDimPerBlock   = Problem::kMPerBlock;
+        constexpr index_t SecondDimPerBlock = Problem::kNPerBlock;
+
+        constexpr index_t kVectorSize = Problem::VectorSizeOutput;
+
+        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
+                                                                      SecondDimPerBlock,
+                                                                      LeadDimPerBlock,
+                                                                      kVectorSize,
+                                                                      TileAccessPattern>;
+        return TileEncodingPattern::Make2DStaticTileDistribution();
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_pipeline.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_pipeline.hpp
new file mode 100644
index 0000000000..ef0b7fa229
--- /dev/null
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_pipeline.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_>
+struct BatchedTransposeLdsPipeline
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using DataType = remove_cvref_t<typename Problem::DataType>;
+
+    static constexpr index_t kBlockSize          = Problem::kBlockSize;
+    static constexpr index_t kLeadSizePerBlock   = Problem::kLeadSizePerBlock;
+    static constexpr index_t kSecondSizePerBlock = Problem::kSecondSizePerBlock;
+
+    static constexpr index_t GetVectorSize() { return Policy::template GetVectorSize<Problem>(); }
+
+    CK_TILE_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    template <typename InputTileWindow, typename OutputTileWindow>
+    CK_TILE_DEVICE void operator()(const InputTileWindow& input_window,
+                                   OutputTileWindow& output_window)
+    {
+        __shared__ char smem[GetSmemSize()];
+        auto input_tile_window =
+            make_tile_window(input_window, Policy::template MakeInputDistribution<Problem>());
+        auto output_tile_window =
+            make_tile_window(output_window, Policy::template MakeOutputDistribution<Problem>());
+
+        DataType* p_lds_ptr              = reinterpret_cast<DataType*>(smem);
+        constexpr auto in_lds_block_desc = Policy::template MakeLdsStoreBlockDescriptor<Problem>();
+        auto input_lds_block =
+            make_tensor_view<address_space_enum::lds>(p_lds_ptr, in_lds_block_desc);
+
+        constexpr auto out_lds_block_desc = Policy::template MakeLdsLoadBlockDescriptor<Problem>();
+        auto output_lds_block =
+            make_tensor_view<address_space_enum::lds>(p_lds_ptr, out_lds_block_desc);
+
+        auto copy_to_lds_window =
+            make_tile_window(input_lds_block,
+                             make_tuple(number<kSecondSizePerBlock>{}, number<kLeadSizePerBlock>{}),
+                             {0, 0});
+        auto load_from_lds_window =
+            make_tile_window(output_lds_block,
+                             make_tuple(number<kSecondSizePerBlock>{}, number<kLeadSizePerBlock>{}),
+                             {0, 0},
+                             Policy::template MakeLdsLoadTileDistribution<Problem>());
+
+        auto x = load_tile(input_tile_window);
+
+        store_tile(copy_to_lds_window, x);
+        block_sync_lds();
+
+        auto y = load_tile_transpose(load_from_lds_window);
+
+        store_tile(output_tile_window, y);
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/37_transpose/transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_policy.hpp
similarity index 65%
rename from example/ck_tile/37_transpose/transpose_policy.hpp
rename to include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_policy.hpp
index b7e52a94f7..77c3db9c06 100644
--- a/example/ck_tile/37_transpose/transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_policy.hpp
@@ -1,24 +1,17 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "batched_transpose_common_policy.hpp"
 
 namespace ck_tile {
 
-struct TransposePolicy
+struct BatchedTransposeLdsPolicy : public BatchedTransposeCommonPolicy
 {
-    static constexpr auto TileAccessPattern = tile_distribution_pattern::thread_raked;
-
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSize()
-    {
-        return 16 / sizeof(typename Problem::DataType);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    CK_TILE_DEVICE static constexpr index_t GetSmemSize()
     {
         return integer_least_multiple(
             sizeof(typename Problem::DataType) *
@@ -27,23 +20,7 @@ struct TransposePolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
-    {
-        constexpr index_t BlockSize         = Problem::kBlockSize;
-        constexpr index_t LeadDimPerBlock   = Problem::kLeadSizePerBlock;
-        constexpr index_t SecondDimPerBlock = Problem::kSecondSizePerBlock;
-        constexpr index_t VecLoadSize       = 16 / sizeof(typename Problem::DataType);
-
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      SecondDimPerBlock,
-                                                                      LeadDimPerBlock,
-                                                                      VecLoadSize,
-                                                                      TileAccessPattern>;
-        return TileEncodingPattern::Make2DStaticTileDistribution();
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    CK_TILE_DEVICE static constexpr auto MakeOutputDistribution()
     {
         constexpr auto input_dstr = MakeLdsLoadTileDistribution<Problem>();
 
@@ -56,11 +33,11 @@ struct TransposePolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreBlockDescriptor()
+    CK_TILE_DEVICE static constexpr auto MakeLdsStoreBlockDescriptor()
     {
         constexpr index_t kLeadDimPerBlock   = Problem::kLeadSizePerBlock;
         constexpr index_t kSecondDimPerBlock = Problem::kSecondSizePerBlock;
-        constexpr index_t kVectorSize        = 16 / sizeof(typename Problem::DataType);
+        constexpr index_t kVectorSize        = Problem::LDSVectorSize;
 
         constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kSecondDimPerBlock>{},
@@ -82,12 +59,11 @@ struct TransposePolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadBlockDescriptor()
+    CK_TILE_DEVICE static constexpr auto MakeLdsLoadBlockDescriptor()
     {
         constexpr index_t kLeadDimPerBlock   = Problem::kLeadSizePerBlock;
         constexpr index_t kSecondDimPerBlock = Problem::kSecondSizePerBlock;
-
-        constexpr index_t kVectorSize = 8 / sizeof(typename Problem::DataType);
+        constexpr index_t kVectorSize        = Problem::LDSVectorSize;
 
         constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kSecondDimPerBlock>{},
@@ -109,25 +85,19 @@ struct TransposePolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadTileDistribution()
+    CK_TILE_DEVICE static constexpr auto MakeLdsLoadTileDistribution()
     {
         using DataType = typename Problem::DataType;
 
-        // Extract base dimensions from the traits
-        constexpr index_t kBaseLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
-        constexpr index_t kBaseSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
-
         // Calculate block-level dimensions
-        constexpr index_t kLead              = Problem::kLeadSizePerXdl;
-        constexpr index_t kSecond            = Problem::kSecondSizePerXdl;
-        constexpr index_t kLeadIterPerWarp   = Problem::kLeadXdlNumPerWarp;
-        constexpr index_t kSecondIterPerWarp = Problem::kSecondXdlNumPerWarp;
+        constexpr index_t kLeadIterPerWarp   = 1;
+        constexpr index_t kSecondIterPerWarp = 1;
         constexpr index_t kLeadNumWarps      = Problem::kLeadNumWarps;
         constexpr index_t kSecondNumWarps    = Problem::kSecondNumWarps;
 
         // Calculate repetitions of base pattern
-        constexpr index_t kLeadRepetitions     = kLead / kBaseLeadDim;
-        constexpr index_t kSecondRepetitions   = kSecond / kBaseSecondDim;
+        constexpr index_t kLeadRepetitions     = Problem::kQuadNumPerLeadDim;
+        constexpr index_t kSecondRepetitions   = Problem::kQuadNumPerSecondDim;
         constexpr index_t kSecondDimIterations = Problem::kIterationsInSecondDim;
         constexpr index_t kSecondDimStrSub     = kSecondRepetitions / kSecondDimIterations;
 
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
new file mode 100644
index 0000000000..491db37564
--- /dev/null
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// supports 2D transpose which will store to lds,
+// then use ds_read_b*_tr_b* instruction to get the transposed data
+template <typename DataType_,
+          typename BlockTile, // sequence<block_x, block_y>
+          typename NumWarps,
+          bool kPadM_,
+          bool kPadN_>
+struct BatchedTransposeLdsProblem
+{
+    using DataType = remove_cvref_t<DataType_>;
+
+    static constexpr index_t kRowWarps_    = NumWarps::at(number<1>{});
+    static constexpr index_t kColWarps_    = NumWarps::at(number<0>{});
+    static constexpr index_t kBlockSize_   = get_warp_size() * kRowWarps_ * kColWarps_;
+    static constexpr index_t kRowPerBlock_ = BlockTile::at(number<1>{});
+    static constexpr index_t kColPerBlock_ = BlockTile::at(number<0>{});
+
+    static constexpr index_t kBlockSize = kBlockSize_;
+    // warps per block
+    static constexpr index_t kLeadNumWarps   = kRowWarps_;
+    static constexpr index_t kSecondNumWarps = kColWarps_;
+
+    static constexpr index_t kLeadSizePerBlock   = kRowPerBlock_;
+    static constexpr index_t kSecondSizePerBlock = kColPerBlock_;
+
+    static constexpr index_t kQuadrantLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
+    static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
+
+    static_assert(kLeadSizePerBlock % kLeadNumWarps == 0,
+                  "block dim should be divided by warp count!");
+    static_assert(kSecondSizePerBlock % kSecondNumWarps == 0,
+                  "block dim should be divided by warp count!");
+    // rows/cols per warp
+    static constexpr index_t kLeadSizePerWarp   = kLeadSizePerBlock / kLeadNumWarps;
+    static constexpr index_t kSecondSizePerWarp = kSecondSizePerBlock / kSecondNumWarps;
+
+    static_assert(kLeadSizePerWarp % kQuadrantLeadDim == 0,
+                  "xdl dim should be divided by quad dim!");
+    static_assert(kSecondSizePerWarp % kQuadrantSecondDim == 0,
+                  "xdl dim should be divided by quad dim!");
+    // xdl rows/cols is divided into quadrants.
+    static constexpr index_t kQuadNumPerLeadDim   = kLeadSizePerWarp / kQuadrantLeadDim;
+    static constexpr index_t kQuadNumPerSecondDim = kSecondSizePerWarp / kQuadrantSecondDim;
+
+    static constexpr index_t kIterationsInSecondDim =
+        kQuadNumPerLeadDim * kQuadNumPerSecondDim * 16 / get_warp_size();
+
+    // definitions to adapt to BatchedTransposeKernel
+
+    // FIXME: support padding
+    static constexpr bool kPadM = kPadM_;
+    static constexpr bool kPadN = kPadN_;
+
+    static constexpr auto kMPerBlock = kLeadSizePerBlock;
+    static constexpr auto kNPerBlock = kSecondSizePerBlock;
+
+    // 128-bit is the max single-instruction bandwidth for load/store
+    static constexpr index_t MaxLoadStoreSize = 16;
+    static constexpr auto VectorSizeInput     = kPadN ? 1 : MaxLoadStoreSize / sizeof(DataType);
+    static constexpr auto VectorSizeOutput    = kPadM ? 1 : MaxLoadStoreSize / sizeof(DataType);
+    static constexpr auto LDSVectorSize       = MaxLoadStoreSize / sizeof(DataType);
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
index e815313c06..633827f3c3 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
@@ -5,8 +5,6 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
-#include <string>
-#include <type_traits>
 
 namespace ck_tile {
 
@@ -14,15 +12,8 @@ template <typename Problem_, typename Policy_ = BatchedTransposePolicy>
 struct BatchedTransposePipeline
 {
     // TODO: this kernel only support warp per row
-    using Problem   = remove_cvref_t<Problem_>;
-    using Policy    = remove_cvref_t<Policy_>;
-    using InputType = ck_tile::remove_cvref_t<typename Problem::InputType>;
-    static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock;
-    static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock;
-    static constexpr index_t AlignmentM          = Problem::AlignmentM;
-    static constexpr index_t AlignmentN          = Problem::AlignmentN;
-    static constexpr bool kPadM                  = Problem::kPadM;
-    static constexpr bool kPadN                  = Problem::kPadN;
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
 
     template <typename InputWindow, typename OutputWindow>
     CK_TILE_DEVICE auto operator()(const InputWindow& input_window, OutputWindow& out_window)
@@ -32,7 +23,7 @@ struct BatchedTransposePipeline
 
         auto input_tile = load_tile(inp_win);
 
-        auto output_tile = make_static_distributed_tensor<InputType>(
+        auto output_tile = make_static_distributed_tensor<typename Problem::DataType>(
             Policy::template MakeOutputDistribution<Problem>());
 
         transpose_tile2d(output_tile, input_tile);
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
index dd9a6d79a8..5238fecdc5 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
@@ -4,43 +4,25 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/softmax.hpp"
-#include "ck_tile/ops/topk.hpp"
+#include "batched_transpose_common_policy.hpp"
 
 namespace ck_tile {
 
-struct BatchedTransposePolicy
+struct BatchedTransposePolicy : public BatchedTransposeCommonPolicy
 {
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
-    {
-        constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t MPerBlock   = Problem::kMPerBlock;
-        constexpr index_t NPerBlock   = Problem::kNPerBlock;
-        constexpr index_t VecLoadSize = Problem::VectorSizeInput;
-        using TileEncodingPattern =
-            TileDistributionEncodingPattern2D<BlockSize,
-                                              MPerBlock,
-                                              NPerBlock,
-                                              VecLoadSize,
-                                              tile_distribution_pattern::thread_raked>;
-        return TileEncodingPattern::Make2DStaticTileDistribution();
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    CK_TILE_DEVICE static constexpr auto MakeOutputDistribution()
     {
         constexpr index_t BlockSize   = Problem::kBlockSize;
         constexpr index_t MPerBlock   = Problem::kMPerBlock;
         constexpr index_t NPerBlock   = Problem::kNPerBlock;
         constexpr index_t VecLoadSize = Problem::VectorSizeOutput;
 
-        using TileEncodingPattern =
-            TileDistributionEncodingPattern2D<BlockSize,
-                                              NPerBlock,
-                                              MPerBlock,
-                                              VecLoadSize,
-                                              tile_distribution_pattern::thread_raked>;
+        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
+                                                                      NPerBlock,
+                                                                      MPerBlock,
+                                                                      VecLoadSize,
+                                                                      TileAccessPattern>;
         return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
     }
 };
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
index fd5ea004b6..2be979723b 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
@@ -6,42 +6,31 @@
 #include "ck_tile/core.hpp"
 #include <type_traits>
 
-#define VectorLoadSize 16
-
 namespace ck_tile {
 
-template <typename InputType_,
+template <typename DataType_,
           typename BlockTile, // Sequence<...
-          typename WarpTile,  // Sequence<...
-          typename ThreadTile,
+          typename WarpLayout,
           bool kPadM_ = false,
           bool kPadN_ = false> // Sequence<...
 struct BatchedTransposeProblem
 {
-    using InputType = remove_cvref_t<InputType_>;
+    using DataType = remove_cvref_t<DataType_>;
 
-    static constexpr index_t kMPerThread = ThreadTile::at(number<0>{});
-    static constexpr index_t kNPerThread = ThreadTile::at(number<1>{});
-
-    static constexpr index_t kMPerWarp = WarpTile::at(number<0>{});
-    static constexpr index_t kNPerWarp = WarpTile::at(number<1>{});
-
-    static constexpr index_t kMThreadPerWarp = kMPerWarp / kMPerThread;
-    static constexpr index_t kNThreadPerWarp = kNPerWarp / kNPerThread;
+    static constexpr index_t kMPerWarp = WarpLayout::at(number<0>{});
+    static constexpr index_t kNPerWarp = WarpLayout::at(number<1>{});
 
     static constexpr index_t kMPerBlock = BlockTile::at(number<0>{});
     static constexpr index_t kNPerBlock = BlockTile::at(number<1>{});
 
-    static constexpr index_t kMWarpPerBlock = kMPerBlock / kMPerWarp;
-    static constexpr index_t kNWarpPerBlock = kNPerBlock / kNPerWarp;
-
-    static constexpr index_t kBlockSize =
-        kMThreadPerWarp * kNThreadPerWarp * kMWarpPerBlock * kNWarpPerBlock;
+    static constexpr index_t kBlockSize = kMPerWarp * kNPerWarp * get_warp_size();
 
     static constexpr bool kPadM = kPadM_;
     static constexpr bool kPadN = kPadN_;
 
-    static constexpr index_t VectorSizeInput  = kPadM ? 1 : VectorLoadSize / sizeof(InputType);
-    static constexpr index_t VectorSizeOutput = kPadN ? 1 : VectorLoadSize / sizeof(InputType);
+    // 128-bit is the max single-instruction bandwidth for load/store
+    static constexpr index_t MaxLoadStoreSize = 16;
+    static constexpr index_t VectorSizeInput  = kPadN ? 1 : MaxLoadStoreSize / sizeof(DataType);
+    static constexpr index_t VectorSizeOutput = kPadM ? 1 : MaxLoadStoreSize / sizeof(DataType);
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 9d00de5f73..c201293389 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -29,9 +29,9 @@
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
-#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
diff --git a/test/ck_tile/batched_transpose/batched_transpose_api.cpp b/test/ck_tile/batched_transpose/batched_transpose_api.cpp
index 27c2269a06..973a1967f2 100644
--- a/test/ck_tile/batched_transpose/batched_transpose_api.cpp
+++ b/test/ck_tile/batched_transpose/batched_transpose_api.cpp
@@ -7,8 +7,6 @@ template <typename ts_type,
           ck_tile::index_t block_y,
           ck_tile::index_t warp_x,
           ck_tile::index_t warp_y,
-          ck_tile::index_t thread_x,
-          ck_tile::index_t thread_y,
           bool kPadM,
           bool kPadN>
 float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
@@ -20,11 +18,10 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
     a.dim_block_w = block_x;
 
     using block_tile  = ck_tile::sequence<block_x, block_y>;
-    using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
-    using thread_tile = ck_tile::sequence<thread_x, thread_y>;
+    using warp_layout = ck_tile::sequence<warp_x, warp_y>;
 
     using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile, kPadM, kPadN>;
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
     using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
 
     using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
@@ -53,21 +50,20 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con
 }
 
 // Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)                               \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, true, true)     \
-    F(fp8, ck_tile::fp8_t, 64, 64, 64, 64, 8, 8, false, false)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 64, 64, 8, 8, false, false) \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, true, true)   \
-    F(bf16, ck_tile::bf16_t, 64, 64, 64, 64, 8, 8, false, false)
+#define FOREACH_TRANSPOSE_PARAM(F)                       \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false)
 
 // Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN)             \
-    static float                                                                                \
-        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY##_##PADM##_##PADN(  \
-            batched_transpose_kargs& a, ck_tile::stream_config& s)                              \
-    {                                                                                           \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY, PADM, PADN>(a, s); \
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN)               \
+    static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN( \
+        batched_transpose_kargs& a, ck_tile::stream_config& s)                            \
+    {                                                                                     \
+        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, PADM, PADN>(a, s);   \
     }
 
 FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -80,33 +76,33 @@ float batched_transpose(batched_transpose_trait t,
     {
         if(a.height % 64 == 0 && a.width % 64 == 0)
         {
-            return transpose_fn_fp8_64_64_64_64_8_8_false_false(a, s);
+            return transpose_fn_fp8_64_64_1_1_false_false(a, s);
         }
         else
         {
-            return transpose_fn_fp8_64_64_64_64_8_8_true_true(a, s);
+            return transpose_fn_fp8_64_64_1_1_true_true(a, s);
         }
     }
     else if(t.type == "fp16")
     {
         if(a.height % 64 == 0 && a.width % 64 == 0)
         {
-            return transpose_fn_fp16_64_64_64_64_8_8_false_false(a, s);
+            return transpose_fn_fp16_64_64_1_1_false_false(a, s);
         }
         else
         {
-            return transpose_fn_fp16_64_64_64_64_8_8_true_true(a, s);
+            return transpose_fn_fp16_64_64_1_1_true_true(a, s);
         }
     }
     else if(t.type == "bf16")
     {
         if(a.height % 64 == 0 && a.width % 64 == 0)
         {
-            return transpose_fn_bf16_64_64_64_64_8_8_false_false(a, s);
+            return transpose_fn_bf16_64_64_1_1_false_false(a, s);
         }
         else
         {
-            return transpose_fn_bf16_64_64_64_64_8_8_true_true(a, s);
+            return transpose_fn_bf16_64_64_1_1_true_true(a, s);
         }
     }
     return -1;

From cbfa62e4b6e0027e8609ca1daac986880a1958f4 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <gt.bercea@gmail.com>
Date: Sun, 27 Jul 2025 01:04:59 -0400
Subject: [PATCH 353/443] Refactor async loads to work on all GPUs (#2545)

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
---
 .../core/arch/amd_buffer_addressing.hpp       | 70 ++++++-------------
 .../arch/amd_buffer_addressing_builtins.hpp   | 70 ++++++-------------
 2 files changed, 44 insertions(+), 96 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 05775063b8..add6b1dbdc 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1783,60 +1783,34 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
                                           bool_constant<oob_conditional_check> = {})
 {
     constexpr index_t bytes = sizeof(T) * N;
+
+    // Used to catch the cases when src_immediate_addr_offset is NOT 0.
+    // Remove this assert once other sizes are implemented.
+    assert(src_immediate_addr_offset == 0 &&
+           "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
+    ignore = src_immediate_addr_offset;
+
 #if defined(__gfx950__)
     static_assert(bytes == 4 || bytes == 12 || bytes == 16,
                   "wrong! only support in dword, dwordx3, dwordx4");
-    ignore = src_wave_addr_offset;
-    ignore = src_immediate_addr_offset;
-    if constexpr(oob_conditional_check)
-    {
-        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            v_offset,
-            0,
-            0,
-            static_cast<index_t>(coherence));
-    }
-    else
-    {
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            src_thread_addr_offset,
-            0,
-            0,
-            static_cast<index_t>(coherence));
-    }
+    src_wave_addr_offset = 0;
 #else
     static_assert(bytes == 4, "wrong! not implemented vector size");
-    if constexpr(oob_conditional_check)
-    {
-        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            v_offset,
-            src_wave_addr_offset,
-            src_immediate_addr_offset,
-            static_cast<index_t>(coherence));
-    }
-    else
-    {
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            src_thread_addr_offset,
-            src_wave_addr_offset,
-            src_immediate_addr_offset,
-            static_cast<index_t>(coherence));
-    }
 #endif
+
+    // Set up v_offset:
+    index_t v_offset = src_thread_addr_offset;
+    if constexpr(oob_conditional_check)
+        v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+
+    llvm_amdgcn_raw_buffer_load_lds(
+        src_wave_buffer_resource,
+        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+        bytes,
+        v_offset,
+        src_wave_addr_offset,
+        /*src_immediate_addr_offset*/ 0,
+        static_cast<index_t>(coherence));
 }
 
 template <index_t N,
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 568a5be64c..ce4af430e2 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -1553,60 +1553,34 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
                                           bool_constant<oob_conditional_check> = {})
 {
     constexpr index_t bytes = sizeof(T) * N;
+
+    // Used to catch the cases when src_immediate_addr_offset is NOT 0.
+    // Remove this assert once other sizes are implemented.
+    assert(src_immediate_addr_offset == 0 &&
+           "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
+    ignore = src_immediate_addr_offset;
+
 #if defined(__gfx950__)
     static_assert(bytes == 4 || bytes == 12 || bytes == 16,
                   "wrong! only support in dword, dwordx3, dwordx4");
-    ignore = src_wave_addr_offset;
-    ignore = src_immediate_addr_offset;
-    if constexpr(oob_conditional_check)
-    {
-        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            v_offset,
-            0,
-            0,
-            static_cast<index_t>(coherence));
-    }
-    else
-    {
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            src_thread_addr_offset,
-            0,
-            0,
-            static_cast<index_t>(coherence));
-    }
+    src_wave_addr_offset = 0;
 #else
     static_assert(bytes == 4, "wrong! not implemented vector size");
-    if constexpr(oob_conditional_check)
-    {
-        index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2];
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            v_offset,
-            src_wave_addr_offset,
-            src_immediate_addr_offset,
-            static_cast<index_t>(coherence));
-    }
-    else
-    {
-        llvm_amdgcn_raw_buffer_load_lds(
-            src_wave_buffer_resource,
-            reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
-            bytes,
-            src_thread_addr_offset,
-            src_wave_addr_offset,
-            src_immediate_addr_offset,
-            static_cast<index_t>(coherence));
-    }
 #endif
+
+    // Set up v_offset:
+    index_t v_offset = src_thread_addr_offset;
+    if constexpr(oob_conditional_check)
+        v_offset = flag ? v_offset : src_wave_buffer_resource[2];
+
+    llvm_amdgcn_raw_buffer_load_lds(
+        src_wave_buffer_resource,
+        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
+        bytes,
+        v_offset,
+        src_wave_addr_offset,
+        /*src_immediate_addr_offset*/ 0,
+        static_cast<index_t>(coherence));
 }
 
 template <index_t N,

From 685771b875296a5b06a42be91211dd1245d95345 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 28 Jul 2025 00:47:17 +0200
Subject: [PATCH 354/443] Enable bf16 RNE on gfx950 (#2542)

* Enable bf16 RNE for gfx950

* test bhalf

* fix

* fix

* Comments fixes

* fixes

* clean

* fix
---
 include/ck/utility/type_convert.hpp | 17 +++++++++++
 test/data_type/test_bhalf.cpp       | 46 +++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index c859cfba3d..e9fd1ea88f 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -39,6 +39,19 @@ namespace details {
 } // namespace details
 } // namespace
 
+#if defined(__gfx950__)
+inline __device__ bhalf_t static_cast_float_to_bf16(float x)
+{
+    union
+    {
+        uint16_t uint16;
+        __bf16 bf16;
+    } out;
+    out.bf16 = static_cast<__bf16>(x);
+    return out.uint16;
+}
+#endif
+
 // Declare a template function for bf16 conversion using RTN
 template <typename Y, typename X>
 __host__ __device__ constexpr Y bf16_convert_rtn(X x);
@@ -47,6 +60,9 @@ __host__ __device__ constexpr Y bf16_convert_rtn(X x);
 template <>
 inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
 {
+#if defined(__gfx950__)
+    return static_cast_float_to_bf16(x);
+#else
     // Nan check
     if(x != x)
     {
@@ -63,6 +79,7 @@ inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(fl
     constexpr uint32_t rounding_bias      = uint32_t((1 << 15) - 1);
 
     return uint16_t((u.int32 + first_bf16_mantisa_bit + rounding_bias) >> 16);
+#endif
 }
 
 // convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
diff --git a/test/data_type/test_bhalf.cpp b/test/data_type/test_bhalf.cpp
index cadd8c70cf..ad31e194b8 100644
--- a/test/data_type/test_bhalf.cpp
+++ b/test/data_type/test_bhalf.cpp
@@ -2,8 +2,12 @@
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
+
+#include <hip/hip_runtime.h>
+
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
 
 using ck::bhalf_t;
 using ck::type_convert;
@@ -46,3 +50,45 @@ TEST(BHALF_T, MantisaExpOverflow)
     ASSERT_TRUE(std::isnan(float_val));
     ASSERT_TRUE(std::isnan(type_convert<float>(type_convert<bhalf_t>(float_val))));
 }
+
+__global__ void cast(const float input, float* output)
+{
+    const bhalf_t bhalf_val = type_convert<bhalf_t>(input);
+    *output                 = type_convert<float>(bhalf_val);
+}
+
+TEST(BHALF_T, CastOnDevice)
+{
+    constexpr int num_vals     = 11;
+    const float abs_tol        = std::pow(2, -7);
+    float float_vals[num_vals] = {0.5, 0.875, 1.5, 1, 2, 4, 8, 16, 32, 64, 128};
+
+    float* float_val_after_cast_dev;
+    float float_val_after_cast_host;
+    hip_check_error(hipMalloc(&float_val_after_cast_dev, sizeof(float)));
+
+    // Positive
+    for(int idx = 0; idx < num_vals; idx++)
+    {
+        cast<<<1, 1>>>(float_vals[idx], float_val_after_cast_dev);
+
+        hip_check_error(hipMemcpy(&float_val_after_cast_host,
+                                  float_val_after_cast_dev,
+                                  sizeof(float),
+                                  hipMemcpyDeviceToHost));
+
+        ASSERT_NEAR(float_val_after_cast_host, float_vals[idx], abs_tol);
+    }
+    // Negative
+    for(int idx = 0; idx < num_vals; idx++)
+    {
+        cast<<<1, 1>>>(-float_vals[idx], float_val_after_cast_dev);
+
+        hip_check_error(hipMemcpy(&float_val_after_cast_host,
+                                  float_val_after_cast_dev,
+                                  sizeof(float),
+                                  hipMemcpyDeviceToHost));
+
+        ASSERT_NEAR(float_val_after_cast_host, -float_vals[idx], abs_tol);
+    }
+}

From 8ae528a1b42913a71c9ca49253b0cfd515e1c6da Mon Sep 17 00:00:00 2001
From: shay-li77 <xiangxli@amd.com>
Date: Mon, 28 Jul 2025 14:39:31 +0800
Subject: [PATCH 355/443] fix mha bwd dbias random mismatch (#2570)

* fix mha bwd dbias random mismatch

* formatting code
---
 ...lock_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index d1b6e6f85b..420ae03b7e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -738,6 +738,11 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
 
             gemm_3(dk_acc, dst_reg_tensor, qt_reg_tensor);
 
+            if constexpr(kHasBiasGrad)
+            {
+                // SGrad and BiasGrad use the same address in LDS.
+                block_sync_lds();
+            }
             store_tile(ds_lds_window, ds_gemm);
 
             block_sync_lds();
@@ -976,6 +981,12 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
                                                   decltype(ds_gemm)>(dst_reg_tensor, ds_gemm);
 
         gemm_3(dk_acc, dst_reg_tensor, qt_reg_tensor);
+
+        if constexpr(kHasBiasGrad)
+        {
+            // SGrad and BiasGrad use the same address in LDS.
+            block_sync_lds();
+        }
         store_tile(ds_lds_window, ds_gemm);
 
         block_sync_lds();

From b36e0b029f1491b71f34e93bc34af20afd817251 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Mon, 28 Jul 2025 17:16:32 +0800
Subject: [PATCH 356/443] [CK_TILE][FMHA] Uncomment all the headdim, use optdim
 to control (#2539)

* uncomment all the headdim, use optdim to control

* change default back to -1

* uncomment splitkv instance

* Fix typo in receipt 4 for appendkv

* support optdim for bwd, splitkv and appendkv

* Fix 192 key error

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: Andy Lugo <Andy.LugoReyes@amd.com>
---
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   | 17 ++++++-----
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  7 +++--
 .../01_fmha/codegen/ops/fmha_fwd_appendkv.py  | 13 +++++----
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 28 +++++++++++--------
 example/ck_tile/01_fmha/generate.py           |  3 --
 5 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 4dffdf836a..30b524d606 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -653,7 +653,7 @@ class FmhaBwdApiTrait:
         return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
             F_spad=self.spad, F_skpad=self.skpad, F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias,
             F_dbias=self.dbias, F_dropout=self.dropout, F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, F_pipeline=self.pipeline, mask_impl=self.mask_impl)
-    
+
     @property
     def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
         # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
@@ -709,7 +709,7 @@ class FmhaBwdApiPool:
             per_dtypes += '    (void)t ; (void)s ; (void)a;'
         return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
 
-def get_bwd_blobs(filter_list: str, receipt, mask_impl) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
+def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
     if filter_list == '':
         filter_list = '*@*@*'
     filter_list = filter_list.split('@')
@@ -743,13 +743,16 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl) -> Tuple[FmhaBwdApiPool,
             if (dpad == "t" or dvpad == "t"):
                 ppl = d[hdim_str][2]
             t = FmhaBwdApiTrait(idx=0, pipeline=ppl, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad=spad, spad1=spad1, skpad=skpad, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
-            
+
             if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
                 continue
             if not fnmatch.fnmatch(t.dq_dk_dv_kernel.name, filter_dq_dk_dv):
                 continue
             if not fnmatch.fnmatch(t.convert_dq_kernel.name, filter_convert_dq):
                 continue
+            if optdim_list != [-1]:
+                if hdim not in optdim_list:
+                    continue
 
             # Flash attention integration
             if receipt == 2:
@@ -803,9 +806,7 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl) -> Tuple[FmhaBwdApiPool,
     return api_pool, list(gen_dot_do_o.keys()), list(gen_dq_dk_dv.keys()), list(gen_convert_dq.keys())
 
 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
-    assert optdim_list == [-1]  # TODO
-
-    api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl)
+    api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list)
     (output_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
     for k in kernels_dot_do_o:
         (output_dir / k.filename).write_text(k.template)
@@ -816,10 +817,8 @@ def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask
 
 
 def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None:
-    assert optdim_list == [-1]  # TODO
-
     _, kernels_dot_do_o, kernels_dq_dk_dv, kernels_convert_dq = get_bwd_blobs(
-        filter_list, receipt, mask_impl
+        filter_list, receipt, mask_impl, optdim_list
     )
     with file_path.open("a") as f:
         for k in kernels_dot_do_o:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 78cec40aa8..730641a6b0 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -27,6 +27,7 @@ K0_MAX_SUBMAX_MAP = {
     64 : 64,
     96 : 128,
     128: 128,
+    192: 192,
     256: 256
 }
 
@@ -504,11 +505,11 @@ class KernelComponentFactory:
             return {
                 (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-            ### (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-            ### (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-            ### (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
             }
         elif dtype == 'fp8' or dtype == 'bf8':
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index 517e84f380..2e5bc2bd3d 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -273,7 +273,7 @@ def get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
     else:
         return None
 
-def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
+def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, optdim_list) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
     # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
     #       support this in future
     def get_pipelines(dtype, hdim) -> List[FmhaFwdAppendKVPipeline]:
@@ -326,6 +326,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                 if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                 # 2 - Flash attention integration
                 if receipt == 2:
                     cond = dtype in ['fp16', 'bf16']
@@ -334,7 +337,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                         continue
                 # PyTorch integration
                 elif receipt == 4:
-                    cond = dtype in ['fp16, bf16']
+                    cond = dtype in ['fp16', 'bf16']
                     cond &= pipeline.F_vlayout == 'row'
                     if not cond:
                         continue
@@ -350,16 +353,14 @@ def write_fwd_appendkv_api(api_pool : FmhaFwdAppendKVApiPool, autogen_dir: Path)
     (autogen_dir / FMHA_FWD_APPENDKV_API_FILENAME).write_text(api_pool.api)
 
 def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
-    assert optdim_list == [-1]
-    api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
+    api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl, optdim_list)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
     write_fwd_appendkv_api(api_pool, output_dir)
 
 def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
-    assert optdim_list == [-1]
     with file_path.open('a') as f:
-        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl, optdim_list)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index edc1532a05..5b35e7f0bd 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -637,9 +637,9 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
         return {
             '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-        ### '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -656,9 +656,9 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
         return {
             '32'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-        ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
-        ### '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
     }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -670,7 +670,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
     else:
         return None
 
-def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdSplitKVApiPool, List[FmhaFwdSplitKVKernel]]:
+def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, optdim_list) -> Tuple[FmhaFwdSplitKVApiPool, List[FmhaFwdSplitKVKernel]]:
     Pipeline = FmhaFwdSplitKVPipeline
     Kernel = FmhaFwdSplitKVKernel
 
@@ -746,6 +746,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                 if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                 # Flash attention integration
                 if receipt == 2:
                     cond = dtype in ['fp16', 'bf16']
@@ -783,7 +786,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
 
     return (api_pool, gen)
 
-def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaFwdSplitKVCombineKernel]:
+def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt, optdim_list) -> List[FmhaFwdSplitKVCombineKernel]:
     Pipeline = FmhaFwdSplitKVCombinePipeline
     Kernel = FmhaFwdSplitKVCombineKernel
 
@@ -830,6 +833,9 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
                 if kernel_filter != '':
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                 # Aiter(mha_varlen_fwd) integration
                 if receipt == 200:
                     cond = dtype in ['fp16', 'bf16']
@@ -855,12 +861,11 @@ def write_fwd_splitkv_api(api_pool : FmhaFwdSplitKVApiPool, autogen_dir: Path) -
 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     filter_list = filter_list.split('@')
     filter_list.extend([''] * (2 - len(filter_list)))
-    assert optdim_list == [-1]
 
-    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
+    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt, optdim_list)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
-    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
+    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl, optdim_list)
     for kernel in kernels:
         write_single_kernel(kernel, output_dir)
     write_fwd_splitkv_api(api_pool, output_dir)
@@ -868,13 +873,12 @@ def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask
 def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     filter_list = filter_list.split('@')
     filter_list.extend([''] * (2 - len(filter_list)))
-    assert optdim_list == [-1]
 
     with file_path.open('a') as f:
-        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
+        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt, optdim_list)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
+        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl, optdim_list)
         for kernel in kernels:
             f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
         f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index c611618824..0317330511 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -126,9 +126,6 @@ if __name__ == "__main__":
     filter_list.extend([''] * (len(api_list) - len(filter_list)))
     optdim_list = [int(hdim) for hdim in args.optdim.split(',')]
 
-    if len(api_list) > 1:
-        assert optdim_list == [-1]
-
     if args.list_blobs is not None:
         list_blobs(args.list_blobs, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
     else:

From c64a0c65b96ef1fba731746e6c39146326023b7f Mon Sep 17 00:00:00 2001
From: jefyang1 <146495389+jefyang1@users.noreply.github.com>
Date: Mon, 28 Jul 2025 09:03:54 -0700
Subject: [PATCH 357/443] Add gemm universal f8 f8 bf16 mk nk instances on
 gfx950 (#2558)

---
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 84 +++++++++++++++++++
 ...f8_bf16_mk_nk_mn_comp_default_instance.cpp |  8 +-
 ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  8 +-
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |  8 ++
 ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  8 ++
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  8 ++
 ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  8 ++
 7 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index 27d7933477..da4307d9be 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -54,6 +54,54 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// instances for double rate mfma on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  32,  32,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>
+#endif
+    // clang-format on
+    >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
@@ -115,6 +163,42 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// instances for double rate mfma on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    256, 32,  32,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 32,  32,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    256, 32,  32,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    256, 32,  32,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    256, 32,  32,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    256, 32,  32,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
index d6c9809020..6cf0228c04 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -17,7 +17,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
 
-    if(ck::get_device_name() != "gfx950")
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmDefault>{});
+    }
+    else
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
index fc6ad01742..65e49d5f88 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -17,7 +17,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
 
-    if(ck::get_device_name() != "gfx950")
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmKPadding>{});
+    }
+    else
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
index f6a9c48555..56c7c71a13 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -16,6 +16,14 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
+                                                                           GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
index f9c12e7cb2..bad30bad99 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -16,6 +16,14 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
+                                                                           GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
index 1d33c7fa57..8d6b8dcbca 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -16,6 +16,14 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
+                                                                           GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
index 252aec5bc2..d0bbc4aeda 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -16,6 +16,14 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmKPadding>{});
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
+                                                                           GemmKPadding>{});
+    }
 }
 
 } // namespace instance

From 97860870104b63c1566f939f86fc0c836efcbfd6 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:04:12 -0700
Subject: [PATCH 358/443] use ninja to build packages (#2575)

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index fb4afa992b..7a8452f25e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -595,7 +595,7 @@ def Build_CK(Map conf=[:]){
                         if (params.RUN_FULL_QA && arch == 2 ){
                             // build deb packages
                             echo "Build packages"
-                            sh 'make -j package'
+                            sh 'ninja package'
                             archiveArtifacts artifacts: 'composablekernel*.deb'
                             sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb'
                             sh 'mv composablekernel-dev_*.deb composablekernel-dev_1.1.0_amd64.deb'

From 504b101da33bd1ae2b39e13342c961eb0ddb4458 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:34:07 -0700
Subject: [PATCH 359/443] upgrade from clang-format-12 to clang-format-18
 (#2568)

* upgrade to clang-format-18

* update to clang-format-18 in pre-commit-config
---
 .pre-commit-config.yaml                       |   2 +-
 Dockerfile                                    |   1 +
 Jenkinsfile                                   |   4 +-
 .../grouped_conv2d_fwd_ngchw.cpp              |   6 +-
 .../grouped_conv2d_bwd_data.cpp               |   6 +-
 .../grouped_conv2d_bwd_data_ngchw.cpp         |   6 +-
 .../grouped_conv3d_bwd_data.cpp               |   6 +-
 ..._conv3d_bwd_data_input_fp16_comp_bf8f8.cpp |   6 +-
 .../elementwise_layernorm2d.cpp               |   2 +-
 client_example/15_reduce/reduce_nhwc_c.cpp    |  18 +-
 ...d_conv_bwd_data_bilinear_residual_fp16.cpp |   6 +-
 .../grouped_conv_bwd_data_scale_fp16.cpp      |   6 +-
 ...rouped_conv_fwd_bilinear_residual_fp16.cpp |   6 +-
 .../common.hpp                                |  32 +--
 .../grouped_conv_fwd_scale_fp16.cpp           |   6 +-
 .../grouped_conv_fwd_scaleadd_ab.inc          |   4 +-
 client_example/25_wrapper/wrapper_img2col.cpp |   6 +-
 codegen/include/ck/host/stringutils.hpp       |   5 +-
 ...wd_multiple_abd_operation_xdl_cshuffle.cpp |  11 +-
 codegen/test/batched_gemm_softmax_gemm.cpp    |  12 +-
 codegen/test/gemm_multiple_d.cpp              |  10 +-
 codegen/test/rtc/include/rtc/tmp_dir.hpp      |   2 +-
 .../Composable-Kernel-prerequisites.rst       |   2 +-
 example/01_gemm/gemm_xdl_fp64.cpp             |  11 +-
 example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp  |   6 +-
 example/12_reduce/reduce_blockwise_impl.hpp   |   2 +-
 .../gemm_reduce_xdl_common.hpp                |   6 +-
 .../batched_gemm_reduce_xdl_fp16.cpp          |   6 +-
 .../run_layernorm_example.inc                 |   4 +-
 ...rouped_gemm_scale_softmax_gemm_permute.inc |   8 +-
 .../sparse_embedding3_forward_layernorm.cpp   |   8 +-
 example/39_permute/common.hpp                 |  13 +-
 .../run_groupnorm_fwd_example.inc             |   4 +-
 ...entwise_scale_permute_amax_2D_fp16_fp8.cpp |   6 +-
 .../contraction_multi_ABD_xdl_fp16.cpp        |   2 +-
 .../contraction_multi_ABD_xdl_fp8.cpp         |   4 +-
 .../convnd_fwd_convscale_reduce_common.hpp    |   8 +-
 .../run_layernorm4d_fwd_example.inc           |   4 +-
 .../moe_gemm1_xdl_pk_i4.cpp                   |   2 +-
 .../02_layernorm2d/layernorm2d_fwd.cpp        |   3 +-
 .../matrix_core_swizzle_kernel.hpp            |  14 +-
 .../10_rmsnorm2d/example_rmsnorm2d_fwd.cpp    |   3 +-
 .../ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp    |   3 +-
 .../add_rmsnorm2d_rdquant_fwd.cpp             |   4 +-
 .../example_add_rmsnorm2d_rdquant_fwd.cpp     |   3 +-
 .../12_smoothquant/example_smoothquant.cpp    |   7 +-
 .../ck_tile/12_smoothquant/smoothquant.cpp    |   5 +-
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    |  30 +--
 .../13_moe_sorting/moe_sorting_api.cpp        |  60 +++---
 .../14_moe_smoothquant/moe_smoothquant.cpp    |   6 +-
 .../15_fused_moe/instances/fused_moe_api.cpp  |  38 ++--
 .../instances/fused_moegemm_api_internal.hpp  |  10 +-
 .../instances/fused_moesorting_api.cpp        |  60 +++---
 example/ck_tile/15_fused_moe/main.cpp         |   3 +-
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  |   7 +-
 .../17_grouped_gemm/grouped_gemm_tileloop.cpp |   7 +-
 .../run_grouped_gemm_example.inc              |   6 +-
 .../19_gemm_multi_d/gemm_multi_d_fp16.cpp     |   7 +-
 ...grouped_convolution_bwd_weight_example.inc |   3 +-
 .../38_block_scale_gemm/gemm_aquant_basic.cpp |  48 +++--
 example/ck_tile/remod.py                      |   2 +-
 include/ck/host_utility/hip_check_error.hpp   |   5 +-
 include/ck/library/utility/algorithm.hpp      |   8 +-
 include/ck/library/utility/fill.hpp           |   7 +-
 include/ck/library/utility/host_tensor.hpp    |   4 +-
 .../ck/tensor_description/tensor_adaptor.hpp  |  24 +--
 .../tensor_description/tensor_descriptor.hpp  |  12 +-
 .../tensor_space_filling_curve.hpp            |   6 +-
 ...blockwise_gemm_mx_pipeline_xdlops_base.hpp |   2 +-
 .../blockwise_gemm_pipeline_wmmaops_base.hpp  |   4 +-
 .../block/blockwise_gemm_pipeline_xdlops.hpp  |   8 +-
 .../blockwise_gemm_pipeline_xdlops_base.hpp   |   4 +-
 .../block/blockwise_gemm_smfmac_xdlops.hpp    |   4 +-
 .../gpu/block/blockwise_gemm_xdlops.hpp       |  12 +-
 .../blockwise_gemm_xdlops_skip_b_lds.hpp      |   2 +-
 ...roup_tensor_slice_transfer_direct_load.hpp |   6 +-
 ...nsor_slice_transfer_gather_direct_load.hpp |  12 +-
 .../gpu/device/device_base.hpp                |  12 +-
 .../gpu/device/device_grouped_gemm.hpp        |  12 +-
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  36 ++--
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  36 ++--
 .../device_batched_gemm_e_permute_xdl.hpp     |  28 +--
 .../device_batched_gemm_gemm_xdl_cshuffle.hpp |  37 ++--
 .../impl/device_batched_gemm_multi_d_xdl.hpp  |  34 ++--
 .../device_batched_gemm_multiple_d_dl.hpp     |  32 +--
 ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp |  56 +++---
 ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp |   8 +-
 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp |  38 ++--
 ...emm_softmax_gemm_permute_wmma_cshuffle.hpp |  64 +++---
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  46 ++---
 ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp |  40 ++--
 .../device_batched_gemm_wmma_cshuffle_v3.hpp  |  13 +-
 .../device/impl/device_batched_gemm_xdl.hpp   |   4 +-
 ...evice_batched_gemm_xdl_fpAintB_b_scale.hpp |   8 +-
 .../impl/device_cgemm_4gemm_xdl_cshuffle.hpp  |   4 +-
 .../impl/device_column_to_image_impl.hpp      |  12 +-
 ..._contraction_multiple_abd_xdl_cshuffle.hpp |  32 +--
 ...ce_contraction_multiple_d_xdl_cshuffle.hpp |  32 +--
 .../device/impl/device_contraction_utils.hpp  |  10 +-
 ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp |   5 +-
 ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp |   5 +-
 ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp |   5 +-
 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp |  32 +--
 .../device/impl/device_gemm_multiple_d_dl.hpp |  28 +--
 ...gemm_multiple_d_layernorm_xdl_cshuffle.hpp |  85 ++++----
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |  40 ++--
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   |  30 +--
 .../device_gemm_xdl_waveletmodel_cshuffle.hpp |  25 ++-
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  14 +-
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp |  28 +--
 .../device_grouped_conv_bwd_weight_dl.hpp     |  22 +-
 ...e_grouped_conv_bwd_weight_explicit_xdl.hpp |  30 +--
 ...onv_bwd_weight_multiple_d_xdl_cshuffle.hpp |  29 ++-
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp |  36 ++--
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp |  29 ++-
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |  36 ++--
 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp |  32 +--
 ...ice_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp |  22 +-
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  66 +++---
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp |  88 ++++----
 ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp |  44 ++--
 ...d_multiple_d_xdl_large_tensor_cshuffle.hpp |  18 +-
 ...ce_grouped_gemm_multi_abd_xdl_fixed_nk.hpp |  14 +-
 .../device_grouped_gemm_multiple_d_dl.hpp     |  12 +-
 ...ltiple_d_splitk_xdl_cshuffle_two_stage.hpp |  20 +-
 ...gemm_multiple_d_xdl_cshuffle_tile_loop.hpp |  20 +-
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  18 +-
 .../device/impl/device_grouped_gemm_xdl.hpp   |  12 +-
 .../impl/device_grouped_gemm_xdl_fixed_nk.hpp |  20 +-
 ...evice_grouped_gemm_xdl_splitk_cshuffle.hpp |  16 +-
 ...e_grouped_query_attention_forward_wmma.hpp |  28 +--
 .../impl/device_moe_gemm_blockscale.hpp       | 116 +++++------
 .../impl/device_moe_mx_gemm_bpreshuffle.hpp   | 112 +++++------
 ...ice_multi_query_attention_forward_wmma.hpp |  28 +--
 ...tk_contraction_multiple_d_xdl_cshuffle.hpp |  36 ++--
 .../gpu/device/masking_specialization.hpp     |   2 +-
 .../element/unary_element_wise_operation.hpp  |   2 +-
 ...iple_d_welford_first_half_xdl_cshuffle.hpp |  17 +-
 ...idwise_2d_reduction_threadwise_multi_d.hpp |   5 +-
 ...iple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp |  34 ++--
 .../gpu/grid/gridwise_elementwise_2d.hpp      |  99 +++++----
 .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp   |  28 +--
 ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp |  48 ++---
 .../gpu/grid/gridwise_gemm_dl_v1r3.hpp        |  16 +-
 .../gpu/grid/gridwise_gemm_dpp.hpp            |  21 +-
 ...ridwise_gemm_multiple_abd_xdl_cshuffle.hpp |  24 +--
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |   2 +-
 ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 103 +++++-----
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |  24 +--
 ...ultiple_d_xdl_cshuffle_lds_direct_load.hpp |  70 ++++---
 ...se_gemm_multiple_d_xdl_splitk_cshuffle.hpp |  24 +--
 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp  |  34 ++--
 ...e_gemm_split_k_multiple_d_xdl_cshuffle.hpp |  34 ++--
 ...emm_split_k_multiple_d_xdl_cshuffle_v2.hpp |  24 +--
 .../gpu/grid/gridwise_gemm_wmma.hpp           |  24 +--
 .../grid/gridwise_gemm_wmma_cshuffle_v3.hpp   |  19 +-
 ...gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp |  22 +-
 .../gridwise_gemm_wmma_cshuffle_v3_common.hpp |   4 +-
 .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp    |  19 +-
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp |  33 +--
 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp    |  43 ++--
 .../grid/gridwise_gemm_xdl_cshuffle_v2.hpp    |  41 ++--
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    |  23 +--
 ...wise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp |  33 ++-
 .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp |  26 +--
 ...ridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp |  49 ++---
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp |  51 ++---
 ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp |  35 ++--
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |  61 +++---
 ...fle_v3_multi_d_blockscale_b_preshuffle.hpp |  51 ++---
 .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp |  32 +--
 ...se_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp |  28 +--
 ...ridwise_gemm_xdl_layernorm_cshuffle_v1.hpp |  38 ++--
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  |  54 ++---
 .../gridwise_gemm_xdlops_skip_b_lds_v1.hpp    |  24 +--
 ...ise_gemm_xdlops_splitk_lds_direct_load.hpp |  27 +--
 .../gpu/grid/gridwise_gemm_xdlops_streamk.hpp |  34 ++--
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    |  50 ++---
 .../gpu/grid/gridwise_gemm_xdlops_v2r4.hpp    |  22 +-
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  |  27 +--
 .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp    |  26 +--
 .../gpu/grid/gridwise_gemm_xdlops_v3r2.hpp    |  32 +--
 .../gpu/grid/gridwise_gemm_xdlops_v3r3.hpp    |  38 ++--
 .../gpu/grid/gridwise_moe_gemm.hpp            | 190 ++++++++----------
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp | 190 ++++++++----------
 .../gpu/grid/gridwise_moe_mx_gemm.hpp         | 113 +++++------
 .../gpu/grid/gridwise_moe_mx_gemm_bns.hpp     | 123 +++++-------
 .../grid/gridwise_moe_mx_gemm_bpreshuffle.hpp | 115 +++++------
 .../gpu/grid/gridwise_permute.hpp             |   2 +-
 .../gpu/grid/gridwise_tensor_rearrange.hpp    |  16 +-
 .../gridwise_normalization_bwd_data.hpp       |   2 +-
 .../threadwise_tensor_slice_transfer.hpp      |  12 +-
 .../threadwise_tensor_slice_transfer_v3r1.hpp |  44 ++--
 ...ise_tensor_slice_transfer_v3r1_dequant.hpp |  18 +-
 ...wise_tensor_slice_transfer_v3r1_gather.hpp |  12 +-
 .../threadwise_tensor_slice_transfer_v3r2.hpp |  12 +-
 .../threadwise_tensor_slice_transfer_v5r1.hpp |  12 +-
 ...ise_tensor_slice_transfer_v7r3_scatter.hpp |  12 +-
 include/ck/utility/amd_ck_fp8.hpp             |  10 +-
 include/ck/utility/container_helper.hpp       |   2 +-
 include/ck/utility/dynamic_buffer.hpp         |   2 +-
 include/ck/utility/is_detected.hpp            |   4 +-
 include/ck/utility/magic_division.hpp         |  12 +-
 include/ck/utility/sequence.hpp               |   4 +-
 include/ck/utility/type_convert.hpp           |  14 +-
 include/ck/wrapper/tensor.hpp                 |  22 +-
 .../core/algorithm/coordinate_transform.hpp   |   2 +-
 .../core/algorithm/space_filling_curve.hpp    |   6 +-
 .../core/arch/amd_buffer_addressing.hpp       |  28 +--
 include/ck_tile/core/arch/arch.hpp            |   2 +-
 .../core/container/container_helper.hpp       |   2 +-
 include/ck_tile/core/container/sequence.hpp   |   5 +-
 include/ck_tile/core/numeric/float8.hpp       |   2 +-
 include/ck_tile/core/numeric/math.hpp         |  66 +++---
 .../core/tensor/load_tile_transpose.hpp       |  10 +-
 include/ck_tile/core/tensor/sweep_tile.hpp    |   2 +-
 .../ck_tile/core/tensor/tensor_adaptor.hpp    |  32 ++-
 .../ck_tile/core/tensor/tile_distribution.hpp |  10 +-
 .../ck_tile/core/tensor/tile_elementwise.hpp  |   5 +-
 .../core/tensor/tile_window_linear.hpp        |  15 +-
 include/ck_tile/core/utility/debug.hpp        |   6 +-
 include/ck_tile/core/utility/type_traits.hpp  |   4 +-
 .../core/utility/unary_element_function.hpp   |   6 +-
 include/ck_tile/host/concat.hpp               |  19 +-
 include/ck_tile/host/fill.hpp                 |  25 ++-
 include/ck_tile/host/host_tensor.hpp          |   2 +-
 include/ck_tile/host/joinable_thread.hpp      |   2 +-
 .../host/reference/reference_moe_sorting.hpp  |   2 +-
 .../ops/epilogue/cshuffle_epilogue.hpp        |   8 +-
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |   2 +-
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp |  12 +-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   2 +-
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     |   4 +-
 .../fused_moe/kernel/fused_moegemm_kernel.hpp |   2 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   |   2 +-
 .../fused_moegemm_pipeline_flatmm_ex.hpp      |  50 +++--
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |  17 +-
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |   8 +-
 ...peline_ag_bg_cr_comp_v4_default_policy.hpp |  20 +-
 ...peline_ag_bg_cr_comp_v5_default_policy.hpp |  12 +-
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     |   4 +-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |  12 +-
 .../gemm_pipeline_agmem_bgmem_creg_v2.hpp     |   4 +-
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |  26 +--
 .../wp_pipeline_agmem_bgmem_creg_v1.hpp       |   2 +-
 ...wp_pipeline_agmem_bgmem_creg_v1_policy.hpp |  12 +-
 .../block_universal_gemm_as_aquant_bs_cr.hpp  |  25 +--
 .../gemm_aquant_pipeline_ag_bg_cr_policy.hpp  |  12 +-
 .../gemm_aquant_pipeline_ag_bg_cr_v3.hpp      |   3 +-
 ...ped_convolution_backward_weight_kernel.hpp |  71 +++----
 .../grouped_convolution_forward_kernel.hpp    |  84 ++++----
 .../utils/grouped_convolution_utils.hpp       |  10 +-
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   2 +-
 include/ck_tile/ref/naive_attention.hpp       |  24 +--
 include/ck_tile/remod.py                      |  16 +-
 .../cpu/reference_moe_gemm.hpp                |   2 +-
 .../cpu/reference_moe_gemm1_blockscale.hpp    |   2 +-
 .../gpu/reference_gemm.hpp                    |  20 +-
 .../device_column_to_image_instance.hpp       |  36 ++--
 .../device_image_to_column_instance.hpp       |  36 ++--
 ...p_gemm_xdl_universal_km_kn_mn_instance.hpp |   9 +-
 ...ce_grouped_conv_bwd_weight_dl_instance.hpp |  27 +--
 ..._grouped_conv_bwd_weight_wmma_instance.hpp |  18 +-
 ...al_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp |   9 +-
 ...al_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp |   9 +-
 ...al_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp |   9 +-
 ...al_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp |   9 +-
 ...ersal_f16_f16_f16_gkm_gkn_gmn_instance.cpp |   9 +-
 ...ersal_f16_f16_f16_gkm_gnk_gmn_instance.cpp |   9 +-
 ...ersal_f16_f16_f16_gmk_gkn_gmn_instance.cpp |   9 +-
 ...ersal_f16_f16_f16_gmk_gnk_gmn_instance.cpp |   9 +-
 ...dl_int8_int8_int8_gkm_gnk_gmn_instance.cpp |   9 +-
 ..._data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp |   9 +-
 ..._shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp |   9 +-
 ..._shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp |   9 +-
 ..._shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp |   9 +-
 ...l_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp |   9 +-
 ...l_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp |   9 +-
 ...l_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp |   9 +-
 ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp |  18 +-
 .../km_kn_mn_default_pipeline_v1_instance.cpp |   9 +-
 .../km_kn_mn_default_pipeline_v2_instance.cpp |   7 +-
 ...kn_mn_default_pipeline_v2_opt_instance.cpp |   7 +-
 ...m_kn_mn_interwave_pipeline_v1_instance.cpp |   7 +-
 .../km_nk_mn_default_pipeline_v1_instance.cpp |   9 +-
 .../km_nk_mn_default_pipeline_v2_instance.cpp |   7 +-
 ...nk_mn_default_pipeline_v2_opt_instance.cpp |   7 +-
 ...m_nk_mn_interwave_pipeline_v1_instance.cpp |   7 +-
 .../mk_kn_mn_default_pipeline_v1_instance.cpp |   9 +-
 .../mk_kn_mn_default_pipeline_v2_instance.cpp |   7 +-
 ...kn_mn_default_pipeline_v2_opt_instance.cpp |   7 +-
 ...k_kn_mn_interwave_pipeline_v1_instance.cpp |   7 +-
 ...gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp |   9 +-
 ...gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp |   9 +-
 ...gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp |   9 +-
 ...gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp |   9 +-
 ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp |   9 +-
 ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp |   9 +-
 ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp |   9 +-
 ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp |   9 +-
 ...le_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp |   9 +-
 ...wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp |   9 +-
 ...wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp |   9 +-
 ...wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp |   9 +-
 ...wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp |   9 +-
 ...mm_wmma_universal_f16_f16_f16_km_kn_mn.hpp |   9 +-
 ...mm_wmma_universal_f16_f16_f16_km_nk_mn.hpp |   9 +-
 ...mm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp |   9 +-
 ...mm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp |   9 +-
 ...emm_wmma_universal_f16_f8_f16_km_kn_mn.hpp |   9 +-
 ...emm_wmma_universal_f16_f8_f16_km_nk_mn.hpp |   9 +-
 ...emm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp |   9 +-
 ...emm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp |   9 +-
 ...emm_wmma_universal_f8_f16_f16_km_kn_mn.hpp |   9 +-
 ...emm_wmma_universal_f8_f16_f16_km_nk_mn.hpp |   9 +-
 ...emm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp |   9 +-
 ...emm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp |   9 +-
 ..._xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp |   9 +-
 ..._xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp |   9 +-
 ..._xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp |   9 +-
 ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp |   9 +-
 ...mm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp |   9 +-
 ...emm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp |   9 +-
 ...emm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp |   9 +-
 ...gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp |   9 +-
 ...gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp |   9 +-
 ...gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp |   9 +-
 ...gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp |   9 +-
 ...gemm_xdl_universal_f8_f16_f16_mk_nk_mn.hpp |   9 +-
 ...versal_streamk_bf16_bf16_bf16_km_kn_mn.hpp |   9 +-
 ...versal_streamk_bf16_bf16_bf16_km_nk_mn.hpp |   9 +-
 ...versal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp |   9 +-
 ...versal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp |   9 +-
 ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp |   9 +-
 ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp |   9 +-
 ..._universal_streamk_f16_f8_f16_mk_kn_mn.hpp |   7 +-
 ..._universal_streamk_f16_f8_f16_mk_nk_mn.hpp |   7 +-
 ..._universal_streamk_f8_f16_f16_mk_kn_mn.hpp |   7 +-
 ..._universal_streamk_f8_f16_f16_mk_nk_mn.hpp |   7 +-
 ...le_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp |   9 +-
 library/src/utility/convolution_parameter.cpp |   5 +-
 .../profiler/profile_conv_bwd_data_impl.hpp   |   6 +-
 .../profiler/profile_conv_fwd_impl.hpp        |   6 +-
 .../profile_conv_tensor_rearrange_impl.hpp    |   5 +-
 .../profile_grouped_conv_bwd_data_impl.hpp    |   7 +-
 .../profile_grouped_conv_bwd_weight_impl.hpp  |  19 +-
 ...ofile_grouped_conv_fwd_bias_clamp_impl.hpp |  10 +-
 .../profile_grouped_conv_fwd_impl.hpp         |   6 +-
 ...ile_grouped_conv_fwd_outelementop_impl.hpp |   6 +-
 .../include/profiler/profile_softmax_impl.hpp |  23 +--
 profiler/src/profile_contraction_bilinear.cpp |   3 +-
 profiler/src/profile_contraction_scale.cpp    |   3 +-
 script/clang-format-overwrite.sh              |   4 +-
 .../add_rmsnorm2d_rdquant_fwd.inc             |   4 +-
 test/ck_tile/data_type/test_pk_int4.cpp       |   8 +-
 .../elementwise/test_elementwise_1d.cpp       |  18 +-
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |   7 +-
 .../test_run_gemm_aquant_example.inc          |  48 +++--
 .../test_gemm_pipeline_util.hpp               |   7 +-
 .../grouped_gemm/test_grouped_gemm_util.hpp   |  29 ++-
 test/ck_tile/layernorm2d/layernorm2d_fwd.inc  |   3 +-
 .../moe_smoothquant/moe_smoothquant.inc       |   6 +-
 test/ck_tile/moe_sorting/moe_sorting_api.cpp  |  60 +++---
 test/ck_tile/moe_sorting/moe_sorting_fp32.cpp |  30 +--
 .../matrix_core_swizzle_kernel.hpp            |  14 +-
 test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc      |   3 +-
 test/ck_tile/smoothquant/smoothquant.inc      |   5 +-
 test/data_type/test_pk_i4.cpp                 |   8 +-
 test/mx_mfma_op/mx_mfma_op.cpp                | 180 ++++++++---------
 test/pool/test_max_pool2d_fwd.cpp             |   4 +-
 .../reference_conv_fwd/reference_conv_fwd.cpp |  12 +-
 tile_engine/ops/gemm/benchmark_gemm.hpp       |   6 +-
 tile_engine/ops/gemm/gemm_profiler.hpp        |   8 +-
 373 files changed, 3351 insertions(+), 3760 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e4e85651f6..664c5219e2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     hooks:
     -   id: clang-format
         name: clang-format
-        entry: clang-format-12 -i --style=file
+        entry: clang-format-18 -i --style=file
         language: system
         types_or: [c++, inc]
     -   id: copyright-year-checker
diff --git a/Dockerfile b/Dockerfile
index 0219f99238..6f5cd0115d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -62,6 +62,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     libzstd-dev \
     openssh-server \
     clang-format-12 \
+    clang-format-18 \
     kmod && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 7a8452f25e..b34e366f1b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -994,7 +994,7 @@ pipeline {
                                 -o -iname \'*.cpp.in\' \
                                 -o -iname \'*.cl\' \
                                 | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\' && \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\' && \
                                 /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
                                 -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
                                 -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
@@ -1023,7 +1023,7 @@ pipeline {
                                 -o -iname \'*.cpp.in\' \
                                 -o -iname \'*.cl\' \
                                 | grep -v 'build/' \
-                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\'"
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd_ngchw.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd_ngchw.cpp
index 480abf23d2..13f1a3acc1 100644
--- a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd_ngchw.cpp
+++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd_ngchw.cpp
@@ -107,14 +107,14 @@ int execute_conv_fwd()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         out.GetDeviceBuffer(),
                                                         in_lengths,
                                                         in_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         out_lengths,
                                                         out_strides,
                                                         filter_strides,
diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp
index ae5f1b6f6e..f31ffe302a 100644
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data.cpp
@@ -130,14 +130,14 @@ int main()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         in.GetDeviceBuffer(),
                                                         out_lengths,
                                                         out_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         in_lengths,
                                                         in_strides,
                                                         filter_strides,
diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
index 2309d757f0..a9918f6ab3 100644
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
@@ -105,14 +105,14 @@ int main()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         in.GetDeviceBuffer(),
                                                         out_lengths,
                                                         out_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         in_lengths,
                                                         in_strides,
                                                         filter_strides,
diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp b/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp
index 93709a7901..baa2b02bce 100644
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data.cpp
@@ -109,14 +109,14 @@ int main()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         in.GetDeviceBuffer(),
                                                         out_lengths,
                                                         out_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         in_lengths,
                                                         in_strides,
                                                         filter_strides,
diff --git a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp b/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
index a62a1d911b..ac7eb3cf41 100644
--- a/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
+++ b/client_example/10_grouped_convnd_bwd_data/grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp
@@ -111,14 +111,14 @@ int main()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         in.GetDeviceBuffer(),
                                                         out_lengths,
                                                         out_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         in_lengths,
                                                         in_strides,
                                                         filter_strides,
diff --git a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
index 69d7c8936c..37cafc190e 100644
--- a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
+++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
@@ -59,7 +59,7 @@ int main()
     SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size);
 
     std::array<const void*, 2> ab_input               = {a_dev_buf.GetDeviceBuffer(),
-                                           b_dev_buf.GetDeviceBuffer()};
+                                                         b_dev_buf.GetDeviceBuffer()};
     std::vector<ck::index_t> abStride                 = {Stride, 1};
     std::array<std::vector<ck::index_t>, 2> abStrides = {abStride, abStride};
 
diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/client_example/15_reduce/reduce_nhwc_c.cpp
index e2b1fbcb54..12aa31dec3 100644
--- a/client_example/15_reduce/reduce_nhwc_c.cpp
+++ b/client_example/15_reduce/reduce_nhwc_c.cpp
@@ -68,15 +68,15 @@ int main(int argc, char* argv[])
     SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements);
 
     using DeviceOp     = ck::tensor_operation::device::DeviceReduce<InDataType,
-                                                                AccDataType,
-                                                                OutDataType,
-                                                                Rank,
-                                                                NumReduceDim,
-                                                                ReduceAdd,
-                                                                PassThrough,
-                                                                UnaryDivide,
-                                                                PropagateNan,
-                                                                OutputIndex>;
+                                                                    AccDataType,
+                                                                    OutDataType,
+                                                                    Rank,
+                                                                    NumReduceDim,
+                                                                    ReduceAdd,
+                                                                    PassThrough,
+                                                                    UnaryDivide,
+                                                                    PropagateNan,
+                                                                    OutputIndex>;
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
 
diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp
index bb106e8d8e..e8e33a3de2 100644
--- a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp
@@ -117,14 +117,14 @@ int execute_conv_bwd_data_bilinear()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {in.GetDeviceBuffer()},
+                                                          {in.GetDeviceBuffer()},
                                                         in.GetDeviceBuffer(),
                                                         out_lengths,
                                                         out_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {in_lengths},
-                                                        {in_strides},
+                                                          {in_lengths},
+                                                          {in_strides},
                                                         in_lengths,
                                                         in_strides,
                                                         filter_strides,
diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
index e53ecc6c99..d81b5fd03e 100644
--- a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
@@ -116,14 +116,14 @@ int execute_conv_bwd_data_scale()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         in.GetDeviceBuffer(),
                                                         out_lengths,
                                                         out_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         in_lengths,
                                                         in_strides,
                                                         filter_strides,
diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
index 32ab481319..2ec70b8b9b 100644
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp
@@ -121,14 +121,14 @@ int execute_conv_fwd_bilinear()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {out.GetDeviceBuffer()},
+                                                          {out.GetDeviceBuffer()},
                                                         out.GetDeviceBuffer(),
                                                         in_lengths,
                                                         in_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {out_lengths},
-                                                        {out_strides},
+                                                          {out_lengths},
+                                                          {out_strides},
                                                         out_lengths,
                                                         out_strides,
                                                         filter_strides,
diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
index c78cacf266..98f41dc7fb 100644
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
@@ -222,13 +222,13 @@ bool run_grouped_conv_fwd_convscale_reduce(
                                         ck::tensor_operation::element_wise::Scale{scale_wei},
                                         {}};
     auto conv_ok        = ConvolutionScale<InDataType,
-                                    WeiDataType,
-                                    ConvOutDataType,
-                                    ConvElementOp,
-                                    InLayout,
-                                    WeiLayout,
-                                    OutLayout,
-                                    NumDimSpatial>(in,
+                                           WeiDataType,
+                                           ConvOutDataType,
+                                           ConvElementOp,
+                                           InLayout,
+                                           WeiLayout,
+                                           OutLayout,
+                                           NumDimSpatial>(in,
                                                    wei,
                                                    conv_out,
                                                    elementwise_op,
@@ -717,15 +717,15 @@ bool TensorFullReduction(SimpleDeviceMem& tensor,
     {
         std::cout << "\nReduction of spatial dimensions:" << std::endl;
         using DeviceOp     = ck::tensor_operation::device::DeviceReduce<OutDataType,
-                                                                    OutDataType,
-                                                                    OutDataType,
-                                                                    NumDimSpatial,
-                                                                    NumDimSpatial,
-                                                                    ReduceOperation,
-                                                                    PassThrough,
-                                                                    AccElementwiseOperation,
-                                                                    true,   // PropagateNan
-                                                                    false>; // OutputIndex
+                                                                        OutDataType,
+                                                                        OutDataType,
+                                                                        NumDimSpatial,
+                                                                        NumDimSpatial,
+                                                                        ReduceOperation,
+                                                                        PassThrough,
+                                                                        AccElementwiseOperation,
+                                                                        true,   // PropagateNan
+                                                                        false>; // OutputIndex
         const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
             DeviceOp>::GetInstances();
 
diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
index 11e69f5bb2..11f24b39c7 100644
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
@@ -120,14 +120,14 @@ int execute_conv_fwd_scale()
         auto& op_ptr        = op_ptrs[i];
         auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                         wei.GetDeviceBuffer(),
-                                                        {},
+                                                          {},
                                                         out.GetDeviceBuffer(),
                                                         in_lengths,
                                                         in_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         out_lengths,
                                                         out_strides,
                                                         filter_strides,
diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
index 3f6f7b0773..4cf3a4cf82 100644
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
@@ -129,8 +129,8 @@ int execute_conv_fwd_scaleadd_ab()
                                                         in_strides,
                                                         wei_lengths,
                                                         wei_strides,
-                                                        {},
-                                                        {},
+                                                          {},
+                                                          {},
                                                         out_lengths,
                                                         out_strides,
                                                         filter_strides,
diff --git a/client_example/25_wrapper/wrapper_img2col.cpp b/client_example/25_wrapper/wrapper_img2col.cpp
index ceccc5eb8f..f7f893fda2 100644
--- a/client_example/25_wrapper/wrapper_img2col.cpp
+++ b/client_example/25_wrapper/wrapper_img2col.cpp
@@ -132,9 +132,9 @@ void PerformImageToColumnPad0(const ck::index_t G,
                                                                   ck::wrapper::size<0>(tile_shape));
 
     const auto kernel    = DeviceImageToColumnPad0<decltype(input_tensor_global),
-                                                decltype(output_tensor_global),
-                                                decltype(tile_shape),
-                                                decltype(thread_layout)>;
+                                                   decltype(output_tensor_global),
+                                                   decltype(tile_shape),
+                                                   decltype(thread_layout)>;
     const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
                                                   kernel,
                                                   dim3(grid_size_x, grid_size_y, 1),
diff --git a/codegen/include/ck/host/stringutils.hpp b/codegen/include/ck/host/stringutils.hpp
index 89c1884d2e..81b312ec95 100644
--- a/codegen/include/ck/host/stringutils.hpp
+++ b/codegen/include/ck/host/stringutils.hpp
@@ -91,8 +91,9 @@ inline auto Transform(const Range& r, F f) -> std::vector<decltype(f(*r.begin())
 }
 
 template <class Range1, class Range2, class F>
-inline auto Transform(const Range1& r1, const Range2& r2, F f)
-    -> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
+inline auto Transform(const Range1& r1,
+                      const Range2& r2,
+                      F f) -> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
 {
     std::vector<decltype(f(*r1.begin(), *r2.begin()))> result;
     assert(std::distance(r1.begin(), r1.end()) == std::distance(r2.begin(), r2.end()));
diff --git a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
index 36c9a13b4c..a2f322c50f 100644
--- a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
@@ -142,12 +142,11 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
         x.A                = TensorDesc{prob.ADataType, prob.ALayout};
         x.B                = TensorDesc{prob.BDataType, prob.BLayout};
         x.E                = TensorDesc{prob.EDataType, prob.ELayout};
-        x.Ds               = Transform(prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) {
-            return TensorDesc{dt, lo};
-        });
-        x.a_elem_op        = prob.AElementOp;
-        x.b_elem_op        = prob.BElementOp;
-        x.cde_elem_op      = prob.CDEElementOp;
+        x.Ds               = Transform(
+            prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) { return TensorDesc{dt, lo}; });
+        x.a_elem_op   = prob.AElementOp;
+        x.b_elem_op   = prob.BElementOp;
+        x.cde_elem_op = prob.CDEElementOp;
         x.update_prologue(prologue);
         x.update_epilogue(epilogue);
         result.push_back(x);
diff --git a/codegen/test/batched_gemm_softmax_gemm.cpp b/codegen/test/batched_gemm_softmax_gemm.cpp
index 13035df355..98e78fc148 100644
--- a/codegen/test/batched_gemm_softmax_gemm.cpp
+++ b/codegen/test/batched_gemm_softmax_gemm.cpp
@@ -55,12 +55,12 @@ TEST_CASE(test_problem_kernel)
         std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
         auto&& solution = solutions[i];
         auto src        = ck::host::InterpolateString(gemm_compile_check,
-                                               {{"include", prob.GetIncludeHeader()},
-                                                {"template", solution.ToTemplateString()},
-                                                {"m", std::to_string(prob.M)},
-                                                {"n", std::to_string(prob.N)},
-                                                {"k", std::to_string(prob.K)},
-                                                {"o", std::to_string(prob.O)}});
+                                                      {{"include", prob.GetIncludeHeader()},
+                                                       {"template", solution.ToTemplateString()},
+                                                       {"m", std::to_string(prob.M)},
+                                                       {"n", std::to_string(prob.N)},
+                                                       {"k", std::to_string(prob.K)},
+                                                       {"o", std::to_string(prob.O)}});
         auto srcs       = get_headers_for_test();
         srcs.push_back({"main.cpp", src});
         rtc::compile_options options;
diff --git a/codegen/test/gemm_multiple_d.cpp b/codegen/test/gemm_multiple_d.cpp
index adc8e1ff02..dd908e8b58 100644
--- a/codegen/test/gemm_multiple_d.cpp
+++ b/codegen/test/gemm_multiple_d.cpp
@@ -60,11 +60,11 @@ TEST_CASE(test_problem_kernel)
         std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
         auto&& solution = solutions[i];
         auto src        = ck::host::InterpolateString(gemm_compile_check,
-                                               {{"include", prob.GetIncludeHeader()},
-                                                {"template", solution.ToTemplateString()},
-                                                {"m", std::to_string(prob.M)},
-                                                {"n", std::to_string(prob.N)},
-                                                {"k", std::to_string(prob.K)}});
+                                                      {{"include", prob.GetIncludeHeader()},
+                                                       {"template", solution.ToTemplateString()},
+                                                       {"m", std::to_string(prob.M)},
+                                                       {"n", std::to_string(prob.N)},
+                                                       {"k", std::to_string(prob.K)}});
         auto srcs       = get_headers_for_test();
         srcs.push_back({"main.cpp", src});
         rtc::compile_options options;
diff --git a/codegen/test/rtc/include/rtc/tmp_dir.hpp b/codegen/test/rtc/include/rtc/tmp_dir.hpp
index 2f3b26cc43..f4983debd9 100644
--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -16,7 +16,7 @@ struct tmp_dir
 
     void execute(const std::string& cmd) const;
 
-    tmp_dir(tmp_dir const&) = delete;
+    tmp_dir(tmp_dir const&)            = delete;
     tmp_dir& operator=(tmp_dir const&) = delete;
 
     ~tmp_dir();
diff --git a/docs/install/Composable-Kernel-prerequisites.rst b/docs/install/Composable-Kernel-prerequisites.rst
index 10be849ea6..9dc082599a 100644
--- a/docs/install/Composable-Kernel-prerequisites.rst
+++ b/docs/install/Composable-Kernel-prerequisites.rst
@@ -29,4 +29,4 @@ The following prerequisites are required to build and install Composable Kernel:
 * zlib1g-dev  
 * libzstd-dev  
 * openssh-server  
-* clang-format-12  
+* clang-format-18
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 5afb3d1554..b55627f3ee 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -31,15 +31,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 #else
          < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
 #endif
-    // clang-format on
+// clang-format on
 
-    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                            BDataType,
-                                                                            CDataType,
-                                                                            AccDataType,
-                                                                            AElementOp,
-                                                                            BElementOp,
-                                                                            CElementOp>;
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
                                                                              BLayout,
diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
index 4a0c23cf44..d149fd88f1 100644
--- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -56,10 +56,10 @@ using CDataType   = float;
 using AccDataType = float;
 
 #endif
-    // clang-format on
+// clang-format on
 
-    using ReferenceGemmInstance = ck::tensor_operation::host::
-        ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;
 
 template <typename DataType>
 std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp
index f1225d86e4..57a86a9dc4 100644
--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -117,7 +117,7 @@ int reduce_blockwise_impl(bool do_verification,
     using InOutDataTypeInDevice = typename std::
         conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
 #else
-    using InOutDataTypeInDevice   = InOutDataType;
+    using InOutDataTypeInDevice = InOutDataType;
 #endif
 
     using DeviceReduceInstance =
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
index 1bea1bcf3e..3e3c586dba 100644
--- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -175,15 +175,15 @@ auto run_gemm_reduce_max_xdl(ck::index_t M,
     auto invoker   = device_op.MakeInvoker();
     auto argument  = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
                                            b_device_buf.GetDeviceBuffer(),
-                                           {},
+                                            {},
                                            e_device_buf.GetDeviceBuffer(),
-                                           {r0_device_buf.GetDeviceBuffer()},
+                                            {r0_device_buf.GetDeviceBuffer()},
                                            M,
                                            N,
                                            K,
                                            StrideA,
                                            StrideB,
-                                           {},
+                                            {},
                                            StrideE,
                                            a_element_op,
                                            b_element_op,
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index 62295c57eb..42bfea372e 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -207,7 +207,7 @@ int main(int argc, char* argv[])
     auto argument     = batched_gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
                                               b_device_buf.GetDeviceBuffer(),
                                               nullptr,
-                                              {},
+                                                  {},
                                               c_device_buf.GetDeviceBuffer(),
                                               p_reduces,
                                               M,
@@ -216,9 +216,9 @@ int main(int argc, char* argv[])
                                               StrideA,
                                               StrideB,
                                               StrideC,
-                                              {},
+                                                  {},
                                               gemm_element_ops,
-                                              {},
+                                                  {},
                                               reduce_in_element_ops,
                                               reduce_out_element_ops,
                                               BatchCount);
diff --git a/example/27_layernorm2d_fwd/run_layernorm_example.inc b/example/27_layernorm2d_fwd/run_layernorm_example.inc
index 23608a1eea..02b60fe548 100644
--- a/example/27_layernorm2d_fwd/run_layernorm_example.inc
+++ b/example/27_layernorm2d_fwd/run_layernorm_example.inc
@@ -44,9 +44,9 @@ int run_layernorm2d_fwd_example()
         {0, 1},
         std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
         std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
         std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
         {1},
         1e-4,
         x_dev.GetDeviceBuffer(),
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
index cdfd86dff4..c693995140 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -126,10 +126,10 @@ int run(int argc, char* argv[])
 
         if(i < 4)
         {
-            std::cout << "a_gs_ms_ks[" << i << "]: " << a_gs_ms_ks.mDesc << ", "
-                      << "b0_gs_ns_ks[" << i << "]: " << b0_gs_ns_ks.mDesc << ", "
-                      << "b1_gs_os_ns[" << i << "]: " << b1_gs_os_ns.mDesc << ", "
-                      << "c_gs_ms_os[" << i << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
+            std::cout << "a_gs_ms_ks[" << i << "]: " << a_gs_ms_ks.mDesc << ", " << "b0_gs_ns_ks["
+                      << i << "]: " << b0_gs_ns_ks.mDesc << ", " << "b1_gs_os_ns[" << i
+                      << "]: " << b1_gs_os_ns.mDesc << ", " << "c_gs_ms_os[" << i
+                      << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
         }
 
         switch(init_method)
diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
index d2337dcda5..26a03f289d 100644
--- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -129,11 +129,11 @@ int main()
         auto argument_ptr    = device_instance.MakeArgumentPointer(
             out_dev.GetDeviceBuffer(),
             {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
-             ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
-             ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
+                ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
+                ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
             {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
-             ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
-             ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
+                ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
+                ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
             gamma_dev.GetDeviceBuffer(),
             beta_dev.GetDeviceBuffer(),
             current_dim,
diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp
index 54f3a78809..b23128a536 100644
--- a/example/39_permute/common.hpp
+++ b/example/39_permute/common.hpp
@@ -249,8 +249,8 @@ inline auto to_array(Range& range) noexcept
 }
 
 template <typename Axes>
-inline auto is_valid_axes(const Axes& axes)
-    -> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
+inline auto
+is_valid_axes(const Axes& axes) -> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
 {
     using std::empty;
     if(empty(axes))
@@ -357,10 +357,11 @@ auto extend_axes(const Problem::Axes& axes)
 }
 
 template <typename Shape, typename Indices>
-auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t<
-    detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
-        detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
-    bool>
+auto advance_indices(const Shape& shape, Indices& indices)
+    -> std::enable_if_t<
+        detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
+            detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
+        bool>
 {
     using std::size;
     if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices)))
diff --git a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
index 853ff791a6..ab6f317bc6 100644
--- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
@@ -65,9 +65,9 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
         {0, 0, 0, C, 1},
         std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
         std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
         std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
         {1, 2, 4}, // reduction dimension: [H, W, C]
         1e-6,
         x_dev.GetDeviceBuffer(),
diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
index 9431a8cde4..c40447e1f9 100644
--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
@@ -152,7 +152,7 @@ int main(int argc, char* argv[])
 
     std::array<const void*, 1> inputs = {input_dev_buf.GetDeviceBuffer()};
     std::array<void*, 2> outputs      = {output_scaled_casted_transposed_dev_buf.GetDeviceBuffer(),
-                                    output_scaled_casted_dev_buf.GetDeviceBuffer()};
+                                         output_scaled_casted_dev_buf.GetDeviceBuffer()};
 
     std::cout << "Input: " << input.mDesc << std::endl;
     std::cout << "Scale: " << scale << std::endl;
@@ -164,8 +164,8 @@ int main(int argc, char* argv[])
     auto launch_transpose_scale = [&]() {
         auto transposeScale = DeviceElementwisePermuteInstance{};
         auto argument       = transposeScale.MakeArgumentPointer(dims,
-                                                           {in_strides},
-                                                           {out_strides, in_strides},
+                                                                 {in_strides},
+                                                                 {out_strides, in_strides},
                                                            inputs,
                                                            outputs,
                                                            ScalePassThrough{scale});
diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
index 8b88e2482d..e7c1d6f0be 100644
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
@@ -213,7 +213,7 @@ int main(int argc, char* argv[])
     auto invoker   = device_op.MakeInvoker();
     auto argument  = device_op.MakeArgument(
         std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
-                                   a1_device_buf.GetDeviceBuffer()},
+                                    a1_device_buf.GetDeviceBuffer()},
         std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
         std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
         e_device_buf.GetDeviceBuffer(),
diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
index eaabccdf2a..ec1b2d6018 100644
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp
@@ -194,9 +194,9 @@ int main(int argc, char* argv[])
     auto invoker   = device_op.MakeInvoker();
     auto argument  = device_op.MakeArgument(
         std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
-                                   a1_device_buf.GetDeviceBuffer()},
+                                    a1_device_buf.GetDeviceBuffer()},
         std::array<const void*, 2>{b0_device_buf.GetDeviceBuffer(),
-                                   b1_device_buf.GetDeviceBuffer()},
+                                    b1_device_buf.GetDeviceBuffer()},
         std::array<const void*, 0>{},
         e_device_buf.GetDeviceBuffer(),
         std::array<std::vector<ck::index_t>, 2>{a0_ms_ks_lengths, a1_ms_ks_lengths},
diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
index 6940c20695..f521c51d67 100644
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
@@ -265,10 +265,10 @@ bool run_grouped_conv_fwd(bool do_verification,
     auto device_ew_scale = DeviceElementwiseScale{};
     auto scale_invoker   = device_ew_scale.MakeInvoker();
     auto scale_argument  = device_ew_scale.MakeArgument(e_g_n_k_wos_lengths,
-                                                       {e_g_n_k_wos_strides},
-                                                       {e_g_n_k_wos_strides},
-                                                       {conv_device_buf.GetDeviceBuffer()},
-                                                       {out_device_buf.GetDeviceBuffer()},
+                                                        {e_g_n_k_wos_strides},
+                                                        {e_g_n_k_wos_strides},
+                                                        {conv_device_buf.GetDeviceBuffer()},
+                                                        {out_device_buf.GetDeviceBuffer()},
                                                        scale_convert);
 
     if(!device_ew_scale.IsSupportedArgument(scale_argument))
diff --git a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
index 1a0b558e2c..f75c01ec61 100644
--- a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
+++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc
@@ -46,9 +46,9 @@ int run_layernorm4d_fwd_example()
         {0, W * C, C, 1},
         std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
         std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
         std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
-                                 save_mean.mDesc.GetStrides().end()},
+                                    save_mean.mDesc.GetStrides().end()},
         {1, 2, 3},
         1e-4,
         x_dev.GetDeviceBuffer(),
diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
index 9e80a2ca35..f78e6e48a5 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
@@ -357,7 +357,7 @@ int main(int argc, char* argv[])
                 int n1 = n % NLane;
 
                 int k0 = k / (KLane * KPack);
-                tempk = k % (KLane * KPack);
+                tempk  = k % (KLane * KPack);
                 int k1 = tempk / KPack;
                 int k2 = tempk % KPack;
 
diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
index b72485222e..bdd5f2da1b 100644
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -191,8 +191,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         return base_str;
     }();
 
-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
               << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
               << ", yr_stride:" << yr_stride << std::flush;
 
diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
index 28f4c452bc..688f4f3d50 100644
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -333,12 +333,12 @@ struct matrix_core_swizzle_kernel
                     return tmp_1;
 #else
                     // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
-                    constexpr index_t kv = Alignment;
-                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
-                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t kv          = Alignment;
+                    constexpr index_t nw          = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw          = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                     constexpr index_t waveflatten = kw * nw * kv;
-                    const index_t kr = a_.k / (k1 * k2);
-                    const index_t nr = a_.n / nw;
+                    const index_t kr              = a_.k / (k1 * k2);
+                    const index_t nr              = a_.n / nw;
                     auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                         p_dst,
                         make_tuple(nr, kr, waveflatten),
@@ -387,8 +387,8 @@ struct matrix_core_swizzle_kernel
                     constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                     constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                     constexpr index_t waveflatten_tile = kw * nw * kv;
-                    constexpr index_t nr_tile = NPerBlock / nw;
-                    constexpr index_t kr_tile = KPerBlock / (kw * kv);
+                    constexpr index_t nr_tile          = NPerBlock / nw;
+                    constexpr index_t kr_tile          = KPerBlock / (kw * kv);
                     return make_tile_window(dst_view,
                                             make_tuple(number<nr_tile>{},
                                                        number<kr_tile>{},
diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
index 13924f5fe9..e0a71452ea 100644
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -183,8 +183,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             }
         }
 
-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", stride:" << stride
                   << ", s:" << USEModelSensitive << ", valid:" << (pass ? "y" : "n") << std::flush
                   << std::endl;
     }
diff --git a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
index 049a0cad41..751b868411 100644
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -193,8 +193,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         return base_str;
     }();
 
-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
               << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
               << ", yr_stride:" << yr_stride << ", s:" << use_model_sensitive_rmsnorm << std::flush;
 
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
index 06c04b763e..1cd375d0f5 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -105,8 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     b_buf.ToDevice(b_host.data());
     gamma_buf.ToDevice(gamma_host.data());
 
-    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]" << " m:" << m
+              << ", n:" << n << ", stride:" << stride << std::flush;
 
     add_rmsnorm2d_rdquant_fwd_traits traits{input_data_type, quantized_data_type, SaveX};
 
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
index c43d9c9a2e..449bc17e04 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -256,8 +256,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             }
         }
 
-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", stride:" << stride
                   << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
index 20e1591516..5fcacacee8 100644
--- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -216,10 +216,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
             }
         }
 
-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
-                  << ", y_stride:" << y_stride << ", valid:" << (pass ? "y" : "n") << std::flush
-                  << std::endl;
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n
+                  << ", x_stride:" << x_stride << ", y_stride:" << y_stride
+                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
     }
 
     return pass;
diff --git a/example/ck_tile/12_smoothquant/smoothquant.cpp b/example/ck_tile/12_smoothquant/smoothquant.cpp
index f3ba587132..02ab1cd9b1 100644
--- a/example/ck_tile/12_smoothquant/smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -93,9 +93,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     x_buf.ToDevice(x_host.data());
     smscale_buf.ToDevice(smscale_host.data());
 
-    std::cout << "[" << data_type << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
-              << std::flush;
+    std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", y_stride:" << y_stride << std::flush;
 
     smoothquant_traits traits{data_type};
 
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index 16fe0ef150..e9b4ea5cd3 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -228,20 +228,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     moe_sorting_trait trait{
         index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
 
-    moe_sorting_args karg
-    {
-        topk_ids_dev.GetDeviceBuffer(), weights_dev.GetDeviceBuffer(),
-            local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer() : nullptr,
-            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
-            sorted_ids_dev.GetDeviceBuffer(), sorted_weights_dev.GetDeviceBuffer(),
-            sorted_expert_ids_dev.GetDeviceBuffer(), sorted_id_cnt_dev.GetDeviceBuffer(),
-            moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
-            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr, tokens, unit_size,
-            num_experts, topk,
+    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                          weights_dev.GetDeviceBuffer(),
+                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
+                                               : nullptr,
+                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+                          sorted_ids_dev.GetDeviceBuffer(),
+                          sorted_weights_dev.GetDeviceBuffer(),
+                          sorted_expert_ids_dev.GetDeviceBuffer(),
+                          sorted_id_cnt_dev.GetDeviceBuffer(),
+                          moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
+                          tokens,
+                          unit_size,
+                          num_experts,
+                          topk,
 #if MOE_SORTING_FMOE_2D_BUF
-            moe_buf_interm_dim, moe_buf_elem_bytes
+                          moe_buf_interm_dim,
+                          moe_buf_elem_bytes
 #else
-            static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
 #endif
     };
 
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index 037891353e..a71c5e51a6 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -40,11 +40,11 @@
     constexpr bool local_expert_masking       = local_expert_masking_;                                  \
     constexpr bool local_token                = local_token_;                                           \
     using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking,      \
-                                                    local_token>;              \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
     using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
     auto kargs                                = kernel::MakeKargs(a);                                   \
     const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -200,11 +200,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -218,11 +218,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -236,11 +236,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -254,11 +254,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -273,11 +273,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                     \
         constexpr bool local_token            = local_token_;                                        \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
-                                                        ms_weight_type,         \
-                                                        mesh_type_,             \
-                                                        unroll_num,             \
-                                                        expert_masking,         \
-                                                        local_token>;           \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                                \
         const dim3 grids                      = kernel::GridSize(a);                                 \
diff --git a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
index dc5b397c85..848fb87dcf 100644
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
@@ -124,9 +124,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
     smscale_buf.ToDevice(smscale_host.data());
     topk_ids_buf.ToDevice(topk_ids_host.data());
 
-    std::cout << "[" << prec_i << "-" << prec_o << "]"
-              << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
-              << ", experts:" << experts << ", topk:" << topk << std::flush;
+    std::cout << "[" << prec_i << "-" << prec_o << "]" << " tokens:" << tokens
+              << ", hidden_size:" << hidden_size << ", stride:" << stride << ", experts:" << experts
+              << ", topk:" << topk << std::flush;
 
     moe_smoothquant_traits traits{prec_i, prec_o};
 
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
index 78f664a671..43ae5cf677 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -25,27 +25,27 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
     }();
 
     auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
-    auto a0 = fused_moesorting_args
-    {
-        a.topk_ids_ptr,              // const void* p_topk_ids;
-            a.topk_weight_ptr,       // const void* p_weights;
-            a.local_expert_mask_ptr, // const void* p_local_expert_mask;
-            a.local_tokens,
-            a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
-            a.sorted_weight_ptr,     // void* p_sorted_weights;
-            a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
-            a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
-            a.o_ptr,                 // void* p_moe_buf;
-            a.ws_ptr,                // void* p_ws;
-            a.num_tokens,            // index_t tokens;
-            a.block_m,               // index_t unit_size;
-            a.num_experts,           // index_t num_experts;
-            a.topk,                  // index_t topk;
+    auto a0 = fused_moesorting_args{
+        a.topk_ids_ptr,          // const void* p_topk_ids;
+        a.topk_weight_ptr,       // const void* p_weights;
+        a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+        a.local_tokens,
+        a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+        a.sorted_weight_ptr,     // void* p_sorted_weights;
+        a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+        a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+        a.o_ptr,                 // void* p_moe_buf;
+        a.ws_ptr,                // void* p_ws;
+        a.num_tokens,            // index_t tokens;
+        a.block_m,               // index_t unit_size;
+        a.num_experts,           // index_t num_experts;
+        a.topk,                  // index_t topk;
 #if MOE_SORTING_FMOE_2D_BUF
-            a.stride_token, o_data_bytes,
+        a.stride_token,
+        o_data_bytes,
 #else
-            static_cast<ck_tile::long_index_t>(a.num_tokens) *
-                a.stride_token* o_data_bytes // index_t moe_buf_bytes;
+        static_cast<ck_tile::long_index_t>(a.num_tokens) * a.stride_token *
+            o_data_bytes // index_t moe_buf_bytes;
 #endif
     };
 
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
index 343ddbed13..6e54df9fde 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
@@ -16,11 +16,11 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
 {
     using f_traits = ck_tile::FusedMoeGemmTraits<Ts_::GateOnly, Ts_::FusedQuant == 1, 1 /*atomic*/>;
     using f_shape  = ck_tile::FusedMoeGemmShape<typename Ts_::BlockTile_0,
-                                               typename Ts_::WarpPerBlock_0,
-                                               typename Ts_::WarpTile_0,
-                                               typename Ts_::BlockTile_1,
-                                               typename Ts_::WarpPerBlock_0,
-                                               typename Ts_::WarpTile_0>;
+                                                typename Ts_::WarpPerBlock_0,
+                                                typename Ts_::WarpTile_0,
+                                                typename Ts_::BlockTile_1,
+                                                typename Ts_::WarpPerBlock_0,
+                                                typename Ts_::WarpTile_0>;
 
     constexpr auto get_activation_ = []() {
         if constexpr(Ts_::Activation == 0)
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
index 83454a3969..5f87393a0a 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -40,11 +40,11 @@
     constexpr bool local_expert_masking       = local_expert_masking_;                                  \
     constexpr bool local_token                = local_token_;                                           \
     using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking,      \
-                                                    local_token>;              \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
     using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
     auto kargs                                = kernel::MakeKargs(a);                                   \
     const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -204,11 +204,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -222,11 +222,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -240,11 +240,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -258,11 +258,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -277,11 +277,11 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
         constexpr bool expert_masking         = expert_masking_;                                     \
         constexpr bool local_token            = local_token_;                                        \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
-                                                        ms_weight_type,         \
-                                                        mesh_type_,             \
-                                                        unroll_num,             \
-                                                        expert_masking,         \
-                                                        local_token>;           \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                                \
         const dim3 grids                      = kernel::GridSize(a);                                 \
diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp
index 35f24c1155..e4d87e5fef 100644
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -218,8 +218,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
             return std::string(", st:") + std::to_string(stride);
     }();
 
-    std::cout << "[" << api_str << "|" << prec_str << "]"
-              << " t:" << tokens;
+    std::cout << "[" << api_str << "|" << prec_str << "]" << " t:" << tokens;
 
     if(is_local_token)
     {
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index 85d75320c5..bb0a0d5840 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -173,10 +173,9 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
 
         if(s.log_level_ > 0)
         {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
         }
 
         ave_time =
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
index 4107181520..897952f03c 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
@@ -138,10 +138,9 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
 
         if(s.log_level_ > 0)
         {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
         }
 
         ave_time =
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index 7532923f9a..fa7f1a31c1 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -216,9 +216,9 @@ int run_grouped_gemm_example_with_layouts(int argc,
         c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
             ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], is_row_major(CLayout{}))));
 
-        std::cout << "gemm[" << i << "]"
-                  << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc
-                  << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
+        std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
+                  << " b_k_n: " << b_k_n_tensors[i].mDesc << " c_m_n: " << c_m_n_tensors[i].mDesc
+                  << std::endl;
 
         ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
         ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
index 3debfa7f42..8971871c14 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -170,10 +170,9 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
+                std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", "
+                          << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", "
+                          << blocks.y << ", " << blocks.z << "}" << std::endl;
             }
 
             ave_time = ck_tile::launch_kernel(
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
index 9c32e2a11e..637ea2fbfb 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
@@ -161,8 +161,7 @@ int run_grouped_conv_bwd_weight_example_with_layouts(
                 conv_param.conv_filter_dilations_,
                 conv_param.input_left_pads_,
                 conv_param.input_right_pads_);
-        const ck_tile::index_t GemmK =
-            weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
         const float max_accumulated_value =
             *std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end());
         const auto rtol_atol =
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
index a1ed3c4920..2667cae788 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
@@ -87,24 +87,24 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
                                                tail_number_v>;
         using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
         using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             CodegenPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             transposed_warp_gemm,
-                                             ck_tile::memory_operation_enum::set>>;
+                   ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                    BDataType,
+                                                    ck_tile::tuple<>,
+                                                    AccDataType,
+                                                    CDataType,
+                                                    ck_tile::tuple<>,
+                                                    CLayout,
+                                                    ck_tile::element_wise::PassThrough,
+                                                    CodegenPipelineProblem::kBlockSize,
+                                                    TilePartitioner::MPerBlock,
+                                                    TilePartitioner::NPerBlock,
+                                                    M_Warp,
+                                                    N_Warp,
+                                                    M_Warp_Tile,
+                                                    N_Warp_Tile,
+                                                    K_Warp_Tile,
+                                                    transposed_warp_gemm,
+                                                    ck_tile::memory_operation_enum::set>>;
         using Kernel =
             ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
@@ -195,14 +195,18 @@ int run_gemm_example(int argc, char* argv[])
     }
     else if(data_type == "i4fp8")
     {
-        using TypeConfig = decltype(
-            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::fp8_t>{});
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::fp8_t,
+                                                        float,
+                                                        ck_tile::fp8_t>{});
         return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4bf8")
     {
-        using TypeConfig = decltype(
-            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::bf8_t>{});
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::bf8_t,
+                                                        float,
+                                                        ck_tile::bf8_t>{});
         return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4f32fp8")
diff --git a/example/ck_tile/remod.py b/example/ck_tile/remod.py
index fdc0dcf5d7..b64fac7b06 100644
--- a/example/ck_tile/remod.py
+++ b/example/ck_tile/remod.py
@@ -13,7 +13,7 @@ for p in sorted(Path("./").rglob("*")):
 # formatting
 for x in all_files:
     subprocess.Popen(f'dos2unix {str(x)}', shell=True)
-    cmd = f'clang-format-12 -style=file -i {str(x)}'
+    cmd = f'clang-format-18 -style=file -i {str(x)}'
     #for xp in x.parents:
     #print(get_file_base(x))
     subprocess.Popen(cmd, shell=True)
diff --git a/include/ck/host_utility/hip_check_error.hpp b/include/ck/host_utility/hip_check_error.hpp
index 0dfd275269..e6e3402e64 100644
--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -12,9 +12,8 @@ inline void hip_check_error(hipError_t x)
     if(x != hipSuccess)
     {
         std::ostringstream ss;
-        ss << "HIP runtime error: " << hipGetErrorString(x) << ". "
-           << "hip_check_error.hpp"
-           << ": " << __LINE__ << "in function: " << __func__;
+        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << "hip_check_error.hpp" << ": "
+           << __LINE__ << "in function: " << __func__;
         throw std::runtime_error(ss.str());
     }
 }
diff --git a/include/ck/library/utility/algorithm.hpp b/include/ck/library/utility/algorithm.hpp
index 57136f8a2a..185a147cce 100644
--- a/include/ck/library/utility/algorithm.hpp
+++ b/include/ck/library/utility/algorithm.hpp
@@ -11,10 +11,10 @@
 namespace ck {
 namespace ranges {
 template <typename InputRange, typename OutputIterator>
-auto copy(InputRange&& range, OutputIterator iter)
-    -> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
-                          std::end(std::forward<InputRange>(range)),
-                          iter))
+auto copy(InputRange&& range,
+          OutputIterator iter) -> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
+                                                     std::end(std::forward<InputRange>(range)),
+                                                     iter))
 {
     return std::copy(std::begin(std::forward<InputRange>(range)),
                      std::end(std::forward<InputRange>(range)),
diff --git a/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp
index 4f421b4282..05357b1637 100644
--- a/include/ck/library/utility/fill.hpp
+++ b/include/ck/library/utility/fill.hpp
@@ -138,9 +138,10 @@ struct FillConstant
     }
 
     template <typename ForwardRange>
-    auto operator()(ForwardRange&& range) const -> std::void_t<
-        decltype(std::declval<const FillConstant&>()(std::begin(std::forward<ForwardRange>(range)),
-                                                     std::end(std::forward<ForwardRange>(range))))>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillConstant&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
     {
         (*this)(std::begin(std::forward<ForwardRange>(range)),
                 std::end(std::forward<ForwardRange>(range)));
diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp
index 33c918c997..fb8f6e79dc 100644
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -202,7 +202,7 @@ struct joinable_thread : std::thread
     {
     }
 
-    joinable_thread(joinable_thread&&) = default;
+    joinable_thread(joinable_thread&&)            = default;
     joinable_thread& operator=(joinable_thread&&) = default;
 
     ~joinable_thread()
@@ -320,7 +320,7 @@ struct Tensor
     ~Tensor() = default;
 
     Tensor& operator=(const Tensor&) = default;
-    Tensor& operator=(Tensor&&) = default;
+    Tensor& operator=(Tensor&&)      = default;
 
     template <typename FromT>
     explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp
index 3ffac32469..28974427d7 100644
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -108,13 +108,13 @@ struct TensorAdaptor
 
     __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
     {
-        constexpr auto all_low_dim_ids = unpack(
-            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
-            LowerDimensionHiddenIdss{});
+        constexpr auto all_low_dim_ids =
+            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
+                   LowerDimensionHiddenIdss{});
 
-        constexpr auto all_up_dim_ids = unpack(
-            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
-            UpperDimensionHiddenIdss{});
+        constexpr auto all_up_dim_ids =
+            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
+                   UpperDimensionHiddenIdss{});
 
         constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
 
@@ -338,8 +338,7 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran];
 
             // sequence in, sequence out
-            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr
-            {
+            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr {
                 auto low_dim_hidden_ids_1_mod_ = to_multi_index(low_dim_hidden_ids_1);
 
                 // shift hidden id so every dim id is unique
@@ -361,8 +360,7 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 });
 
                 return low_dim_hidden_ids_1_mod_;
-            }
-            ();
+            }();
 
             return generate_sequence_v2(
                 [&](auto i) constexpr { return Number<low_dim_hidden_ids_1_mod[i]>{}; },
@@ -384,8 +382,7 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran];
 
             // sequence in, constexpr tuple out
-            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr
-            {
+            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr {
                 auto up_dim_hidden_ids_1_mod_ = to_multi_index(up_dim_hidden_ids_1);
 
                 // shift hidden id
@@ -394,8 +391,7 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 });
 
                 return up_dim_hidden_ids_1_mod_;
-            }
-            ();
+            }();
 
             // constexpr tuple to sequence
             return generate_sequence_v2(
diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp
index f1df2eedd4..a82f69fb3f 100644
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -365,7 +365,7 @@ transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
         Sequence<0>{}, inclusive_scan_sequence(up_dim_numbers, math::plus<index_t>{}, Number<0>{}));
 
     constexpr auto up_dim_hidden_idss = generate_tuple(
-        [ old_hidden_dim_number, up_dim_numbers_scan ](auto i) constexpr {
+        [old_hidden_dim_number, up_dim_numbers_scan](auto i) constexpr {
             return
                 typename arithmetic_sequence_gen<old_hidden_dim_number + up_dim_numbers_scan[i],
                                                  old_hidden_dim_number + up_dim_numbers_scan[i + 1],
@@ -374,12 +374,12 @@ transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
         Number<num_new_transform>{});
 
     // new visible dimension's hidden ids
-    constexpr auto unordered_new_visible_dim_hidden_ids = unpack(
-        [](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
+    constexpr auto unordered_new_visible_dim_hidden_ids =
+        unpack([](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
 
-    constexpr auto new_visible_dim_unordered2ordered = unpack(
-        [](auto... xs) constexpr { return merge_sequences(xs...); },
-        NewUpperDimensionNewVisibleIdss{});
+    constexpr auto new_visible_dim_unordered2ordered =
+        unpack([](auto... xs) constexpr { return merge_sequences(xs...); },
+               NewUpperDimensionNewVisibleIdss{});
 
     constexpr auto new_visible_dim_hidden_ids =
         unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);
diff --git a/include/ck/tensor_description/tensor_space_filling_curve.hpp b/include/ck/tensor_description/tensor_space_filling_curve.hpp
index 9a326092d2..67da37cc90 100644
--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -94,10 +94,8 @@ struct SpaceFillingCurve
         // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
         // idim-th element of multidimensional index.
         // All constexpr variables have to be captured by VALUE.
-        constexpr auto compute_index = [ idx_1d, access_strides ](auto idim) constexpr
-        {
-            constexpr auto compute_index_impl = [ idx_1d, access_strides ](auto jdim) constexpr
-            {
+        constexpr auto compute_index = [idx_1d, access_strides](auto idim) constexpr {
+            constexpr auto compute_index_impl = [idx_1d, access_strides](auto jdim) constexpr {
                 auto res = idx_1d.value;
                 auto id  = 0;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
index c929956124..d0a594e2c6 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp
@@ -152,7 +152,7 @@ struct BlockwiseGemmXdlops_mx_pipeline_base
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
index d46c5b737d..6fb62bc677 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -93,7 +93,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
 
     struct Empty
     {
-        __device__ Empty(){};
+        __device__ Empty() {};
         template <index_t NBuffer>
         __device__ void GlobalLoad(bool cond)
         {
@@ -119,7 +119,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
                           GridBuffer b_scale_grid_buf_)
             : b_scale_thread_copy(b_scale_thread_copy_),
               b_scale_grid_desc(b_scale_grid_desc_),
-              b_scale_grid_buf(b_scale_grid_buf_){};
+              b_scale_grid_buf(b_scale_grid_buf_) {};
 
         static constexpr index_t num_scale_k_block = BScaleThreadDesc{}.GetLength(Number<1>{});
         static constexpr index_t num_scale_krepeat = KRepeat / num_scale_k_block;
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
index 438d7d8ac3..231dbf817c 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp
@@ -96,9 +96,9 @@ template <
     index_t KPack,
     bool TransposeC = false,
     index_t AMmaKStride =
-        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
+        KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
     index_t BMmaKStride =
-        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
+        KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
 struct BlockwiseGemmXdlops_pipeline_v4
 {
     static constexpr auto I0 = Number<0>{};
@@ -188,7 +188,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
@@ -217,7 +217,7 @@ struct BlockwiseGemmXdlops_pipeline_v4
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
index 9296b8136f..cd13dbb836 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -153,7 +153,7 @@ struct BlockwiseGemmXdlops_pipeline_base
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
@@ -182,7 +182,7 @@ struct BlockwiseGemmXdlops_pipeline_base
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
index e9f9b0be7e..90f356987d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_smfmac_xdlops.hpp
@@ -110,7 +110,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
         const auto waveId_m = wave_idx[I0];
@@ -138,7 +138,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
         const auto waveId_m = wave_idx[I0];
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
index d3f6344c27..e6bb2d8db3 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -114,7 +114,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
@@ -143,7 +143,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
@@ -667,9 +667,9 @@ template <
     index_t KPack,
     bool TransposeC = false,
     index_t AMmaKStride =
-        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
+        KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops,
     index_t BMmaKStride =
-        KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
+        KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
 struct BlockwiseGemmXdlops_v2
 {
     static constexpr auto I0 = Number<0>{};
@@ -742,7 +742,7 @@ struct BlockwiseGemmXdlops_v2
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
@@ -771,7 +771,7 @@ struct BlockwiseGemmXdlops_v2
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
index 287c6701c3..84ee096cba 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
@@ -90,7 +90,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
 
     template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
     __device__ static auto
-        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
     {
         const auto wave_idx = GetWaveIdx();
 
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
index 98cc149f4d..aa06f8c6c1 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
@@ -258,8 +258,7 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
             src_buf.template DirectCopyToLds<remove_cvref_t<decltype(dst_buf)>, ScalarPerVector>(
                 dst_buf, src_offset, dst_offset, is_src_valid);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -271,8 +270,7 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // Decide whether to move forward or backward.
             constexpr auto forward_sweep = [&]() {
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
index 3e9e501126..55dd924f8c 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
@@ -281,8 +281,7 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
             src_buf.template DirectCopyToLds<remove_cvref_t<decltype(dst_buf)>, ScalarPerVector>(
                 dst_buf, src_offset, dst_offset, true);
 
-            constexpr auto move_src_on_dim = [&]() constexpr
-            {
+            constexpr auto move_src_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -295,11 +294,9 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
-            constexpr auto move_dst_on_dim = [&]() constexpr
-            {
+            constexpr auto move_dst_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -311,8 +308,7 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // Decide whether to move forward or backward.
             constexpr auto forward_sweep = [&]() {
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index 9285211519..c946abb77d 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -49,8 +49,8 @@ namespace device {
 #ifndef CK_CODE_GEN_RTC
 struct BaseArgument
 {
-    BaseArgument()                    = default;
-    BaseArgument(const BaseArgument&) = default;
+    BaseArgument()                               = default;
+    BaseArgument(const BaseArgument&)            = default;
     BaseArgument& operator=(const BaseArgument&) = default;
 
     virtual ~BaseArgument() {}
@@ -60,8 +60,8 @@ struct BaseArgument
 
 struct BaseInvoker
 {
-    BaseInvoker()                   = default;
-    BaseInvoker(const BaseInvoker&) = default;
+    BaseInvoker()                              = default;
+    BaseInvoker(const BaseInvoker&)            = default;
     BaseInvoker& operator=(const BaseInvoker&) = default;
 
     virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
@@ -75,8 +75,8 @@ struct BaseInvoker
 
 struct BaseOperator
 {
-    BaseOperator()                    = default;
-    BaseOperator(const BaseOperator&) = default;
+    BaseOperator()                               = default;
+    BaseOperator(const BaseOperator&)            = default;
     BaseOperator& operator=(const BaseOperator&) = default;
 #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
     virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
index 267a970ee5..52632785bd 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
@@ -70,15 +70,9 @@ struct GroupedGemmKernelArgument
         for(auto sd : StrideDs)
             str << sd << ",";
 
-        std::cout << "arg {"
-                  << "M:" << M << ", "
-                  << "N:" << N << ", "
-                  << "K:" << K << ", "
-                  << "SA:" << StrideA << ", "
-                  << "SB:" << StrideB << ", "
-                  << "SE:" << StrideE << ", "
-                  << "SDs: {" << str.str() << "}"
-                  << "}" << std::endl;
+        std::cout << "arg {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                  << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SE:" << StrideE
+                  << ", " << "SDs: {" << str.str() << "}" << "}" << std::endl;
     }
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 72c011bfb2..1dd143f6a3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -205,25 +205,25 @@ template <typename GridwiseGemm,
           bool isMultiB>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
-            AsPointer p_as_grid,
-            BsPointer p_bs_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            const Block2ETileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
+        AsPointer p_as_grid,
+        BsPointer p_bs_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const index_t batch_count,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        const Block2ETileMap block_2_ctile_map,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 
     device_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index fc1a2b995a..c57d5316ba 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -36,25 +36,25 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_contraction_multiple_d_xdl_cshuffle(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatDsPointer p_ds_grid,
-            FloatE* __restrict__ p_e_grid,
-            const index_t batch_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const Block2ETileMap block_2_etile_map)
+    kernel_contraction_multiple_d_xdl_cshuffle(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatDsPointer p_ds_grid,
+        FloatE* __restrict__ p_e_grid,
+        const index_t batch_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
index 0cd1d84a43..c82da32313 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -58,21 +58,21 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid,
-                                          const ABDataType* __restrict__ p_b_grid,
-                                          EDataType* __restrict__ p_e_grid,
-                                          const index_t batch_count,
-                                          const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                          const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                          const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                          const AElementwiseOperation a_element_op,
-                                          const BElementwiseOperation b_element_op,
-                                          const CDEElementwiseOperation cde_element_op,
-                                          const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-                                          const Block2ETileMap block_2_etile_map)
+    kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid,
+                                      const ABDataType* __restrict__ p_b_grid,
+                                      EDataType* __restrict__ p_e_grid,
+                                      const index_t batch_count,
+                                      const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                      const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                      const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                          e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                      const AElementwiseOperation a_element_op,
+                                      const BElementwiseOperation b_element_op,
+                                      const CDEElementwiseOperation cde_element_op,
+                                      const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                                      const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index 985752796b..efe8fe92c7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -39,26 +39,25 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_gemm_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatAB* __restrict__ p_b1_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const AccElementwiseOperation acc_element_op,
-            const B1ElementwiseOperation b1_element_op,
-            const CElementwiseOperation c_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map,
-            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+    kernel_gemm_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
+                                     const FloatAB* __restrict__ p_b_grid,
+                                     const FloatAB* __restrict__ p_b1_grid,
+                                     FloatC* __restrict__ p_c_grid,
+                                     const AElementwiseOperation a_element_op,
+                                     const BElementwiseOperation b_element_op,
+                                     const AccElementwiseOperation acc_element_op,
+                                     const B1ElementwiseOperation b1_element_op,
+                                     const CElementwiseOperation c_element_op,
+                                     const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                     const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                     const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+                                     const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                         c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                     const Block2CTileMap block_2_ctile_map,
+                                     const index_t batch_count,
+                                     const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 12085edaae..811924a189 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -63,24 +63,24 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
-                                const ABDataType* __restrict__ p_b_grid,
-                                DsPointer p_ds_grid,
-                                EDataType* __restrict__ p_e_grid,
-                                const index_t batch_count,
-                                const AElementwiseOperation a_element_op,
-                                const BElementwiseOperation b_element_op,
-                                const CDEElementwiseOperation cde_element_op,
-                                const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
-                                const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
-                                const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                    e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-                                const Block2ETileMap block_2_etile_map)
+    kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
+                            const ABDataType* __restrict__ p_b_grid,
+                            DsPointer p_ds_grid,
+                            EDataType* __restrict__ p_e_grid,
+                            const index_t batch_count,
+                            const AElementwiseOperation a_element_op,
+                            const BElementwiseOperation b_element_op,
+                            const CDEElementwiseOperation cde_element_op,
+                            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+                            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+                            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                            const Block2ETileMap block_2_etile_map)
 {
 
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
index 1b487502f4..a38e0d25e7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
@@ -52,23 +52,23 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_dl_multiple_d(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const index_t batch_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
-            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_dl_multiple_d(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const index_t batch_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+        const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+        const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
     defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index d38698af4b..2ae4794d00 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -42,32 +42,32 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_gemm_xdl_cshuffle_v1(
-            const A0B0B1DataType* __restrict__ p_a0_grid,
-            const A0B0B1DataType* __restrict__ p_b0_grid,
-            D0sPointer p_d0s_grid,
-            const A0B0B1DataType* __restrict__ p_b1_grid,
-            D1sPointer p_d1s_grid,
-            E1DataType* __restrict__ p_e1_grid,
-            const A0ElementwiseOperation a0_element_op,
-            const B0ElementwiseOperation b0_element_op,
-            const CDE0ElementwiseOperation cde0_element_op,
-            const B1ElementwiseOperation b1_element_op,
-            const CDE1ElementwiseOperation cde1_element_op,
-            const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
-            const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
-            const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-            const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                d1s_grid_desc_mblock_mperblock_nblock_nperblock,
-            const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e1_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2E1TileMap block_2_e1tile_map,
-            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+    kernel_batched_gemm_gemm_xdl_cshuffle_v1(
+        const A0B0B1DataType* __restrict__ p_a0_grid,
+        const A0B0B1DataType* __restrict__ p_b0_grid,
+        D0sPointer p_d0s_grid,
+        const A0B0B1DataType* __restrict__ p_b1_grid,
+        D1sPointer p_d1s_grid,
+        E1DataType* __restrict__ p_e1_grid,
+        const A0ElementwiseOperation a0_element_op,
+        const B0ElementwiseOperation b0_element_op,
+        const CDE0ElementwiseOperation cde0_element_op,
+        const B1ElementwiseOperation b1_element_op,
+        const CDE1ElementwiseOperation cde1_element_op,
+        const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
+        const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
+        const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+        const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+        const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e1_grid_desc_mblock_mperblock_nblock_nperblock,
+        const Block2E1TileMap block_2_e1tile_map,
+        const index_t batch_count,
+        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -829,10 +829,8 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
              is_same_v<tensor_layout::gemm::ColumnMajor, B0Layout> &&
              CheckDLayout<tensor_layout::gemm::RowMajor, D0sLayout, NumD0Tensor>() &&
              (is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ||
-              is_same_v<tensor_layout::gemm::ColumnMajor,
-                        B1Layout>)&&CheckDLayout<tensor_layout::gemm::RowMajor,
-                                                 D1sLayout,
-                                                 NumD1Tensor>() &&
+              is_same_v<tensor_layout::gemm::ColumnMajor, B1Layout>) &&
+             CheckDLayout<tensor_layout::gemm::RowMajor, D1sLayout, NumD1Tensor>() &&
              is_same_v<tensor_layout::gemm::RowMajor, E1Layout>))
         {
             return false;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 6624570b27..2e0b5da113 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -33,9 +33,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
+    kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -79,9 +79,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
+    kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // Pass two lds pointer is the key to tell compiler that ds_read/write
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index de7d67f08b..851f6a5f97 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -39,26 +39,26 @@ template <typename GridwiseGemm,
           bool HasMainK0BlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_reduce_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            ReducePtrsGlobal p_reduces_grid,
-            const index_t batch_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const ReduceInElementwiseOperations reduce_in_element_ops,
-            const ReduceAccElementwiseOperations reduce_out_element_ops,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
-            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_batched_gemm_reduce_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        ReducePtrsGlobal p_reduces_grid,
+        const index_t batch_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const ReduceInElementwiseOperations reduce_in_element_ops,
+        const ReduceAccElementwiseOperations reduce_out_element_ops,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+        const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
index 1026118381..2e1684adb6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
@@ -40,21 +40,21 @@ template <typename DeviceOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_softmax_gemm_wmma_cshuffle(const ADataType* __restrict__ p_a_grid,
-                                                       const B0DataType* __restrict__ p_b0_grid,
-                                                       const B1DataType* __restrict__ p_b1_grid,
-                                                       CDataType* __restrict__ p_c_grid,
-                                                       index_t M,
-                                                       index_t N,
-                                                       index_t K,
-                                                       index_t O,
-                                                       index_t G0,
-                                                       index_t G1,
-                                                       float alpha,
-                                                       bool input_permute,
-                                                       bool output_permute)
+    kernel_batched_gemm_softmax_gemm_wmma_cshuffle(const ADataType* __restrict__ p_a_grid,
+                                                   const B0DataType* __restrict__ p_b0_grid,
+                                                   const B1DataType* __restrict__ p_b1_grid,
+                                                   CDataType* __restrict__ p_c_grid,
+                                                   index_t M,
+                                                   index_t N,
+                                                   index_t K,
+                                                   index_t O,
+                                                   index_t G0,
+                                                   index_t G1,
+                                                   float alpha,
+                                                   bool input_permute,
+                                                   bool output_permute)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
@@ -178,15 +178,15 @@ template <typename DeviceOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_wmma_self_attention_forward(const QKVDataType* __restrict__ p_qkv_grid,
-                                           ODataType* __restrict__ p_out_grid,
-                                           index_t batch_size,
-                                           index_t sequence_length,
-                                           index_t head_count,
-                                           index_t head_size,
-                                           float alpha)
+    kernel_wmma_self_attention_forward(const QKVDataType* __restrict__ p_qkv_grid,
+                                       ODataType* __restrict__ p_out_grid,
+                                       index_t batch_size,
+                                       index_t sequence_length,
+                                       index_t head_count,
+                                       index_t head_size,
+                                       float alpha)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
@@ -310,17 +310,17 @@ template <typename DeviceOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_wmma_cross_attention_forward(const QDataType* __restrict__ p_q_grid,
-                                            const KVDataType* __restrict__ p_kv_grid,
-                                            ODataType* __restrict__ p_out_grid,
-                                            index_t batch_size,
-                                            index_t q_sequence_length,
-                                            index_t kv_sequence_length,
-                                            index_t head_count,
-                                            index_t head_size,
-                                            float alpha)
+    kernel_wmma_cross_attention_forward(const QDataType* __restrict__ p_q_grid,
+                                        const KVDataType* __restrict__ p_kv_grid,
+                                        ODataType* __restrict__ p_out_grid,
+                                        index_t batch_size,
+                                        index_t q_sequence_length,
+                                        index_t kv_sequence_length,
+                                        index_t head_count,
+                                        index_t head_size,
+                                        float alpha)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index bae5c6019d..18b9e6ce74 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -43,30 +43,30 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatAB* __restrict__ p_b1_grid,
-            FloatC* __restrict__ p_c_grid,
-            D0sPointer p_d0s_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const C0DEElementwiseOperation c0de_element_op,
-            const B1ElementwiseOperation b1_element_op,
-            const C1DEElementwiseOperation c1de_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-            const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c1_grid_desc_mblock_mperblock_nblock_nperblock,
-            const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
-                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-            const Block2CTileMap block_2_ctile_map,
-            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
-            const C0MatrixMask c0_matrix_mask)
+    kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        const FloatAB* __restrict__ p_b1_grid,
+        FloatC* __restrict__ p_c_grid,
+        D0sPointer p_d0s_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const C0DEElementwiseOperation c0de_element_op,
+        const B1ElementwiseOperation b1_element_op,
+        const C1DEElementwiseOperation c1de_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+        const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c1_grid_desc_mblock_mperblock_nblock_nperblock,
+        const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+        const Block2CTileMap block_2_ctile_map,
+        const index_t batch_count,
+        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+        const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index e846b0630b..ec0fb7b98d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -42,27 +42,27 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            const FloatAB* __restrict__ p_b1_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const AccElementwiseOperation acc_element_op,
-            const B1ElementwiseOperation b1_element_op,
-            const CElementwiseOperation c_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map,
-            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
-            const C0MatrixMask c0_matrix_mask)
+    kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        const FloatAB* __restrict__ p_b1_grid,
+        FloatC* __restrict__ p_c_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const AccElementwiseOperation acc_element_op,
+        const B1ElementwiseOperation b1_element_op,
+        const CElementwiseOperation c_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const Block2CTileMap block_2_ctile_map,
+        const index_t batch_count,
+        const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+        const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
index abd6574d8c..cecd312879 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
@@ -29,14 +29,13 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_batched_gemm_wmma_cshuffle_v3(
-            typename GridwiseGemm::Argument
-                karg, // This works for now but it actually receives a
-                      // DeviceBatchedGemm_Wmma_CShuffleV3::Argument
-                      // argument through implicit conversion to base class!
-            const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
+    kernel_batched_gemm_wmma_cshuffle_v3(
+        typename GridwiseGemm::Argument karg, // This works for now but it actually receives a
+                                              // DeviceBatchedGemm_Wmma_CShuffleV3::Argument
+                                              // argument through implicit conversion to base class!
+        const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 #if defined(__gfx11__)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index 494524b6f0..16d5feccf2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -48,9 +48,9 @@ namespace device {
 template <typename DeviceOp, typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
+    kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
index 7d9555dc82..1419f5ee7c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
@@ -33,9 +33,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
+    kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -71,9 +71,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds(BatchedGemmArg karg)
+    kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds(BatchedGemmArg karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // Pass two lds pointer is the key to tell compiler that ds_read/write
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
index 8843e520a6..4934993693 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -610,8 +610,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         if(!parg)
         {
             std::ostringstream err;
-            err << "Provided argument pointer is not of an Argument class!"
-                << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+            err << "Provided argument pointer is not of an Argument class!" << " In " << __FILE__
+                << ":" << __LINE__ << ", in function: " << __func__;
             throw std::runtime_error(err.str());
         }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
index 9482812f75..dee3a51df7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
@@ -467,12 +467,12 @@ struct DeviceColumnToImageImpl
 
             float elapsed_time = 0.f;
             const auto kernel  = kernel_tensor_rearrange<InputGridDesc,
-                                                        InputDataType,
-                                                        OutputGridDesc,
-                                                        OutputDataType,
-                                                        Block2ETileMap,
-                                                        ComputePtrOffsetOfStridedBatch<>,
-                                                        GridwiseTensorRearrangeKernel>;
+                                                         InputDataType,
+                                                         OutputGridDesc,
+                                                         OutputDataType,
+                                                         Block2ETileMap,
+                                                         ComputePtrOffsetOfStridedBatch<>,
+                                                         GridwiseTensorRearrangeKernel>;
 
             // Execute each set of independent filters
             for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
index df5922a04f..b99032fb9f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -37,23 +37,23 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_contraction_multiple_abd_xdl_cshuffle(
-            AsPointer p_as_grid,
-            BsPointer p_bs_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1,
-            const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2ETileMap block_2_etile_map)
+    kernel_contraction_multiple_abd_xdl_cshuffle(
+        AsPointer p_as_grid,
+        BsPointer p_bs_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1,
+        const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 77974f84ae..de8e524dc3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -35,23 +35,23 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_contraction_multiple_d_xdl_cshuffle(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatDsPointer p_ds_grid,
-            FloatE* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2ETileMap block_2_etile_map)
+    kernel_contraction_multiple_d_xdl_cshuffle(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatDsPointer p_ds_grid,
+        FloatE* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
index 1b0db73fdd..dc07f8b445 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
@@ -35,17 +35,15 @@ auto CalculateMaxRead(const std::vector<index_t>& lengths, const std::vector<ind
     if(lengths.size() != NumDim1 + NumDim2)
     {
         std::ostringstream err;
-        err << "Incorrect number of lengths in "
-            << "device_contraction_utils.hpp"
-            << ":" << __LINE__ << ", in function: " << __func__;
+        err << "Incorrect number of lengths in " << "device_contraction_utils.hpp" << ":"
+            << __LINE__ << ", in function: " << __func__;
         throw std::runtime_error(err.str());
     }
     if(strides.size() != NumDim1 + NumDim2)
     {
         std::ostringstream err;
-        err << "Incorrect number of strides in "
-            << "device_contraction_utils.hpp"
-            << ":" << __LINE__ << ", in function: " << __func__;
+        err << "Incorrect number of strides in " << "device_contraction_utils.hpp" << ":"
+            << __LINE__ << ", in function: " << __func__;
         throw std::runtime_error(err.str());
     }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index a8eb73d730..5d039427d6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -648,9 +648,8 @@ struct
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 std::cout << DeviceOp{}.GetTypeString() << std::endl;
-                std::cout << "N " << arg.Conv_N_ << ", "
-                          << "K " << arg.Conv_K_ << ", "
-                          << "C " << arg.Conv_C_ << ", " << std::endl;
+                std::cout << "N " << arg.Conv_N_ << ", " << "K " << arg.Conv_K_ << ", " << "C "
+                          << arg.Conv_C_ << ", " << std::endl;
                 std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
                           << arg.filter_spatial_lengths_[1] << ", " << std::endl;
                 std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index 6eb9281d30..242f5cd673 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -618,9 +618,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 std::cout << DeviceOp{}.GetTypeString() << std::endl;
-                std::cout << "N " << arg.Conv_N_ << ", "
-                          << "K " << arg.Conv_K_ << ", "
-                          << "C " << arg.Conv_C_ << ", " << std::endl;
+                std::cout << "N " << arg.Conv_N_ << ", " << "K " << arg.Conv_K_ << ", " << "C "
+                          << arg.Conv_C_ << ", " << std::endl;
                 std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
                           << arg.filter_spatial_lengths_[1] << ", " << std::endl;
                 std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 5fad21f521..0d295a2418 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -583,9 +583,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 std::cout << DeviceOp{}.GetTypeString() << std::endl;
-                std::cout << "N " << arg.Conv_N_ << ", "
-                          << "K " << arg.Conv_K_ << ", "
-                          << "C " << arg.Conv_C_ << ", " << std::endl;
+                std::cout << "N " << arg.Conv_N_ << ", " << "K " << arg.Conv_K_ << ", " << "C "
+                          << arg.Conv_C_ << ", " << std::endl;
                 std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
                           << arg.filter_spatial_lengths_[1] << ", " << std::endl;
                 std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index 68ec8187a4..0e926a748a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -38,23 +38,23 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r3_for_conv3d(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const index_t num_batches,
-            const index_t a_batch_stride,
-            const index_t b_batch_stride,
-            const index_t c_batch_stride,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_xdlops_v2r3_for_conv3d(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const index_t num_batches,
+        const index_t a_batch_stride,
+        const index_t b_batch_stride,
+        const index_t c_batch_stride,
+        const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index b9467ac194..9e8c959f98 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -34,21 +34,21 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_dl_multiple_d(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
-            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_dl_multiple_d(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+        const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+        const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx9__) || \
     defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index 47fb630ea9..8f4c41b69c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -37,31 +37,30 @@ template <typename GridwiseGemmWelford,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EMeanVarDataType* __restrict__ p_e_grid,
-            EMeanVarDataType* __restrict__ p_welford_mean_grid,
-            EMeanVarDataType* __restrict__ p_welford_var_grid,
-            int32_t* __restrict__ p_welford_count_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock
-                mean_var_grid_desc_mblock_mperblock_nblock,
-            const CountGridDescriptor_MBlock_MPerBlock_NBlock
-                count_grid_desc_mblock_mperblock_nblock,
-            const Block2ETileMap block_2_etile_map,
-            index_t NRaw)
+    kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EMeanVarDataType* __restrict__ p_e_grid,
+        EMeanVarDataType* __restrict__ p_welford_mean_grid,
+        EMeanVarDataType* __restrict__ p_welford_var_grid,
+        int32_t* __restrict__ p_welford_count_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock
+            mean_var_grid_desc_mblock_mperblock_nblock,
+        const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock,
+        const Block2ETileMap block_2_etile_map,
+        index_t NRaw)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
@@ -121,26 +120,26 @@ template <typename GridwiseWelfordLayernorm,
           typename HElementwiseOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_welford_layernorm2d_second_half(
-            const EMeanVarDataType* __restrict__ p_e_grid,
-            const EMeanVarDataType* __restrict__ p_in_welford_mean_grid,
-            const EMeanVarDataType* __restrict__ p_in_welford_var_grid,
-            const int32_t* __restrict__ p_in_welford_count_grid,
-            const GammaDataType* __restrict__ p_gamma_grid,
-            const BetaDataType* __restrict__ p_beta_grid,
-            HDataType* __restrict__ p_h_grid,
-            const EHGridDesc_M_N e_grid_desc_m_n,
-            const EHGridDesc_M_N h_grid_desc_m_n,
-            const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock,
-            const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock,
-            const GammaBetaGridDesc_N gamma_grid_desc_n,
-            const GammaBetaGridDesc_N beta_grid_desc_n,
-            index_t numMeanVarCountBlockTileIteration_N,
-            index_t NBlockClusterLength,
-            ComputeDataType epsilon,
-            HElementwiseOperation h_element_op)
+    kernel_welford_layernorm2d_second_half(
+        const EMeanVarDataType* __restrict__ p_e_grid,
+        const EMeanVarDataType* __restrict__ p_in_welford_mean_grid,
+        const EMeanVarDataType* __restrict__ p_in_welford_var_grid,
+        const int32_t* __restrict__ p_in_welford_count_grid,
+        const GammaDataType* __restrict__ p_gamma_grid,
+        const BetaDataType* __restrict__ p_beta_grid,
+        HDataType* __restrict__ p_h_grid,
+        const EHGridDesc_M_N e_grid_desc_m_n,
+        const EHGridDesc_M_N h_grid_desc_m_n,
+        const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock,
+        const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock,
+        const GammaBetaGridDesc_N gamma_grid_desc_n,
+        const GammaBetaGridDesc_N beta_grid_desc_n,
+        index_t numMeanVarCountBlockTileIteration_N,
+        index_t NBlockClusterLength,
+        ComputeDataType epsilon,
+        HElementwiseOperation h_element_op)
 {
     GridwiseWelfordLayernorm::Run(p_e_grid,
                                   p_in_welford_mean_grid,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index c048e7249c..c1b3f98bc9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -38,27 +38,27 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_multiple_d_multiple_r_xdl_cshuffle(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatDsPointer p_ds_grid,
-            FloatE* __restrict__ p_e_grid,
-            FloatRsPointer p_rs_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const QsElementwiseOperation qs_element_op,
-            const RsElementwiseOperation rs_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
-            const Block2ETileMap block_2_etile_map)
+    kernel_gemm_multiple_d_multiple_r_xdl_cshuffle(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatDsPointer p_ds_grid,
+        FloatE* __restrict__ p_e_grid,
+        FloatRsPointer p_rs_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const QsElementwiseOperation qs_element_op,
+        const RsElementwiseOperation rs_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
+        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index f193b093d1..e36816df64 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -37,22 +37,22 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
-                                            const BDataType* __restrict__ p_b_grid,
-                                            DsPointer p_ds_grid,
-                                            EDataType* __restrict__ p_e_grid,
-                                            const AElementwiseOperation a_element_op,
-                                            const BElementwiseOperation b_element_op,
-                                            const CDEElementwiseOperation cde_element_op,
-                                            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-                                            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-                                            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                                e_grid_desc_mblock_mperblock_nblock_nperblock,
-                                            const Block2ETileMap block_2_etile_map)
+    kernel_gemm_multiple_d_xdl_cshuffle(const ADataType* __restrict__ p_a_grid,
+                                        const BDataType* __restrict__ p_b_grid,
+                                        DsPointer p_ds_grid,
+                                        EDataType* __restrict__ p_e_grid,
+                                        const AElementwiseOperation a_element_op,
+                                        const BElementwiseOperation b_element_op,
+                                        const CDEElementwiseOperation cde_element_op,
+                                        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                            e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
index 2554ffea46..0f6457f48e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -32,20 +32,19 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_WAVELET_MAX_THREAD_PER_BLOCK, CK_WAVELET_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_WAVELET_MAX_THREAD_PER_BLOCK, CK_WAVELET_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdl_waveletmodel_cshuffle(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const EElementwiseOperation e_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2ETileMap block_2_etile_map)
+    kernel_gemm_xdl_waveletmodel_cshuffle(const ABDataType* __restrict__ p_a_grid,
+                                          const ABDataType* __restrict__ p_b_grid,
+                                          EDataType* __restrict__ p_e_grid,
+                                          const AElementwiseOperation a_element_op,
+                                          const BElementwiseOperation b_element_op,
+                                          const EElementwiseOperation e_element_op,
+                                          const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                          const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                          const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                          const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index 884175eaca..f32334cd91 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -28,14 +28,14 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_contraction_multiple_d_xdl_cshuffle(
-            const void CK_CONSTANT_ADDRESS_SPACE* contraction_args,
-            const index_t group_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op)
+    kernel_grouped_contraction_multiple_d_xdl_cshuffle(
+        const void CK_CONSTANT_ADDRESS_SPACE* contraction_args,
+        const index_t group_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index db2426518a..fe9e4ff7e8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -80,21 +80,21 @@ template <typename GridwiseGemm,
           bool CTranspose>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const std::array<GemmArgs, MaxGroupedGemmGroupsNum> gemm_kernel_args,
-            const index_t gemms_count,
-            const AElementwiseOp a_element_op,
-            const BElementwiseOp b_element_op,
-            const CDEElementwiseOp cde_element_op,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const ComputePtrOffsetOfN compute_ptr_offset_of_n,
-            const index_t KBatch)
+    kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const std::array<GemmArgs, MaxGroupedGemmGroupsNum> gemm_kernel_args,
+        const index_t gemms_count,
+        const AElementwiseOp a_element_op,
+        const BElementwiseOp b_element_op,
+        const CDEElementwiseOp cde_element_op,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const ComputePtrOffsetOfN compute_ptr_offset_of_n,
+        const index_t KBatch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
index 0b3f1a0255..3306e311b3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
@@ -35,18 +35,18 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_dlops_bwd_weight(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const index_t batch_count,
-            const AGridDesc_B_K0_M0_M1_K1 a_grid_desc_kbatch_k0_m0_m1_k1,
-            const BGridDesc_B_K0_N0_N1_K1 b_grid_desc_kbatch_k0_n0_n1_k1,
-            const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
-            const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_batched_gemm_dlops_bwd_weight(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const index_t batch_count,
+        const AGridDesc_B_K0_M0_M1_K1 a_grid_desc_kbatch_k0_m0_m1_k1,
+        const BGridDesc_B_K0_N0_N1_K1 b_grid_desc_kbatch_k0_n0_n1_k1,
+        const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
+        const Block2CTileMap block_2_ctile_map,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) ||         \
     defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
index a819b91b05..e5872816f5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
@@ -77,21 +77,21 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
     using CElementwiseGridDesc     = remove_cvref_t<decltype(GetElementwiseCGridDesc(I1))>;
     using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<1, ElemsPerBlock>;
     using GridwiseElementwiseCast  = GridwiseElementwise<Tuple<CElementwiseGridDesc>,
-                                                        Tuple<CElementwiseGridDesc>,
-                                                        Tuple<const float*>,
-                                                        Tuple<WeiDataType*>,
-                                                        Block2TileMapElementwise,
-                                                        WeiElementwiseOperation,
-                                                        ElementwiseBlockSize,
-                                                        I1,
-                                                        ElemsPerBlock,
-                                                        I1,
-                                                        ElemsPerBlock / ElementwiseBlockSize,
-                                                        Sequence<0, 1>,
-                                                        Sequence<1>,
-                                                        Sequence<1>,
-                                                        I1,
-                                                        I1>;
+                                                         Tuple<CElementwiseGridDesc>,
+                                                         Tuple<const float*>,
+                                                         Tuple<WeiDataType*>,
+                                                         Block2TileMapElementwise,
+                                                         WeiElementwiseOperation,
+                                                         ElementwiseBlockSize,
+                                                         I1,
+                                                         ElemsPerBlock,
+                                                         I1,
+                                                         ElemsPerBlock / ElementwiseBlockSize,
+                                                         Sequence<0, 1>,
+                                                         Sequence<1>,
+                                                         Sequence<1>,
+                                                         I1,
+                                                         I1>;
 
     struct Argument : public BaseArgument
     {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
index 672c7dd2f7..601bf4eb5a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -43,22 +43,21 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_xdlops_bwd_weight(
-            const FloatA* __restrict__ p_a_grid,
-            const FloatB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const index_t batch_count,
-            const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-            const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_batched_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
+                                          const FloatB* __restrict__ p_b_grid,
+                                          FloatC* __restrict__ p_c_grid,
+                                          const AElementwiseOperation a_element_op,
+                                          const BElementwiseOperation b_element_op,
+                                          const CElementwiseOperation c_element_op,
+                                          const index_t batch_count,
+                                          const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+                                          const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+                                          const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                          const Block2CTileMap block_2_ctile_map,
+                                          const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index c7c463f43d..8796f5520e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -44,16 +44,16 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
-            typename GridwiseGemm::Argument karg,
-            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            [[maybe_unused]] const index_t num_k_per_block)
+    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
+        typename GridwiseGemm::Argument karg,
+        [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        [[maybe_unused]] const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
@@ -99,16 +99,16 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
-            typename GridwiseGemm::Argument karg,
-            [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            [[maybe_unused]] const index_t num_k_per_block)
+    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
+        typename GridwiseGemm::Argument karg,
+        [[maybe_unused]] const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        [[maybe_unused]] const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        [[maybe_unused]] const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        [[maybe_unused]] const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 6c53161ded..6f6a3587ac 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -41,22 +41,21 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_gemm_xdlops_bwd_weight(
-            const FloatA* __restrict__ p_a_grid,
-            const FloatB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const index_t batch_count,
-            const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-            const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_batched_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
+                                          const FloatB* __restrict__ p_b_grid,
+                                          FloatC* __restrict__ p_c_grid,
+                                          const AElementwiseOperation a_element_op,
+                                          const BElementwiseOperation b_element_op,
+                                          const CElementwiseOperation c_element_op,
+                                          const index_t batch_count,
+                                          const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+                                          const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+                                          const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                          const Block2CTileMap block_2_ctile_map,
+                                          const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index f13a256d6b..bbaa04536c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -42,16 +42,16 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
-            typename GridwiseGemm::Argument karg,
-            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const index_t num_k_per_block)
+    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
@@ -100,16 +100,16 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
-            typename GridwiseGemm::Argument karg,
-            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const index_t num_k_per_block)
+    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 3e14f66a09..e7446bb995 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -72,23 +72,23 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_fwd_dl_multiple_d(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
-            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
-            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
-            const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_grouped_conv_fwd_dl_multiple_d(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const index_t batch_count,
+        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+        const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+        const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+        const Block2CTileMap block_2_ctile_map,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) ||         \
     defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
index 50e171e503..393ee80881 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -93,18 +93,18 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_fwd_dl(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            CDataType* __restrict__ p_c_grid,
-            const index_t batch_count,
-            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-            const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
-            const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_grouped_conv_fwd_dl(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        CDataType* __restrict__ p_c_grid,
+        const index_t batch_count,
+        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+        const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
+        const Block2CTileMap block_2_ctile_map,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
     defined(__gfx11__) || defined(__gfx12__))
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 6d2988ba24..ac40d363b5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -81,25 +81,25 @@ template <typename GridwiseGemm,
           bool CTranspose>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
-            AsPointer p_as_grid,
-            BsPointer p_bs_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            AElementwiseOperation a_element_op,
-            BElementwiseOperation b_element_op,
-            CDEElementwiseOperation cde_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            const Block2ETileMap block_2_ctile_map,
-            const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
-            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
+    kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
+        AsPointer p_as_grid,
+        BsPointer p_bs_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        AElementwiseOperation a_element_op,
+        BElementwiseOperation b_element_op,
+        CDEElementwiseOperation cde_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        const Block2ETileMap block_2_ctile_map,
+        const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
+        const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
 
@@ -383,11 +383,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
-            ctc::NHWGC,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
-                               ctc::NDHWGC,
-                               ALay>>;
+             is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+             ctc::NHWGC,
+             std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                                ctc::NDHWGC,
+                                ALay>>;
 
         const auto in_gemmmraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeADescriptor_M_K<Layout>();
@@ -403,11 +403,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
-            ctc::GKYXC,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
-                               ctc::GKZYXC,
-                               BLay>>;
+             is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+             ctc::GKYXC,
+             std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                                ctc::GKZYXC,
+                                BLay>>;
 
         const auto wei_gemmnraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeBDescriptor_N_K<Layout>();
@@ -423,11 +423,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
-            ctc::NHWGK,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
-                               ctc::NDHWGK,
-                               ELay>>;
+             is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+             ctc::NHWGK,
+             std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                                ctc::NDHWGK,
+                                ELay>>;
 
         const auto out_gemmmraw_gemmnraw_desc =
             conv_to_gemm_transformer.template MakeCDescriptor_M_N<Layout>();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index e30caf3aac..a938820e6c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -72,15 +72,15 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_fwd_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg,
-                                                const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-                                                const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-                                                const DsGridDesc_M_N ds_grid_desc_m_n,
-                                                const EGridDesc_M_N c_grid_desc_m_n,
-                                                const ComputePtrOffset compute_ptr_offset_of_groups,
-                                                const ComputePtrOffset compute_ptr_offset_of_n)
+    kernel_grouped_conv_fwd_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg,
+                                            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+                                            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+                                            const DsGridDesc_M_N ds_grid_desc_m_n,
+                                            const EGridDesc_M_N c_grid_desc_m_n,
+                                            const ComputePtrOffset compute_ptr_offset_of_groups,
+                                            const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
@@ -151,16 +151,16 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds(
-            typename GridwiseGemm::Argument karg,
-            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const DsGridDesc_M_N ds_grid_desc_m_n,
-            const EGridDesc_M_N c_grid_desc_m_n,
-            const ComputePtrOffset compute_ptr_offset_of_groups,
-            const ComputePtrOffset compute_ptr_offset_of_n)
+    kernel_grouped_conv_fwd_xdl_cshuffle_v3_2lds(
+        typename GridwiseGemm::Argument karg,
+        const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+        const DsGridDesc_M_N ds_grid_desc_m_n,
+        const EGridDesc_M_N c_grid_desc_m_n,
+        const ComputePtrOffset compute_ptr_offset_of_groups,
+        const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
@@ -369,11 +369,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
-            ctc::NHWGC,
-            std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
-                               ctc::NDHWGC,
-                               ALay>>;
+             is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
+             ctc::NHWGC,
+             std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
+                                ctc::NDHWGC,
+                                ALay>>;
 
         const auto in_gemmmraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeADescriptor_M_K<Layout>();
@@ -399,11 +399,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
-            ctc::GKYXC,
-            std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
-                               ctc::GKZYXC,
-                               BLay>>;
+             is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
+             ctc::GKYXC,
+             std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
+                                ctc::GKZYXC,
+                                BLay>>;
 
         const auto wei_gemmnraw_gemmkraw_desc =
             conv_to_gemm_transformer.template MakeBDescriptor_N_K<Layout>();
@@ -429,11 +429,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     {
         namespace ctc = tensor_layout::convolution;
         using Layout  = std::conditional_t<
-            is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
-            ctc::NHWGK,
-            std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
-                               ctc::NDHWGK,
-                               ELay>>;
+             is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>(),
+             ctc::NHWGK,
+             std::conditional_t<is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>(),
+                                ctc::NDHWGK,
+                                ELay>>;
 
         const auto out_gemmmraw_gemmnraw_desc =
             conv_to_gemm_transformer.template MakeCDescriptor_M_N<Layout>();
@@ -1347,9 +1347,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
             return false;
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
-                std::cout << "The MultiABD is not supported!"
-                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
+                std::cout << "The MultiABD is not supported!" << " In " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
             }
         }
 
@@ -1374,8 +1373,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
-                std::cout << "Current device does not support xdl instructions!"
-                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                std::cout << "Current device does not support xdl instructions!" << " In "
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                           << std::endl;
             }
             return false;
@@ -1455,9 +1454,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
-                std::cout << "Unsupported A Layout!"
-                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
+                std::cout << "Unsupported A Layout!" << " In " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
             }
             return false;
         }
@@ -1488,9 +1486,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
-                std::cout << "Unsupported A Layout!"
-                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
+                std::cout << "Unsupported A Layout!" << " In " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
             }
             return false;
         }
@@ -1602,9 +1599,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         {
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
-                std::cout << "Unsupported E Layout!"
-                          << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
+                std::cout << "Unsupported E Layout!" << " In " << __FILE__ << ":" << __LINE__
+                          << ", in function: " << __func__ << std::endl;
             }
             return false;
         }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index ec1a05366e..1e5c67aac7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -131,29 +131,29 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batch_gemm_multiple_d_xdl_cshuffle(
-            const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            RsPointer p_rs_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const QsElementwiseOperation qs_element_op,
-            const RsElementwiseOperation rs_element_op,
-            const index_t batch_count,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
-            const Block2ETileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_batch_gemm_multiple_d_xdl_cshuffle(
+        const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        RsPointer p_rs_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const QsElementwiseOperation qs_element_op,
+        const RsElementwiseOperation rs_element_op,
+        const index_t batch_count,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
+        const Block2ETileMap block_2_ctile_map,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index 9988367959..b1494a36bf 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -41,16 +41,16 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_fwd_multiple_d_grouped_gemm_xdl_cshuffle(
-            Array<GemmArgs, MaxGemmsNum> gemm_desc_kernel_args,
-            const index_t gemms_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation c_element_op,
-            const ComputePtrOffset compute_ptr_offset_of_groups,
-            const ComputePtrOffset compute_ptr_offset_of_n)
+    kernel_grouped_conv_fwd_multiple_d_grouped_gemm_xdl_cshuffle(
+        Array<GemmArgs, MaxGemmsNum> gemm_desc_kernel_args,
+        const index_t gemms_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation c_element_op,
+        const ComputePtrOffset compute_ptr_offset_of_groups,
+        const ComputePtrOffset compute_ptr_offset_of_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
index 21afc06040..7cfc73fab6 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
@@ -36,14 +36,14 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_xdl_fixed_nk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                         const index_t group_count,
-                                         const index_t grid_size_grp,
-                                         const AElementwiseOperation a_element_op,
-                                         const BElementwiseOperation b_element_op,
-                                         const CDEElementwiseOperation cde_element_op)
+    kernel_grouped_gemm_xdl_fixed_nk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                     const index_t group_count,
+                                     const index_t grid_size_grp,
+                                     const AElementwiseOperation a_element_op,
+                                     const BElementwiseOperation b_element_op,
+                                     const CDEElementwiseOperation cde_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index 10d8a4a44d..d0d613af8f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -32,13 +32,13 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_multiple_d_dl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                          const index_t group_count,
-                                          const AElementwiseOperation a_element_op,
-                                          const BElementwiseOperation b_element_op,
-                                          const CDEElementwiseOperation cde_element_op)
+    kernel_grouped_gemm_multiple_d_dl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                      const index_t group_count,
+                                      const AElementwiseOperation a_element_op,
+                                      const BElementwiseOperation b_element_op,
+                                      const CDEElementwiseOperation cde_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
     defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx94__) || \
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
index 18872e38ea..7b5dd55a8f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -576,16 +576,16 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             if(dev_gemm_args == nullptr)
             {
                 std::ostringstream err;
-                err << "The gemm arguments device buffer is not allocated!"
-                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                err << "The gemm arguments device buffer is not allocated!" << " In " << __FILE__
+                    << ":" << __LINE__ << ", in function: " << __func__;
                 throw std::runtime_error(err.str());
             }
 
             if(dev_gemm_workspace == nullptr)
             {
                 std::ostringstream err;
-                err << "The gemm workspace buffer is not allocated!"
-                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                err << "The gemm workspace buffer is not allocated!" << " In " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__;
                 throw std::runtime_error(err.str());
             }
 
@@ -624,16 +624,16 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
             if(arg.p_dev_gemm_kargs_ == nullptr)
             {
                 std::ostringstream err;
-                err << "The gemm arguments device buffer is not allocated!"
-                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                err << "The gemm arguments device buffer is not allocated!" << " In " << __FILE__
+                    << ":" << __LINE__ << ", in function: " << __func__;
                 throw std::runtime_error(err.str());
             }
 
             if(arg.p_workspace_ == nullptr)
             {
                 std::ostringstream err;
-                err << "The gemm workspace buffer is not allocated!"
-                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                err << "The gemm workspace buffer is not allocated!" << " In " << __FILE__ << ":"
+                    << __LINE__ << ", in function: " << __func__;
                 throw std::runtime_error(err.str());
             }
 
@@ -711,8 +711,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
                 if(not_all_have_kbatch_value_same)
                 {
                     std::ostringstream err;
-                    err << "Not all gemms have same kbatch value (=1 or >1)! "
-                        << "group [" << i << "], kbatch: " << gemm_arg.k_batch
+                    err << "Not all gemms have same kbatch value (=1 or >1)! " << "group [" << i
+                        << "], kbatch: " << gemm_arg.k_batch
                         << ", group [0], kbatch: " << gemm_arg.k_batch << " in " << __FILE__ << ":"
                         << __LINE__ << ", in function: " << __func__;
                     throw std::runtime_error(err.str());
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
index 61058dec2b..38bb19b712 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -60,13 +60,13 @@ template <typename GridwiseGemm,
           BlockGemmPipelineVersion BlkGemmPipelineVer>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_multiple_d_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                           const index_t group_count,
-                                           const AElementwiseOperation a_element_op,
-                                           const BElementwiseOperation b_element_op,
-                                           const CDEElementwiseOperation cde_element_op)
+    kernel_grouped_gemm_multiple_d_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                       const index_t group_count,
+                                       const AElementwiseOperation a_element_op,
+                                       const BElementwiseOperation b_element_op,
+                                       const CDEElementwiseOperation cde_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
 
@@ -600,8 +600,8 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
             if(dev_gemm_args == nullptr)
             {
                 std::ostringstream err;
-                err << "The gemm arguments device buffer is not allocated!"
-                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                err << "The gemm arguments device buffer is not allocated!" << " In " << __FILE__
+                    << ":" << __LINE__ << ", in function: " << __func__;
                 throw std::runtime_error(err.str());
             }
 
@@ -629,8 +629,8 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
             if(arg.p_dev_gemm_args_ == nullptr)
             {
                 std::ostringstream err;
-                err << "The gemm arguments device buffer is not allocated!"
-                    << " In " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
+                err << "The gemm arguments device buffer is not allocated!" << " In " << __FILE__
+                    << ":" << __LINE__ << ", in function: " << __func__;
                 throw std::runtime_error(err.str());
             }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 3fb2c5ae86..1754b542c5 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -32,16 +32,16 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_softmax_gemm_xdl_cshuffle_v1(
-            const void CK_CONSTANT_ADDRESS_SPACE* group_kernel_args,
-            const index_t group_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const AccElementwiseOperation acc_element_op,
-            const B1ElementwiseOperation b1_element_op,
-            const CElementwiseOperation c_element_op)
+    kernel_grouped_gemm_softmax_gemm_xdl_cshuffle_v1(
+        const void CK_CONSTANT_ADDRESS_SPACE* group_kernel_args,
+        const index_t group_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const AccElementwiseOperation acc_element_op,
+        const B1ElementwiseOperation b1_element_op,
+        const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index cbee4e09f4..a528149ecd 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -31,13 +31,13 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                const index_t group_count,
-                                const AElementwiseOperation a_element_op,
-                                const BElementwiseOperation b_element_op,
-                                const CDEElementwiseOperation c_element_op)
+    kernel_grouped_gemm_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                            const index_t group_count,
+                            const AElementwiseOperation a_element_op,
+                            const BElementwiseOperation b_element_op,
+                            const CDEElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
index 8fe71fb9a2..81134465af 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -38,17 +38,17 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_xdl_fixed_nk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                         uint32_t* barrier_count,
-                                         const index_t barrier_size_grp,
-                                         const index_t group_count,
-                                         const index_t grid_size_grp,
-                                         const index_t KBatch,
-                                         const AElementwiseOperation a_element_op,
-                                         const BElementwiseOperation b_element_op,
-                                         const CDEElementwiseOperation c_element_op)
+    kernel_grouped_gemm_xdl_fixed_nk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                     uint32_t* barrier_count,
+                                     const index_t barrier_size_grp,
+                                     const index_t group_count,
+                                     const index_t grid_size_grp,
+                                     const index_t KBatch,
+                                     const AElementwiseOperation a_element_op,
+                                     const BElementwiseOperation b_element_op,
+                                     const CDEElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index 01f52881f4..ea14087698 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -33,13 +33,13 @@ template <typename GridwiseGemm,
           typename CElementwiseOperation = ck::tensor_operation::element_wise::PassThrough>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
-                                       const index_t group_count,
-                                       const AElementwiseOperation a_element_op,
-                                       const BElementwiseOperation b_element_op,
-                                       const CElementwiseOperation c_element_op)
+    kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                   const index_t group_count,
+                                   const AElementwiseOperation a_element_op,
+                                   const BElementwiseOperation b_element_op,
+                                   const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
@@ -416,8 +416,8 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
                 if(not_all_have_kbatch_value_same)
                 {
                     std::ostringstream err;
-                    err << "Not all gemms have same kbatch value (=1 or >1)! "
-                        << "group [" << i << "], kbatch: " << kbatch
+                    err << "Not all gemms have same kbatch value (=1 or >1)! " << "group [" << i
+                        << "], kbatch: " << kbatch
                         << ", group [0], kbatch: " << arg.gemm_kernel_args_[0].karg_.k_batch
                         << " in " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
                     throw std::runtime_error(err.str());
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
index 67a100a112..b66ab997bb 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
@@ -45,21 +45,21 @@ template <typename DeviceOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_query_attention_wmma(const ADataType* __restrict__ p_a_grid,
-                                            const B0DataType* __restrict__ p_b0_grid,
-                                            const B1DataType* __restrict__ p_b1_grid,
-                                            CDataType* __restrict__ p_c_grid,
-                                            index_t M,  // SequenceQ
-                                            index_t N,  // SequenceK
-                                            index_t K,  // HeadDim
-                                            index_t O,  // SequenceK
-                                            index_t G0, // Batch
-                                            index_t G1, // HeadNum
-                                            float alpha,
-                                            bool input_permute,
-                                            bool output_permute)
+    kernel_grouped_query_attention_wmma(const ADataType* __restrict__ p_a_grid,
+                                        const B0DataType* __restrict__ p_b0_grid,
+                                        const B1DataType* __restrict__ p_b1_grid,
+                                        CDataType* __restrict__ p_c_grid,
+                                        index_t M,  // SequenceQ
+                                        index_t N,  // SequenceK
+                                        index_t K,  // HeadDim
+                                        index_t O,  // SequenceK
+                                        index_t G0, // Batch
+                                        index_t G1, // HeadNum
+                                        float alpha,
+                                        bool input_permute,
+                                        bool output_permute)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
index 48a10f219c..efa85a357c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp
@@ -100,64 +100,64 @@ struct DeviceMoeGemmBlockScale
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
     using GridwiseGemm                  = GridwiseMoeGemmBlockScale<
-        ALayout,
-        BLayout,
-        DsLayout,
-        CLayout,
-        ADataType,
-        BDataType,
-        GemmAccDataType,
-        CShuffleDataType,
-        DsDataType,
-        CDataType,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        GemmSpec,
-        BlockSize,
-        ScaleBlockM,
-        ScaleBlockN,
-        ScaleBlockK,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_AK0_M_AK1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_AK1,
-        false,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_BK0_N_BK1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_BK1,
-        false,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CDEShuffleBlockTransferScalarPerVectors,
-        BlkGemmPipeSched,
-        BlkGemmPipelineVer,
-        ActivationOP,
-        NSwizzle,
-        IsInputGemm,
-        MulRoutedWeight,
-        IndexType,
-        ComputeTypeA,
-        ComputeTypeB,
-        LDSTypeA,
-        LDSTypeB>;
+                         ALayout,
+                         BLayout,
+                         DsLayout,
+                         CLayout,
+                         ADataType,
+                         BDataType,
+                         GemmAccDataType,
+                         CShuffleDataType,
+                         DsDataType,
+                         CDataType,
+                         AElementwiseOperation,
+                         BElementwiseOperation,
+                         CElementwiseOperation,
+                         GemmSpec,
+                         BlockSize,
+                         ScaleBlockM,
+                         ScaleBlockN,
+                         ScaleBlockK,
+                         MPerBlock,
+                         NPerBlock,
+                         KPerBlock,
+                         AK1,
+                         BK1,
+                         MPerXDL,
+                         NPerXDL,
+                         MXdlPerWave,
+                         NXdlPerWave,
+                         ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                         ABlockTransferThreadClusterArrangeOrder,
+                         ABlockTransferSrcAccessOrder,
+                         ABlockTransferSrcVectorDim,
+                         ABlockTransferSrcScalarPerVector,
+                         ABlockTransferDstScalarPerVector_AK1,
+                         false,
+                         ABlockLdsExtraM,
+                         BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                         BBlockTransferThreadClusterArrangeOrder,
+                         BBlockTransferSrcAccessOrder,
+                         BBlockTransferSrcVectorDim,
+                         BBlockTransferSrcScalarPerVector,
+                         BBlockTransferDstScalarPerVector_BK1,
+                         false,
+                         BBlockLdsExtraN,
+                         CShuffleMXdlPerWavePerShuffle,
+                         CShuffleNXdlPerWavePerShuffle,
+                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                         CDEShuffleBlockTransferScalarPerVectors,
+                         BlkGemmPipeSched,
+                         BlkGemmPipelineVer,
+                         ActivationOP,
+                         NSwizzle,
+                         IsInputGemm,
+                         MulRoutedWeight,
+                         IndexType,
+                         ComputeTypeA,
+                         ComputeTypeB,
+                         LDSTypeA,
+                         LDSTypeB>;
 
     using Argument = typename GridwiseGemm::Argument;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
index 6dc3a5f881..4bf38d9d1f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
@@ -92,62 +92,62 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
     using GridwiseGemm                  = GridwiseMoeGemmMX_BPreshuffle<
-        ALayout,
-        BLayout,
-        DsLayout,
-        CLayout,
-        ADataType,
-        AScaleDataType,
-        BDataType,
-        BScaleDataType,
-        GemmAccDataType,
-        CShuffleDataType,
-        DsDataType,
-        CDataType,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        GemmSpec,
-        ScaleBlockSize,
-        BlockSize,
-        MPerBlock,
-        NPerBlock,
-        KPerBlock,
-        AK1,
-        BK1,
-        MPerXDL,
-        NPerXDL,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_AK0_M_AK1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_AK1,
-        false,
-        ABlockLdsExtraM,
-        BBlockTransferThreadClusterLengths_BK0_N_BK1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_BK1,
-        false,
-        BBlockLdsExtraN,
-        CShuffleMXdlPerWavePerShuffle,
-        CShuffleNXdlPerWavePerShuffle,
-        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CDEShuffleBlockTransferScalarPerVectors,
-        BlkGemmPipeSched,
-        BlkGemmPipelineVer,
-        ActivationOP,
-        NSwizzle,
-        IsInputGemm,
-        MulRoutedWeight,
-        IndexType,
-        ComputeTypeA,
-        ComputeTypeB>;
+                         ALayout,
+                         BLayout,
+                         DsLayout,
+                         CLayout,
+                         ADataType,
+                         AScaleDataType,
+                         BDataType,
+                         BScaleDataType,
+                         GemmAccDataType,
+                         CShuffleDataType,
+                         DsDataType,
+                         CDataType,
+                         AElementwiseOperation,
+                         BElementwiseOperation,
+                         CElementwiseOperation,
+                         GemmSpec,
+                         ScaleBlockSize,
+                         BlockSize,
+                         MPerBlock,
+                         NPerBlock,
+                         KPerBlock,
+                         AK1,
+                         BK1,
+                         MPerXDL,
+                         NPerXDL,
+                         MXdlPerWave,
+                         NXdlPerWave,
+                         ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                         ABlockTransferThreadClusterArrangeOrder,
+                         ABlockTransferSrcAccessOrder,
+                         ABlockTransferSrcVectorDim,
+                         ABlockTransferSrcScalarPerVector,
+                         ABlockTransferDstScalarPerVector_AK1,
+                         false,
+                         ABlockLdsExtraM,
+                         BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                         BBlockTransferThreadClusterArrangeOrder,
+                         BBlockTransferSrcAccessOrder,
+                         BBlockTransferSrcVectorDim,
+                         BBlockTransferSrcScalarPerVector,
+                         BBlockTransferDstScalarPerVector_BK1,
+                         false,
+                         BBlockLdsExtraN,
+                         CShuffleMXdlPerWavePerShuffle,
+                         CShuffleNXdlPerWavePerShuffle,
+                         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                         CDEShuffleBlockTransferScalarPerVectors,
+                         BlkGemmPipeSched,
+                         BlkGemmPipelineVer,
+                         ActivationOP,
+                         NSwizzle,
+                         IsInputGemm,
+                         MulRoutedWeight,
+                         IndexType,
+                         ComputeTypeA,
+                         ComputeTypeB>;
 
     using Argument = typename GridwiseGemm::Argument;
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
index cc88c1a104..e196ed5e3a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -44,21 +44,21 @@ template <typename DeviceOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_multi_query_attention_wmma(const ADataType* __restrict__ p_a_grid,
-                                          const B0DataType* __restrict__ p_b0_grid,
-                                          const B1DataType* __restrict__ p_b1_grid,
-                                          CDataType* __restrict__ p_c_grid,
-                                          index_t M,  // SequenceQ
-                                          index_t N,  // SequenceK
-                                          index_t K,  // HeadDim
-                                          index_t O,  // SequenceK
-                                          index_t G0, // Batch
-                                          index_t G1, // HeadNum
-                                          float alpha,
-                                          bool input_permute,
-                                          bool output_permute)
+    kernel_multi_query_attention_wmma(const ADataType* __restrict__ p_a_grid,
+                                      const B0DataType* __restrict__ p_b0_grid,
+                                      const B1DataType* __restrict__ p_b1_grid,
+                                      CDataType* __restrict__ p_c_grid,
+                                      index_t M,  // SequenceQ
+                                      index_t N,  // SequenceK
+                                      index_t K,  // HeadDim
+                                      index_t O,  // SequenceK
+                                      index_t G0, // Batch
+                                      index_t G1, // HeadNum
+                                      float alpha,
+                                      bool input_permute,
+                                      bool output_permute)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index 63b49d9aa0..c1d3aa43de 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -36,25 +36,25 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_contraction_multiple_d_xdl_cshuffle(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatDsPointer p_ds_grid,
-            FloatE* __restrict__ p_e_grid,
-            const index_t batch_count,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1,
-            const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const Block2ETileMap block_2_etile_map)
+    kernel_contraction_multiple_d_xdl_cshuffle(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatDsPointer p_ds_grid,
+        FloatE* __restrict__ p_e_grid,
+        const index_t batch_count,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1,
+        const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
index 9fe2f0d976..cc500bb9cb 100644
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -33,7 +33,7 @@ struct MaskDisabledPredicate
     };
 
     __host__ __device__ constexpr bool
-        IsTileSkippable(index_t /*m*/, index_t /*n*/, index_t /*m_tile*/, index_t /*n_tile*/) const
+    IsTileSkippable(index_t /*m*/, index_t /*n*/, index_t /*m_tile*/, index_t /*n_tile*/) const
     {
         return false;
     }
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 8f829496da..4a87e8a277 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -266,7 +266,7 @@ struct DequantPack8
         dst.template AsType<half2_t>()(Number<3>{}) =
             type_convert<half2_t>(src.template AsType<pk_i4_t>()[Number<3>{}]);
 
-        y          = dst.template AsType<half8_t>()[Number<0>{}];
+        y = dst.template AsType<half8_t>()[Number<0>{}];
 #endif
     }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
index 02dba97430..36dc8aa6ba 100644
--- a/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gemm_layernorm/gridwise_gemm_multiple_d_welford_first_half_xdl_cshuffle.hpp
@@ -527,11 +527,11 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<ABDataType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         ABDataType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              ABDataType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -997,9 +997,8 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
                 static_for<0, post_shuffle_thread_desc_m_n.GetElementSize(), 1>{}([&](auto i) {
                     const auto c_ds_src_data_refs = concat_tuple_of_reference(
                         tie(e_thread_buf[i]),
-                        generate_tie(
-                            [&](auto Id) -> const auto& { return ds_thread_buf[Id][i]; },
-                            Number<NumDTensor>{}));
+                        generate_tie([&](auto Id) -> const auto& { return ds_thread_buf[Id][i]; },
+                                     Number<NumDTensor>{}));
                     auto e_dst_data_refs = tie(e_thread_buf(i));
                     unpack2(cde_element_op, e_dst_data_refs, c_ds_src_data_refs);
                 });
@@ -1124,7 +1123,7 @@ struct GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
             });
 
         } // shuffle C + Ds + welford + write out
-    }     // run
+    } // run
 };
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp
index e3c50ef06c..cc3306e1bd 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise_multi_d.hpp
@@ -228,9 +228,8 @@ struct GridwiseReduction_mk_to_m_threadwise_multi_d
             static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
                 const auto c_ds_buf_refs = concat_tuple_of_reference(
                     tie(accu_value_buf[I]),
-                    generate_tie(
-                        [&](auto Id) -> const auto& { return ds_thread_buf[Id][I]; },
-                        Number<NumDTensor>{}));
+                    generate_tie([&](auto Id) -> const auto& { return ds_thread_buf[Id][I]; },
+                                 Number<NumDTensor>{}));
 
                 unpack2(out_elementwise_op, tie(out_value_buf(I)), c_ds_buf_refs);
             });
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
index 53a45c7f16..e8f8caa10d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -372,11 +372,11 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
                 : false;
         constexpr auto is_scale_mfma = false;
         constexpr auto mfma          = MfmaSelector<A0B0B1DataType,
-                                           Gemm0MPerXdl,
-                                           Gemm0NPerXdl,
-                                           A0B0B1DataType,
-                                           is_single_rate_mfma,
-                                           is_scale_mfma>::selected_mfma;
+                                                    Gemm0MPerXdl,
+                                                    Gemm0NPerXdl,
+                                                    A0B0B1DataType,
+                                                    is_single_rate_mfma,
+                                                    is_scale_mfma>::selected_mfma;
         constexpr auto N3            = mfma.num_groups_per_blk;
         constexpr auto N5            = mfma.group_size;
         return transform_tensor_descriptor(
@@ -669,11 +669,11 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_A0K1_B0K1,
                                             MfmaSelector<A0B0B1DataType,
-                                                         Gemm0MPerXdl,
-                                                         Gemm0NPerXdl,
-                                                         A0B0B1DataType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              Gemm0MPerXdl,
+                                                              Gemm0NPerXdl,
+                                                              A0B0B1DataType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm0 = BlockwiseGemmXdlops_v2<
             BlockSize,
@@ -1176,18 +1176,16 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c1_d1s_desc_refs = concat_tuple_of_reference(
                 tie(c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return d1s_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumD1Tensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return d1s_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumD1Tensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c1_d1s_buf_refs = concat_tuple_of_reference(
                 tie(c1_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return d1s_grid_buf[i]; },
-                    Number<NumD1Tensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return d1s_grid_buf[i]; },
+                             Number<NumD1Tensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c1_d1s_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index 1326c5d62d..839a68a978 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -24,14 +24,14 @@ template <typename GridwiseElementwiseFunctor,
           typename ElementwiseOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_elementwise(const InGridDescTuple in_grid_desc_tuple,
-                           const OutGridDescTuple out_grid_desc_tuple,
-                           const InDataTypePointerTuple p_in_global_tuple,
-                           const OutDataTypePointerTuple p_out_global_tuple,
-                           const Block2TileMap block_2_tile_map,
-                           const ElementwiseOperation elementwise_op)
+    kernel_elementwise(const InGridDescTuple in_grid_desc_tuple,
+                       const OutGridDescTuple out_grid_desc_tuple,
+                       const InDataTypePointerTuple p_in_global_tuple,
+                       const OutDataTypePointerTuple p_out_global_tuple,
+                       const Block2TileMap block_2_tile_map,
+                       const ElementwiseOperation elementwise_op)
 {
     GridwiseElementwiseFunctor::Run(in_grid_desc_tuple,
                                     out_grid_desc_tuple,
@@ -56,20 +56,20 @@ template <typename GridwiseElementwiseFunctorA,
           typename ElementwiseOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_elementwise_dual(const InAGridDescTuple in_grid_desc_tuple_a,
-                                const InBGridDescTuple in_grid_desc_tuple_b,
-                                const OutAGridDescTuple out_grid_desc_tuple_a,
-                                const OutBGridDescTuple out_grid_desc_tuple_b,
-                                const InADataTypePointerTuple p_in_global_tuple_a,
-                                const InBDataTypePointerTuple p_in_global_tuple_b,
-                                const OutADataTypePointerTuple p_out_global_tuple_a,
-                                const OutBDataTypePointerTuple p_out_global_tuple_b,
-                                const Block2TileMapA block_2_tile_map_a,
-                                const Block2TileMapB block_2_tile_map_b,
-                                const ElementwiseOperation elementwise_op,
-                                const index_t a_grid_size)
+    kernel_elementwise_dual(const InAGridDescTuple in_grid_desc_tuple_a,
+                            const InBGridDescTuple in_grid_desc_tuple_b,
+                            const OutAGridDescTuple out_grid_desc_tuple_a,
+                            const OutBGridDescTuple out_grid_desc_tuple_b,
+                            const InADataTypePointerTuple p_in_global_tuple_a,
+                            const InBDataTypePointerTuple p_in_global_tuple_b,
+                            const OutADataTypePointerTuple p_out_global_tuple_a,
+                            const OutBDataTypePointerTuple p_out_global_tuple_b,
+                            const Block2TileMapA block_2_tile_map_a,
+                            const Block2TileMapB block_2_tile_map_b,
+                            const ElementwiseOperation elementwise_op,
+                            const index_t a_grid_size)
 {
     if(get_block_1d_id() < a_grid_size)
     {
@@ -112,27 +112,26 @@ template <typename GridwiseElementwiseFunctorA,
           index_t NumOutputsB>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_elementwise_batched_dual(
-            const InAGridDescTuple in_grid_desc_tuple_a,
-            const InBGridDescTuple in_grid_desc_tuple_b,
-            const OutAGridDescTuple out_grid_desc_tuple_a,
-            const OutBGridDescTuple out_grid_desc_tuple_b,
-            const InADataTypePointerTuple p_in_global_tuple_a,
-            const InBDataTypePointerTuple p_in_global_tuple_b,
-            const OutADataTypePointerTuple p_out_global_tuple_a,
-            const OutBDataTypePointerTuple p_out_global_tuple_b,
-            const Block2TileMapA block_2_tile_map_a,
-            const Block2TileMapB block_2_tile_map_b,
-            const ElementwiseOperation elementwise_op,
-            const index_t a_grid_size,
-            const index_t batch_count_a,
-            const index_t batch_count_b,
-            const std::array<index_t, NumInputsA> input_batch_strides_a,
-            const std::array<index_t, NumInputsB> input_batch_strides_b,
-            const std::array<index_t, NumOutputsA> output_batch_strides_a,
-            const std::array<index_t, NumOutputsB> output_batch_strides_b)
+    kernel_elementwise_batched_dual(const InAGridDescTuple in_grid_desc_tuple_a,
+                                    const InBGridDescTuple in_grid_desc_tuple_b,
+                                    const OutAGridDescTuple out_grid_desc_tuple_a,
+                                    const OutBGridDescTuple out_grid_desc_tuple_b,
+                                    const InADataTypePointerTuple p_in_global_tuple_a,
+                                    const InBDataTypePointerTuple p_in_global_tuple_b,
+                                    const OutADataTypePointerTuple p_out_global_tuple_a,
+                                    const OutBDataTypePointerTuple p_out_global_tuple_b,
+                                    const Block2TileMapA block_2_tile_map_a,
+                                    const Block2TileMapB block_2_tile_map_b,
+                                    const ElementwiseOperation elementwise_op,
+                                    const index_t a_grid_size,
+                                    const index_t batch_count_a,
+                                    const index_t batch_count_b,
+                                    const std::array<index_t, NumInputsA> input_batch_strides_a,
+                                    const std::array<index_t, NumInputsB> input_batch_strides_b,
+                                    const std::array<index_t, NumOutputsA> output_batch_strides_a,
+                                    const std::array<index_t, NumOutputsB> output_batch_strides_b)
 {
     static_assert(InAGridDescTuple::Size() == NumInputsA &&
                   InADataTypePointerTuple::Size() == NumInputsA);
@@ -217,17 +216,17 @@ template <typename GridwiseElementwiseFunctor,
           index_t NumOutputs>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batched_elementwise(const InGridDescTuple in_grid_desc_tuple,
-                                   const OutGridDescTuple out_grid_desc_tuple,
-                                   const InDataTypePointerTuple p_in_global_tuple,
-                                   const OutDataTypePointerTuple p_out_global_tuple,
-                                   const Block2TileMap block_2_tile_map,
-                                   const ElementwiseOperation elementwise_op,
-                                   const index_t batch_count,
-                                   const std::array<index_t, NumInputs> input_batch_strides,
-                                   const std::array<index_t, NumOutputs> output_batch_strides)
+    kernel_batched_elementwise(const InGridDescTuple in_grid_desc_tuple,
+                               const OutGridDescTuple out_grid_desc_tuple,
+                               const InDataTypePointerTuple p_in_global_tuple,
+                               const OutDataTypePointerTuple p_out_global_tuple,
+                               const Block2TileMap block_2_tile_map,
+                               const ElementwiseOperation elementwise_op,
+                               const index_t batch_count,
+                               const std::array<index_t, NumInputs> input_batch_strides,
+                               const std::array<index_t, NumOutputs> output_batch_strides)
 {
     static_assert(InGridDescTuple::Size() == NumInputs &&
                   InDataTypePointerTuple::Size() == NumInputs);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
index 21dac6f9e9..fab0fbab1d 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
@@ -34,21 +34,21 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_fpAintB_gemm_wmma(const ADataType* __restrict__ p_a_grid,
-                                 const BDataType* __restrict__ p_b_grid,
-                                 const ScaleDataType* __restrict__ p_scale_grid,
-                                 CDataType* __restrict__ p_c_grid,
-                                 const AGridDesc a_grid_desc,
-                                 const BGridDesc b_grid_desc,
-                                 const ScaleGridDesc scale_grid_desc,
-                                 const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                                     c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                 const AElementwiseOperation a_element_op,
-                                 const BElementwiseOperation b_element_op,
-                                 const CElementwiseOperation c_element_op,
-                                 const Block2CTileMap block_2_ctile_map)
+    kernel_fpAintB_gemm_wmma(const ADataType* __restrict__ p_a_grid,
+                             const BDataType* __restrict__ p_b_grid,
+                             const ScaleDataType* __restrict__ p_scale_grid,
+                             CDataType* __restrict__ p_c_grid,
+                             const AGridDesc a_grid_desc,
+                             const BGridDesc b_grid_desc,
+                             const ScaleGridDesc scale_grid_desc,
+                             const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                             const AElementwiseOperation a_element_op,
+                             const BElementwiseOperation b_element_op,
+                             const CElementwiseOperation c_element_op,
+                             const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index f406bfb95a..6e73f0955b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -40,31 +40,31 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_bias_add_reduce_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const FloatC0* __restrict__ p_bias_grid,
-            const FloatC1* __restrict__ p_d0_grid,
-            ReducePtrsGlobal p_reduces_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const C1ElementwiseOperation c1_element_op,
-            const ReduceInElementwiseOperations reduce_in_element_ops,
-            const ReduceAccElementwiseOperations reduce_out_element_ops,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c0_grid_desc_mblock_mperblock_nblock_nperblock,
-            const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c1_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_bias_add_reduce_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC0* __restrict__ p_bias_grid,
+        const FloatC1* __restrict__ p_d0_grid,
+        ReducePtrsGlobal p_reduces_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const C1ElementwiseOperation c1_element_op,
+        const ReduceInElementwiseOperations reduce_in_element_ops,
+        const ReduceAccElementwiseOperations reduce_out_element_ops,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c0_grid_desc_mblock_mperblock_nblock_nperblock,
+        const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c1_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
index 562b9b8ffa..5e779b2881 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
@@ -28,15 +28,15 @@ template <typename GridwiseGemm,
           bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_dl_v1r3(const FloatAB* __restrict__ p_a_grid,
-                            const FloatAB* __restrict__ p_b_grid,
-                            FloatC* __restrict__ p_c_grid,
-                            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-                            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-                            const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
-                            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_dl_v1r3(const FloatAB* __restrict__ p_a_grid,
+                        const FloatAB* __restrict__ p_b_grid,
+                        FloatC* __restrict__ p_c_grid,
+                        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+                        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+                        const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
+                        const Block2CTileMap block_2_ctile_map)
 {
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
index b473d7cbf2..7deda48f7b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
@@ -21,12 +21,12 @@ namespace ck {
 template <typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
 #if CK_USE_WAVES_PER_EU
-        __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
+    __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
 #endif
-        kernel_gemm_dpp(const typename GridwiseGemm::Argument karg)
+    kernel_gemm_dpp(const typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx103__) || defined(__gfx11__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -154,17 +154,10 @@ struct GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << "}" << std::endl;
         }
 
         index_t M;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
index 054aca2936..c37ffb6263 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -687,11 +687,11 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
         static constexpr auto is_scale_mfma = false;
         constexpr index_t KPack             = math::max(lcm_AK1_BK1,
                                             MfmaSelector<AComputeDataType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         BComputeDataType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                                     MPerXdl,
+                                                                     NPerXdl,
+                                                                     BComputeDataType,
+                                                                     is_single_rate_mfma,
+                                                                     is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -863,18 +863,16 @@ struct GridwiseGemmMultipleABD_xdl_cshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index 127d889572..df5c8b10f3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -952,7 +952,7 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
             }); // copy c, d, e + reduction
 
         } // shuffle C + Ds + reduction + write out
-    }     // Run
+    } // Run
 };
 
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index de6c9c1601..36eb4489e9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -34,25 +34,25 @@ template <typename GridwiseOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_multiple_d_wmma_cshuffle(
-            const ADataType* __restrict__ p_a_grid,
-            const BDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
-            const AGridDesc_AK0_M_AK1 a_grid_desc,
-            const BGridDesc_BK0_N_BK1 b_grid_desc,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock_,
-            const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+    kernel_grouped_conv_multiple_d_wmma_cshuffle(
+        const ADataType* __restrict__ p_a_grid,
+        const BDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const index_t batch_count,
+        const AGridDesc_AK0_M_AK1 a_grid_desc,
+        const BGridDesc_BK0_N_BK1 b_grid_desc,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        const Block2CTileMap block_2_ctile_map,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
     // offset base pointer for each work-group
@@ -127,25 +127,25 @@ template <typename GridwiseOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_contraction_multiple_d_wmma_cshuffle(
-            const ADataType* __restrict__ p_a_grid,
-            const BDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const index_t batch_count,
-            const AGridDesc a_grid_desc,
-            const BGridDesc b_grid_desc,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const Block2CTileMap block_2_etile_map)
+    kernel_contraction_multiple_d_wmma_cshuffle(
+        const ADataType* __restrict__ p_a_grid,
+        const BDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const index_t batch_count,
+        const AGridDesc a_grid_desc,
+        const BGridDesc b_grid_desc,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+        const Block2CTileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
     // printf("entry kernel launch");
@@ -219,23 +219,22 @@ template <typename GridwiseOp,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_mupltipe_d_wmma_cshuffle(
-            const ADataType* __restrict__ p_a_grid,
-            const BDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AGridDesc a_grid_desc,
-            const BGridDesc b_grid_desc,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_mupltipe_d_wmma_cshuffle(const ADataType* __restrict__ p_a_grid,
+                                         const BDataType* __restrict__ p_b_grid,
+                                         DsPointer p_ds_grid,
+                                         EDataType* __restrict__ p_e_grid,
+                                         const AGridDesc a_grid_desc,
+                                         const BGridDesc b_grid_desc,
+                                         const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                             ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                         const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                             e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                         const AElementwiseOperation a_element_op,
+                                         const BElementwiseOperation b_element_op,
+                                         const CDEElementwiseOperation cde_element_op,
+                                         const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index acbccf1889..318ff59383 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -657,11 +657,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<AComputeDataType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         BComputeDataType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              BComputeDataType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -856,18 +856,16 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index 1e79d67f93..769bc5b877 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -38,23 +38,23 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load(
-            const ADataType* __restrict__ p_a_grid,
-            const BDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                ds_grid_desc_mblock_mperblock_nblock_nperblock,
-            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                e_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2ETileMap block_2_etile_map)
+    kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load(
+        const ADataType* __restrict__ p_a_grid,
+        const BDataType* __restrict__ p_b_grid,
+        DsPointer p_ds_grid,
+        EDataType* __restrict__ p_e_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CDEElementwiseOperation cde_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock,
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const Block2ETileMap block_2_etile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -73,18 +73,18 @@ __global__ void
                                                   e_grid_desc_mblock_mperblock_nblock_nperblock,
                                                   block_2_etile_map);
 #else
-    ignore                 = p_a_grid;
-    ignore                 = p_b_grid;
-    ignore                 = p_ds_grid;
-    ignore                 = p_e_grid;
-    ignore                 = a_element_op;
-    ignore                 = b_element_op;
-    ignore                 = cde_element_op;
-    ignore                 = a_grid_desc_ak0_m_ak1;
-    ignore                 = b_grid_desc_bk0_n_bk1;
-    ignore                 = ds_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore                 = e_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore                 = block_2_etile_map;
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
 #endif
 }
 
@@ -814,18 +814,16 @@ struct GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
             // A tuple of reference to C/Ds tensor descriptors.
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // A tuple of reference to C/Ds grid buffers.
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // A tuple of starting index of C/Ds blockwise copy.
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
index 5815eb5b0b..85b5b5faab 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
@@ -611,11 +611,11 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<AComputeType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         AComputeType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              AComputeType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -855,18 +855,16 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor_>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor_>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor_>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor_>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index db227bb7ef..b257fa4aa3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -35,24 +35,24 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_reduce_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            ReducePtrsGlobal p_reduces_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const ReduceInElementwiseOperations reduce_in_element_ops,
-            const ReduceAccElementwiseOperations reduce_out_element_ops,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_reduce_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        ReducePtrsGlobal p_reduces_grid,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const ReduceInElementwiseOperations reduce_in_element_ops,
+        const ReduceAccElementwiseOperations reduce_out_element_ops,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
index 70301c326a..b4848c7077 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -593,11 +593,11 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<ABDataType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         ABDataType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              ABDataType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -769,18 +769,16 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
                 // tuple of reference to C/Ds tensor descriptors
                 const auto c_ds_desc_refs = concat_tuple_of_reference(
                     tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                    generate_tie(
-                        [&](auto i) -> const auto& // return type should be reference
-                        { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                        Number<NumDTensor>{}));
+                    generate_tie([&](auto i) -> const auto& // return type should be reference
+                                 { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                                 Number<NumDTensor>{}));
 
                 // tuple of reference to C/Ds tensor descriptors
                 const auto c_ds_buf_refs = concat_tuple_of_reference(
                     tie(c_shuffle_block_buf),
-                    generate_tie(
-                        [&](auto i) -> const auto& // return type should be reference
-                        { return ds_grid_buf[i]; },
-                        Number<NumDTensor>{}));
+                    generate_tie([&](auto i) -> const auto& // return type should be reference
+                                 { return ds_grid_buf[i]; },
+                                 Number<NumDTensor>{}));
 
                 // tuple of starting index of C/Ds blockwise copy
                 const auto idx_c_ds_block_begin = container_concat(
@@ -1032,11 +1030,11 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<ABDataType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         ABDataType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              ABDataType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
index f64838ea4e..1b4c2666ab 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
@@ -607,11 +607,11 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<ComputeType,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         ComputeType,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              ComputeType,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
@@ -845,18 +845,16 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor_>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor_>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor_>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor_>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 4458b9356d..51cd5ada91 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -31,19 +31,19 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_wmma(const ADataType* __restrict__ p_a_grid,
-                         const BDataType* __restrict__ p_b_grid,
-                         CDataType* __restrict__ p_c_grid,
-                         const AGridDesc a_grid_desc,
-                         const BGridDesc b_grid_desc,
-                         const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                             c_grid_desc_mblock_mperblock_nblock_nperblock,
-                         const AElementwiseOperation a_element_op,
-                         const BElementwiseOperation b_element_op,
-                         const CElementwiseOperation c_element_op,
-                         const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_wmma(const ADataType* __restrict__ p_a_grid,
+                     const BDataType* __restrict__ p_b_grid,
+                     CDataType* __restrict__ p_c_grid,
+                     const AGridDesc a_grid_desc,
+                     const BGridDesc b_grid_desc,
+                     const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                         c_grid_desc_mblock_mperblock_nblock_nperblock,
+                     const AElementwiseOperation a_element_op,
+                     const BElementwiseOperation b_element_op,
+                     const CElementwiseOperation c_element_op,
+                     const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
index 75f12d094e..9a8d09e5e4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp
@@ -337,20 +337,11 @@ struct GridwiseGemm_wmma_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
index 7b6ad5ca3e..37ffbf1c51 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_b_scale.hpp
@@ -240,22 +240,12 @@ struct GridwiseGemm_wmma_cshuffle_v3_b_scale
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "SScaleB:" << StrideScaleB << ", " << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded
+                      << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
index 5a4a41e507..fc01866ddf 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
@@ -24,9 +24,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
+    kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 #if defined(__gfx11__)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 63d40f6ff8..68112489ca 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -217,20 +217,11 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index d45ed79ae3..9089bd2ce2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -33,9 +33,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -54,9 +54,9 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // Pass two lds pointer is the key to tell compiler that ds_read/write
@@ -538,24 +538,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << ", "
-                      << "Stream-K Selection:" << Streamk_sel << ", "
-                      << "Grid size:" << Grid_size << ", "
-                      << "Reduction Strategy:"
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
+                      << "NBlock: " << NBlock << ", " << "Stream-K Selection:" << Streamk_sel
+                      << ", " << "Grid size:" << Grid_size << ", " << "Reduction Strategy:"
                       << (reduction_strategy == StreamKReductionStrategy::Atomic ? "Atomic"
                                                                                  : "Reduction")
                       << "}" << std::endl;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index 7edcd7270f..c22229a183 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -20,9 +20,9 @@ namespace ck {
 template <typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -42,12 +42,12 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdl_cshuffle_v1(const FloatA* __restrict__ p_a_grid,
-                                    const FloatB* __restrict__ p_b_grid,
-                                    FloatC* __restrict__ p_c_grid,
-                                    typename GridwiseGemm::Problem problem)
+    kernel_gemm_xdl_cshuffle_v1(const FloatA* __restrict__ p_a_grid,
+                                const FloatB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                typename GridwiseGemm::Problem problem)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -436,20 +436,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
@@ -822,11 +813,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<ComputeTypeA,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         ComputeTypeB,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              ComputeTypeB,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
             BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
index f92268265f..48c577b2e0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -20,7 +20,7 @@ namespace ck {
 template <typename GridwiseGemm, bool HasMainKBlockLoop, index_t TailNum = 3>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v2(typename GridwiseGemm::Argument karg)
@@ -46,12 +46,12 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
 #endif
-        kernel_gemm_xdl_cshuffle_v2(const FloatA* p_a_grid,
-                                    const FloatB* p_b_grid,
-                                    FloatC* p_c_grid,
-                                    typename GridwiseGemm::Problem problem)
+    kernel_gemm_xdl_cshuffle_v2(const FloatA* p_a_grid,
+                                const FloatB* p_b_grid,
+                                FloatC* p_c_grid,
+                                typename GridwiseGemm::Problem problem)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -475,20 +475,11 @@ struct GridwiseGemm_xdl_cshuffle_v2
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
@@ -881,11 +872,11 @@ struct GridwiseGemm_xdl_cshuffle_v2
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(lcm_AK1_BK1,
                                             MfmaSelector<ComputeTypeA,
-                                                         MPerXdl,
-                                                         NPerXdl,
-                                                         ComputeTypeA,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXdl,
+                                                              NPerXdl,
+                                                              ComputeTypeA,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         // auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
         //     BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 6270d0c4dc..5f3950b29e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -30,7 +30,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
@@ -58,7 +58,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
@@ -666,20 +666,11 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index 8d5c844103..91f08413af 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -30,7 +30,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_b_preshuffle(typename GridwiseGemm::Argument karg)
@@ -58,7 +58,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
@@ -155,11 +155,11 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
     static constexpr bool is_single_rate_mfma = true;
     static constexpr auto is_scale_mfma       = false;
     static constexpr auto mfma                = MfmaSelector<ComputeTypeA,
-                                              MPerXdl,
-                                              NPerXdl,
-                                              ComputeTypeA,
-                                              is_single_rate_mfma,
-                                              is_scale_mfma>{};
+                                                             MPerXdl,
+                                                             NPerXdl,
+                                                             ComputeTypeA,
+                                                             is_single_rate_mfma,
+                                                             is_scale_mfma>{};
     static constexpr index_t KPack = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
     static constexpr index_t KLane = mfma.GetKPerXdlops() / mfma.GetK1PerXdlops();
 
@@ -575,20 +575,11 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index 93c1779a80..d8c697823a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -30,7 +30,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
@@ -60,7 +60,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
@@ -563,22 +563,12 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "SScaleB:" << StrideScaleB << ", " << "MP:" << MPadded << ", "
+                      << "NP:" << NPadded << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded
+                      << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", "
+                      << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index 97d0e2a4eb..9f442906f5 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -29,7 +29,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
@@ -59,7 +59,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
@@ -589,18 +589,11 @@ struct GridwiseGemm_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "MP:" << MPadded << ", " << "NP:" << NPadded << ", " << "KRead:" << KRead
+                      << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0 << ", " << "BK0:" << BK0
+                      << ", " << "MBlock: " << MBlock << ", " << "NBlock: " << NBlock << "}"
+                      << std::endl;
         }
 
         index_t M;
@@ -1757,18 +1750,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
@@ -2340,18 +2331,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index a3694e3767..17b4cd7c68 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -33,7 +33,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d(typename GridwiseGemm::Argument karg)
@@ -65,7 +65,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_2lds(typename GridwiseGemm::Argument karg)
@@ -577,20 +577,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
@@ -1636,18 +1627,16 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
@@ -2170,18 +2159,16 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 64fbda7a44..b41f1220fb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -33,7 +33,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
@@ -538,20 +538,11 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
@@ -1556,18 +1547,16 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 3553a1d040..27926e5290 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -33,7 +33,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle(typename GridwiseGemm::Argument karg)
@@ -65,7 +65,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
@@ -174,11 +174,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             : false;
     static constexpr auto is_scale_mfma = false;
     static constexpr auto mfma          = MfmaSelector<ComputeTypeA,
-                                              MPerXdl,
-                                              NPerXdl,
-                                              ComputeTypeA,
-                                              is_single_rate_mfma,
-                                              is_scale_mfma>{};
+                                                       MPerXdl,
+                                                       NPerXdl,
+                                                       ComputeTypeA,
+                                                       is_single_rate_mfma,
+                                                       is_scale_mfma>{};
     static constexpr index_t KPack      = math::max(lcm_AK1_BK1, mfma.selected_mfma.k_per_blk);
     static constexpr index_t KGroup     = []() {
         if constexpr(is_same_v<remove_cvref_t<BDataType>, f8_t>)
@@ -599,20 +599,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
@@ -1414,18 +1405,16 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
@@ -1855,18 +1844,16 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
index 909376e5f7..20711f0c5e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
@@ -33,7 +33,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle(
@@ -66,7 +66,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds(
@@ -555,20 +555,11 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
@@ -1446,18 +1437,16 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
@@ -1948,18 +1937,16 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin = container_concat(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index ca3902188e..bc87559c43 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -34,7 +34,7 @@ template <bool Use2LDS,
           TailNumber TailNum       = TailNumber::Full>
 __global__ enable_if_t<!Use2LDS, void>
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
@@ -66,7 +66,7 @@ template <bool Use2LDS,
           TailNumber TailNum       = TailNumber::Full>
 __global__ enable_if_t<Use2LDS, void>
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
@@ -422,8 +422,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
         static_assert(!((is_same_v<remove_cvref_t<ADataType>, f6x16_pk_t> ||
                          is_same_v<remove_cvref_t<ADataType>, bf6x16_pk_t> ||
                          is_same_v<remove_cvref_t<ADataType>, f6x32_pk_t> ||
-                         is_same_v<remove_cvref_t<ADataType>, bf6x32_pk_t>)&&GemmSpec !=
-                        GemmSpecialization::Default),
+                         is_same_v<remove_cvref_t<ADataType>, bf6x32_pk_t>) &&
+                        GemmSpec != GemmSpecialization::Default),
                       "Packed F6 types do not support padding");
 
         if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
@@ -648,23 +648,13 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SScaleA:" << StrideScaleA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", " << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded
+                      << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock
+                      << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index 6691c63484..7902a16fb3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -34,7 +34,7 @@ template <bool Use2LDS,
           TailNumber TailNum       = TailNumber::Full>
 __global__ enable_if_t<!Use2LDS, void>
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
@@ -66,7 +66,7 @@ template <bool Use2LDS,
           TailNumber TailNum       = TailNumber::Full>
 __global__ enable_if_t<Use2LDS, void>
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_mx(typename GridwiseGemm::Argument karg)
@@ -674,23 +674,13 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SScaleA:" << StrideScaleA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", " << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded
+                      << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock
+                      << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t M;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index 67fb4d651e..80ce6a1bc4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -36,26 +36,26 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_layernorm_xdl_cshuffle_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,               // MxN
-            const FloatC0* __restrict__ p_c0_bias_grid,  // 1xN
-            const FloatC0* __restrict__ p_c0_add_grid,   // MxN
-            const FloatC0* __restrict__ p_c0_gamma_grid, // 1xN
-            const FloatC0* __restrict__ p_c0_beta_grid,  // 1xN
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const AccElementwiseOperation acc_element_op,
-            const CElementwiseOperation c_element_op,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_layernorm_xdl_cshuffle_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,               // MxN
+        const FloatC0* __restrict__ p_c0_bias_grid,  // 1xN
+        const FloatC0* __restrict__ p_c0_add_grid,   // MxN
+        const FloatC0* __restrict__ p_c0_gamma_grid, // 1xN
+        const FloatC0* __restrict__ p_c0_beta_grid,  // 1xN
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const AccElementwiseOperation acc_element_op,
+        const CElementwiseOperation c_element_op,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index b7947309e4..697d0f90d9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -152,19 +152,19 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
-                                      const FloatB* __restrict__ p_b_grid,
-                                      FloatC* __restrict__ p_c_grid,
-                                      const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-                                      const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
-                                      const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
-                                          c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                      const AElementwiseOperation a_element_op,
-                                      const BElementwiseOperation b_element_op,
-                                      const CElementwiseOperation c_element_op,
-                                      const CBlockClusterAdaptor c_block_cluster_adaptor)
+    kernel_gemm_xdlops_bwd_weight(const FloatA* __restrict__ p_a_grid,
+                                  const FloatB* __restrict__ p_b_grid,
+                                  FloatC* __restrict__ p_c_grid,
+                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                  const AElementwiseOperation a_element_op,
+                                  const BElementwiseOperation b_element_op,
+                                  const CElementwiseOperation c_element_op,
+                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -182,16 +182,16 @@ __global__ void
                                                   c_element_op,
                                                   c_block_cluster_adaptor);
 #else
-    ignore               = p_a_grid;
-    ignore               = p_b_grid;
-    ignore               = p_c_grid;
-    ignore               = a_b_k0_m_k1_grid_desc;
-    ignore               = b_b_k0_n_k1_grid_desc;
-    ignore               = c_grid_desc_mblock_mperblock_nblock_nperblock;
-    ignore               = a_element_op;
-    ignore               = b_element_op;
-    ignore               = c_element_op;
-    ignore               = c_block_cluster_adaptor;
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c_block_cluster_adaptor;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -752,11 +752,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
         constexpr auto is_scale_mfma = false;
         constexpr index_t KPack      = math::max(K1,
                                             MfmaSelector<FloatAAdjusted,
-                                                         MPerXDL,
-                                                         NPerXDL,
-                                                         FloatBAdjusted,
-                                                         is_single_rate_mfma,
-                                                         is_scale_mfma>::selected_mfma.k_per_blk);
+                                                              MPerXDL,
+                                                              NPerXDL,
+                                                              FloatBAdjusted,
+                                                              is_single_rate_mfma,
+                                                              is_scale_mfma>::selected_mfma.k_per_blk);
 
         auto blockwise_gemm =
             BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index 7c401a4957..21dacb3412 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -30,19 +30,19 @@ template <typename GridwiseGemm,
           bool HasMainK0BlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_skip_b_lds_v1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
-            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_xdlops_skip_b_lds_v1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
index 3e23008a5f..0c5f8de1e4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
@@ -30,13 +30,13 @@ template <typename GridwiseGemm,
           typename CElementwiseOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_splitk_lds_direct_load(typename GridwiseGemm::Argument karg,
-                                                  const Block2CTileMap& b2c_map,
-                                                  const AElementwiseOperation a_element_op,
-                                                  const BElementwiseOperation b_element_op,
-                                                  const CElementwiseOperation c_element_op)
+    kernel_gemm_xdlops_splitk_lds_direct_load(typename GridwiseGemm::Argument karg,
+                                              const Block2CTileMap& b2c_map,
+                                              const AElementwiseOperation a_element_op,
+                                              const BElementwiseOperation b_element_op,
+                                              const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
@@ -168,17 +168,10 @@ struct GridwiseGemm_xdlops_splitk_lds_direct_load
 
         void Print() const
         {
-            std::cout << "arg {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KP:" << KPadded << ", "
-                      << "K0Padded:" << K0Padded << ", "
+            std::cout << "arg {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", " << "K0Padded:" << K0Padded << ", "
                       << "KB:" << k_batch << "}" << std::endl;
         }
     };
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
index e9190dee29..104632d3f0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -23,19 +23,19 @@ namespace ck {
 template <typename GridwiseGemm>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_streamk(const typename GridwiseGemm::FloatAB* p_a_grid,
-                                   const typename GridwiseGemm::FloatAB* p_b_grid,
-                                   typename GridwiseGemm::FloatC* p_c_grid,
-                                   void* p_workspace,
-                                   index_t M,
-                                   index_t N,
-                                   index_t K,
-                                   index_t StrideA,
-                                   index_t StrideB,
-                                   index_t StrideC,
-                                   typename GridwiseGemm::Block2CTileMap block_mapping)
+    kernel_gemm_xdlops_streamk(const typename GridwiseGemm::FloatAB* p_a_grid,
+                               const typename GridwiseGemm::FloatAB* p_b_grid,
+                               typename GridwiseGemm::FloatC* p_c_grid,
+                               void* p_workspace,
+                               index_t M,
+                               index_t N,
+                               index_t K,
+                               index_t StrideA,
+                               index_t StrideB,
+                               index_t StrideC,
+                               typename GridwiseGemm::Block2CTileMap block_mapping)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -174,13 +174,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
 
         void Print() const
         {
-            std::cout << "arg {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << std::endl;
+            std::cout << "arg {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << std::endl;
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index 5c3d9b7ba4..dc9429ea6e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -26,17 +26,17 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
 #if CK_USE_WAVES_PER_EU
-        __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
+    __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
 #endif
-        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
-                                const FloatAB* __restrict__ p_b_grid,
-                                FloatC* __restrict__ p_c_grid,
-                                const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-                                const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-                                const CGridDesc_M_N c_grid_desc_m_n)
+    kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                            const FloatAB* __restrict__ p_b_grid,
+                            FloatC* __restrict__ p_c_grid,
+                            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+                            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+                            const CGridDesc_M_N c_grid_desc_m_n)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -50,24 +50,24 @@ __global__ void
                                                   b_grid_desc_k0_n_k1,
                                                   c_grid_desc_m_n);
 #else
-    ignore                = p_a_grid;
-    ignore                = p_b_grid;
-    ignore                = p_c_grid;
-    ignore                = a_grid_desc_k0_m_k1;
-    ignore                = b_grid_desc_k0_n_k1;
-    ignore                = c_grid_desc_m_n;
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m_n;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
 template <typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
 #if CK_USE_WAVES_PER_EU
-        __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
+    __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
 #endif
-        kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
+    kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -90,7 +90,7 @@ __global__ void
                                                   b_grid_desc_k0_n_k1,
                                                   c_grid_desc_m_n);
 #else
-    ignore                = karg;
+    ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
 
@@ -200,16 +200,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "K0:" << K0 << "}" << std::endl;
+            std::cout << "problem {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", " << "K0:" << K0
+                      << "}" << std::endl;
         }
 
         index_t M;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index 7d8e94c001..978f08ad4a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -29,18 +29,18 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4(const FloatAB* __restrict__ p_a_grid,
-                                const FloatAB* __restrict__ p_b_grid,
-                                FloatC* __restrict__ p_c_grid,
-                                const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc,
-                                const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc,
-                                const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
-                                const AElementwiseOperation a_element_op,
-                                const BElementwiseOperation b_element_op,
-                                const CElementwiseOperation c_element_op,
-                                const CBlockClusterAdaptor c_block_cluster_adaptor)
+    kernel_gemm_xdlops_v2r4(const FloatAB* __restrict__ p_a_grid,
+                            const FloatAB* __restrict__ p_b_grid,
+                            FloatC* __restrict__ p_c_grid,
+                            const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc,
+                            const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc,
+                            const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                            const AElementwiseOperation a_element_op,
+                            const BElementwiseOperation b_element_op,
+                            const CElementwiseOperation c_element_op,
+                            const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 256b495c6e..a546b471bf 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -28,13 +28,13 @@ template <typename GridwiseGemm,
           typename CElementwiseOperation>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
-                                             const Block2CTileMap& b2c_map,
-                                             const AElementwiseOperation a_element_op,
-                                             const BElementwiseOperation b_element_op,
-                                             const CElementwiseOperation c_element_op)
+    kernel_gemm_xdlops_v2r4r2_simplified(typename GridwiseGemm::Argument karg,
+                                         const Block2CTileMap& b2c_map,
+                                         const AElementwiseOperation a_element_op,
+                                         const BElementwiseOperation b_element_op,
+                                         const CElementwiseOperation c_element_op)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
@@ -175,17 +175,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
 
         void Print() const
         {
-            std::cout << "arg {"
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KP:" << KPadded << ", "
-                      << "K0Padded:" << K0Padded << ", "
+            std::cout << "arg {" << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KP:" << KPadded << ", " << "K0Padded:" << K0Padded << ", "
                       << "KB:" << k_batch << "}" << std::endl;
         }
     };
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index 15c2da9d32..66a3fef4eb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -31,20 +31,20 @@ template <typename GridwiseGemm,
           bool HasMainK0BlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v3r1(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
-            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
-            const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_xdlops_v3r1(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index e22bfb6439..eb4e7d3db3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -31,23 +31,23 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v3r2(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const FloatC* __restrict__ p_c0_grid,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-            const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_xdlops_v3r2(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC* __restrict__ p_c0_grid,
+        const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+        const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index 3da5e66018..5bd5f75fa9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -32,26 +32,26 @@ template <typename GridwiseGemm,
           bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_gemm_xdlops_v3r3(
-            const FloatAB* __restrict__ p_a_grid,
-            const FloatAB* __restrict__ p_b_grid,
-            FloatC* __restrict__ p_c_grid,
-            const FloatC* __restrict__ p_c0_grid,
-            const FloatC* __restrict__ p_c1_grid,
-            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
-            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
-            const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-            const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-            const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-                c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const Block2CTileMap block_2_ctile_map)
+    kernel_gemm_xdlops_v3r3(
+        const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC* __restrict__ p_c0_grid,
+        const FloatC* __restrict__ p_c1_grid,
+        const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+        const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const AElementwiseOperation a_element_op,
+        const BElementwiseOperation b_element_op,
+        const CElementwiseOperation c_element_op,
+        const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__gfx94__))
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 3d5066d52d..ca68fe9f86 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -40,7 +40,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm(typename GridwiseGemm::Argument karg)
@@ -75,7 +75,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
@@ -619,22 +619,12 @@ struct GridwiseMoeGemm
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "NumTokens:" << NumTokens << ", "
-                      << "TopK:" << TopK << ", "
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "NumTokens:" << NumTokens << ", " << "TopK:" << TopK << ", "
+                      << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
@@ -1714,18 +1704,16 @@ struct GridwiseMoeGemm
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -1746,40 +1734,40 @@ struct GridwiseMoeGemm
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                            // support arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                               // support arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
@@ -2436,18 +2424,16 @@ struct GridwiseMoeGemm
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -2468,40 +2454,40 @@ struct GridwiseMoeGemm
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                            // support arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                               // support arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index f092c9c1eb..7145efbd97 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -40,7 +40,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm(typename GridwiseGemm::Argument karg)
@@ -77,7 +77,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
@@ -626,22 +626,12 @@ struct GridwiseMoeGemmBlockScale
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "NumTokens:" << NumTokens << ", "
-                      << "TopK:" << TopK << ", "
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
+            std::cout << "problem {" << "NumTokens:" << NumTokens << ", " << "TopK:" << TopK << ", "
+                      << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SB:" << StrideB << ", " << "SC:" << StrideC
+                      << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded << ", "
+                      << "KRead:" << KRead << ", " << "KP:" << KPadded << ", " << "AK0:" << AK0
+                      << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock << ", "
                       << "NBlock: " << NBlock << "}" << std::endl;
         }
 
@@ -1764,18 +1754,16 @@ struct GridwiseMoeGemmBlockScale
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -1796,40 +1784,40 @@ struct GridwiseMoeGemmBlockScale
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = IsInputGemm ? 1 : 1; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                            // support arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                               // support arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
@@ -2506,18 +2494,16 @@ struct GridwiseMoeGemmBlockScale
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -2538,40 +2524,40 @@ struct GridwiseMoeGemmBlockScale
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = IsInputGemm ? 1 : 1; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                            // support arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                               // support arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index 5f8e524fb2..6731a7dda6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -81,7 +81,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
@@ -678,25 +678,14 @@ struct GridwiseMoeGemmMX
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "NumTokens:" << NumTokens << ", "
-                      << "TopK:" << TopK << ", "
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SScaleA:" << StrideScaleA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "NumTokens:" << NumTokens << ", " << "TopK:" << TopK << ", "
+                      << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", " << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded
+                      << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock
+                      << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t NumTokens;
@@ -2769,18 +2758,16 @@ struct GridwiseMoeGemmMX
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -2801,41 +2788,41 @@ struct GridwiseMoeGemmMX
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
-                                                                            // Sequence support
-                                                                            // arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                               // Sequence support
+                                                                               // arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
index 9ccd334262..d8d77ae388 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -42,7 +42,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
@@ -205,11 +205,11 @@ struct GridwiseMoeGemmMXBNS
     static constexpr bool is_single_rate_mfma = false;
     static constexpr auto is_scale_mfma       = true;
     using mfma_selector                       = MfmaSelector<ComputeTypeA,
-                                       MPerXdl,
-                                       NPerXdl,
-                                       ComputeTypeB,
-                                       is_single_rate_mfma,
-                                       is_scale_mfma>;
+                                                             MPerXdl,
+                                                             NPerXdl,
+                                                             ComputeTypeB,
+                                                             is_single_rate_mfma,
+                                                             is_scale_mfma>;
     static constexpr index_t KPack            = math::max(
         math::lcm(AK1Number, BK1Number), mfma_selector::selected_mfma.k_per_blk / APackedSize);
 
@@ -611,25 +611,14 @@ struct GridwiseMoeGemmMXBNS
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "NumTokens:" << NumTokens << ", "
-                      << "TopK:" << TopK << ", "
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SScaleA:" << StrideScaleA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "NumTokens:" << NumTokens << ", " << "TopK:" << TopK << ", "
+                      << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", " << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded
+                      << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock
+                      << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t NumTokens;
@@ -1956,18 +1945,16 @@ struct GridwiseMoeGemmMXBNS
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -1988,41 +1975,41 @@ struct GridwiseMoeGemmMXBNS
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
-                                                                            // Sequence support
-                                                                            // arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                               // Sequence support
+                                                                               // arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
index be85528f28..7c3dbceeaa 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
@@ -42,7 +42,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
@@ -79,7 +79,7 @@ template <typename GridwiseGemm,
           TailNumber TailNum       = TailNumber::Even>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
@@ -708,25 +708,14 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
         __host__ void Print() const
         {
-            std::cout << "problem {"
-                      << "NumTokens:" << NumTokens << ", "
-                      << "TopK:" << TopK << ", "
-                      << "M:" << M << ", "
-                      << "N:" << N << ", "
-                      << "K:" << K << ", "
-                      << "SA:" << StrideA << ", "
-                      << "SScaleA:" << StrideScaleA << ", "
-                      << "SB:" << StrideB << ", "
-                      << "SScaleB:" << StrideScaleB << ", "
-                      << "SC:" << StrideC << ", "
-                      << "MP:" << MPadded << ", "
-                      << "NP:" << NPadded << ", "
-                      << "KRead:" << KRead << ", "
-                      << "KP:" << KPadded << ", "
-                      << "AK0:" << AK0 << ", "
-                      << "BK0:" << BK0 << ", "
-                      << "MBlock: " << MBlock << ", "
-                      << "NBlock: " << NBlock << "}" << std::endl;
+            std::cout << "problem {" << "NumTokens:" << NumTokens << ", " << "TopK:" << TopK << ", "
+                      << "M:" << M << ", " << "N:" << N << ", " << "K:" << K << ", "
+                      << "SA:" << StrideA << ", " << "SScaleA:" << StrideScaleA << ", "
+                      << "SB:" << StrideB << ", " << "SScaleB:" << StrideScaleB << ", "
+                      << "SC:" << StrideC << ", " << "MP:" << MPadded << ", " << "NP:" << NPadded
+                      << ", " << "KRead:" << KRead << ", " << "KP:" << KPadded << ", "
+                      << "AK0:" << AK0 << ", " << "BK0:" << BK0 << ", " << "MBlock: " << MBlock
+                      << ", " << "NBlock: " << NBlock << "}" << std::endl;
         }
 
         index_t NumTokens;
@@ -2588,18 +2577,16 @@ struct GridwiseMoeGemmMX_BPreshuffle
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_desc_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of reference to C/Ds tensor descriptors
             const auto c_ds_buf_refs = concat_tuple_of_reference(
                 tie(c_shuffle_block_buf),
-                generate_tie(
-                    [&](auto i) -> const auto& // return type should be reference
-                    { return ds_grid_buf[i]; },
-                    Number<NumDTensor>{}));
+                generate_tie([&](auto i) -> const auto& // return type should be reference
+                             { return ds_grid_buf[i]; },
+                             Number<NumDTensor>{}));
 
             // tuple of starting index of C/Ds blockwise copy
             const auto idx_c_ds_block_begin =
@@ -2620,41 +2607,41 @@ struct GridwiseMoeGemmMX_BPreshuffle
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
             constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                ThisThreadBlock,
-                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                Tuple<EDataType>,
-                decltype(c_ds_desc_refs),
-                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                CElementwiseOperation,
-                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
-                                                                            // Sequence support
-                                                                            // arbitray type
-                Sequence<1,
-                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                         1,
-                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                CDEBlockTransferCluster,
-                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                3,                    // index_t SrcVectorDim,
-                3,                    // index_t DstVectorDim,
-                CDEShuffleBlockTransferScalarPerVectors,
-                CShuffleBlockTransferScalarPerVector_NPerBlock,
-                sequence_merge_t<
-                    Sequence<true>,
-                    uniform_sequence_gen_t<NumDTensor,
-                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                IndexType,
-                1,                 // ScatterDim
-                true,              // OutputScatter: false, only use scatter weights
-                scatter_weight_idx // ScatterWeightIdx: ascale
-                >{c_ds_desc_refs,
-                  idx_c_ds_block_begin,
-                  tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                  make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                  c_element_op};
+                   ThisThreadBlock,
+                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                   Tuple<EDataType>,
+                   decltype(c_ds_desc_refs),
+                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                   CElementwiseOperation,
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                               // Sequence support
+                                                                               // arbitray type
+                   Sequence<1,
+                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                            1,
+                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                   CDEBlockTransferCluster,
+                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                   3,                    // index_t SrcVectorDim,
+                   3,                    // index_t DstVectorDim,
+                   CDEShuffleBlockTransferScalarPerVectors,
+                   CShuffleBlockTransferScalarPerVector_NPerBlock,
+                   sequence_merge_t<
+                       Sequence<true>,
+                       uniform_sequence_gen_t<NumDTensor,
+                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
+                   IndexType,
+                   1,                 // ScatterDim
+                   true,              // OutputScatter: false, only use scatter weights
+                   scatter_weight_idx // ScatterWeightIdx: ascale
+                   >{c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
+                     c_element_op};
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
index 61d0f9e0d5..fa9b5fb2ce 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
@@ -86,7 +86,7 @@ struct GridwisePermute
         ~Block2TileMap() = default;
 
         Block2TileMap& operator=(const Block2TileMap&) = delete;
-        Block2TileMap& operator=(Block2TileMap&&) = delete;
+        Block2TileMap& operator=(Block2TileMap&&)      = delete;
 
         explicit Block2TileMap(const InGridDesc& desc) : desc_(desc) {}
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
index ddf0b4a58d..bffc3c696c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -25,15 +25,15 @@ template <typename InputGridDesc,
           typename GridwiseTensorRearrangeKernel>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_tensor_rearrange(const InputGridDesc in_grid_desc,
-                                const InputDataType* __restrict__ p_in_global,
-                                const OutputGridDesc out_grid_desc,
-                                OutputDataType* __restrict__ p_out_global,
-                                const index_t batch_count,
-                                const Block2ETileMap block_2_tile_map,
-                                const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
+    kernel_tensor_rearrange(const InputGridDesc in_grid_desc,
+                            const InputDataType* __restrict__ p_in_global,
+                            const OutputGridDesc out_grid_desc,
+                            OutputDataType* __restrict__ p_out_global,
+                            const index_t batch_count,
+                            const Block2ETileMap block_2_tile_map,
+                            const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
     defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
diff --git a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_bwd_data.hpp b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_bwd_data.hpp
index 8a0e16d7f6..e399499cc8 100644
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_bwd_data.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_bwd_data.hpp
@@ -399,7 +399,7 @@ struct GridwiseNormalizationBwdData_mk_to_mk
                                     dx_grid_desc_m_k,
                                     dx_global_val_buf);
 
-        }    // end of sweep once
+        } // end of sweep once
         else // Sweep Twice pipeline
         {
             constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 4e4c92de40..2305997f70 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -823,8 +823,7 @@ struct ThreadwiseTensorSliceTransfer_v3
                 buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
             });
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -837,8 +836,7 @@ struct ThreadwiseTensorSliceTransfer_v3
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move
             static_for<0, nDim, 1>{}([&](auto i) {
@@ -983,8 +981,7 @@ struct ThreadwiseTensorSliceTransfer_v3
                 is_dst_valid,
                 dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -997,8 +994,7 @@ struct ThreadwiseTensorSliceTransfer_v3
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move
             static_for<0, nDim, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
index 79e22018a6..4a6ed62c0e 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -246,22 +246,22 @@ struct ThreadwiseTensorSliceTransfer_v3r1
             using dst_elem_op_vec_t = typename vector_type<DstData, elem_op_vec_len>::type;
 
             using VectorSizeLookupTable    = Tuple<Sequence<>,
-                                                Sequence<I1>,
-                                                Sequence<I2>,
-                                                Sequence<I2, I1>,
-                                                Sequence<I4>,
-                                                Sequence<I4, I1>,
-                                                Sequence<I4, I2>,
-                                                Sequence<I4, I2, I1>,
-                                                Sequence<I8>,
-                                                Sequence<I8, I1>,
-                                                Sequence<I8, I2>,
-                                                Sequence<I8, I2, I1>,
-                                                Sequence<I8, I4>,
-                                                Sequence<I8, I4, I1>,
-                                                Sequence<I8, I4, I2>,
-                                                Sequence<I8, I4, I2, I1>,
-                                                Sequence<I16>>;
+                                                   Sequence<I1>,
+                                                   Sequence<I2>,
+                                                   Sequence<I2, I1>,
+                                                   Sequence<I4>,
+                                                   Sequence<I4, I1>,
+                                                   Sequence<I4, I2>,
+                                                   Sequence<I4, I2, I1>,
+                                                   Sequence<I8>,
+                                                   Sequence<I8, I1>,
+                                                   Sequence<I8, I2>,
+                                                   Sequence<I8, I2, I1>,
+                                                   Sequence<I8, I4>,
+                                                   Sequence<I8, I4, I1>,
+                                                   Sequence<I8, I4, I2>,
+                                                   Sequence<I8, I4, I2, I1>,
+                                                   Sequence<I16>>;
             using VectorOffsetsLookupTable = Tuple<Sequence<>,
                                                    Sequence<I0>,
                                                    Sequence<I0>,
@@ -308,8 +308,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 .template SetAsType<dst_vector_t>(src_data_idx_seq,
                                                   op_r_v.template AsType<dst_vector_t>()[I0]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -322,8 +321,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move src coord
             static_for<0, nDim, 1>{}([&](auto i) {
@@ -636,8 +634,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 is_dst_valid,
                 dst_vector_container.template AsType<dst_vector_t>()[I0]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -650,8 +647,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move dst coord
             static_for<0, nDim, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp
index 174b82f870..8af6a2148b 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp
@@ -229,8 +229,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
                 .template SetAsType<src_vector_t>(
                     src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[I0]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -243,8 +242,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move src coord
             static_for<0, nDim, 1>{}([&](auto i) {
@@ -376,8 +374,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
             scale_thread_scratch_.template SetAsType<scale_vector_t>(
                 scale_data_idx_seq, scale_vector_container.template AsType<scale_vector_t>()[I0]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -391,8 +388,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move scale coord
             static_for<0, nDim, 1>{}([&](auto i) {
@@ -666,8 +662,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
                 is_dst_valid,
                 dst_vector_container.template AsType<dst_vector_t>()[I0]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -680,8 +675,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move dst coord
             static_for<0, nDim, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
index 50f1e21beb..8574fd055c 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
@@ -277,8 +277,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
                 .template SetAsType<dst_vector_t>(src_data_idx_seq,
                                                   op_r_v.template AsType<dst_vector_t>()[I0]);
 
-            auto move_on_dim = [&]() constexpr
-            {
+            auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -292,8 +291,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
             // move src coord
             static_for<0, nDim, 1>{}([&](auto i) {
                 if(move_on_dim[i])
@@ -603,8 +601,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
                 is_dst_valid,
                 dst_vector_container.template AsType<dst_vector_t>()[I0]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -617,8 +614,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move dst coord
             static_for<0, nDim, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
index f0d793456d..9383e3f829 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -229,8 +229,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                             src_data_idx_seq,
                             src_vector_container.template AsType<src_vector_t>()[I0]);
 
-                    constexpr auto move_on_dim = [&]() constexpr
-                    {
+                    constexpr auto move_on_dim = [&]() constexpr {
                         StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                         static_for<0, nDim, 1>{}([&](auto i) {
@@ -245,8 +244,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                         });
 
                         return move_on_dim_;
-                    }
-                    ();
+                    }();
 
                     // move src coord
                     static_for<0, nDim, 1>{}([&](auto i) {
@@ -438,8 +436,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                         is_dst_valid,
                         dst_vector_container.template AsType<dst_vector_t>()[I0]);
 
-                    constexpr auto move_on_dim = [&]() constexpr
-                    {
+                    constexpr auto move_on_dim = [&]() constexpr {
                         StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                         static_for<0, nDim, 1>{}([&](auto i) {
@@ -454,8 +451,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                         });
 
                         return move_on_dim_;
-                    }
-                    ();
+                    }();
 
                     // move dst coord
                     static_for<0, nDim, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
index 40ebdeff08..4e9c188115 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -198,8 +198,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
                     src_vector.template AsType<SrcData>()[Number<src_vector_offset>{}];
             });
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -212,8 +211,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move
             static_for<0, nDim, 1>{}([&](auto i) {
@@ -368,8 +366,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
                 is_dst_valid,
                 dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
 
-            constexpr auto move_on_dim = [&]() constexpr
-            {
+            constexpr auto move_on_dim = [&]() constexpr {
                 StaticallyIndexedArray<bool, nDim> move_on_dim_;
 
                 static_for<0, nDim, 1>{}([&](auto i) {
@@ -382,8 +379,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
                 });
 
                 return move_on_dim_;
-            }
-            ();
+            }();
 
             // move
             static_for<0, nDim, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
index 9b1ff3dbf8..65e63993a6 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
@@ -421,8 +421,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
             {
                 constexpr auto forward_step = DstSpaceFillingCurve::GetForwardStep(iAccess);
 
-                auto forward_step_scatter = [&]() constexpr
-                {
+                auto forward_step_scatter = [&]() constexpr {
                     Index step_;
 
                     static_for<0, nDim, 1>{}([&](auto i) {
@@ -430,8 +429,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                     });
 
                     return step_;
-                }
-                ();
+                }();
                 static_for<0, nDst, 1>{}([&](auto i) {
                     move_tensor_coordinate(
                         dst_descs[i],
@@ -493,8 +491,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
         {
             constexpr auto reset_step =
                 DstSpaceFillingCurve::GetStepBetween(Number<dst_num_access - 1>{}, Number<0>{});
-            auto reset_step_scatter = [&]() constexpr
-            {
+            auto reset_step_scatter = [&]() constexpr {
                 Index step_;
                 static_for<0, nDim, 1>{}([&](auto i) {
                     step_(i) =
@@ -502,8 +499,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter
                 });
 
                 return step_;
-            }
-            ();
+            }();
             return reset_step_scatter;
         }
     }
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index b7af32d3dc..2edbb7c789 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -1400,7 +1400,7 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
 #else
         constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
 #else
         rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
 #endif // #ifndef CK_CODE_GEN_RTC
@@ -1426,7 +1426,7 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
 #else
         constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
 #else
         rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
 #endif // #ifndef CK_CODE_GEN_RTC
@@ -1503,7 +1503,7 @@ __device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
 #else
         constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
 #else
         rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f[0]);
 #endif // #ifndef CK_CODE_GEN_RTC
@@ -1704,7 +1704,7 @@ __host__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
 #else
             constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-            rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
                                                static_cast<float>(x[0]));
 #else
             rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x),
@@ -1734,7 +1734,7 @@ using bf8_t = bf8_ocp_t;
 #define CK_FP8_TYPE_FNUZ 0
 #define CK_FP8_TYPE_OCP 1
 #else
-using f8_t = f8_fnuz_t;
+using f8_t  = f8_fnuz_t;
 using bf8_t = bf8_fnuz_t;
 #define CK_FP8_TYPE_FNUZ 1
 #define CK_FP8_TYPE_OCP 0
diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
index bd0ca42ecd..d6524283db 100644
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -19,7 +19,7 @@ __host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>
 {
     Array<TData, NSize + 1> r;
 
-    static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
+    static_for<0, NSize, 1>{}([&r, &a](auto i) constexpr { r(i) = a[i]; });
 
     r(Number<NSize>{}) = x;
 
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index ed42b22daf..027290dbf8 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -232,7 +232,7 @@ struct DynamicBuffer
 #if CK_USE_AMD_BUFFER_LOAD
         bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t);
 #else
-        bool constexpr use_amd_buffer_addressing      = false;
+        bool constexpr use_amd_buffer_addressing = false;
 #endif
 
 #if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
diff --git a/include/ck/utility/is_detected.hpp b/include/ck/utility/is_detected.hpp
index a700fcfff1..8cb37b68b2 100644
--- a/include/ck/utility/is_detected.hpp
+++ b/include/ck/utility/is_detected.hpp
@@ -25,8 +25,8 @@ struct detector<Default, ck::void_t<Op<Args...>>, Op, Args...>
 
 struct nonesuch
 {
-    ~nonesuch()               = delete;
-    nonesuch(nonesuch const&) = delete;
+    ~nonesuch()                     = delete;
+    nonesuch(nonesuch const&)       = delete;
     void operator=(nonesuch const&) = delete;
 };
 
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
index 7b079c541c..993b70a3fb 100644
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -75,7 +75,7 @@ struct MagicDivision
     // integral_constant<uint32_t, .>
     template <uint32_t Divisor>
     __host__ __device__ static constexpr auto
-        CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
+    CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
     {
         constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
 
@@ -88,7 +88,7 @@ struct MagicDivision
 
     template <uint32_t Divisor>
     __host__ __device__ static constexpr auto
-        CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
+    CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
     {
         constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
 
@@ -97,7 +97,7 @@ struct MagicDivision
 
     template <uint32_t Divisor>
     __host__ __device__ static constexpr auto
-        CalculateMagicShift(integral_constant<uint32_t, Divisor>)
+    CalculateMagicShift(integral_constant<uint32_t, Divisor>)
     {
         constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
 
@@ -107,21 +107,21 @@ struct MagicDivision
     // integral_constant<int32_t, .>
     template <int32_t Divisor>
     __host__ __device__ static constexpr auto
-        CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
+    CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
     {
         return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
     }
 
     template <int32_t Divisor>
     __host__ __device__ static constexpr auto
-        CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
+    CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
     {
         return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
     }
 
     template <int32_t Divisor>
     __host__ __device__ static constexpr auto
-        CalculateMagicShift(integral_constant<int32_t, Divisor>)
+    CalculateMagicShift(integral_constant<int32_t, Divisor>)
     {
         return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
     }
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index 497625f7e2..75f0c92c58 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -342,8 +342,8 @@ struct sequence_reverse
 
     using seq_split = sequence_split<Seq, NSize / 2>;
     using type      = typename sequence_merge<
-        typename sequence_reverse<typename seq_split::right_type>::type,
-        typename sequence_reverse<typename seq_split::left_type>::type>::type;
+             typename sequence_reverse<typename seq_split::right_type>::type,
+             typename sequence_reverse<typename seq_split::left_type>::type>::type;
 };
 
 template <index_t I>
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index e9fd1ea88f..99538ac78c 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -259,7 +259,7 @@ inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
 #endif // #ifndef CK_CODE_GEN_RTC
@@ -327,7 +327,7 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
 #endif // #ifndef CK_CODE_GEN_RTC
@@ -1495,7 +1495,7 @@ inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
 #endif
@@ -1520,7 +1520,7 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
 #endif
@@ -1565,7 +1565,7 @@ inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
 #endif
@@ -1817,7 +1817,7 @@ inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
 #endif
@@ -2155,7 +2155,7 @@ inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
 #else
     constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
     uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
 #endif
diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp
index 8dabb58451..26cfcaa2f0 100644
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -407,17 +407,17 @@ struct Tensor
                                             ElementSpaceSize,
                                             true /*InvalidElementUseNumericalZeroValue*/>;
     using StaticBufferType  = std::conditional_t<
-        is_scalar_type<ElementType>::value,
-        StaticBuffer<BufferAddressSpace,
-                     ElementType,
-                     size(Shape{}),
-                     true /*InvalidElementUseNumericalZeroValue*/>,
-        StaticBufferTupleOfVector<BufferAddressSpace,
-                                  TensorElementType,
-                                  size(Shape{}) /
-                                      scalar_type<std::remove_const_t<ElementType>>::vector_size,
-                                  scalar_type<std::remove_const_t<ElementType>>::vector_size,
-                                  true /*InvalidElementUseNumericalZeroValue*/>>;
+         is_scalar_type<ElementType>::value,
+         StaticBuffer<BufferAddressSpace,
+                      ElementType,
+                      size(Shape{}),
+                      true /*InvalidElementUseNumericalZeroValue*/>,
+         StaticBufferTupleOfVector<BufferAddressSpace,
+                                   TensorElementType,
+                                   size(Shape{}) /
+                                       scalar_type<std::remove_const_t<ElementType>>::vector_size,
+                                   scalar_type<std::remove_const_t<ElementType>>::vector_size,
+                                   true /*InvalidElementUseNumericalZeroValue*/>>;
     // If register use static buffer, else use dynamic buffer
     using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
 
diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp
index aaa7db2574..f7f9489f4c 100644
--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -1259,7 +1259,7 @@ struct slice : public base_transform<1, 1>
 
         printf("}");
     } // namespace ck
-};    // namespace ck
+}; // namespace ck
 
 /*
  * \brief lower_idx = upper_idx % modulus.
diff --git a/include/ck_tile/core/algorithm/space_filling_curve.hpp b/include/ck_tile/core/algorithm/space_filling_curve.hpp
index 6591acddb9..648a1251be 100644
--- a/include/ck_tile/core/algorithm/space_filling_curve.hpp
+++ b/include/ck_tile/core/algorithm/space_filling_curve.hpp
@@ -100,10 +100,8 @@ struct space_filling_curve
         // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
         // idim-th element of multidimensional index.
         // All constexpr variables have to be captured by VALUE.
-        constexpr auto compute_index = [ idx_1d, access_strides ](auto idim) constexpr
-        {
-            constexpr auto compute_index_impl = [ idx_1d, access_strides ](auto jdim) constexpr
-            {
+        constexpr auto compute_index = [idx_1d, access_strides](auto idim) constexpr {
+            constexpr auto compute_index_impl = [idx_1d, access_strides](auto jdim) constexpr {
                 auto res = idx_1d.value;
                 auto id  = 0;
 
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index add6b1dbdc..0932f39ca7 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -302,12 +302,12 @@ struct buffer_load_if<16, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0,
+                                   index_t flag           = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 16);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
+        using mbuf_t    = typename impl::buffer_load_trait<16, T>::payload_t;
         static_assert(sizeof(mbuf_t) == sizeof(T));
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
@@ -336,12 +336,12 @@ struct buffer_load_if<8, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0,
+                                   index_t flag           = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 8);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
+        using mbuf_t    = typename impl::buffer_load_trait<8, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -369,12 +369,12 @@ struct buffer_load_if<4, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0,
+                                   index_t flag           = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
+        using mbuf_t    = typename impl::buffer_load_trait<4, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -402,12 +402,12 @@ struct buffer_load_if<2, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0,
+                                   index_t flag           = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
+        using mbuf_t    = typename impl::buffer_load_trait<2, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -435,12 +435,12 @@ struct buffer_load_if<1, pre_nop>
                                    index_t v_offset,
                                    index_t /*s_offset*/,
                                    index_t i_offset /*max 0xFFF*/,
-                                   index_t flag = 0,
+                                   index_t flag           = 0,
                                    bool_constant<pre_nop> = {})
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
+        using mbuf_t    = typename impl::buffer_load_trait<1, T>::payload_t;
         if constexpr(pre_nop)
             asm volatile("s_nop 4\n"
                          "v_cmpx_le_u32 exec, 1, %4\n"
@@ -624,7 +624,7 @@ struct buffer_store_if<16>
     {
         static_assert(sizeof(T) == 16);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = fp32x4_t;
+        using mbuf_t   = fp32x4_t;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -681,7 +681,7 @@ struct buffer_store_if<4>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = float;
+        using mbuf_t   = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -709,7 +709,7 @@ struct buffer_store_if<2>
     {
         static_assert(sizeof(T) == 2);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = short;
+        using mbuf_t   = short;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
@@ -737,7 +737,7 @@ struct buffer_store_if<1>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
-        using mbuf_t = float;
+        using mbuf_t   = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
                      "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
                      "s_mov_b64 exec %5"
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index e2a73e6242..0723026836 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -13,7 +13,7 @@
 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
 #define CK_TILE_VMCNT(cnt)                                              \
     ([]() { static_assert(!((cnt) >> 6), "VMCNT only has 6 bits"); }(), \
-     ((cnt)&0b1111) | (((cnt)&0b110000) << 10))
+     ((cnt) & 0b1111) | (((cnt) & 0b110000) << 10))
 #define CK_TILE_EXPCNT(cnt) \
     ([]() { static_assert(!((cnt) >> 3), "EXP only has 3 bits"); }(), ((cnt) << 4))
 #define CK_TILE_LGKMCNT(cnt) \
diff --git a/include/ck_tile/core/container/container_helper.hpp b/include/ck_tile/core/container/container_helper.hpp
index 474eda80d1..1a631bd95e 100644
--- a/include/ck_tile/core/container/container_helper.hpp
+++ b/include/ck_tile/core/container/container_helper.hpp
@@ -16,7 +16,7 @@ template <typename TData, index_t NSize>
 CK_TILE_HOST_DEVICE constexpr auto container_push_back(const array<TData, NSize>& a, const TData& x)
 {
     array<TData, NSize + 1> r;
-    static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
+    static_for<0, NSize, 1>{}([&r, &a](auto i) constexpr { r(i) = a[i]; });
     r[number<NSize>{}] = x;
     return r;
 }
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index b187b71830..94309dd5dd 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -1236,9 +1236,8 @@ constexpr auto reverse_slice_sequence(Seq,
 template <typename Seq,
           index_t SliceSize,
           typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
-constexpr auto slice_sequence(Seq,
-                              number<SliceSize>,
-                              Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+constexpr auto
+slice_sequence(Seq, number<SliceSize>, Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
 {
     constexpr auto r =
         reverse_slice_sequence(Seq{}.reverse(), number<SliceSize>{}, Mask{}.reverse());
diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index b5da468319..a3ce614f84 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -75,7 +75,7 @@ struct alignas(1) float8_e4m3_t
 #if CK_TILE_USE_OCP_FP8
     static constexpr int bias = 7; // OCP
 #else
-    static constexpr int bias = 8;  // FNUZ
+    static constexpr int bias = 8; // FNUZ
 #endif
     using raw_type = uint8_t;
     raw_type data;
diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp
index 8176fe551c..b8a31ba8fc 100644
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -31,8 +31,8 @@ struct scales
     CK_TILE_HOST_DEVICE constexpr explicit scales(Scale lhs) : lhs_(lhs) {}
 
     template <typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Right& rhs) const
-        -> decltype(std::declval<const Scale&>() * rhs)
+    CK_TILE_HOST_DEVICE constexpr auto
+    operator()(const Right& rhs) const -> decltype(std::declval<const Scale&>() * rhs)
     {
         return lhs_ * rhs;
     }
@@ -43,13 +43,13 @@ struct scales
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
 template <typename Scale>
-__host__ __device__ scales(Scale)->scales<Scale>;
+__host__ __device__ scales(Scale) -> scales<Scale>;
 
 template <typename Left = void, typename Right = Left>
 struct plus
 {
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs + rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs + rhs)
     {
         return lhs + rhs;
     }
@@ -59,21 +59,21 @@ template <>
 struct plus<void, void>
 {
     template <typename Left, typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs + rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs + rhs)
     {
         return lhs + rhs;
     }
 };
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
-__host__ __device__ plus()->plus<void, void>;
+__host__ __device__ plus() -> plus<void, void>;
 
 template <typename Left = void, typename Right = Left>
 struct minus
 {
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs - rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs - rhs)
     {
         return lhs - rhs;
     }
@@ -83,21 +83,21 @@ template <>
 struct minus<void, void>
 {
     template <typename Left, typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs - rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs - rhs)
     {
         return lhs - rhs;
     }
 };
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
-__host__ __device__ minus()->minus<void, void>;
+__host__ __device__ minus() -> minus<void, void>;
 
 template <typename Left = void, typename Right = Left>
 struct multiplies
 {
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs * rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs * rhs)
     {
         return lhs * rhs;
     }
@@ -107,15 +107,15 @@ template <>
 struct multiplies<void, void>
 {
     template <typename Left, typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs * rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs * rhs)
     {
         return lhs * rhs;
     }
 };
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
-__host__ __device__ multiplies()->multiplies<void, void>;
+__host__ __device__ multiplies() -> multiplies<void, void>;
 
 template <typename T>
 struct maximize
@@ -327,8 +327,8 @@ CK_TILE_HOST_DEVICE constexpr auto lcm(X x, Ys... ys)
 template <typename Left = void, typename Right = Left>
 struct equal
 {
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs == rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs == rhs)
     {
         return lhs == rhs;
     }
@@ -338,15 +338,15 @@ template <>
 struct equal<void, void>
 {
     template <typename Left, typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs == rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs == rhs)
     {
         return lhs == rhs;
     }
 };
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
-__host__ __device__ equal()->equal<void, void>;
+__host__ __device__ equal() -> equal<void, void>;
 
 template <>
 struct equal<float, float>
@@ -369,8 +369,8 @@ struct equal<double, double>
 template <typename Left = void, typename Right = Left>
 struct less
 {
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs < rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs < rhs)
     {
         return lhs < rhs;
     }
@@ -380,21 +380,21 @@ template <>
 struct less<void, void>
 {
     template <typename Left, typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs < rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs < rhs)
     {
         return lhs < rhs;
     }
 };
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
-__host__ __device__ less()->less<void, void>;
+__host__ __device__ less() -> less<void, void>;
 
 template <typename Left = void, typename Right = Left>
 struct less_equal
 {
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs <= rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs <= rhs)
     {
         return lhs <= rhs;
     }
@@ -404,15 +404,15 @@ template <>
 struct less_equal<void, void>
 {
     template <typename Left, typename Right>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs, const Right& rhs) const
-        -> decltype(lhs <= rhs)
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const Left& lhs,
+                                                  const Right& rhs) const -> decltype(lhs <= rhs)
     {
         return lhs <= rhs;
     }
 };
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
-__host__ __device__ less_equal()->less_equal<void, void>;
+__host__ __device__ less_equal() -> less_equal<void, void>;
 
 template <>
 struct less_equal<float, float>
diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp b/include/ck_tile/core/tensor/load_tile_transpose.hpp
index ceb7e18556..1535250722 100644
--- a/include/ck_tile/core/tensor/load_tile_transpose.hpp
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp
@@ -117,8 +117,8 @@ struct DefaultTranspose
     struct ValidationTraitsImpl
     {
         using QuadEncoding             = std::conditional_t<ReverseDirection,
-                                                QuadOutputEncoding<LaneGroupSize>,
-                                                QuadInputEncoding<LaneGroupSize>>;
+                                                            QuadOutputEncoding<LaneGroupSize>,
+                                                            QuadInputEncoding<LaneGroupSize>>;
         static constexpr auto I0       = number<0>{};
         static constexpr auto I1       = number<1>{};
         static constexpr auto input_hs = InDstrEncode::hs_lengthss_;
@@ -396,9 +396,9 @@ template <
     index_t NumCoord,
     typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>,
     typename        = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_,
-                                                          typename BottomTensorView_::DataType,
-                                                          Policy>::distr_encoding_valid,
-                                Policy>>
+                                                                 typename BottomTensorView_::DataType,
+                                                                 Policy>::distr_encoding_valid,
+                                       Policy>>
 CK_TILE_DEVICE auto
 load_tile_transpose(const tile_window_with_static_distribution<BottomTensorView_,
                                                                WindowLengths_,
diff --git a/include/ck_tile/core/tensor/sweep_tile.hpp b/include/ck_tile/core/tensor/sweep_tile.hpp
index f82f6b5bcd..6ee1fa54f4 100644
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -303,6 +303,6 @@ struct tile_sweeper
 template <typename T,
           typename F,
           typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
-CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {})->tile_sweeper<T, F, U>;
+CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {}) -> tile_sweeper<T, F, U>;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index 6bcba4019c..e2a6ae6555 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -81,7 +81,7 @@ struct tensor_adaptor
 
     template <index_t IDimHidden>
     CK_TILE_HOST_DEVICE static constexpr auto
-        get_transform_and_its_upper_dimension(number<IDimHidden>)
+    get_transform_and_its_upper_dimension(number<IDimHidden>)
     {
         // FIXME: length of bottom dimension is not known, since info about lower dim length are not
         // saved in transformation
@@ -119,13 +119,13 @@ struct tensor_adaptor
 
     CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_hidden_dimension()
     {
-        constexpr auto all_low_dim_ids = unpack(
-            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
-            LowerDimensionHiddenIdss{});
+        constexpr auto all_low_dim_ids =
+            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
+                   LowerDimensionHiddenIdss{});
 
-        constexpr auto all_up_dim_ids = unpack(
-            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
-            UpperDimensionHiddenIdss{});
+        constexpr auto all_up_dim_ids =
+            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
+                   UpperDimensionHiddenIdss{});
 
         constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
 
@@ -461,7 +461,7 @@ transform_tensor_adaptor(const OldTensorAdaptor& old_tensor_adaptor,
         sequence<0>{}, inclusive_scan_sequence(up_dim_numbers, plus<index_t>{}, number<0>{}));
 
     constexpr auto up_dim_hidden_idss = generate_tuple(
-        [ old_hidden_dim_number, up_dim_numbers_scan ](auto i) constexpr {
+        [old_hidden_dim_number, up_dim_numbers_scan](auto i) constexpr {
             return
                 typename arithmetic_sequence_gen<old_hidden_dim_number + up_dim_numbers_scan[i],
                                                  old_hidden_dim_number + up_dim_numbers_scan[i + 1],
@@ -470,8 +470,8 @@ transform_tensor_adaptor(const OldTensorAdaptor& old_tensor_adaptor,
         number<num_new_transform>{});
 
     // new top dimension's hidden ids
-    constexpr auto unordered_new_top_dim_hidden_ids = unpack(
-        [](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
+    constexpr auto unordered_new_top_dim_hidden_ids =
+        unpack([](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
 
     constexpr auto new_top_dim_unordered2ordered = unpack(
         [](auto... xs) constexpr { return merge_sequences(xs...); }, NewUpperDimensionNewTopIdss{});
@@ -595,8 +595,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 TensorAdaptor1::get_lower_dimension_hidden_idss()[itran];
 
             // sequence in, sequence out
-            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr
-            {
+            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr {
                 auto low_dim_hidden_ids_1_mod_ = to_multi_index(low_dim_hidden_ids_1);
 
                 // shift hidden id so every dim id is unique
@@ -619,8 +618,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 });
 
                 return low_dim_hidden_ids_1_mod_;
-            }
-            ();
+            }();
 
             return generate_sequence_v2(
                 [&](auto i) constexpr { return number<low_dim_hidden_ids_1_mod[i]>{}; },
@@ -643,8 +641,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 TensorAdaptor1::get_upper_dimension_hidden_idss()[itran];
 
             // sequence in, constexpr tuple out
-            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr
-            {
+            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr {
                 auto up_dim_hidden_ids_1_mod_ = to_multi_index(up_dim_hidden_ids_1);
 
                 // shift hidden id
@@ -653,8 +650,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const TensorAdaptor0& a
                 });
 
                 return up_dim_hidden_ids_1_mod_;
-            }
-            ();
+            }();
 
             // constexpr tuple to sequence
             return generate_sequence_v2(
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index d7be5957c6..11e6b35c39 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -202,7 +202,7 @@ struct tile_distribution
     // FIXME: it's hacky to get Y index from Distributed-Index
     template <typename DistributedIndices>
     CK_TILE_HOST_DEVICE static constexpr auto
-        get_y_indices_from_distributed_indices(DistributedIndices)
+    get_y_indices_from_distributed_indices(DistributedIndices)
     {
         constexpr auto ys_idx_arr = [] {
             array<index_t, NDimY> ys_idx;
@@ -266,7 +266,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_sequential_index(index_t ibegin, index_t
 // this returns a constexpr encoding of tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto
-    make_adaptor_encoding_for_tile_distribution(StaticTileDistributionEncoding_)
+make_adaptor_encoding_for_tile_distribution(StaticTileDistributionEncoding_)
 {
     using RsLengths    = typename StaticTileDistributionEncoding_::RsLengths;
     using HsLengthss   = typename StaticTileDistributionEncoding_::HsLengthss;
@@ -614,8 +614,7 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
     constexpr auto src_y_maps       = src_y_info[number<1>{}];
     constexpr auto src_y_prefix_sum = src_y_info[number<2>{}];
 
-    constexpr auto sliced_hlen_yidx_ylen = [&]() constexpr
-    {
+    constexpr auto sliced_hlen_yidx_ylen = [&]() constexpr {
         auto y_slice_sorted_origins = make_zero_multi_index<Encoding::NDimY>();
         auto y_slice_lengths        = Encoding::detail::ys_lengths_;
         constexpr auto y_to_h_masks = Encoding::detail::get_y_to_h_masks();
@@ -685,8 +684,7 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
         auto y_slice_origins = container_reorder_given_old2new(y_slice_sorted_origins, src_y_maps);
 
         return make_tuple(new_h_lengths, y_slice_origins, y_slice_lengths);
-    }
-    ();
+    }();
 
     constexpr auto sliced_h_lengths       = sliced_hlen_yidx_ylen[number<0>{}];
     constexpr auto sliced_y_origins_array = sliced_hlen_yidx_ylen[number<1>{}];
diff --git a/include/ck_tile/core/tensor/tile_elementwise.hpp b/include/ck_tile/core/tensor/tile_elementwise.hpp
index d2b24ad54e..284efd5d70 100644
--- a/include/ck_tile/core/tensor/tile_elementwise.hpp
+++ b/include/ck_tile/core/tensor/tile_elementwise.hpp
@@ -327,9 +327,8 @@ CK_TILE_DEVICE auto cast_tile_opt_subdword(const InTensor& in_dstr_tensors)
 template <typename DstType, typename SrcTensor>
 CK_TILE_DEVICE auto cast_tile(const SrcTensor& src_tensor)
 {
-    if constexpr((std::is_same_v<DstType, fp8_t> ||
-                  std::is_same_v<DstType, bf8_t>)&&std::is_same_v<typename SrcTensor::DataType,
-                                                                  float> &&
+    if constexpr((std::is_same_v<DstType, fp8_t> || std::is_same_v<DstType, bf8_t>) &&
+                 std::is_same_v<typename SrcTensor::DataType, float> &&
                  (SrcTensor::get_thread_buffer_size() % 4 == 0))
     {
         return impl::cast_tile_pk_fp8_fp32<DstType, SrcTensor>(src_tensor);
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index c4b24fba93..b5a89e5f51 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -74,8 +74,9 @@ struct tile_window_linear
         static constexpr auto get_num_non_linear_access()
         {
             constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
-            using ys_to_rhs_major          = typename decltype(
-                typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            using ys_to_rhs_major =
+                typename decltype(typename Base::TileDstr{}
+                                      .get_static_tile_distribution_encoding())::Ys2RHsMajor;
 
             constexpr auto non_linear = [&]() {
                 index_t cnt = 1;
@@ -109,8 +110,9 @@ struct tile_window_linear
         static constexpr auto get_non_linear_access_map()
         {
             constexpr auto sfc_access_lens = Base::Traits::SFC_Ys::access_lengths;
-            using ys_to_rhs_major          = typename decltype(
-                typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            using ys_to_rhs_major =
+                typename decltype(typename Base::TileDstr{}
+                                      .get_static_tile_distribution_encoding())::Ys2RHsMajor;
             constexpr auto non_linear_map = [&]() {
                 array<index_t, Base::Traits::NumAccess> m_{0};
                 index_t cumulative_len_            = 1;
@@ -244,8 +246,9 @@ struct tile_window_linear
     {
         using SFC_Ys          = typename Base::Traits::SFC_Ys;
         constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
-        using ys_to_rhs_major = typename decltype(
-            typename Base::TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+        using ys_to_rhs_major =
+            typename decltype(typename Base::TileDstr{}
+                                  .get_static_tile_distribution_encoding())::Ys2RHsMajor;
 
         constexpr auto modified_idx_ys = generate_tuple(
             [&](auto i_dim_y) {
diff --git a/include/ck_tile/core/utility/debug.hpp b/include/ck_tile/core/utility/debug.hpp
index 261bf50148..15f0718dc2 100644
--- a/include/ck_tile/core/utility/debug.hpp
+++ b/include/ck_tile/core/utility/debug.hpp
@@ -48,7 +48,7 @@ struct str_literal
 
 template <size_t... Idx>
 constexpr std::tuple<std::integral_constant<size_t, Idx>...>
-    makeTuple(std::index_sequence<Idx...>) noexcept
+makeTuple(std::index_sequence<Idx...>) noexcept
 {
     return {};
 }
@@ -113,8 +113,8 @@ struct CK_PRINTF<ConvertTo,
                                   std::integer_sequence<index_t, Is...>) const
     {
         using FMT1                = std::conditional_t<sizeof...(FMTChars) == 0,
-                                        decltype(default_format<Y>()),
-                                        str_literal<FMTChars...>>;
+                                                       decltype(default_format<Y>()),
+                                                       str_literal<FMTChars...>>;
         constexpr auto fmt_v      = FMT1::template duplicate_n<N>(make_str_literal(" "));
         constexpr auto fmt_wrap_v = get_prefix() + fmt_v + get_suffix();
 
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index 95fb1bd834..c43a64edaa 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -58,8 +58,8 @@ struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
 
 struct nonesuch
 {
-    ~nonesuch()               = delete;
-    nonesuch(nonesuch const&) = delete;
+    ~nonesuch()                     = delete;
+    nonesuch(nonesuch const&)       = delete;
     void operator=(nonesuch const&) = delete;
 };
 
diff --git a/include/ck_tile/core/utility/unary_element_function.hpp b/include/ck_tile/core/utility/unary_element_function.hpp
index ed3b464660..6bd6e33bd3 100644
--- a/include/ck_tile/core/utility/unary_element_function.hpp
+++ b/include/ck_tile/core/utility/unary_element_function.hpp
@@ -49,7 +49,7 @@ struct composes<F>
 
 /// FIXME: create macro to replace '__host__ __device__' and nothing more
 template <typename... Ts>
-__host__ __device__ composes(Ts&&...)->composes<remove_cvref_t<Ts>...>;
+__host__ __device__ composes(Ts&&...) -> composes<remove_cvref_t<Ts>...>;
 
 template <typename SaturateType>
 struct saturates
@@ -57,8 +57,8 @@ struct saturates
     // NOTE: this function does not return SaturateType value
     // it is user's responsiblity to do further cast or not
     template <typename AccType>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const AccType& a_) const
-        -> std::enable_if_t<std::is_arithmetic_v<AccType>, AccType>
+    CK_TILE_HOST_DEVICE constexpr auto
+    operator()(const AccType& a_) const -> std::enable_if_t<std::is_arithmetic_v<AccType>, AccType>
     {
         return clamp(a_,
                      type_convert<AccType>(numeric<SaturateType>::lowest()),
diff --git a/include/ck_tile/host/concat.hpp b/include/ck_tile/host/concat.hpp
index c68b908149..e9ba9a7d7b 100644
--- a/include/ck_tile/host/concat.hpp
+++ b/include/ck_tile/host/concat.hpp
@@ -33,13 +33,14 @@ struct IsCharArray<const char (&)[N]> : std::true_type
 };
 
 template <typename... Ts>
-inline constexpr bool AllConvertibleToStringView = ((std::is_convertible_v<Ts, std::string_view> ||
-                                                     IsCharArray<Ts>::value ||
-                                                     std::is_same_v<Ts, char>)&&...);
+inline constexpr bool AllConvertibleToStringView =
+    ((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
+      std::is_same_v<Ts, char>) &&
+     ...);
 
 template <typename... Ts>
-[[nodiscard]] auto concat(const Ts&... xs)
-    -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
+[[nodiscard]] auto
+concat(const Ts&... xs) -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
 {
     using ::operator<<;
     thread_local std::ostringstream oss;
@@ -78,8 +79,8 @@ template <std::size_t N>
 }
 
 template <typename... Ts>
-auto concatInto(std::string& result, const Ts&... xs)
-    -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
+auto concatInto(std::string& result,
+                const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
 {
     const std::size_t space = (1 + ... + getSize(xs));
     result.reserve(result.size() + space);
@@ -87,8 +88,8 @@ auto concatInto(std::string& result, const Ts&... xs)
 }
 
 template <typename... Ts>
-[[nodiscard]] auto concat(const Ts&... xs)
-    -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
+[[nodiscard]] auto
+concat(const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
 {
     std::string result;
     concatInto(result, xs...);
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index 9b31a7889d..e03881a1c7 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -64,7 +64,7 @@ struct FillUniformDistribution
                         return;
                     // need to make each thread unique, add an offset to current seed
                     std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
-                                                              : std::random_device{}());
+                                                       : std::random_device{}());
                     std::uniform_real_distribution<float> dis(a_, b_);
                     std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
                         return ck_tile::type_convert<T>(dis(gen));
@@ -242,7 +242,7 @@ struct FillNormalDistribution
                         return;
                     // need to make each thread unique, add an offset to current seed
                     std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
-                                                              : std::random_device{}());
+                                                       : std::random_device{}());
                     std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
                     std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
                         return ck_tile::type_convert<T>(dis(gen));
@@ -407,9 +407,10 @@ struct FillStepRange
     }
 
     template <typename ForwardRange>
-    auto operator()(ForwardRange&& range) const -> std::void_t<
-        decltype(std::declval<const FillStepRange&>()(std::begin(std::forward<ForwardRange>(range)),
-                                                      std::end(std::forward<ForwardRange>(range))))>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillStepRange&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
     {
         (*this)(std::begin(std::forward<ForwardRange>(range)),
                 std::end(std::forward<ForwardRange>(range)));
@@ -428,9 +429,10 @@ struct FillConstant
     }
 
     template <typename ForwardRange>
-    auto operator()(ForwardRange&& range) const -> std::void_t<
-        decltype(std::declval<const FillConstant&>()(std::begin(std::forward<ForwardRange>(range)),
-                                                     std::end(std::forward<ForwardRange>(range))))>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillConstant&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
     {
         (*this)(std::begin(std::forward<ForwardRange>(range)),
                 std::end(std::forward<ForwardRange>(range)));
@@ -512,9 +514,10 @@ struct FillTrigValue
     }
 
     template <typename ForwardRange>
-    auto operator()(ForwardRange&& range) const -> std::void_t<
-        decltype(std::declval<const FillTrigValue&>()(std::begin(std::forward<ForwardRange>(range)),
-                                                      std::end(std::forward<ForwardRange>(range))))>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillTrigValue&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
     {
         (*this)(std::begin(std::forward<ForwardRange>(range)),
                 std::end(std::forward<ForwardRange>(range)));
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index ecbc009b85..c3f1b7d221 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -378,7 +378,7 @@ struct HostTensor
     ~HostTensor() = default;
 
     HostTensor& operator=(const HostTensor&) = default;
-    HostTensor& operator=(HostTensor&&) = default;
+    HostTensor& operator=(HostTensor&&)      = default;
 
     template <typename FromT>
     explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
diff --git a/include/ck_tile/host/joinable_thread.hpp b/include/ck_tile/host/joinable_thread.hpp
index a822f967dc..a42b567fb4 100644
--- a/include/ck_tile/host/joinable_thread.hpp
+++ b/include/ck_tile/host/joinable_thread.hpp
@@ -15,7 +15,7 @@ struct joinable_thread : std::thread
     {
     }
 
-    joinable_thread(joinable_thread&&) = default;
+    joinable_thread(joinable_thread&&)            = default;
     joinable_thread& operator=(joinable_thread&&) = default;
 
     ~joinable_thread()
diff --git a/include/ck_tile/host/reference/reference_moe_sorting.hpp b/include/ck_tile/host/reference/reference_moe_sorting.hpp
index 1e877b9933..b7615d0478 100644
--- a/include/ck_tile/host/reference/reference_moe_sorting.hpp
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
@@ -9,7 +9,7 @@
 namespace ck_tile {
 
 #define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
-    static_cast<uint32_t>(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24))
+    static_cast<uint32_t>(((token_id_) & 0x00ffffff) | (((topk_id_) & 0xff) << 24))
 
 template <typename WeightType, typename IndexType = index_t>
 CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 7ae63e17a7..d42f144baa 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -284,8 +284,8 @@ struct CShuffleEpilogue
             {0, 0});
 
         using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
-                                        sequence<0, 1>,
-                                        sequence<MPerIterationShuffle, NPerIterationShuffle>>;
+                                                           sequence<0, 1>,
+                                                           sequence<MPerIterationShuffle, NPerIterationShuffle>>;
         constexpr index_t num_access = SFC::get_num_of_access();
 
         static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
@@ -336,8 +336,8 @@ struct CShuffleEpilogue
 
             const auto c_ds_tiles = concat_tuple_of_reference(
                 tie(c_out_tensor, c_out_tensor),
-                generate_tie(
-                    [&](auto idx) -> const auto& { return ds_tensor[idx]; }, number<NumDTensor>{}));
+                generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
+                             number<NumDTensor>{}));
 
             tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
 
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index edb5853c7f..54f2a777bf 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -458,7 +458,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1 : public BaseFlatmmPipelineAGmemBGmemCRegV
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](const ADataType & a) { return a; },
             b_flat_dram_block_window_tmp,
             num_loop,
             p_smem);
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 837aeb13e3..cc00000efc 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -431,12 +431,12 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                  typename Problem::BDataType,
+                                                  typename Problem::CDataType,
+                                                  WarpTile::at(I0),
+                                                  WarpTile::at(I1),
+                                                  WarpTile::at(I2),
+                                                  Problem::TransposeC>;
 
         using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy<
             typename Problem::ADataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 0b8e5836cd..3489d6f9a1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -509,7 +509,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
 
     template <typename Problem, index_t IBuf = 0>
     CK_TILE_HOST_DEVICE static constexpr auto
-        MakeKLdsStoreBlockDescriptor(number<IBuf> = number<0>{})
+    MakeKLdsStoreBlockDescriptor(number<IBuf> = number<0>{})
     {
         // K is always k-major, we use async-copy to load into LDS
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index 76ba34115f..570cff8bf0 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -60,8 +60,8 @@ struct TileFmhaShape
     // v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen
     static constexpr bool IsVLayoutRowMajor = IsVLayoutRowMajor_;
     using VLayout                           = std::conditional_t<IsVLayoutRowMajor,
-                                       ck_tile::tensor_layout::gemm::RowMajor,
-                                       ck_tile::tensor_layout::gemm::ColumnMajor>;
+                                                                 ck_tile::tensor_layout::gemm::RowMajor,
+                                                                 ck_tile::tensor_layout::gemm::ColumnMajor>;
 };
 
 template <typename BlockTile_, // sequence<...
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
index efa1ccb311..5255b5aeae 100644
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
@@ -385,7 +385,7 @@ struct FusedMoeGemmKernel
             auto o_window = [&]() {
                 ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr);
                 auto o_view_     = make_naive_tensor_view<address_space_enum::global,
-                                                      memory_operation_enum::atomic_add>(
+                                                          memory_operation_enum::atomic_add>(
                     o_ptr,
                     make_tuple(kargs.num_tokens, kargs.hidden_size),
                     make_tuple(kargs.stride_token, 1),
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index db85fae643..a5f9f31d6a 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -13,7 +13,7 @@
 namespace ck_tile {
 
 #define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
-    static_cast<uint32_t>(((token_id_)&0x00ffffff) | (((topk_id_)&0xff) << 24))
+    static_cast<uint32_t>(((token_id_) & 0x00ffffff) | (((topk_id_) & 0xff) << 24))
 
 #ifndef MOE_SORTING_USE_EX_KERNEL
 #define MOE_SORTING_USE_EX_KERNEL 1
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp
index e9577e2304..17c38a2632 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp
@@ -267,8 +267,7 @@ struct FusedMoeGemmPipeline_FlatmmEx
         statically_indexed_array<a_thread_type, 2> as;
 
         auto gld_a = [&]<typename PreNop = bool_constant<false>>(
-            auto& a_store_, auto i_access, PreNop = {})
-        {
+                         auto& a_store_, auto i_access, PreNop = {}) {
             async_load_tile_raw(a_store_, a_win, i_access, PreNop{});
         };
         auto move_a = [&]() {
@@ -278,43 +277,40 @@ struct FusedMoeGemmPipeline_FlatmmEx
             load_tile_raw(a_, win_, i_access);
         };
 
-        auto gld_g = [&]<typename PreNop = bool_constant<false>>(
-            auto& g_, auto i_access, PreNop = {})
-        {
-            if constexpr(IsGateOnly)
-            {
-                // TODO: hack!
-                if constexpr(i_access.value == 0)
+        auto gld_g =
+            [&]<typename PreNop = bool_constant<false>>(auto& g_, auto i_access, PreNop = {}) {
+                if constexpr(IsGateOnly)
                 {
-                    g_win.bottom_tensor_view_ = g_view;
+                    // TODO: hack!
+                    if constexpr(i_access.value == 0)
+                    {
+                        g_win.bottom_tensor_view_ = g_view;
+                    }
+                    else if constexpr(i_access.value == issues_g / 2)
+                    {
+                        g_win.bottom_tensor_view_ = u_view;
+                    }
                 }
-                else if constexpr(i_access.value == issues_g / 2)
-                {
-                    g_win.bottom_tensor_view_ = u_view;
-                }
-            }
-            load_tile_raw(g_, g_win, i_access, FALSE, PreNop{});
-        };
+                load_tile_raw(g_, g_win, i_access, FALSE, PreNop{});
+            };
         auto move_g = [&]() {
             move_tile_window(g_win, {number<0>{}, number<BlockShape::Block_Kr0>{}, number<0>{}});
         };
         statically_indexed_array<d_thread_type, 2> ds;
 
-        auto gld_d = [&]<typename PreNop = bool_constant<false>>(
-            auto& d_, auto i_access, PreNop = {})
-        {
-            load_tile_raw(d_, d_win, i_access, FALSE, PreNop{});
-        };
+        auto gld_d =
+            [&]<typename PreNop = bool_constant<false>>(auto& d_, auto i_access, PreNop = {}) {
+                load_tile_raw(d_, d_win, i_access, FALSE, PreNop{});
+            };
         auto move_d = [&]() {
             // d move along gemm-n
             move_tile_window(d_win, {number<BlockShape::Block_N1>{}, number<0>{}});
         };
 
-        auto atomic_add_o = [&]<typename PreNop = bool_constant<false>>(
-            auto& o_, auto i_access, PreNop = {})
-        {
-            update_tile_raw(o_win, o_, i_access, TRUE, PreNop{});
-        };
+        auto atomic_add_o =
+            [&]<typename PreNop = bool_constant<false>>(auto& o_, auto i_access, PreNop = {}) {
+                update_tile_raw(o_win, o_, i_access, TRUE, PreNop{});
+            };
 
         auto acc_0  = Policy::template MakeCBlockTile_Gemm0<Problem>();
         auto acc_1s = generate_tuple(
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index 28e8bee908..0a6bacdc42 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -69,8 +69,8 @@ struct GemmTile2DPartitioner
      * @param blockIdy      WGP's Y index.
      * @return const tuple<index_t, index_t>    Tuple containing 2D output C-tile index.
      */
-    CK_TILE_DEVICE static auto GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept
-        -> const tuple<index_t, index_t>
+    CK_TILE_DEVICE static auto
+    GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple<index_t, index_t>
     {
         const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx);
         const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy);
@@ -137,8 +137,8 @@ struct GemmTile1DPartitioner
      * @param blockIdx      WGP's index.
      * @return const tuple<index_t, index_t>    Tuple containing 2D output C-tile index.
      */
-    CK_TILE_DEVICE static auto GetOutputTileIndex(index_t blockIdx) noexcept
-        -> const tuple<index_t, index_t>
+    CK_TILE_DEVICE static auto
+    GetOutputTileIndex(index_t blockIdx) noexcept -> const tuple<index_t, index_t>
     {
         const index_t NBlocks = integer_divide_ceil(N_, NPerBlock);
 
@@ -188,9 +188,8 @@ struct OffsettedTile1DPartitioner
      * @param [in] N           Gemm's N dimension.
      * @return Returns a `tuple` [Im, In] with shifted index.
      */
-    [[nodiscard]] CK_TILE_DEVICE static auto
-    GetOffsetedTileIndex(index_t block_start, index_t M, index_t N) noexcept
-        -> const tuple<index_t, index_t>
+    [[nodiscard]] CK_TILE_DEVICE static auto GetOffsetedTileIndex(
+        index_t block_start, index_t M, index_t N) noexcept -> const tuple<index_t, index_t>
     {
         const auto [iM, iN] = TilePartitioner{M, N}.GetOutputTileIndex(blockIdx.x - block_start);
         return make_tuple(iM, iN);
@@ -271,8 +270,8 @@ struct GemmSpatiallyLocalTilePartitioner
      * @param [in] block_1d_id      WGP's index.
      * @return const tuple<index_t, index_t>    Tuple containing 2D output C-tile index.
      */
-    CK_TILE_DEVICE auto GetOutputTileIndex(index_t block_1d_id) noexcept
-        -> const tuple<index_t, index_t>
+    CK_TILE_DEVICE auto
+    GetOutputTileIndex(index_t block_1d_id) noexcept -> const tuple<index_t, index_t>
     {
         const auto M0 = integer_divide_ceil(M, MPerBlock);
         const auto N0 = integer_divide_ceil(N, NPerBlock);
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 8716475869..921ea11720 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -144,8 +144,8 @@ struct GroupedGemmKernel
         // clang-format on
     }
 
-    CK_TILE_HOST static auto GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs>& gemm_descs)
-        -> std::size_t
+    CK_TILE_HOST static auto
+    GetWorkSpaceSize(const std::vector<GroupedGemmHostArgs>& gemm_descs) -> std::size_t
     {
         return gemm_descs.size() * sizeof(GemmTransKernelArg);
     }
@@ -185,8 +185,8 @@ struct GroupedGemmKernel
         return dim3(grid_size, 1, 1);
     }
 
-    CK_TILE_HOST static auto MakeKargs(const std::vector<GroupedGemmHostArgs>& gemm_descs)
-        -> std::vector<GemmTransKernelArg>
+    CK_TILE_HOST static auto
+    MakeKargs(const std::vector<GroupedGemmHostArgs>& gemm_descs) -> std::vector<GemmTransKernelArg>
     {
         std::vector<GemmTransKernelArg> gemm_kernel_args_;
         index_t group_count = ck_tile::type_convert<ck_tile::index_t>(gemm_descs.size());
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index 4e9a70140e..7d88c804f3 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -28,20 +28,20 @@ struct GemmPipelineAgBgCrCompV4DefaultPolicy
             (DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType)) ==
             (WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size());
         constexpr auto wg_attr_num_access =
-            ((is_a_load_tr<Problem> || is_b_load_tr<Problem>)&&!single_load_tr_length)
+            ((is_a_load_tr<Problem> || is_b_load_tr<Problem>) && !single_load_tr_length)
                 ? WGAttrNumAccessEnum::Double
                 : WGAttrNumAccessEnum::Single;
 
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType, // AccDataType
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                Problem::TransposeC,
-                                                false,
-                                                false,
-                                                wg_attr_num_access>;
+                                                       typename Problem::BDataType,
+                                                       typename Problem::CDataType, // AccDataType
+                                                       WarpTile::at(I0),
+                                                       WarpTile::at(I1),
+                                                       WarpTile::at(I2),
+                                                       Problem::TransposeC,
+                                                       false,
+                                                       false,
+                                                       wg_attr_num_access>;
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
index 7784b1d508..17cd46d560 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
@@ -24,12 +24,12 @@ struct GemmPipelineAgBgCrCompV5DefaultPolicy
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType, // AccDataType
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                       typename Problem::BDataType,
+                                                       typename Problem::CDataType, // AccDataType
+                                                       WarpTile::at(I0),
+                                                       WarpTile::at(I1),
+                                                       WarpTile::at(I2),
+                                                       Problem::TransposeC>;
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 2335c4eced..d8118a7f8f 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -284,9 +284,9 @@ struct GemmPipelineAGmemBGmemCRegV1
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](const ADataType & a) { return a; },
             b_dram_block_window_tmp,
-            [](const BDataType& b) { return b; },
+            [](const BDataType & b) { return b; },
             num_loop,
             p_smem);
     }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 0f7f6369f0..0560ed9ba9 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -394,12 +394,12 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                typename Problem::ComputeDataType,
-                                                AccDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                       typename Problem::ComputeDataType,
+                                                       AccDataType,
+                                                       WarpTile::at(I0),
+                                                       WarpTile::at(I1),
+                                                       WarpTile::at(I2),
+                                                       Problem::TransposeC>;
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index 95b7618b11..b151cd6782 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -241,9 +241,9 @@ struct GemmPipelineAGmemBGmemCRegV2
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](const ADataType & a) { return a; },
             b_dram_block_window_tmp,
-            [](const BDataType& b) { return b; },
+            [](const BDataType & b) { return b; },
             num_loop,
             p_smem);
     }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 8976315b21..15f3358aad 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -169,10 +169,10 @@ struct UniversalGemmBasePolicy
             constexpr index_t BlockSize   = Problem::kBlockSize;
             constexpr index_t VecLoadSize = GetVectorSizeB<Problem>();
             using TileEncodingPattern     = TileDistributionEncodingPattern2D<BlockSize,
-                                                                          KPerBlock,
-                                                                          NPerBlock,
-                                                                          VecLoadSize,
-                                                                          BTileAccessPattern>;
+                                                                              KPerBlock,
+                                                                              NPerBlock,
+                                                                              VecLoadSize,
+                                                                              BTileAccessPattern>;
 
             constexpr auto BK0 = number<TileEncodingPattern::X1>{};
             constexpr auto BK1 = number<TileEncodingPattern::Y0>{};
@@ -636,15 +636,15 @@ struct UniversalGemmPipelineAgBgCrPolicy
                                                               : WGAttrNumAccessEnum::Invalid;
 
         using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                typename Problem::ComputeDataType,
-                                                typename Problem::CDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                Problem::TransposeC,
-                                                false,
-                                                Problem::UseStructuredSparsity,
-                                                wg_attr_num_access>;
+                                                       typename Problem::ComputeDataType,
+                                                       typename Problem::CDataType,
+                                                       WarpTile::at(I0),
+                                                       WarpTile::at(I1),
+                                                       WarpTile::at(I2),
+                                                       Problem::TransposeC,
+                                                       false,
+                                                       Problem::UseStructuredSparsity,
+                                                       wg_attr_num_access>;
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
index cf42cd3e74..04d0b3baab 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -462,7 +462,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
     {
         return operator()(
             a_dram_block_window_tmp,
-            [](const ADataType& a) { return a; },
+            [](const ADataType & a) { return a; },
             b_flat_dram_block_window_tmp,
             num_loop,
             p_smem);
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 6922ddf8a7..25aad329d9 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -430,12 +430,12 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
-                                                typename Problem::CDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                Problem::TransposeC>;
+                                                  typename Problem::BDataType,
+                                                  typename Problem::CDataType,
+                                                  WarpTile::at(I0),
+                                                  WarpTile::at(I1),
+                                                  WarpTile::at(I2),
+                                                  Problem::TransposeC>;
 
         using BlockWeightPreshufflePolicy =
             BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
diff --git a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index c1ff6a356e..4c136e78f7 100644
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -142,22 +142,15 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
         // 2. bf8, fp32, bf8 -> f32
         // 3. i4, (fp8/fp32) fp8 -> f32
         // 4. i4, (fp8/fp32) bf8 -> f32
-        static_assert(
-            (std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
-             std::is_same_v<
-                 ADataType,
-                 bf8_t>)&&(std::is_same_v<BDataType, fp8_t> ||
-                           std::is_same_v<
-                               BDataType,
-                               bf8_t>)&&(std::is_same_v<AQDataType, float> ||
-                                         std::is_same_v<AQDataType, ck_tile::fp8_t> ||
-                                         std::is_same_v<
-                                             AQDataType,
-                                             ck_tile::bf8_t>)&&(std::is_same_v<ComputeDataType,
-                                                                               fp8_t> ||
-                                                                std::is_same_v<ComputeDataType,
-                                                                               bf8_t>)&&std::
-                is_same_v<CDataType, fp32_t>);
+        static_assert((std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
+                       std::is_same_v<ADataType, bf8_t>) &&
+                      (std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>) &&
+                      (std::is_same_v<AQDataType, float> ||
+                       std::is_same_v<AQDataType, ck_tile::fp8_t> ||
+                       std::is_same_v<AQDataType, ck_tile::bf8_t>) &&
+                      (std::is_same_v<ComputeDataType, fp8_t> ||
+                       std::is_same_v<ComputeDataType, bf8_t>) &&
+                      std::is_same_v<CDataType, fp32_t>);
 
         static constexpr index_t InterWaveSchedulingMacClusters = 1;
 
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
index 83b61e23fc..2004f7d90e 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -44,12 +44,12 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         constexpr index_t VecLoadSize = GetVectorSizeAQ<Problem>();
         using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm                = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                typename Problem::ComputeDataType,
-                                                typename Problem::CDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                false>;
+                                                               typename Problem::ComputeDataType,
+                                                               typename Problem::CDataType,
+                                                               WarpTile::at(I0),
+                                                               WarpTile::at(I1),
+                                                               WarpTile::at(I2),
+                                                               false>;
 
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
         using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
index 9fb26eb4e0..746396b13a 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -202,8 +202,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
             << "AQ vector size: " << GetVectorSizeAQ() << "\n"
             << "A/B LDS read/write width: " << A_LDS_Read_Width << ", " << B_LDS_Read_Width << "\n"
             << "A/B buffer load inst: " << A_Buffer_Load_Inst_Num << ", " << B_Buffer_Load_Inst_Num
-            << ", "
-            << "AQ buffer load inst: " << AQ_Buffer_Load_Inst_Num << "\n"
+            << ", " << "AQ buffer load inst: " << AQ_Buffer_Load_Inst_Num << "\n"
             << "A/B LDS write inst: " << A_LDS_Write_Inst_Num << ", " << B_LDS_Write_Inst_Num
             << "\n"
             << "A/B LDS read inst: " << A_LDS_Read_Inst_Num << ", " << B_LDS_Read_Inst_Num << "\n"
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 5b7d78d51f..115f6dea19 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -37,13 +37,13 @@ struct GroupedConvBwdWeightKernelArgs
     CK_TILE_HOST GroupedConvBwdWeightKernelArgs(const GroupedConvBwdWeightHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.N_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.input_spatial_lengths_[0])};
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0])};
         wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.K_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.filter_spatial_lengths_[0])};
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0])};
         out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
                                  static_cast<index_t>(args.N_),
                                  static_cast<index_t>(args.K_),
@@ -106,15 +106,15 @@ struct GroupedConvBwdWeightKernelArgs
     CK_TILE_HOST GroupedConvBwdWeightKernelArgs(const GroupedConvBwdWeightHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.N_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.input_spatial_lengths_[0]),
-                                static_cast<index_t>(args.input_spatial_lengths_[1])};
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[1])};
         wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.K_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
-                                static_cast<index_t>(args.filter_spatial_lengths_[1])};
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[1])};
         out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
                                  static_cast<index_t>(args.N_),
                                  static_cast<index_t>(args.K_),
@@ -122,13 +122,13 @@ struct GroupedConvBwdWeightKernelArgs
                                  static_cast<index_t>(args.output_spatial_lengths_[1])};
 
         conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
-                               static_cast<index_t>(args.conv_filter_strides_[1])};
+                                 static_cast<index_t>(args.conv_filter_strides_[1])};
         conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
                                  static_cast<index_t>(args.conv_filter_dilations_[1])};
         input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
-                           static_cast<index_t>(args.input_left_pads_[1])};
+                                 static_cast<index_t>(args.input_left_pads_[1])};
         input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
-                            static_cast<index_t>(args.input_right_pads_[1])};
+                                 static_cast<index_t>(args.input_right_pads_[1])};
 
         k_batch = args.k_batch;
 
@@ -182,17 +182,17 @@ struct GroupedConvBwdWeightKernelArgs
     CK_TILE_HOST GroupedConvBwdWeightKernelArgs(const GroupedConvBwdWeightHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.N_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.input_spatial_lengths_[0]),
-                                static_cast<index_t>(args.input_spatial_lengths_[1]),
-                                static_cast<index_t>(args.input_spatial_lengths_[2])};
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[2])};
         wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.K_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
-                                static_cast<index_t>(args.filter_spatial_lengths_[1]),
-                                static_cast<index_t>(args.filter_spatial_lengths_[2])};
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[2])};
         out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
                                  static_cast<index_t>(args.N_),
                                  static_cast<index_t>(args.K_),
@@ -201,17 +201,17 @@ struct GroupedConvBwdWeightKernelArgs
                                  static_cast<index_t>(args.output_spatial_lengths_[2])};
 
         conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
-                               static_cast<index_t>(args.conv_filter_strides_[1]),
-                               static_cast<index_t>(args.conv_filter_strides_[2])};
+                                 static_cast<index_t>(args.conv_filter_strides_[1]),
+                                 static_cast<index_t>(args.conv_filter_strides_[2])};
         conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
                                  static_cast<index_t>(args.conv_filter_dilations_[1]),
                                  static_cast<index_t>(args.conv_filter_dilations_[2])};
         input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
-                           static_cast<index_t>(args.input_left_pads_[1]),
-                           static_cast<index_t>(args.input_left_pads_[2])};
+                                 static_cast<index_t>(args.input_left_pads_[1]),
+                                 static_cast<index_t>(args.input_left_pads_[2])};
         input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
-                            static_cast<index_t>(args.input_right_pads_[1]),
-                            static_cast<index_t>(args.input_right_pads_[2])};
+                                 static_cast<index_t>(args.input_right_pads_[1]),
+                                 static_cast<index_t>(args.input_right_pads_[2])};
 
         k_batch = args.k_batch;
 
@@ -254,8 +254,9 @@ struct GroupedConvBwdWeightKernelArgs
         GemmBatch = args.G_;
     }
 
-    using ABCGridDescs = remove_cvref_t<decltype(
-        ConvToGemmTransformer{}.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N())>;
+    using ABCGridDescs =
+        remove_cvref_t<decltype(ConvToGemmTransformer{}
+                                    .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N())>;
 
     using AGridDescMK = remove_cvref_t<decltype(ABCGridDescs{}[number<0>{}])>;
     using BGridDescNK = remove_cvref_t<decltype(ABCGridDescs{}[number<1>{}])>;
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index f979d96326..8cd1710043 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -37,13 +37,13 @@ struct GroupedConvFwdKernelArgs
     CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.N_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.input_spatial_lengths_[0])};
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0])};
         wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.K_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.filter_spatial_lengths_[0])};
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0])};
         out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
                                  static_cast<index_t>(args.N_),
                                  static_cast<index_t>(args.K_),
@@ -107,15 +107,15 @@ struct GroupedConvFwdKernelArgs
     CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.N_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.input_spatial_lengths_[0]),
-                                static_cast<index_t>(args.input_spatial_lengths_[1])};
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[1])};
         wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.K_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
-                                static_cast<index_t>(args.filter_spatial_lengths_[1])};
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[1])};
         out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
                                  static_cast<index_t>(args.N_),
                                  static_cast<index_t>(args.K_),
@@ -123,13 +123,13 @@ struct GroupedConvFwdKernelArgs
                                  static_cast<index_t>(args.output_spatial_lengths_[1])};
 
         conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
-                               static_cast<index_t>(args.conv_filter_strides_[1])};
+                                 static_cast<index_t>(args.conv_filter_strides_[1])};
         conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
                                  static_cast<index_t>(args.conv_filter_dilations_[1])};
         input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
-                           static_cast<index_t>(args.input_left_pads_[1])};
+                                 static_cast<index_t>(args.input_left_pads_[1])};
         input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
-                            static_cast<index_t>(args.input_right_pads_[1])};
+                                 static_cast<index_t>(args.input_right_pads_[1])};
 
         k_batch = args.k_batch;
 
@@ -184,17 +184,17 @@ struct GroupedConvFwdKernelArgs
     CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
     {
         in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.N_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.input_spatial_lengths_[0]),
-                                static_cast<index_t>(args.input_spatial_lengths_[1]),
-                                static_cast<index_t>(args.input_spatial_lengths_[2])};
+                                 static_cast<index_t>(args.N_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.input_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.input_spatial_lengths_[2])};
         wei_g_k_c_xs_lengths  = {static_cast<index_t>(args.G_),
-                                static_cast<index_t>(args.K_),
-                                static_cast<index_t>(args.C_),
-                                static_cast<index_t>(args.filter_spatial_lengths_[0]),
-                                static_cast<index_t>(args.filter_spatial_lengths_[1]),
-                                static_cast<index_t>(args.filter_spatial_lengths_[2])};
+                                 static_cast<index_t>(args.K_),
+                                 static_cast<index_t>(args.C_),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[0]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[1]),
+                                 static_cast<index_t>(args.filter_spatial_lengths_[2])};
         out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
                                  static_cast<index_t>(args.N_),
                                  static_cast<index_t>(args.K_),
@@ -203,17 +203,17 @@ struct GroupedConvFwdKernelArgs
                                  static_cast<index_t>(args.output_spatial_lengths_[2])};
 
         conv_filter_strides   = {static_cast<index_t>(args.conv_filter_strides_[0]),
-                               static_cast<index_t>(args.conv_filter_strides_[1]),
-                               static_cast<index_t>(args.conv_filter_strides_[2])};
+                                 static_cast<index_t>(args.conv_filter_strides_[1]),
+                                 static_cast<index_t>(args.conv_filter_strides_[2])};
         conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
                                  static_cast<index_t>(args.conv_filter_dilations_[1]),
                                  static_cast<index_t>(args.conv_filter_dilations_[2])};
         input_left_pads       = {static_cast<index_t>(args.input_left_pads_[0]),
-                           static_cast<index_t>(args.input_left_pads_[1]),
-                           static_cast<index_t>(args.input_left_pads_[2])};
+                                 static_cast<index_t>(args.input_left_pads_[1]),
+                                 static_cast<index_t>(args.input_left_pads_[2])};
         input_right_pads      = {static_cast<index_t>(args.input_right_pads_[0]),
-                            static_cast<index_t>(args.input_right_pads_[1]),
-                            static_cast<index_t>(args.input_right_pads_[2])};
+                                 static_cast<index_t>(args.input_right_pads_[1]),
+                                 static_cast<index_t>(args.input_right_pads_[2])};
 
         k_batch = args.k_batch;
 
@@ -259,15 +259,15 @@ struct GroupedConvFwdKernelArgs
         group_stride_c = args.K_;
     }
 
-    using AGridDescMK = remove_cvref_t<decltype(
-        ConvToGemmFwdTransformer{}
-            .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>())>;
-    using BGridDescNK = remove_cvref_t<decltype(
-        ConvToGemmFwdTransformer{}
-            .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>())>;
-    using CGridDescMN = remove_cvref_t<decltype(
-        ConvToGemmFwdTransformer{}
-            .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>())>;
+    using AGridDescMK = remove_cvref_t<
+        decltype(ConvToGemmFwdTransformer{}
+                     .template MakeADescriptor_M_K<typename GroupedConvTraitsType::InLayout>())>;
+    using BGridDescNK = remove_cvref_t<
+        decltype(ConvToGemmFwdTransformer{}
+                     .template MakeBDescriptor_N_K<typename GroupedConvTraitsType::WeiLayout>())>;
+    using CGridDescMN = remove_cvref_t<
+        decltype(ConvToGemmFwdTransformer{}
+                     .template MakeCDescriptor_M_N<typename GroupedConvTraitsType::OutLayout>())>;
 
     static constexpr index_t NonSpatialDims = 3;
     array<index_t, NonSpatialDims + GroupedConvTraitsType::NDimSpatial> in_g_n_c_wis_lengths;
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
index 48aaed3aae..b173ab25a1 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -67,11 +67,11 @@ struct GroupedConvTraits
     using DsLayout                                                = DsLayout_;
     using OutLayout                                               = OutLayout_;
     using GroupedConvImplicitGemmTraits                           = TileGemmTraits<true,
-                                                         true,
-                                                         true,
-                                                         ck_tile::tensor_layout::gemm::RowMajor,
-                                                         ck_tile::tensor_layout::gemm::ColumnMajor,
-                                                         ck_tile::tensor_layout::gemm::RowMajor>;
+                                                                                   true,
+                                                                                   true,
+                                                                                   ck_tile::tensor_layout::gemm::RowMajor,
+                                                                                   ck_tile::tensor_layout::gemm::ColumnMajor,
+                                                                                   ck_tile::tensor_layout::gemm::RowMajor>;
     static constexpr index_t NumDTensor                           = DsLayout::size();
     using ImplicitGemmDsLayout = decltype(generate_implicit_gemm_layout());
 };
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index c93329bfbe..434be9f84a 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -380,6 +380,6 @@ struct BlockReduce2D
 
 // deduction guide
 template <typename T>
-CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D(const T&, const typename T::DataType&)->BlockReduce2D<T>;
+CK_TILE_HOST_DEVICE_EXTERN BlockReduce2D(const T&, const typename T::DataType&) -> BlockReduce2D<T>;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ref/naive_attention.hpp b/include/ck_tile/ref/naive_attention.hpp
index 98ceab6992..172fcee2e3 100644
--- a/include/ck_tile/ref/naive_attention.hpp
+++ b/include/ck_tile/ref/naive_attention.hpp
@@ -695,18 +695,18 @@ struct naive_attention_fwd_kernel
             static_cast<naive_attention_variation_enum>(variation_),                                        \
             static_cast<naive_attention_quant_algo>(quant_algo_)>;                                          \
         using k_   = naive_attention_fwd_kernel<q_type_,                                                    \
-                                              k_type_,                                                    \
-                                              v_type_,                                                    \
-                                              o_type_,                                                    \
-                                              acc_type_,                                                  \
-                                              kvscale_type_,                                              \
-                                              q_layout_,                                                  \
-                                              k_layout_,                                                  \
-                                              v_layout_,                                                  \
-                                              o_layout_,                                                  \
-                                              k_scale_layout_,                                            \
-                                              v_scale_layout_,                                            \
-                                              ktraits_>;                                                  \
+                                                k_type_,                                                    \
+                                                v_type_,                                                    \
+                                                o_type_,                                                    \
+                                                acc_type_,                                                  \
+                                                kvscale_type_,                                              \
+                                                q_layout_,                                                  \
+                                                k_layout_,                                                  \
+                                                v_layout_,                                                  \
+                                                o_layout_,                                                  \
+                                                k_scale_layout_,                                            \
+                                                v_scale_layout_,                                            \
+                                                ktraits_>;                                                  \
         dim3 grids = k_::get_grid_size(a);                                                                  \
         r          = ck_tile::launch_kernel(s,                                                              \
                                    ck_tile::make_kernel(k_{}, grids, k_::get_block_size(), 0, a)); \
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index 9f2ef3389f..6f5a425207 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -1,14 +1,8 @@
-from datetime import datetime
-import pathlib
-from pathlib import Path
-import subprocess
-import os
-import copy
+from datetime import datetime import pathlib from pathlib import Path import subprocess import os
+    import copy
 
-NS = 'ck_tile'
-OPS = 'ops'
-REF = 'ref'
-OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
+        NS = 'ck_tile' OPS = 'ops' REF = 'ref' OPS_COMMON =
+            'common' #common header will be duplicated into ops/* other module
 
 HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n
@@ -82,7 +76,7 @@ submodule = submodule_t()
 # formatting
 for x in all_files:
     subprocess.Popen(f'dos2unix {str(x)}', shell=True)
-    cmd = f'clang-format-12 -style=file -i {str(x)}'
+    cmd = f'clang-format-18 -style=file -i {str(x)}'
     #for xp in x.parents:
     #print(get_file_base(x))
     subprocess.Popen(cmd, shell=True)
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
index 120bf7484a..59dfd76ede 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -116,7 +116,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
 #if CK_USE_PK4_LAYOUT_SHUFFLE
                             v_a = i4_to_f32_gfx9(i4);
 #else
-                            v_a    = i4 - 8;
+                            v_a = i4 - 8;
 #endif
                         }
                         else
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
index eedd687bde..9f04cf3e3d 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -110,7 +110,7 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
 #if CK_USE_PK4_LAYOUT_SHUFFLE
                             v_a = i4_to_f32_gfx9(i4);
 #else
-                            v_a    = i4 - 8;
+                            v_a = i4 - 8;
 #endif
                         }
                         else
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
index 2c2cac77e3..28274a5154 100644
--- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
@@ -25,17 +25,17 @@ template <typename ALayout,
           typename ComputeTypeB>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        naive_gemm_kernel(const ADataType* __restrict__ p_a_grid,
-                          const BDataType* __restrict__ p_b_grid,
-                          CDataType* __restrict__ p_c_grid,
-                          index_t m,
-                          index_t n,
-                          index_t k,
-                          const AElementwiseOperation a_element_op,
-                          const BElementwiseOperation b_element_op,
-                          const CDEElementwiseOperation c_element_op)
+    naive_gemm_kernel(const ADataType* __restrict__ p_a_grid,
+                      const BDataType* __restrict__ p_b_grid,
+                      CDataType* __restrict__ p_c_grid,
+                      index_t m,
+                      index_t n,
+                      index_t k,
+                      const AElementwiseOperation a_element_op,
+                      const BElementwiseOperation b_element_op,
+                      const CDEElementwiseOperation c_element_op)
 {
     using RowMajor = ck::tensor_layout::gemm::RowMajor;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
index 681f466677..2f0c6113de 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
@@ -23,8 +23,9 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_bf16_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_bf16_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -39,12 +40,13 @@ using device_column_to_image_bf16_instances = std::tuple<
         DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_f16_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_f16_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -59,12 +61,13 @@ using device_column_to_image_f16_instances = std::tuple<
         DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_f32_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_f32_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -76,12 +79,13 @@ using device_column_to_image_f32_instances = std::tuple<
         DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_i8_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_i8_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -97,8 +101,8 @@ using device_column_to_image_i8_instances = std::tuple<
         DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
         DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
index 74a2155a04..2d2798b667 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
@@ -23,8 +23,9 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_bf16_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_bf16_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -38,12 +39,13 @@ using device_image_to_column_bf16_instances = std::tuple<
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_f16_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_f16_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -58,12 +60,13 @@ using device_image_to_column_f16_instances = std::tuple<
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_f32_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_f32_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -75,12 +78,13 @@ using device_image_to_column_f32_instances = std::tuple<
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_i8_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_i8_instances =
+    std::tuple<
+        // clang-format off
         //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -96,8 +100,8 @@ using device_image_to_column_i8_instances = std::tuple<
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
index 0c44ca6613..1da94059b0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
@@ -38,8 +38,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <typename InOutDataType, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_km_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -56,8 +57,8 @@ using device_gemm_xdl_universal_km_kn_mn_comp_instances = std::tuple<
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <typename InOutDataType,
           BlockGemmPipelineScheduler BlkGemmPipeSched,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
index c3e333e720..56012a96fd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
@@ -35,48 +35,51 @@ template <ck::index_t NDimSpatial,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_dl_f32_instances = std::tuple<
-    // clang-format off
+using device_grouped_conv_bwd_weight_dl_f32_instances =
+    std::tuple<
+        // clang-format off
         //############################|        Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1|  M1Per|  N1Per|   KPer|  M1N1Thread|  M1N1Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|         ABlockTransfer|     ABlockTransfer|         ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|         BBlockTransfer|     BBlockTransfer|         BBlockTransfer|   CThreadTransfer| CThreadTransfer|    CThreadTransfer| 
         //############################|        Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   | Thread| Thread| Thread| ClusterM1Xs| ClusterN1Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| SrcDstAccessOrder| SrcDstVectorDim| DstScalarPerVector|
         //############################|    Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |       |       |       |            |            |       _K0_M0_M1_K1|         _K0_M0_M1_K1|   ArrangeOrder|               |           _K0_M0_M1_K1| ContiguousDimOrder|           _K0_M0_M1_K1|       _K0_N0_N1_K1|         _K0_N0_N1_K1|   ArrangeOrder|               |           _K0_N0_N1_K1| ContiguousDimOrder|           _K0_N0_N1_K1|                  |                |                   |
         //############################|           |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |       |       |       |            |            |                   |                     |               |               |                       |                   |                       |                   |                     |               |               |                       |                   |                       |                  |                |                   |
         // generic instance
         DeviceGroupedConvBwdWeight_Dl< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    16,  1,      4,      4,      1,     S<8, 2>,     S<8, 2>,   S<1, 8, 1, 1, 1>,   S<1, 2, 1, 128, 1>, S<0, 2, 3, 1, 4>, S<0, 2, 3, 1, 4>,   S<1, 1, 1, 1, 1>,   S<0, 2, 3, 1, 4>,       S<1, 1, 1, 1, 1>,   S<1, 1, 1, 8, 1>,   S<1, 16, 1, 16, 1>, S<0, 1, 4, 2, 3>, S<0, 1, 4, 2, 3>,   S<1, 1, 1, 1, 1>,   S<0, 1, 4, 2, 3>,       S<1, 1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,             5,                   1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_dl_f16_instances = std::tuple<
-    // clang-format off
+using device_grouped_conv_bwd_weight_dl_f16_instances =
+    std::tuple<
+        // clang-format off
         //############################|        Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1|  M1Per|  N1Per|   KPer|  M1N1Thread|  M1N1Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|         ABlockTransfer|     ABlockTransfer|         ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|         BBlockTransfer|     BBlockTransfer|         BBlockTransfer|   CThreadTransfer| CThreadTransfer|    CThreadTransfer| 
         //############################|        Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   | Thread| Thread| Thread| ClusterM1Xs| ClusterN1Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| SrcDstAccessOrder| SrcDstVectorDim| DstScalarPerVector|
         //############################|    Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |       |       |       |            |            |       _K0_M0_M1_K1|         _K0_M0_M1_K1|   ArrangeOrder|               |           _K0_M0_M1_K1| ContiguousDimOrder|           _K0_M0_M1_K1|       _K0_N0_N1_K1|         _K0_N0_N1_K1|   ArrangeOrder|               |           _K0_N0_N1_K1| ContiguousDimOrder|           _K0_N0_N1_K1|                  |                |                   |
         //############################|           |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |       |       |       |            |            |                   |                     |               |               |                       |                   |                       |                   |                     |               |               |                       |                   |                       |                  |                |                   |
         // generic instance
         DeviceGroupedConvBwdWeight_Dl< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    16,  1,      4,      4,      1,     S<8, 2>,     S<8, 2>,   S<1, 8, 1, 1, 1>,   S<1, 2, 1, 128, 1>, S<0, 2, 3, 1, 4>, S<0, 2, 3, 1, 4>,   S<1, 1, 1, 1, 1>,   S<0, 2, 3, 1, 4>,       S<1, 1, 1, 1, 1>,   S<1, 1, 1, 8, 1>,   S<1, 16, 1, 16, 1>, S<0, 1, 4, 2, 3>, S<0, 1, 4, 2, 3>,   S<1, 1, 1, 1, 1>,   S<0, 1, 4, 2, 3>,       S<1, 1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,             5,                   1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_dl_bf16_instances = std::tuple<
-    // clang-format off
+using device_grouped_conv_bwd_weight_dl_bf16_instances =
+    std::tuple<
+        // clang-format off
         //############################|        Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1|  M1Per|  N1Per|   KPer|  M1N1Thread|  M1N1Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|         ABlockTransfer|     ABlockTransfer|         ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|         BBlockTransfer|     BBlockTransfer|         BBlockTransfer|   CThreadTransfer| CThreadTransfer|    CThreadTransfer| 
         //############################|        Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   | Thread| Thread| Thread| ClusterM1Xs| ClusterN1Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| SrcDstAccessOrder| SrcDstVectorDim| DstScalarPerVector|
         //############################|    Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |       |       |       |            |            |       _K0_M0_M1_K1|         _K0_M0_M1_K1|   ArrangeOrder|               |           _K0_M0_M1_K1| ContiguousDimOrder|           _K0_M0_M1_K1|       _K0_N0_N1_K1|         _K0_N0_N1_K1|   ArrangeOrder|               |           _K0_N0_N1_K1| ContiguousDimOrder|           _K0_N0_N1_K1|                  |                |                   |
         //############################|           |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |       |       |       |            |            |                   |                     |               |               |                       |                   |                       |                   |                     |               |               |                       |                   |                       |                  |                |                   |
         // generic instance
         DeviceGroupedConvBwdWeight_Dl< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    16,  1,      4,      4,      1,     S<8, 2>,     S<8, 2>,   S<1, 8, 1, 1, 1>,   S<1, 2, 1, 128, 1>, S<0, 2, 3, 1, 4>, S<0, 2, 3, 1, 4>,   S<1, 1, 1, 1, 1>,   S<0, 2, 3, 1, 4>,       S<1, 1, 1, 1, 1>,   S<1, 1, 1, 8, 1>,   S<1, 16, 1, 16, 1>, S<0, 1, 4, 2, 3>, S<0, 1, 4, 2, 3>,   S<1, 1, 1, 1, 1>,   S<0, 1, 4, 2, 3>,       S<1, 1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,             5,                   1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
index 40c4d558b8..47cb9a88a4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
@@ -37,9 +37,8 @@ template <index_t NDSpatial,
           typename BLayout,
           typename CLayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_wmma_f16_instances =
-    std::tuple<
-        // clang-format off
+using device_grouped_conv_bwd_weight_wmma_f16_instances = std::tuple<
+    // clang-format off
         //#####################################|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################################|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
         //#####################################|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
@@ -71,17 +70,16 @@ using device_grouped_conv_bwd_weight_wmma_f16_instances =
         DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    32,     8,  8,    16,   16,       4,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
         DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    16,     8,  8,    16,   16,       4,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
         DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    16,     8,  8,    16,   16,       2,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 template <index_t NDSpatial,
           typename ALayout,
           typename BLayout,
           typename CLayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_wmma_i8_instances =
-    std::tuple<
-        // clang-format off
+using device_grouped_conv_bwd_weight_wmma_i8_instances = std::tuple<
+    // clang-format off
         //#####################################|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################################|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
         //#####################################|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
@@ -110,8 +108,8 @@ using device_grouped_conv_bwd_weight_wmma_i8_instances =
         DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    64,     8,   8,    16,   16,       4,       4,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
         DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    32,     8,   8,    16,   16,       2,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
         DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    16,     8,   8,    16,   16,       4,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 } // namespace instance
 } // namespace device
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
index 659d6a99a9..34b580cf75 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -47,8 +46,8 @@ using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instanc
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
index 8ead225c7c..e5dc2e1faf 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -49,8 +48,8 @@ using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instanc
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
index f9e0f610fa..b084104af7 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -52,8 +51,8 @@ using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instanc
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
index 41ed9bfb3b..d27d3a10a4 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -55,8 +54,8 @@ using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instanc
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
index 21fee6f321..e54ea0ff98 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -47,8 +46,8 @@ using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
index ea9b725286..49647695c3 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -49,8 +48,8 @@ using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
index fc0fc45887..16e1cbf13f 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -52,8 +51,8 @@ using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
index e67df2cada..e4b8dd977d 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances = std::tuple<
+    // clang-format off
         //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -55,8 +54,8 @@ using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
index d76cd350c8..bd022f83f7 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
@@ -28,8 +28,9 @@ using AccData = int32_t;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances = std::tuple<
-    // clang-format off
+using device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances =
+    std::tuple<
+        // clang-format off
         //##########|          AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|           Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|               |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -55,8 +56,8 @@ using device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances = std::tuple<
         DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
         DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
         DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(
     std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 36610ae205..2f079c234c 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -102,9 +102,8 @@ using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances = std:
     >;
 
 // FIXME: retire dedicated 2D version
-using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances =
-    std::tuple<
-        // clang-format off
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances = std::tuple<
+    // clang-format off
         //#####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //#####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //#####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |    |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -122,8 +121,8 @@ using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instan
         DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  16,   32,   32,    1,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
         DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  16,   32,   32,    2,    1,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
         DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  16,   32,   32,    1,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
     std::vector<std::unique_ptr<DeviceConvBwdData<2,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
index 9739046d3c..53fc307973 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
@@ -28,9 +28,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -51,8 +50,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
index 810c1b87cb..823c4e5307 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
@@ -28,9 +28,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -51,8 +50,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,  16,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
index 2bef8ebbaf..50f04578c4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
@@ -31,9 +31,8 @@ static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecializat
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
-using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -51,8 +50,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
index d02fb8f70b..efb57135ad 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -50,8 +49,8 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
index abf79262e6..ac8466e0af 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -50,8 +49,8 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,  16,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
index 5da89c3421..01edcbf4ee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -50,8 +49,8 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index caf17d55cb..8f0a8e620a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -29,9 +29,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -49,20 +48,19 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 // double rate mfma instances on gfx950
-using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances_2x =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances_2x = std::tuple<
+    // clang-format off
         //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,   256,  64,  64,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 8>,               4,  LoopScheduler::Default,        PipelineVersion::v1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp
index 81fedd50f0..9eb872e4b0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp
@@ -9,9 +9,8 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
-        // clang-format off
+using Instances = std::tuple<
+    // clang-format off
         // pipeline v1, 1 wave
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler| Pipeline|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |         |
@@ -25,8 +24,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_default_pipeline_v1_instances(
     OwnerList<InstanceNT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp
index 5a0c52c2df..ab5f40e81d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
         // pipeline v2, 1 wave
@@ -27,8 +26,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_default_pipeline_v2_instances(
     OwnerList<InstanceNT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp
index 59ffb80bd4..6f368a44d3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
         // pipeline v2, 1 wave
@@ -20,8 +19,8 @@ using Instances =
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                        |                     |
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_default_pipeline_v2_opt_instances(
     OwnerList<InstanceNT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
index a64424e8ac..7049732e41 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -27,8 +26,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceNT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp
index a0dd60c0f5..eef7e728d2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp
@@ -9,9 +9,8 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using Instances =
-    std::tuple<
-        // clang-format off
+using Instances = std::tuple<
+    // clang-format off
         // pipeline v1, 1 wave
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
@@ -25,8 +24,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_default_pipeline_v1_instances(
     OwnerList<InstanceNN>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp
index 122fff4960..e966b3ec49 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
         // pipeline v2, 1 wave
@@ -27,8 +26,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_default_pipeline_v2_instances(
     OwnerList<InstanceNN>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp
index 9f459aabfc..e090b157b3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
         // pipeline v2, 1 wave
@@ -20,8 +19,8 @@ using Instances =
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                      |
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_default_pipeline_v2_opt_instances(
     OwnerList<InstanceNN>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
index 3671bea7a3..811358a3d3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -27,8 +26,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceNN>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp
index 98db8bad1c..a9ee03ca49 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp
@@ -9,9 +9,8 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
-        // clang-format off
+using Instances = std::tuple<
+    // clang-format off
         // pipeline v1, 1 wave
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
@@ -34,8 +33,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_default_pipeline_v1_instances(
     OwnerList<InstanceTT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp
index 532c348b7e..d4e5ab8014 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
         // pipeline v2, 1 wave
@@ -36,8 +35,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_default_pipeline_v2_instances(
     OwnerList<InstanceTT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
index b931b8fdfd..03fdf13bc4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
         // pipeline v2, 1 wave
@@ -20,8 +19,8 @@ using Instances =
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                      |
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_default_pipeline_v2_opt_instances(
     OwnerList<InstanceTT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
index fa53a3bf0f..c3ab756f3b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -36,8 +35,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceTT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
index a590413acc..aa895fc0cd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -28,9 +28,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -43,8 +42,8 @@ using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances =
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
index 1d010d1b07..880aa6dd4a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -28,9 +28,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -43,8 +42,8 @@ using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances =
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
index f108b75342..ac3ac8d905 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -28,9 +28,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -43,8 +42,8 @@ using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances =
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
index b0b4bc012d..21dcb0a920 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -28,9 +28,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -48,8 +47,8 @@ using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances =
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
         DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
index df3bd94fca..43b41a7d4a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -39,8 +38,8 @@ using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances =
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
index 73b4e77666..656d07f575 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -39,8 +38,8 @@ using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances =
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
index 76137a1c3e..059c9f1acb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -39,8 +38,8 @@ using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances =
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
index f0158d8f3d..3cd42231ca 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
@@ -27,9 +27,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances = std::tuple<
+    // clang-format off
         //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
@@ -44,8 +43,8 @@ using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances =
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
index eba9cfcb7c..b7ab2ad64f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128.hpp
@@ -34,8 +34,9 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances =
+    std::tuple<
+        // clang-format off
         //################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -46,8 +47,8 @@ using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_instances = st
         DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,    64,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Col,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
index 430daae3ab..06d6780227 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -49,8 +48,8 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_km_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
index 9b876f5430..fd938f502f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -51,8 +50,8 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_km_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 65261235b6..87300fa871 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -54,8 +53,8 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index dc770d8d9a..902e349492 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_bf16_bf16_bf16/device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -57,8 +56,8 @@ using device_gemm_wmma_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
index 266e6b1a5d..a439cf27f5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -49,8 +48,8 @@ using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
index 1674b2de6c..55e0362018 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -51,8 +50,8 @@ using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
index 758420ca37..e51de0556c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -54,8 +53,8 @@ using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
index dad402dff4..722a0bae55 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -33,9 +33,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -57,8 +56,8 @@ using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
index ee15dfa94e..d10b9facd5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -50,8 +49,8 @@ using device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
index 93039a5008..d9d16ede65 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -51,8 +50,8 @@ using device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
index 1dc9678c5b..9277e5e901 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -53,8 +52,8 @@ using device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
index e4682c27d3..e97a649c19 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -53,8 +52,8 @@ using device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
index 0c601b3823..c8f1b85ddb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -51,8 +50,8 @@ using device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
index 8d11b6f9d9..fc0220a502 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -53,8 +52,8 @@ using device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
index d389da5ee8..b87cf64b0f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -53,8 +52,8 @@ using device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
index 001330eabb..31ad66409e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
@@ -34,9 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
@@ -52,8 +51,8 @@ using device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
index 59154f3439..a6b6465128 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -53,8 +54,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
index b962d75b12..e0bbe7dff0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -55,8 +56,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 9f142ad831..5cb767ab0f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -48,8 +49,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 7d141a47e1..ac29d1ba9c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -48,8 +49,8 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
index 8d109d1346..1a8227279d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
@@ -53,9 +53,8 @@ using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_comp_instances = std::tupl
 #endif
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances =
-    std::tuple<
-        // clang-format off
+using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances = std::tuple<
+    // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|        Block-wiseGemm|               Block-wiseGemm| ACompType| BCompType| APermute| BPermute|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|              Pipeline|                     Pipeline|          |          |         |         |
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|             Scheduler|                     Verision|          |          |         |         |
@@ -79,8 +78,8 @@ using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances =
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   bhalf_t,   bhalf_t,    false,    true>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
index 940da94e70..a160f84175 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -33,8 +33,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -51,8 +52,8 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
index d83014d5e8..2f043cef03 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -33,8 +33,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -63,8 +64,8 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
index ff13de1d6a..0d72da9e6e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
@@ -34,8 +34,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -48,8 +49,8 @@ using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
index bb10da37f4..c763b5048c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
@@ -34,8 +34,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -45,8 +46,8 @@ using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
index 680788d668..63300d2c37 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
@@ -53,8 +53,9 @@ using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
 #endif
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm| ACompType| BCompType| APermute| BPermute|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|          |          |         |         |
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|          |          |         |         |
@@ -78,8 +79,8 @@ using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,   128,   8,   32,  32,   32,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,   128,   8,   32,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,   128,   8,   32,  32,   32,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             32,             32,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2,   half_t,    half_t,    false,    true>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp
index 5c525244e1..783606ef9d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn.hpp
@@ -34,8 +34,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -50,8 +51,8 @@ using device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple<
         // We prefer following instance, however, existing compiler bug cause it failed to generate sanity code.
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn.hpp
index af4008c91d..bece6b4c30 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn.hpp
@@ -34,8 +34,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -47,8 +48,8 @@ using device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
index b4554fc6a9..f03dc4fc8e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -54,8 +55,8 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_instances =
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
index b6a60a1f31..7f1976f220 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -59,8 +60,8 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_instances =
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
index 5353fe16b5..93ac0d7dcc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -51,8 +52,8 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances =
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
index 959c1c0992..b2e3252e4d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -35,8 +35,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -61,8 +62,8 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances =
        
         // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               2,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
 
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
index 282cea7563..a318627bea 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
@@ -33,8 +33,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -54,8 +55,8 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
index 7335a9851f..92e5c86343 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
@@ -33,8 +33,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
+        // clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -62,8 +63,8 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances_part2 = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp
index d03002af5c..f83b0a47c9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn.hpp
@@ -34,7 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
+using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances =
+    std::tuple<
 // clang-format off
     #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
@@ -51,8 +52,8 @@ using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
         #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp
index 7736f38cb2..2de3ed35b0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn.hpp
@@ -34,7 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
+using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances =
+    std::tuple<
 // clang-format off
     #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
@@ -49,8 +50,8 @@ using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
         #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp
index 57b6ab3ae2..a38eef7294 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn.hpp
@@ -34,7 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple<
+using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances =
+    std::tuple<
 // clang-format off
     #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
@@ -52,8 +53,8 @@ using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_comp_instances = std
         // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F8,  F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,    16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
         #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp
index 14bd36d29f..d2e15f01da 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn.hpp
@@ -34,7 +34,8 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple<
+using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances =
+    std::tuple<
 // clang-format off
     #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
@@ -49,8 +50,8 @@ using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,      F8,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
         #endif
-    // clang-format on
-    >;
+        // clang-format on
+        >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_instances = std::tuple<
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
index 839d3559f7..2344108576 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
@@ -80,9 +80,8 @@ template <typename DsLayout,
           typename CDEElementwiseOp,
           GemmSpecialization GemmSpec                 = GemmMNKPadding,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave>
-using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances =
-    std::tuple<
-        // clang-format off
+using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances = std::tuple<
+    // clang-format off
         //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -99,8 +98,8 @@ using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances =
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    32,   128,    64,   8,   4,   32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,    16,   256,    64,   8,   4,   16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,              S<1, 16, 1, 16>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,    32,   256,    64,   8,   4,   32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,              S<1, 16, 1, 16>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 } // namespace instance
 } // namespace device
diff --git a/library/src/utility/convolution_parameter.cpp b/library/src/utility/convolution_parameter.cpp
index a71f8a4fa1..634b7f0890 100644
--- a/library/src/utility/convolution_parameter.cpp
+++ b/library/src/utility/convolution_parameter.cpp
@@ -215,9 +215,8 @@ ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, ch
 
 std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParam& p)
 {
-    os << "ConvParam {"
-       << "\nnum_dim_spatial: " << p.num_dim_spatial_ << "\nG: " << p.G_ << "\nN: " << p.N_
-       << "\nK: " << p.K_ << "\nC: " << p.C_
+    os << "ConvParam {" << "\nnum_dim_spatial: " << p.num_dim_spatial_ << "\nG: " << p.G_
+       << "\nN: " << p.N_ << "\nK: " << p.K_ << "\nC: " << p.C_
        << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths_
        << "\ninput_spatial_lengths: " << p.input_spatial_lengths_
        << "\nconv_filter_strides: " << p.conv_filter_strides_
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
index b70dd9538d..5ea1a78094 100644
--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -260,9 +260,9 @@ bool profile_conv_bwd_data_impl(int do_verification,
         }
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
index 917e4c07fc..37366821c4 100644
--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -233,9 +233,9 @@ bool profile_conv_fwd_impl(int do_verification,
         }
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
index fa0a771962..14182bb7b0 100644
--- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
+++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
@@ -288,9 +288,8 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
         }
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\nGB/s: " << best_gb_per_sec << std::endl;
 
     return is_supporting_instance && pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 12f6ad606f..0aeefaabfb 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -287,10 +287,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
         }
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
-              << best_split_k << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index c1bb90dd9c..84acb53425 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -92,12 +92,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     if(do_verification)
     {
         auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
-                                                                           InDataType,
-                                                                           WeiDataType,
-                                                                           OutDataType,
-                                                                           InElementOp,
-                                                                           WeiElementOp,
-                                                                           OutElementOp>{};
+                                                                               InDataType,
+                                                                               WeiDataType,
+                                                                               OutDataType,
+                                                                               InElementOp,
+                                                                               WeiElementOp,
+                                                                               OutElementOp>{};
         auto ref_invoker  = ref_conv.MakeInvoker();
         auto ref_argument = ref_conv.MakeArgument(input,
                                                   weight_host_result,
@@ -302,10 +302,9 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
         }
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
-              << best_split_k << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl;
 
     return all_pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
index c12fa75e34..d0e1cf2611 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp
@@ -178,8 +178,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                                   in_element_op,
                                                   wei_element_op,
                                                   out_element_op,
-                                                  {},
-                                                  {},
+                                                                             {},
+                                                                             {},
                                                   d_tensors);
 
         // init host output to zero
@@ -312,9 +312,9 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
         run_impl(op_ptr, argument_ptr);
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index a1f9ee1528..2dcee4c1fc 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -250,9 +250,9 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
         run_impl(op_ptr, argument_ptr);
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
 
     return pass;
 }
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
index bd756eb825..b553e07735 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
@@ -342,9 +342,9 @@ bool profile_grouped_conv_fwd_outelementop_impl(int do_verification,
         run_impl(op_ptr, argument_ptr);
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
     return pass;
 }
 
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index daaf565149..83913d8398 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -103,12 +103,12 @@ bool profile_softmax_impl(int do_verification,
     // add device softmax instances
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
     using DeviceOp    = tensor_operation::device::DeviceSoftmax<InDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             PassThrough,
-                                                             PassThrough,
-                                                             Rank,
-                                                             NumReduceDim>;
+                                                                AccDataType,
+                                                                OutDataType,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                Rank,
+                                                                NumReduceDim>;
 
     // get device op instances
     const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -141,8 +141,7 @@ bool profile_softmax_impl(int do_verification,
         {
             std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
             LogRange(std::cout << "input lengths = [", in_length, ", ")
-                << "], "
-                << "scaler = [" << alpha << ", " << beta << "]";
+                << "], " << "scaler = [" << alpha << ", " << beta << "]";
             LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
             instance_pass.push_back(true);
             continue;
@@ -202,8 +201,7 @@ bool profile_softmax_impl(int do_verification,
             {
                 std::cout << inst_ptr->GetTypeString() << " failed verification: ";
                 LogRange(std::cout << "input lengths = [", in_length, ", ")
-                    << "], "
-                    << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+                    << "], " << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
             }
             instance_pass.push_back(pass);
         }
@@ -215,9 +213,8 @@ bool profile_softmax_impl(int do_verification,
         LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
         LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
         LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
-        std::cout << "alpha = " << alpha << ", "
-                  << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
-                  << " GB/s, " << best_instance_name << std::endl;
+        std::cout << "alpha = " << alpha << ", " << "beta = " << beta << ", " << best_avg_time
+                  << " ms, " << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
     }
     return std::all_of(
         std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
diff --git a/profiler/src/profile_contraction_bilinear.cpp b/profiler/src/profile_contraction_bilinear.cpp
index 990e1e1196..a64555fc66 100644
--- a/profiler/src/profile_contraction_bilinear.cpp
+++ b/profiler/src/profile_contraction_bilinear.cpp
@@ -29,8 +29,7 @@ static void print_helper_msg()
               << "                     3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
                  "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
               << "arg6: verification (0: no; 1: yes)\n"
-              << "arg7: initialization (0: no init; 1: integer value; 2: decimal "
-              << "value)\n"
+              << "arg7: initialization (0: no init; 1: integer value; 2: decimal " << "value)\n"
               << "arg8: print tensor value (0: no; 1: yes)\n"
               << "arg9: time kernel (0: no, 1: yes)\n"
               << "arg10: alpha\n"
diff --git a/profiler/src/profile_contraction_scale.cpp b/profiler/src/profile_contraction_scale.cpp
index 85252eaa37..a168c09bcf 100644
--- a/profiler/src/profile_contraction_scale.cpp
+++ b/profiler/src/profile_contraction_scale.cpp
@@ -29,8 +29,7 @@ static void print_helper_msg()
               << "                     3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
                  "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
               << "arg6: verification (0: no; 1: yes)\n"
-              << "arg7: initialization (0: no init; 1: integer value; 2: decimal "
-              << "value)\n"
+              << "arg7: initialization (0: no init; 1: integer value; 2: decimal " << "value)\n"
               << "arg8: print tensor value (0: no; 1: yes)\n"
               << "arg9: time kernel (0: no, 1: yes)\n"
               << "arg10: alpha\n"
diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
index 728b8c1092..53de05a7d8 100755
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
@@ -1,2 +1,2 @@
-find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
-git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
+find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc
index b7cf891862..116d3798b9 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.inc
@@ -110,8 +110,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     b_buf.ToDevice(b_host.data());
     gamma_buf.ToDevice(gamma_host.data());
 
-    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]" << " m:" << m
+              << ", n:" << n << ", stride:" << stride << std::flush;
 
     add_rmsnorm2d_rdquant_fwd_traits traits{input_data_type, quantized_data_type, SaveX};
 
diff --git a/test/ck_tile/data_type/test_pk_int4.cpp b/test/ck_tile/data_type/test_pk_int4.cpp
index 4e9fb20efc..1ccae88112 100644
--- a/test/ck_tile/data_type/test_pk_int4.cpp
+++ b/test/ck_tile/data_type/test_pk_int4.cpp
@@ -36,8 +36,8 @@ TEST(PackedInt4, ConvertToHalf)
     const half_t first_input_val  = ck_tile::type_convert<half_t>(7.f);
     const half_t second_input_val = ck_tile::type_convert<half_t>(-1.f);
 #else
-    const half_t first_input_val     = ck_tile::type_convert<half_t>(-1.f);
-    const half_t second_input_val    = ck_tile::type_convert<half_t>(7.f);
+    const half_t first_input_val  = ck_tile::type_convert<half_t>(-1.f);
+    const half_t second_input_val = ck_tile::type_convert<half_t>(7.f);
 #endif
     uint8_t data = 0b11110111; // {-1, 7}
     pk_int4_t in = ck_tile::bit_cast<int8_t>(data);
@@ -53,8 +53,8 @@ TEST(PackedInt4, ConvertToBHalf)
     const bf16_t first_input_val  = ck_tile::type_convert<bf16_t>(7.f);
     const bf16_t second_input_val = ck_tile::type_convert<bf16_t>(-1.f);
 #else
-    const bf16_t first_input_val     = ck_tile::type_convert<bf16_t>(-1.f);
-    const bf16_t second_input_val    = ck_tile::type_convert<bf16_t>(7.f);
+    const bf16_t first_input_val  = ck_tile::type_convert<bf16_t>(-1.f);
+    const bf16_t second_input_val = ck_tile::type_convert<bf16_t>(7.f);
 #endif
     uint8_t data = 0b11110111; // {-1, 7}
     pk_int4_t in = ck_tile::bit_cast<int8_t>(data);
diff --git a/test/ck_tile/elementwise/test_elementwise_1d.cpp b/test/ck_tile/elementwise/test_elementwise_1d.cpp
index 5f327c7097..7013792335 100644
--- a/test/ck_tile/elementwise/test_elementwise_1d.cpp
+++ b/test/ck_tile/elementwise/test_elementwise_1d.cpp
@@ -36,11 +36,9 @@ struct elementwise_op_traits<ck_tile::element_wise::Relu>
 template <std::size_t D, typename F>
 auto make_uniform_array_with_factory(F&& factory)
 {
-    return [&]<std::size_t... Is>(std::index_sequence<Is...>)
-    {
+    return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
         return std::array<std::invoke_result_t<F, std::size_t>, D>{factory(Is)...};
-    }
-    (std::make_index_sequence<D>{});
+    }(std::make_index_sequence<D>{});
 }
 
 template <typename Tuple>
@@ -87,12 +85,10 @@ class TestCkTileElementwise : public ::testing::Test
         ck_tile::DeviceMem d_y_mem(h_y);
         d_y_mem.SetZero();
 
-        auto d_x_ptrs_tuple = [&]<std::size_t... Is>(std::index_sequence<Is...>)
-        {
+        auto d_x_ptrs_tuple = [&]<std::size_t... Is>(std::index_sequence<Is...>) {
             return ck_tile::make_tuple(
                 static_cast<const XDataType*>(d_xs_mems_owner[Is].GetDeviceBuffer())...);
-        }
-        (std::make_index_sequence<NumInputs>{});
+        }(std::make_index_sequence<NumInputs>{});
 
         YDataType* p_y_device = static_cast<YDataType*>(d_y_mem.GetDeviceBuffer());
 
@@ -142,11 +138,9 @@ class TestCkTileElementwise : public ::testing::Test
         ElementwiseOpType op_host;
         for(ck_tile::index_t i = 0; i < total_m_elements; ++i)
         {
-            auto get_host_op_args = [&]<std::size_t... Is>(std::index_sequence<Is...>)
-            {
+            auto get_host_op_args = [&]<std::size_t... Is>(std::index_sequence<Is...>) {
                 return ck_tile::make_tuple(static_cast<ComputeDataType>(h_xs[Is](i))...);
-            }
-            (std::make_index_sequence<NumInputs>{});
+            }(std::make_index_sequence<NumInputs>{});
 
             YDataType temp_y_val;
             ck_tile::apply(
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 9adf9ec185..70aa161881 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -218,10 +218,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
+                std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", "
+                          << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", "
+                          << blocks.y << ", " << blocks.z << "}" << std::endl;
             }
 
             ck_tile::launch_kernel(
diff --git a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
index 3d2c9a82e0..a63a58b473 100644
--- a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
+++ b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
@@ -90,24 +90,24 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
                                                tail_number_v>;
         using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
         using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             CodegenPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             transposed_warp_gemm,
-                                             ck_tile::memory_operation_enum::set>>;
+                   ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                    BDataType,
+                                                    ck_tile::tuple<>,
+                                                    AccDataType,
+                                                    CDataType,
+                                                    ck_tile::tuple<>,
+                                                    CLayout,
+                                                    ck_tile::element_wise::PassThrough,
+                                                    CodegenPipelineProblem::kBlockSize,
+                                                    TilePartitioner::MPerBlock,
+                                                    TilePartitioner::NPerBlock,
+                                                    M_Warp,
+                                                    N_Warp,
+                                                    M_Warp_Tile,
+                                                    N_Warp_Tile,
+                                                    K_Warp_Tile,
+                                                    transposed_warp_gemm,
+                                                    ck_tile::memory_operation_enum::set>>;
         using Kernel =
             ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
@@ -449,14 +449,18 @@ bool run_gemm_test(int argc, char* argv[])
     }
     else if(data_type == "i4fp8")
     {
-        using TypeConfig = decltype(
-            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::fp8_t>{});
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::fp8_t,
+                                                        float,
+                                                        ck_tile::fp8_t>{});
         return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4bf8")
     {
-        using TypeConfig = decltype(
-            GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::bf8_t>{});
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::bf8_t,
+                                                        float,
+                                                        ck_tile::bf8_t>{});
         return run_gemm_test_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4f32fp8")
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
index 4d6a1b42b1..af229aad29 100644
--- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_util.hpp
@@ -215,10 +215,9 @@ class TestCkTileGemmPipeline : public ::testing::Test
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
+                std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", "
+                          << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", "
+                          << blocks.y << ", " << blocks.z << "}" << std::endl;
             }
 
             ck_tile::launch_kernel(
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 79e29f8b99..cededd38f9 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -82,11 +82,11 @@ class TestCkTileGroupedGemm : public ::testing::Test
             GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
         using Traits              = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
-                                               GroupedGemKernelParam::kPadN,
-                                               GroupedGemKernelParam::kPadK,
-                                               ALayout,
-                                               BLayout,
-                                               CLayout>;
+                                                            GroupedGemKernelParam::kPadN,
+                                                            GroupedGemKernelParam::kPadK,
+                                                            ALayout,
+                                                            BLayout,
+                                                            CLayout>;
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GroupedGemKernelParam::kPadM,
                                                                      GroupedGemKernelParam::kPadN,
                                                                      GroupedGemKernelParam::kPadK,
@@ -161,10 +161,10 @@ class TestCkTileGroupedGemm : public ::testing::Test
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
+                std::cout << "Launching kernel: " << Kernel::GetName()
+                          << " with args:" << " grid: {" << grids.x << ", " << grids.y << ", "
+                          << grids.z << "}" << ", blocks: {" << blocks.x << ", " << blocks.y << ", "
+                          << blocks.z << "}" << std::endl;
             }
 
             ave_time = ck_tile::launch_kernel(
@@ -284,10 +284,10 @@ class TestCkTileGroupedGemm : public ::testing::Test
 
             if(s.log_level_ > 0)
             {
-                std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
+                std::cout << "Launching kernel: " << Kernel::GetName()
+                          << " with args:" << " grid: {" << grids.x << ", " << grids.y << ", "
+                          << grids.z << "}" << ", blocks: {" << blocks.x << ", " << blocks.y << ", "
+                          << blocks.z << "}" << std::endl;
             }
 
             ck_tile::launch_kernel(s,
@@ -412,8 +412,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
             c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
                 f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{})));
 
-            std::cout << "gemm[" << i << "]"
-                      << " a_m_k: " << a_m_k_tensors[i].mDesc
+            std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
                       << " b_k_n: " << b_k_n_tensors[i].mDesc
                       << " c_m_n: " << c_m_n_tensors[i].mDesc << " KBatch: " << kbatch << std::endl;
 
diff --git a/test/ck_tile/layernorm2d/layernorm2d_fwd.inc b/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
index 8070815b7e..a0295eafeb 100644
--- a/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
@@ -194,8 +194,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         return base_str;
     }();
 
-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
               << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
               << ", yr_stride:" << yr_stride << std::flush;
 
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
index ff23c99e74..9e181a9d8c 100644
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
@@ -128,9 +128,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
     smscale_buf.ToDevice(smscale_host.data());
     topk_ids_buf.ToDevice(topk_ids_host.data());
 
-    std::cout << "[" << prec_i << "-" << prec_o << "]"
-              << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
-              << ", experts:" << experts << ", topk:" << topk << std::flush;
+    std::cout << "[" << prec_i << "-" << prec_o << "]" << " tokens:" << tokens
+              << ", hidden_size:" << hidden_size << ", stride:" << stride << ", experts:" << experts
+              << ", topk:" << topk << std::flush;
 
     moe_smoothquant_traits traits{prec_i, prec_o};
 
diff --git a/test/ck_tile/moe_sorting/moe_sorting_api.cpp b/test/ck_tile/moe_sorting/moe_sorting_api.cpp
index 0e8998e254..0f25e17867 100644
--- a/test/ck_tile/moe_sorting/moe_sorting_api.cpp
+++ b/test/ck_tile/moe_sorting/moe_sorting_api.cpp
@@ -40,11 +40,11 @@
     constexpr bool local_expert_masking       = local_expert_masking_;                                  \
     constexpr bool local_token                = local_token_;                                           \
     using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking,      \
-                                                    local_token>;              \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
     using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
     auto kargs                                = kernel::MakeKargs(a);                                   \
     const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -200,11 +200,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -218,11 +218,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -236,11 +236,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -254,11 +254,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                    \
         constexpr bool local_token            = local_token_;                                       \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
-                                                        ms_weight_type,        \
-                                                        mesh_type_,            \
-                                                        unroll_num,            \
-                                                        expert_masking,        \
-                                                        local_token>;          \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                               \
         const dim3 grids                      = kernel::GridSize(a);                                \
@@ -273,11 +273,11 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         constexpr bool expert_masking         = expert_masking_;                                     \
         constexpr bool local_token            = local_token_;                                        \
         using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
-                                                        ms_weight_type,         \
-                                                        mesh_type_,             \
-                                                        unroll_num,             \
-                                                        expert_masking,         \
-                                                        local_token>;           \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
         using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
         auto kargs                            = kernel::MakeKargs(a);                                \
         const dim3 grids                      = kernel::GridSize(a);                                 \
diff --git a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp b/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
index cc511984fe..8a300dd890 100644
--- a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
+++ b/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
@@ -226,20 +226,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     moe_sorting_trait trait{
         index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
 
-    moe_sorting_args karg
-    {
-        topk_ids_dev.GetDeviceBuffer(), weights_dev.GetDeviceBuffer(),
-            local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer() : nullptr,
-            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
-            sorted_ids_dev.GetDeviceBuffer(), sorted_weights_dev.GetDeviceBuffer(),
-            sorted_expert_ids_dev.GetDeviceBuffer(), sorted_id_cnt_dev.GetDeviceBuffer(),
-            moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
-            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr, tokens, unit_size,
-            num_experts, topk,
+    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                          weights_dev.GetDeviceBuffer(),
+                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
+                                               : nullptr,
+                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+                          sorted_ids_dev.GetDeviceBuffer(),
+                          sorted_weights_dev.GetDeviceBuffer(),
+                          sorted_expert_ids_dev.GetDeviceBuffer(),
+                          sorted_id_cnt_dev.GetDeviceBuffer(),
+                          moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
+                          tokens,
+                          unit_size,
+                          num_experts,
+                          topk,
 #if MOE_SORTING_FMOE_2D_BUF
-            moe_buf_interm_dim, moe_buf_elem_bytes
+                          moe_buf_interm_dim,
+                          moe_buf_elem_bytes
 #else
-            static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
 #endif
     };
 
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
index 518a9a8889..c94adc24c3 100644
--- a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -333,12 +333,12 @@ struct matrix_core_swizzle_kernel
                     return tmp_1;
 #else
                     // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
-                    constexpr index_t kv = Alignment;
-                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
-                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t kv          = Alignment;
+                    constexpr index_t nw          = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw          = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                     constexpr index_t waveflatten = kw * nw * kv;
-                    const index_t kr = a_.k / (k1 * k2);
-                    const index_t nr = a_.n / nw;
+                    const index_t kr              = a_.k / (k1 * k2);
+                    const index_t nr              = a_.n / nw;
                     auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                         p_dst,
                         make_tuple(nr, kr, waveflatten),
@@ -387,8 +387,8 @@ struct matrix_core_swizzle_kernel
                     constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                     constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                     constexpr index_t waveflatten_tile = kw * nw * kv;
-                    constexpr index_t nr_tile = NPerBlock / nw;
-                    constexpr index_t kr_tile = KPerBlock / (kw * kv);
+                    constexpr index_t nr_tile          = NPerBlock / nw;
+                    constexpr index_t kr_tile          = KPerBlock / (kw * kv);
                     return make_tile_window(dst_view,
                                             make_tuple(number<nr_tile>{},
                                                        number<kr_tile>{},
diff --git a/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc
index 19abf10f3c..bf8ee8b0cc 100644
--- a/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc
+++ b/test/ck_tile/rmsnorm2d/rmsnorm2d_fwd.inc
@@ -194,8 +194,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         return base_str;
     }();
 
-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
               << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
               << ", yr_stride:" << yr_stride << std::flush;
 
diff --git a/test/ck_tile/smoothquant/smoothquant.inc b/test/ck_tile/smoothquant/smoothquant.inc
index afda7de4eb..23dba27e88 100644
--- a/test/ck_tile/smoothquant/smoothquant.inc
+++ b/test/ck_tile/smoothquant/smoothquant.inc
@@ -96,9 +96,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
     x_buf.ToDevice(x_host.data());
     smscale_buf.ToDevice(smscale_host.data());
 
-    std::cout << "[" << data_type << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
-              << std::flush;
+    std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", y_stride:" << y_stride << std::flush;
 
     smoothquant_traits traits{data_type};
 
diff --git a/test/data_type/test_pk_i4.cpp b/test/data_type/test_pk_i4.cpp
index d8d4d0e36d..52273d45de 100644
--- a/test/data_type/test_pk_i4.cpp
+++ b/test/data_type/test_pk_i4.cpp
@@ -31,8 +31,8 @@ TEST(PackedInt4, ConvertToFloat)
     constexpr float first_input_val  = 7.f;
     constexpr float second_input_val = -1.f;
 #else
-    constexpr float first_input_val   = -1.f;
-    constexpr float second_input_val  = 7.f;
+    constexpr float first_input_val  = -1.f;
+    constexpr float second_input_val = 7.f;
 #endif
     uint8_t data = 0b11110111; // {-1, 7}
     pk_i4_t in   = ck::bit_cast<int8_t>(data);
@@ -65,8 +65,8 @@ TEST(PackedInt4, ConvertToBHalf)
     const bhalf_t first_input_val  = ck::type_convert<bhalf_t>(7.f);
     const bhalf_t second_input_val = ck::type_convert<bhalf_t>(-1.f);
 #else
-    const bhalf_t first_input_val     = ck::type_convert<bhalf_t>(-1.f);
-    const bhalf_t second_input_val    = ck::type_convert<bhalf_t>(7.f);
+    const bhalf_t first_input_val  = ck::type_convert<bhalf_t>(-1.f);
+    const bhalf_t second_input_val = ck::type_convert<bhalf_t>(7.f);
 #endif
     uint8_t data = 0b11110111; // {-1, 7}
     pk_i4_t in   = ck::bit_cast<int8_t>(data);
diff --git a/test/mx_mfma_op/mx_mfma_op.cpp b/test/mx_mfma_op/mx_mfma_op.cpp
index 5e2aedd35e..9decfe14ac 100644
--- a/test/mx_mfma_op/mx_mfma_op.cpp
+++ b/test/mx_mfma_op/mx_mfma_op.cpp
@@ -67,12 +67,12 @@ TEST(MFMA, FP8MFMA16x16x128)
     using CLayout = ck::tensor_layout::gemm::ColumnMajor;
     auto AB_init  = (common_init < 0) ? 5 : common_init;
     auto pass     = run_mfma_test<ALayout,
-                              BLayout,
-                              CLayout,
-                              f8_t,
-                              f8_t,
-                              half_t,
-                              ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
+                                  BLayout,
+                                  CLayout,
+                                  f8_t,
+                                  f8_t,
+                                  half_t,
+                                  ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -83,12 +83,12 @@ TEST(MFMA, BF8MFMA16x16x128)
     using CLayout = ck::tensor_layout::gemm::ColumnMajor;
     auto AB_init  = (common_init < 0) ? 5 : common_init;
     auto pass     = run_mfma_test<ALayout,
-                              BLayout,
-                              CLayout,
-                              bf8_t,
-                              bf8_t,
-                              half_t,
-                              ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
+                                  BLayout,
+                                  CLayout,
+                                  bf8_t,
+                                  bf8_t,
+                                  half_t,
+                                  ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -126,12 +126,12 @@ TEST(MFMA, BF6MFMA16x16x128)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mfma_test<ALayout,
-                              BLayout,
-                              CLayout,
-                              bf6_t,
-                              bf6_t,
-                              float,
-                              ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
+                                 BLayout,
+                                 CLayout,
+                                 bf6_t,
+                                 bf6_t,
+                                 float,
+                                 ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -156,12 +156,12 @@ TEST(MFMA, BF8MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mfma_test<ALayout,
-                              BLayout,
-                              CLayout,
-                              bf8_t,
-                              bf8_t,
-                              float,
-                              ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
+                                 BLayout,
+                                 CLayout,
+                                 bf8_t,
+                                 bf8_t,
+                                 float,
+                                 ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -199,12 +199,12 @@ TEST(MFMA, BF6MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mfma_test<ALayout,
-                              BLayout,
-                              CLayout,
-                              bf6_t,
-                              bf6_t,
-                              half_t,
-                              ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
+                                 BLayout,
+                                 CLayout,
+                                 bf6_t,
+                                 bf6_t,
+                                 half_t,
+                                 ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -274,12 +274,12 @@ TEST(MXMFMA, MXFP8MFMA16x16x128)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                f8_t,
-                                f8_t,
-                                float,
-                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   f8_t,
+                                   f8_t,
+                                   float,
+                                   ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -291,12 +291,12 @@ TEST(MXMFMA, MXFP8MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                f8_t,
-                                f8_t,
-                                half_t,
-                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   f8_t,
+                                   f8_t,
+                                   half_t,
+                                   ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -308,12 +308,12 @@ TEST(MXMFMA, MXBF8MFMA16x16x128)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                bf8_t,
-                                bf8_t,
-                                float,
-                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   bf8_t,
+                                   bf8_t,
+                                   float,
+                                   ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -325,12 +325,12 @@ TEST(MXMFMA, MXBF8MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                bf8_t,
-                                bf8_t,
-                                half_t,
-                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   bf8_t,
+                                   bf8_t,
+                                   half_t,
+                                   ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -342,12 +342,12 @@ TEST(MXMFMA, MXFP6MFMA16x16x128)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                f6_t,
-                                f6_t,
-                                float,
-                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   f6_t,
+                                   f6_t,
+                                   float,
+                                   ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -359,12 +359,12 @@ TEST(MXMFMA, MXFP6MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                f6_t,
-                                f6_t,
-                                half_t,
-                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   f6_t,
+                                   f6_t,
+                                   half_t,
+                                   ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -376,12 +376,12 @@ TEST(MXMFMA, MXBF6MFMA16x16x128)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                bf6_t,
-                                bf6_t,
-                                float,
-                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   bf6_t,
+                                   bf6_t,
+                                   float,
+                                   ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -393,12 +393,12 @@ TEST(MXMFMA, MXBF6MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                bf6_t,
-                                bf6_t,
-                                half_t,
-                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   bf6_t,
+                                   bf6_t,
+                                   half_t,
+                                   ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -410,12 +410,12 @@ TEST(MXMFMA, MXFP4MFMA16x16x128)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                f4_t,
-                                f4_t,
-                                float,
-                                ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   f4_t,
+                                   f4_t,
+                                   float,
+                                   ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
@@ -427,11 +427,11 @@ TEST(MXMFMA, MXFP4MFMA32x32x64)
 
     auto AB_init = (common_init < 0) ? 5 : common_init;
     auto pass    = run_mxmfma_test<ALayout,
-                                BLayout,
-                                CLayout,
-                                f4_t,
-                                f4_t,
-                                half_t,
-                                ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+                                   BLayout,
+                                   CLayout,
+                                   f4_t,
+                                   f4_t,
+                                   half_t,
+                                   ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
diff --git a/test/pool/test_max_pool2d_fwd.cpp b/test/pool/test_max_pool2d_fwd.cpp
index 2179242754..bb6fc96cb1 100644
--- a/test/pool/test_max_pool2d_fwd.cpp
+++ b/test/pool/test_max_pool2d_fwd.cpp
@@ -57,9 +57,9 @@ using true_t  = std::integral_constant<bool, true>;
 using false_t = std::integral_constant<bool, false>;
 
 using MaxPool2D_F32_Types  = ::testing::Types<std::tuple<F32, F32, F32, I32, true_t>,
-                                             std::tuple<F32, F32, F32, I32, false_t>>;
+                                              std::tuple<F32, F32, F32, I32, false_t>>;
 using MaxPool2D_F16_Types  = ::testing::Types<std::tuple<F16, F16, F32, I32, true_t>,
-                                             std::tuple<F16, F16, F32, I32, false_t>>;
+                                              std::tuple<F16, F16, F32, I32, false_t>>;
 using MaxPool2D_BF16_Types = ::testing::Types<std::tuple<I8, I8, F32, I32, true_t>,
                                               std::tuple<BF16, BF16, F32, I32, false_t>>;
 using MaxPool2D_I8_Types =
diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp
index b3328e4b36..45345cccfa 100644
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -58,12 +58,12 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParam& conv_param,
     ck::ranges::fill<OutDataType>(host_output, 0.f);
 
     auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                                 InDataType,
-                                                                 WeiDataType,
-                                                                 OutDataType,
-                                                                 InElementOp,
-                                                                 WeiElementOp,
-                                                                 OutElementOp>();
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
     auto ref_invoker  = ref_conv.MakeInvoker();
     auto ref_argument = ref_conv.MakeArgument(input,
                                               weights,
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index bbb9c1d715..ce8a6e8234 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -105,10 +105,8 @@ struct KernelInstance
     friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
     {
         os << "{\n"
-           << " \"name\": \""
-           << "{\n"
-           << obj.name_ << "\n}"
-           << "\",\n"
+           << " \"name\": \"" << "{\n"
+           << obj.name_ << "\n}" << "\",\n"
            << " \"problem\": \"" << obj.problem_ << "\",\n"
            << " \"perf_result\": " << obj.perf_result_ << "\n"
            << "}";
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index fdad363f7c..634e19de6e 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -218,10 +218,8 @@ class GemmProfiler
                 {
                     file << "rocm_version,device_name,"
                          << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c,"
-                         << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity,"
-                         << "name,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity," << "name,"
                          << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
                 }
 
@@ -251,7 +249,7 @@ class GemmProfiler
         return kernel_instance;
     }
 
-    GemmProfiler(const GemmProfiler&) = delete;
+    GemmProfiler(const GemmProfiler&)            = delete;
     GemmProfiler& operator=(const GemmProfiler&) = delete;
 
     private:

From 0782ee8eb3e0e14ba30c169be43cefe55aeb0f82 Mon Sep 17 00:00:00 2001
From: linqunAMD <qlin@amd.com>
Date: Tue, 29 Jul 2025 04:01:07 +0800
Subject: [PATCH 360/443] Remove !defined(__HIP_DEVICE_COMPILE__) in CK kernel
 (#2564)

* Remove HIP_COMPILE_DEVICE

* add missing files

* fix clang format

---------

Co-authored-by: Lin, Qun <Quentin.Lin+amdeng@amd.com>
---
 ...en_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 2 +-
 .../device_batched_contraction_multiple_d_xdl_cshuffle.hpp  | 2 +-
 .../gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp   | 2 +-
 .../device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp   | 2 +-
 .../gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp     | 2 +-
 .../gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp   | 5 ++---
 ...batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp | 2 +-
 .../impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp | 4 ++--
 .../device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp | 2 +-
 ...vice_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp | 6 +++---
 ...evice_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 2 +-
 .../impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp  | 2 +-
 .../device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp    | 2 +-
 .../gpu/device/impl/device_batched_gemm_xdl.hpp             | 2 +-
 .../device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp | 4 ++--
 .../impl/device_contraction_multiple_abd_xdl_cshuffle.hpp   | 2 +-
 .../impl/device_contraction_multiple_d_xdl_cshuffle.hpp     | 2 +-
 .../device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 2 +-
 .../gpu/device/impl/device_gemm_multiple_d_dl.hpp           | 4 ++--
 .../impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp  | 2 +-
 .../impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp | 2 +-
 .../gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp | 2 +-
 .../device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp   | 2 +-
 .../device_grouped_contraction_multiple_d_xdl_cshuffle.hpp  | 2 +-
 ...ice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 3 ++-
 .../gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp   | 5 ++---
 ...vice_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp | 2 +-
 ...evice_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 4 ++--
 .../impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp    | 3 +--
 .../impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 4 ++--
 ...device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 5 ++---
 .../impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp      | 3 +--
 .../device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp   | 2 +-
 ...device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 4 ++--
 ..._grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp | 2 +-
 ...rouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp | 2 +-
 .../impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp     | 2 +-
 .../gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp   | 5 ++---
 ...evice_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp | 2 +-
 ...evice_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 2 +-
 .../gpu/device/impl/device_grouped_gemm_xdl.hpp             | 2 +-
 .../gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp    | 2 +-
 .../device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp | 2 +-
 .../impl/device_grouped_query_attention_forward_wmma.hpp    | 2 +-
 .../impl/device_multi_query_attention_forward_wmma.hpp      | 2 +-
 .../device_splitk_contraction_multiple_d_xdl_cshuffle.hpp   | 2 +-
 .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp                 | 2 +-
 .../grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp  | 3 +--
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp  | 2 +-
 .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp     | 6 +++---
 ...ridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp | 2 +-
 .../gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp       | 3 +--
 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 2 +-
 .../gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp      | 2 +-
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp      | 4 ++--
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp              | 6 ++----
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp              | 6 ++----
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp              | 4 ++--
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp | 4 ++--
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp      | 4 ++--
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp    | 4 ++--
 .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp      | 4 ++--
 .../grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp | 2 +-
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp  | 4 ++--
 ...gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp | 4 ++--
 .../gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp    | 3 +--
 .../gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp            | 3 +--
 .../gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp         | 3 +--
 .../grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp    | 2 +-
 .../gpu/grid/gridwise_gemm_xdlops_streamk.hpp               | 3 +--
 .../tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp | 6 ++----
 .../tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp | 3 +--
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp                | 3 +--
 .../tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp | 3 +--
 .../tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp | 3 +--
 .../tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp | 3 +--
 include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp  | 4 ++--
 .../gpu/grid/gridwise_moe_gemm_blockscale.hpp               | 4 ++--
 .../ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp   | 4 ++--
 .../tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp  | 4 ++--
 .../gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp           | 4 ++--
 .../tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp | 5 ++---
 82 files changed, 114 insertions(+), 137 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 1dd143f6a3..c71153768d 100644
--- a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -94,7 +94,7 @@ __device__ void device_grouped_conv_fwd_multiple_abd_xdl_cshuffle(
     const Block2ETileMap block_2_ctile_map,
     const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
index c57d5316ba..f59ea3efde 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
index c82da32313..8a8cf54e42 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -74,7 +74,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                       const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
                                       const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
index efe8fe92c7..b23d864f5c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -59,7 +59,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                      const index_t batch_count,
                                      const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
index 811924a189..1f8c6b1508 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -83,7 +83,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const Block2ETileMap block_2_etile_map)
 {
 
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
index a38e0d25e7..9254fc1990 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
@@ -70,9 +70,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
-    defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || \
+    defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
 
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
index 2ae4794d00..ea5668d765 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -69,7 +69,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const index_t batch_count,
         const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index 2e0b5da113..cf7941195e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -37,7 +37,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_xdl_cshuffle_v3_multi_d(BatchedGemmArg karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t g_idx = blockIdx.z % karg.Batch;
@@ -83,7 +83,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds(BatchedGemmArg karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
index 851f6a5f97..ffebad253b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -60,7 +60,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
index 2e1684adb6..6481982651 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                                    bool input_permute,
                                                    bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
@@ -188,7 +188,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                        index_t head_size,
                                        float alpha)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
@@ -322,7 +322,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                         index_t head_size,
                                         float alpha)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 18b9e6ce74..d835bb6c61 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -68,7 +68,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
         const C0MatrixMask c0_matrix_mask)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index ec0fb7b98d..1345d2b782 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -64,7 +64,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
         const C0MatrixMask c0_matrix_mask)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
index cecd312879..5d983afb9b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
@@ -37,7 +37,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
                                               // argument through implicit conversion to base class!
         const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 #if defined(__gfx11__)
     // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
     using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
index 16d5feccf2..d3f067f170 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -52,7 +52,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_batched_gemm_xdlops_v2r3(const typename DeviceOp::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / karg.Batch);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
index 1419f5ee7c..459ebd7f35 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl_fpAintB_b_scale.hpp
@@ -37,7 +37,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_b_scale_xdl_cshuffle_v3(BatchedGemmArg karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t g_idx = blockIdx.z % karg.Batch;
@@ -75,7 +75,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds(BatchedGemmArg karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
index b99032fb9f..27f0a7af7c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -55,7 +55,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
             e_grid_desc_mblock_mperblock_nblock_nperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_as_grid,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index de8e524dc3..615566a555 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -53,7 +53,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
             e_grid_desc_mblock_mperblock_nblock_nperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index 0e926a748a..dc8499fcf2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
index 9e8c959f98..77d747a42c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -50,8 +50,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx9__) || \
-    defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx9__) || defined(__gfx103__) || defined(__gfx11__) || \
+    defined(__gfx12__))
 
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
index 8f4c41b69c..0a1ec2c1b8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -62,7 +62,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2ETileMap block_2_etile_map,
         index_t NRaw)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemmWelford::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemmWelford::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
index c1b3f98bc9..8ae6761769 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -60,7 +60,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index e36816df64..c7481997a9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -54,7 +54,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                             e_grid_desc_mblock_mperblock_nblock_nperblock,
                                         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
index 0f6457f48e..1042f8948c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -46,7 +46,7 @@ __launch_bounds__(CK_WAVELET_MAX_THREAD_PER_BLOCK, CK_WAVELET_MIN_BLOCK_PER_CU)
                                               e_grid_desc_mblock_mperblock_nblock_nperblock,
                                           const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
index f32334cd91..5449525306 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -37,7 +37,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const BElementwiseOperation b_element_op,
         const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index fe9e4ff7e8..25923235c3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -96,7 +96,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfN compute_ptr_offset_of_n,
         const index_t KBatch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // offset base pointer for each work-group
     const index_t block_args_id = __builtin_amdgcn_readfirstlane(blockIdx.x);
     const index_t g_idx         = __builtin_amdgcn_readfirstlane(blockIdx.y);
@@ -221,6 +221,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
     ignore = cde_element_op;
     ignore = compute_ptr_offset_of_batch;
     ignore = compute_ptr_offset_of_n;
+    ignore = KBatch;
 #endif
 }
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
index 3306e311b3..5a6caef945 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
@@ -48,9 +48,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2CTileMap block_2_ctile_map,
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) ||         \
-    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
-    defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx103__) || defined(__gfx90a__) || defined(__gfx908__) || \
+    defined(__gfx94__) || defined(__gfx11__) || defined(__gfx12__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
index 601bf4eb5a..4e6b4927fc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -59,7 +59,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                           const Block2CTileMap block_2_ctile_map,
                                           const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 8796f5520e..bfb6707e09 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -55,7 +55,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         [[maybe_unused]] const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
@@ -110,7 +110,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         [[maybe_unused]] const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 6f6a3587ac..b58f6885c7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -57,8 +57,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                           const Block2CTileMap block_2_ctile_map,
                                           const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index bbaa04536c..243a6adafc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -53,7 +53,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
 
@@ -111,7 +111,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index e7446bb995..330f7fd809 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -90,9 +90,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2CTileMap block_2_ctile_map,
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) ||         \
-    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
-    defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx103__) || defined(__gfx90a__) || defined(__gfx908__) || \
+    defined(__gfx94__) || defined(__gfx11__) || defined(__gfx12__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
index 393ee80881..f9b8e591b9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -106,8 +106,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2CTileMap block_2_ctile_map,
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
-    defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index ac40d363b5..f90f9b457b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -101,7 +101,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
         const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
 
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index a938820e6c..55ec0d21e9 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -82,7 +82,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
                                             const ComputePtrOffset compute_ptr_offset_of_groups,
                                             const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
@@ -162,7 +162,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
         const ComputePtrOffset compute_ptr_offset_of_groups,
         const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
index 1e5c67aac7..d7859dbc46 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -155,7 +155,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2ETileMap block_2_ctile_map,
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
     const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index b1494a36bf..9279f7547a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -52,7 +52,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffset compute_ptr_offset_of_groups,
         const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
index 7cfc73fab6..764daf1750 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp
@@ -45,7 +45,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                      const BElementwiseOperation b_element_op,
                                      const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t KBatch = 1;
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
index d0d613af8f..128c25c1d4 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -40,9 +40,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                       const BElementwiseOperation b_element_op,
                                       const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx94__) || \
-    defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx103__) || \
+    defined(__gfx11__) || defined(__gfx94__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
index 38bb19b712..70a395f2f7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -68,7 +68,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                        const BElementwiseOperation b_element_op,
                                        const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
 
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
     __shared__ uint8_t p_shared[shared_size];
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
index 1754b542c5..784b2fd401 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -43,7 +43,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const B1ElementwiseOperation b1_element_op,
         const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
index a528149ecd..2c5d1dd134 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -39,7 +39,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const BElementwiseOperation b_element_op,
                             const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
index 81134465af..91c691b6a2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -50,7 +50,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                      const BElementwiseOperation b_element_op,
                                      const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t block_id = get_block_1d_id();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
index ea14087698..45d46de74b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -41,7 +41,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                    const BElementwiseOperation b_element_op,
                                    const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
     __shared__ uint8_t p_shared[shared_size];
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
index b66ab997bb..9d61e57367 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
@@ -61,7 +61,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                         bool input_permute,
                                         bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
index e196ed5e3a..e87dcc4f84 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -60,7 +60,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                       bool input_permute,
                                       bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
index c1d3aa43de..b60370fd8e 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const index_t num_blocks_per_batch =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
index fab0fbab1d..8011fa56d3 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
@@ -50,7 +50,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                              const CElementwiseOperation c_element_op,
                              const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
index 6e73f0955b..96b737385a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -66,8 +66,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
index 7deda48f7b..ff534b0777 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
@@ -28,7 +28,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_gemm_dpp(const typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx103__) || defined(__gfx11__))
+#if(defined(__gfx103__) || defined(__gfx11__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const auto a_grid_desc_ak0_m_ak1 = amd_wave_read_first_lane(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
index 36eb4489e9..46979a5620 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -54,7 +54,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const Block2CTileMap block_2_ctile_map,
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
     // offset base pointer for each work-group
     const index_t num_blocks_per_batch =
         __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -147,7 +147,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
         const Block2CTileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
     // printf("entry kernel launch");
     __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
 
@@ -236,7 +236,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                          const CDEElementwiseOperation cde_element_op,
                                          const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
 
     GridwiseOp::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
index 769bc5b877..bd9b08f8f9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp
@@ -56,7 +56,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
             e_grid_desc_mblock_mperblock_nblock_nperblock,
         const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx90a__) || defined(__gfx94__))
+#if(defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index b257fa4aa3..010b2144b9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -54,8 +54,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
index 51cd5ada91..4a15958adb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -45,7 +45,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                      const CElementwiseOperation c_element_op,
                      const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
     __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
index fc01866ddf..c60dba3b48 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
@@ -28,7 +28,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if(defined(__gfx11__) || defined(__gfx12__))
 #if defined(__gfx11__)
     // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
     using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index 9089bd2ce2..129929b665 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -37,7 +37,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
@@ -58,7 +58,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index c22229a183..e4d5b99ffe 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -24,8 +24,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_gemm_xdl_cshuffle_v1(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
@@ -49,8 +48,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                 FloatC* __restrict__ p_c_grid,
                                 typename GridwiseGemm::Problem problem)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared, problem);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
index 48c577b2e0..57624b218c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -25,8 +25,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v2(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -53,8 +52,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, 1)
                                 FloatC* p_c_grid,
                                 typename GridwiseGemm::Problem problem)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 5f3950b29e..8fea287941 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -35,7 +35,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
@@ -63,7 +63,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index 91f08413af..7947d2490a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -35,7 +35,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_b_preshuffle(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
@@ -63,7 +63,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index d8c697823a..a7d7546b1c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -35,7 +35,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -65,7 +65,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index 9f442906f5..1187088bb6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -34,7 +34,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
@@ -64,7 +64,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index 17b4cd7c68..b72c4d0313 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -38,7 +38,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -70,7 +70,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     // Pass two lds pointer is the key to tell compiler that ds_read/write
     // operate on different lds chunk at same time without order dependecy
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index b41f1220fb..93ec6ca31e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -38,7 +38,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index 27926e5290..373d4eb4e4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -38,7 +38,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -70,7 +70,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
index 20711f0c5e..e345bc860b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp
@@ -39,7 +39,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle(
         typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
@@ -72,7 +72,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds(
         typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
index 80ce6a1bc4..e90239b70a 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -57,8 +57,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     // TODO ANT: separate into MMA + Epilogue
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
index 697d0f90d9..344c7d6528 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -166,8 +166,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                   const CElementwiseOperation c_element_op,
                                   const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
index 21dacb3412..24fe81c74e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -44,8 +44,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
index 0c5f8de1e4..a13ce732e6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp
@@ -38,7 +38,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                               const BElementwiseOperation b_element_op,
                                               const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
     __shared__ uint8_t p_shared[shared_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
index 104632d3f0..6aa61fcd38 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
@@ -37,8 +37,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                index_t StrideC,
                                typename GridwiseGemm::Block2CTileMap block_mapping)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
     __shared__ uint8_t p_shared[shared_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index dc9429ea6e..ae9a8af813 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -38,8 +38,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
                             const CGridDesc_M_N c_grid_desc_m_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
@@ -69,8 +68,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
     kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     const auto a_grid_desc_k0_m_k1 =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
index 978f08ad4a..f779e63752 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -42,8 +42,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const CElementwiseOperation c_element_op,
                             const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     constexpr index_t shared_block_size =
         GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index a546b471bf..595a597318 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -36,8 +36,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                                          const BElementwiseOperation b_element_op,
                                          const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
 
     __shared__ uint8_t p_shared[shared_size];
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
index 66a3fef4eb..8822778b52 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -46,8 +46,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainK0BlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
index eb4e7d3db3..c3bbece33c 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -49,8 +49,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
index 5bd5f75fa9..2e288efee2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -53,8 +53,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         const CElementwiseOperation c_element_op,
         const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__))
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     GridwiseGemm::template Run<HasMainKBlockLoop>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index ca68fe9f86..82be6ac7ce 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -45,7 +45,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -80,7 +80,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
index 7145efbd97..0d78957b07 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_blockscale.hpp
@@ -45,7 +45,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -82,7 +82,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_gemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index 6731a7dda6..ac3a887155 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -48,7 +48,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -86,7 +86,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
index d8d77ae388..a8417b2e02 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -47,7 +47,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -85,7 +85,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
index 7c3dbceeaa..46e9a19ae6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
@@ -47,7 +47,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
     auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
@@ -84,7 +84,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     // __attribute__((amdgpu_waves_per_eu(1, 1)))
     kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+#if defined(__gfx9__)
     __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
     __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
index bffc3c696c..295a77ca34 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -35,9 +35,8 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
                             const Block2ETileMap block_2_tile_map,
                             const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
-    defined(__gfx12__))
+#if(defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) || \
+    defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
     GridwiseTensorRearrangeKernel::Run(in_grid_desc,
                                        p_in_global,
                                        out_grid_desc,

From 5b244105d9faaef58486c815e436c1bb03be2dd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 28 Jul 2025 22:39:07 +0200
Subject: [PATCH 361/443] Enable multiple D for grouped conv fwd large tensors
 (#2572)

---
 Jenkinsfile                                   |   4 +-
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |   4 +-
 ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp |  12 +-
 ...d_multiple_d_xdl_large_tensor_cshuffle.hpp | 358 +++++++++++-------
 .../transform_conv_fwd_to_gemm.hpp            |   8 +
 .../CMakeLists.txt                            |   4 +
 ...uped_convnd_fwd_bias_clamp_large_cases.cpp | 135 +++++++
 7 files changed, 377 insertions(+), 148 deletions(-)
 create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp

diff --git a/Jenkinsfile b/Jenkinsfile
index b34e366f1b..f08e247a06 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1046,8 +1046,8 @@ pipeline {
                     environment{
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases && \
-                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases"""
+                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases test_grouped_convnd_fwd_bias_clamp_large_cases && \
+                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases && ./bin/test_grouped_convnd_fwd_bias_clamp_large_cases"""
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index f90f9b457b..1448914dd3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -106,9 +106,11 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+
     const long_index_t e_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
     const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+    const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
 
     const long_index_t e_n_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
@@ -121,7 +123,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
 
     static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; });
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i]; });
 
     if constexpr(isMultiA || isMultiB)
     {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index 55ec0d21e9..bb31d64a93 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -88,13 +88,15 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
     const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+    const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
 
     static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
     using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
     DsGridPointer p_ds_grid_grp{};
 
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_group_offset[i]; });
+    static_for<0, NumDTensor, 1>{}([&](auto i) {
+        p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i];
+    });
 
     const long_index_t a_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
@@ -168,13 +170,15 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
 
     const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+    const auto& ds_n_offset     = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
 
     static constexpr index_t NumDTensor = GridwiseGemm::NumDTensor;
     using DsGridPointer                 = typename GridwiseGemm::DsGridPointer;
     DsGridPointer p_ds_grid_grp{};
 
-    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_group_offset[i]; });
+    static_for<0, NumDTensor, 1>{}([&](auto i) {
+        p_ds_grid_grp(i) = karg.p_ds_grid[i] + ds_n_offset[i] + ds_group_offset[i];
+    });
 
     const long_index_t a_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index 9279f7547a..8f3feee1c1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -63,11 +63,13 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
     const long_index_t b_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
     const long_index_t e_group_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
 
     const long_index_t a_n_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const auto& ds_n_offset = compute_ptr_offset_of_n.GetDsPtrOffset(n_idx);
     const long_index_t e_n_offset =
         amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
 
@@ -89,10 +91,18 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         group_id = index_t((left + right) / 2);
     }
 
+    using DsPointer = decltype(gemm_desc_kernel_args[Number<0>{}].ds_ptr_);
+    DsPointer p_ds_grid_grp;
+    static constexpr index_t NumDTensor = DsPointer::Size();
+    static_for<0, NumDTensor, 1>{}([&](auto i) {
+        p_ds_grid_grp(i) =
+            gemm_desc_kernel_args[group_id].ds_ptr_[i] + ds_group_offset[i] + ds_n_offset[i];
+    });
+
     GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
         gemm_desc_kernel_args[group_id].a_ptr_ + a_group_offset + a_n_offset,
         gemm_desc_kernel_args[group_id].b_ptr_ + b_group_offset,
-        Tuple<>{},
+        p_ds_grid_grp,
         gemm_desc_kernel_args[group_id].e_ptr_ + e_group_offset + e_n_offset,
         p_shared,
         a_element_op,
@@ -100,7 +110,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
         c_element_op,
         gemm_desc_kernel_args[group_id].a_grid_desc_ak0_m_ak1_,
         gemm_desc_kernel_args[group_id].b_grid_desc_bk0_n_bk1_,
-        Tuple<>{},
+        gemm_desc_kernel_args[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
         gemm_desc_kernel_args[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
         gemm_desc_kernel_args[group_id].block_2_etile_map_);
 #else
@@ -259,18 +269,44 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         return out_gemmm_gemmn_desc;
     }
 
+    static auto
+    MakeDsGridDescriptor_M_N(const ConvToGemmFwdTransformerIndexT& conv_to_gemm_transformer)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(conv_to_gemm_transformer);
+            },
+            Number<NumDTensor>{});
+    }
+
+    static auto CastDsPointers(const std::array<const void*, NumDTensor>& p_ds)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                return static_cast<const DDataType*>(p_ds[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using DsPointer = decltype(CastDsPointers(std::array<const void*, NumDTensor>{}));
     // desc for problem definition
     constexpr static ConvToGemmFwdTransformerIndexT dummy_conv_to_gemm_transformer;
     using AGridDesc_M_K =
         remove_cvref_t<decltype(MakeAGridDescriptor_M_K<ALayout>(dummy_conv_to_gemm_transformer))>;
     using BGridDesc_N_K =
         remove_cvref_t<decltype(MakeBGridDescriptor_N_K<BLayout>(dummy_conv_to_gemm_transformer))>;
+    using DsGridDesc_M_N =
+        remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(dummy_conv_to_gemm_transformer))>;
     using EGridDesc_M_N =
         remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>(dummy_conv_to_gemm_transformer))>;
 
     static auto
     GenerateConvToGemmTransforms(ConvToGemmFwdTransformerLongIndexT conv_to_gemm_transformer_base,
                                  const ADataType* a_grid_ptr_base,
+                                 DsPointer ds_grid_ptr_base,
                                  EDataType* c_grid_ptr_base)
     {
         // Max number of splits
@@ -279,11 +315,13 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         // Arrays to store transformers with smaller descs than 2GB
         Array<ConvToGemmFwdTransformerIndexT, MaxGemmsNum> conv_to_gemm_transformers_arr;
         Array<const ADataType*, MaxGemmsNum> a_grid_ptrs_arr;
+        Array<DsPointer, MaxGemmsNum> ds_grid_ptrs_arr;
         Array<EDataType*, MaxGemmsNum> c_grid_ptrs_arr;
         // Queue for spliting
         std::queue<ConvToGemmFwdTransformerLongIndexT> conv_to_gemm_transformers_queue(
             {conv_to_gemm_transformer_base});
         std::queue<const ADataType*> a_grid_ptrs_queue({a_grid_ptr_base});
+        std::queue<DsPointer> ds_grid_ptrs_queue({ds_grid_ptr_base});
         std::queue<EDataType*> c_grid_ptrs_queue({c_grid_ptr_base});
 
         index_t gemms_number  = 0;
@@ -300,6 +338,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
             // Get transformer from the queue
             const auto& conv_to_gemm_transformer = conv_to_gemm_transformers_queue.front();
             const ADataType* a_grid_ptr          = a_grid_ptrs_queue.front();
+            DsPointer ds_grid_ptr                = ds_grid_ptrs_queue.front();
             EDataType* c_grid_ptr                = c_grid_ptrs_queue.front();
 
             // Check if convolution not exceed 2GB
@@ -308,8 +347,9 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
                 // If yes, push into result array
                 conv_to_gemm_transformers_arr(gemms_number) =
                     ConvToGemmFwdTransformerIndexT{conv_to_gemm_transformer};
-                a_grid_ptrs_arr(gemms_number) = a_grid_ptr;
-                c_grid_ptrs_arr(gemms_number) = c_grid_ptr;
+                a_grid_ptrs_arr(gemms_number)  = a_grid_ptr;
+                ds_grid_ptrs_arr(gemms_number) = ds_grid_ptr;
+                c_grid_ptrs_arr(gemms_number)  = c_grid_ptr;
                 gemms_number++;
             }
             else
@@ -318,19 +358,23 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
                 ConvToGemmFwdTransformerLongIndexT conv_to_gemm_transformers_left_part,
                     conv_to_gemm_transformers_right_part;
                 const ADataType* a_grid_right_ptr;
+                DsPointer ds_grid_right_ptr;
                 EDataType* c_grid_right_ptr;
 
                 ck::tie(conv_to_gemm_transformers_left_part,
                         conv_to_gemm_transformers_right_part,
                         a_grid_right_ptr,
+                        ds_grid_right_ptr,
                         c_grid_right_ptr) =
-                    conv_to_gemm_transformer.SplitConvProblem(a_grid_ptr, c_grid_ptr);
+                    conv_to_gemm_transformer.SplitConvProblem(a_grid_ptr, ds_grid_ptr, c_grid_ptr);
 
                 conv_to_gemm_transformers_queue.push(conv_to_gemm_transformers_left_part);
                 conv_to_gemm_transformers_queue.push(conv_to_gemm_transformers_right_part);
                 // Left offsets remain the same
                 a_grid_ptrs_queue.push(a_grid_ptr);
                 a_grid_ptrs_queue.push(a_grid_right_ptr);
+                ds_grid_ptrs_queue.push(ds_grid_ptr);
+                ds_grid_ptrs_queue.push(ds_grid_right_ptr);
                 c_grid_ptrs_queue.push(c_grid_ptr);
                 c_grid_ptrs_queue.push(c_grid_right_ptr);
                 split_numbers++;
@@ -338,6 +382,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
             // Remove from the queue
             conv_to_gemm_transformers_queue.pop();
             a_grid_ptrs_queue.pop();
+            ds_grid_ptrs_queue.pop();
             c_grid_ptrs_queue.pop();
         }
 
@@ -345,6 +390,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
         return ck::make_tuple(conv_to_gemm_transformers_arr,
                               a_grid_ptrs_arr,
+                              ds_grid_ptrs_arr,
                               c_grid_ptrs_arr,
                               gemms_number,
                               is_split_valid);
@@ -375,6 +421,9 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
     using BGridDesc_BK0_N_BK1 =
         remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
             BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}))>;
     using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
         remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             EGridDesc_M_N{}))>;
@@ -388,11 +437,14 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         // pointers
         const ADataType* a_ptr_;
         const BDataType* b_ptr_;
+        DsPointer ds_ptr_;
         EDataType* e_ptr_;
 
         // tensor descriptors for block/thread-wise copy
         AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
         BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
         EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
 
         // block-to-e-tile map
@@ -405,16 +457,16 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
     {
         Argument(const void* p_a,
                  const void* p_b,
-                 const std::array<const void*, NumDTensor>& /*p_ds*/,
+                 const std::array<const void*, NumDTensor>& p_ds,
                  void* p_e,
                  const std::array<long_index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
                  const std::array<long_index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
                  const std::array<long_index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
                  const std::array<long_index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
                  const std::array<std::array<long_index_t, NDimSpatial + 3>, NumDTensor>&
-                 /*ds_g_n_k_wos_lengths*/,
+                     ds_g_n_k_wos_lengths,
                  const std::array<std::array<long_index_t, NDimSpatial + 3>, NumDTensor>&
-                 /*ds_g_n_k_wos_strides*/,
+                     ds_g_n_k_wos_strides,
                  const std::array<long_index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
                  const std::array<long_index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
                  const std::array<long_index_t, NDimSpatial>& conv_filter_strides,
@@ -434,6 +486,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
               a_g_n_c_wis_strides_{a_g_n_c_wis_strides},
               b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
               b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
+              ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
               e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
               e_g_n_k_wos_strides_{e_g_n_k_wos_strides},
               conv_filter_strides_{conv_filter_strides},
@@ -441,94 +495,105 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
               input_left_pads_{input_left_pads},
               input_right_pads_{input_right_pads}
         {
-            if constexpr(NumDTensor == 0)
+            // Perform grouped gemm, generate array of tranformer for convolution
+            Array<ConvToGemmFwdTransformerIndexT, MaxGemmsNum> conv_to_gemm_transformer_arr;
+            Array<const ADataType*, MaxGemmsNum> a_grid_ptrs;
+            Array<DsPointer, MaxGemmsNum> ds_grid_ptrs;
+            Array<EDataType*, MaxGemmsNum> c_grid_ptrs;
+
+            DsPointer p_ds_casted = CastDsPointers(p_ds);
+
+            ck::tie(conv_to_gemm_transformer_arr,
+                    a_grid_ptrs,
+                    ds_grid_ptrs,
+                    c_grid_ptrs,
+                    gemms_count_,
+                    is_split_valid_) =
+                GenerateConvToGemmTransforms(
+                    ConvToGemmFwdTransformerLongIndexT{a_g_n_c_wis_lengths_,
+                                                       a_g_n_c_wis_strides_,
+                                                       b_g_k_c_xs_lengths_,
+                                                       b_g_k_c_xs_strides_,
+                                                       e_g_n_k_wos_lengths_,
+                                                       e_g_n_k_wos_strides_,
+                                                       conv_filter_strides_,
+                                                       conv_filter_dilations_,
+                                                       input_left_pads_,
+                                                       input_right_pads_},
+                    static_cast<const ADataType*>(p_a),
+                    p_ds_casted,
+                    static_cast<EDataType*>(p_e));
+
+            grid_size_         = 0;
+            valid_gemms_count_ = 0;
+
+            if(is_split_valid_)
             {
-                // Perform grouped gemm, generate array of tranformer for convolution
-                Array<ConvToGemmFwdTransformerIndexT, MaxGemmsNum> conv_to_gemm_transformer_arr;
-                Array<const ADataType*, MaxGemmsNum> a_grid_ptrs;
-                Array<EDataType*, MaxGemmsNum> c_grid_ptrs;
-
-                ck::tie(conv_to_gemm_transformer_arr,
-                        a_grid_ptrs,
-                        c_grid_ptrs,
-                        gemms_count_,
-                        is_split_valid_) =
-                    GenerateConvToGemmTransforms(
-                        ConvToGemmFwdTransformerLongIndexT{a_g_n_c_wis_lengths_,
-                                                           a_g_n_c_wis_strides_,
-                                                           b_g_k_c_xs_lengths_,
-                                                           b_g_k_c_xs_strides_,
-                                                           e_g_n_k_wos_lengths_,
-                                                           e_g_n_k_wos_strides_,
-                                                           conv_filter_strides_,
-                                                           conv_filter_dilations_,
-                                                           input_left_pads_,
-                                                           input_right_pads_},
-                        static_cast<const ADataType*>(p_a),
-                        static_cast<EDataType*>(p_e));
-
-                grid_size_         = 0;
-                valid_gemms_count_ = 0;
-
-                if(is_split_valid_)
+                // Create GemmArg for each gemm(conv)
+                for(index_t i = 0; i < gemms_count_; i++)
                 {
-                    // Create GemmArg for each gemm(conv)
-                    for(index_t i = 0; i < gemms_count_; i++)
+                    const AGridDesc_M_K a_grid_desc_m_k{DeviceOp::MakeAGridDescriptor_M_K<ALayout>(
+                        conv_to_gemm_transformer_arr[i])};
+                    const BGridDesc_N_K b_grid_desc_n_k{DeviceOp::MakeBGridDescriptor_N_K<BLayout>(
+                        conv_to_gemm_transformer_arr[i])};
+                    const auto e_grid_desc_m_n =
+                        DeviceOp::MakeEGridDescriptor_M_N<ELayout>(conv_to_gemm_transformer_arr[i]);
+
+                    const auto ds_grid_desc_m_n =
+                        generate_tuple([&](auto) { return e_grid_desc_m_n; }, Number<NumDTensor>{});
+
+                    const auto block_2_etile_map =
+                        GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+
+                    const index_t grid_size_grp =
+                        block_2_etile_map.CalculateGridSize(e_grid_desc_m_n);
+
+                    const index_t BlockStart = grid_size_;
+                    const index_t BlockEnd   = grid_size_ + grid_size_grp;
+
+                    grid_size_ += grid_size_grp;
+
+                    if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                   b_grid_desc_n_k,
+                                                   ds_grid_desc_m_n,
+                                                   e_grid_desc_m_n,
+                                                   block_2_etile_map))
                     {
-                        const AGridDesc_M_K a_grid_desc_m_k{
-                            DeviceOp::MakeAGridDescriptor_M_K<ALayout>(
-                                conv_to_gemm_transformer_arr[i])};
-                        const BGridDesc_N_K b_grid_desc_n_k{
-                            DeviceOp::MakeBGridDescriptor_N_K<BLayout>(
-                                conv_to_gemm_transformer_arr[i])};
-                        const auto e_grid_desc_m_n = DeviceOp::MakeEGridDescriptor_M_N<ELayout>(
-                            conv_to_gemm_transformer_arr[i]);
+                        gemm_desc_kernel_args_(valid_gemms_count_) = GemmArgs{
+                            a_grid_ptrs[i],
+                            static_cast<const BDataType*>(p_b),
+                            ds_grid_ptrs[i],
+                            c_grid_ptrs[i],
+                            GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k),
+                            GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k),
+                            GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n),
+                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                e_grid_desc_m_n),
+                            block_2_etile_map,
+                            BlockStart,
+                            BlockEnd};
 
-                        const auto block_2_etile_map =
-                            GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
-
-                        const index_t grid_size_grp =
-                            block_2_etile_map.CalculateGridSize(e_grid_desc_m_n);
-
-                        const index_t BlockStart = grid_size_;
-                        const index_t BlockEnd   = grid_size_ + grid_size_grp;
-
-                        grid_size_ += grid_size_grp;
-
-                        if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
-                                                       b_grid_desc_n_k,
-                                                       Tuple<>{},
-                                                       e_grid_desc_m_n,
-                                                       block_2_etile_map))
-                        {
-
-                            gemm_desc_kernel_args_(valid_gemms_count_) = GemmArgs{
-                                a_grid_ptrs[i],
-                                static_cast<const BDataType*>(p_b),
-                                c_grid_ptrs[i],
-                                GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k),
-                                GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k),
-                                GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                                    e_grid_desc_m_n),
-                                block_2_etile_map,
-                                BlockStart,
-                                BlockEnd};
-
-                            valid_gemms_count_++;
-                        }
+                        valid_gemms_count_++;
                     }
-                    // N is the same for all convs
-                    conv_N_per_block_ = static_cast<index_t>(conv_to_gemm_transformer_arr[I0].N_);
                 }
-
-                // Strides for G and N remain the same
-                compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0];
-                compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0];
-                compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0];
-
-                compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_;
-                compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_;
+                // N is the same for all convs
+                conv_N_per_block_ = static_cast<index_t>(conv_to_gemm_transformer_arr[I0].N_);
             }
+
+            // Strides for G and N remain the same
+            compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0];
+
+            compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_;
+            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                compute_ptr_offset_of_groups_.BatchStrideDs_(i) = ds_g_n_k_wos_strides_[i][0];
+                compute_ptr_offset_of_n_.BatchStrideDs_(i) =
+                    ds_g_n_k_wos_strides_[i][1] * conv_N_per_block_;
+            });
         }
 
         void Print() const
@@ -558,8 +623,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         bool is_split_valid_;
 
         // for computing batch offset
-        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_groups_;
-        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_n_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_groups_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_n_;
 
         // element-wise op
         AElementwiseOperation a_element_op_;
@@ -571,6 +636,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         std::array<long_index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
         std::array<long_index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
         std::array<long_index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<long_index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_;
+        std::array<std::array<long_index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_;
         std::array<long_index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_;
         std::array<long_index_t, NDimSpatial + 3> e_g_n_k_wos_strides_;
         std::array<long_index_t, NDimSpatial> conv_filter_strides_;
@@ -584,63 +651,55 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
     {
         float Run(const DeviceOp::Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if constexpr(NumDTensor == 0)
+            if(stream_config.log_level_ > 0)
             {
-                if(stream_config.log_level_ > 0)
-                {
-                    arg.Print();
-                }
+                arg.Print();
+            }
 
-                const index_t num_workgroups_per_Conv_N =
-                    arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;
+            const index_t num_workgroups_per_Conv_N =
+                arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;
 
-                const index_t gdx = arg.grid_size_;
-                const index_t gdy = arg.num_group_;
-                const index_t gdz = num_workgroups_per_Conv_N;
+            const index_t gdx = arg.grid_size_;
+            const index_t gdy = arg.num_group_;
+            const index_t gdz = num_workgroups_per_Conv_N;
 
-                // K is constant for all gemms
-                const auto K = arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
-                               arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            // K is constant for all gemms
+            const auto K = arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                           arg.gemm_desc_kernel_args_[I0].a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
-                auto launch_kernel = [&](auto has_main_k_block_loop) {
-                    constexpr bool has_main_loop = has_main_k_block_loop.value;
-                    const auto kernel =
-                        kernel_grouped_conv_fwd_multiple_d_grouped_gemm_xdl_cshuffle<
-                            GridwiseGemm,
-                            MaxGemmsNum,
-                            GemmArgs,
-                            AElementwiseOperation,
-                            BElementwiseOperation,
-                            CDEElementwiseOperation,
-                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
-                            has_main_loop>;
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+                const auto kernel = kernel_grouped_conv_fwd_multiple_d_grouped_gemm_xdl_cshuffle<
+                    GridwiseGemm,
+                    MaxGemmsNum,
+                    GemmArgs,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                    has_main_loop>;
 
-                    return launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(gdx, gdy, gdz),
-                                                  dim3(BlockSize),
-                                                  0,
-                                                  arg.gemm_desc_kernel_args_,
-                                                  arg.gemms_count_,
-                                                  arg.a_element_op_,
-                                                  arg.b_element_op_,
-                                                  arg.cde_element_op_,
-                                                  arg.compute_ptr_offset_of_groups_,
-                                                  arg.compute_ptr_offset_of_n_);
-                };
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(gdx, gdy, gdz),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.gemm_desc_kernel_args_,
+                                              arg.gemms_count_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.compute_ptr_offset_of_groups_,
+                                              arg.compute_ptr_offset_of_n_);
+            };
 
-                if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
-                {
-                    return launch_kernel(integral_constant<bool, true>{});
-                }
-                else
-                {
-                    return launch_kernel(integral_constant<bool, false>{});
-                }
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
             }
             else
             {
-                return 0.f;
+                return launch_kernel(integral_constant<bool, false>{});
             }
         }
 
@@ -657,9 +716,26 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
         const long_index_t K = arg.b_g_k_c_xs_lengths_[I1];
         const long_index_t C = arg.b_g_k_c_xs_lengths_[I2];
-        // Move this to runtime check to align Conv instances
-        // with Conv Multiple D instances
-        if constexpr(NumDTensor != 0)
+
+        bool ds_valid = true;
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            for(int d = 0; d < NDimSpatial + I3; d++)
+            {
+                if(arg.ds_g_n_k_wos_strides_[i][d] != arg.e_g_n_k_wos_strides_[d])
+                {
+                    ds_valid = false;
+                }
+                if(arg.ds_g_n_k_wos_lengths_[i][d] != arg.e_g_n_k_wos_lengths_[d])
+                {
+                    ds_valid = false;
+                }
+            }
+
+            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+            static_assert(is_same_v<DDataType, EDataType>);
+        });
+
+        if(!ds_valid)
         {
             return false;
         }
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index 92b48c44b3..50f6ba3b53 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -389,7 +389,9 @@ struct TransformConvFwdToGemm
         return is_a_descriptor_smaller_than_2GB && is_c_descriptor_smaller_than_2GB;
     }
 
+    template <typename DsPointer>
     __host__ auto SplitConvProblem(const ADataType* a_grid_ptr_base,
+                                   DsPointer& ds_grid_ptr_base,
                                    CDataType* c_grid_ptr_base) const
     {
         // Create copies
@@ -480,11 +482,17 @@ struct TransformConvFwdToGemm
             a_right_offset = ((Wo_ / 2) * ConvStrideW_ - InLeftPadW_) * WiStride_;
             c_right_offset = (Wo_ / 2) * WoStride_;
         }
+
+        static constexpr index_t NumDTensor = DsPointer::Size();
+        const auto ds_grid_right_ptr        = generate_tuple(
+            [&](auto i) { return ds_grid_ptr_base(i) + c_right_offset; }, Number<NumDTensor>{});
+
         // Return left transform, right transformer, right offset to Input and right offset to
         // Output
         return ck::make_tuple(conv_to_gemm_transformer_left,
                               conv_to_gemm_transformer_right,
                               a_grid_ptr_base + a_right_offset,
+                              ds_grid_right_ptr,
                               c_grid_ptr_base + c_right_offset);
     }
 
diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt
index 8bded647b6..f964325c06 100644
--- a/test/grouped_convnd_fwd_activation/CMakeLists.txt
+++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt
@@ -7,4 +7,8 @@ if(GPU_TARGETS MATCHES "gfx9")
 
     add_gtest_executable(test_grouped_convnd_fwd_clamp test_grouped_convnd_fwd_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_clamp PRIVATE utility device_grouped_conv2d_fwd_clamp_instance device_grouped_conv3d_fwd_clamp_instance)
+
+    add_executable(test_grouped_convnd_fwd_bias_clamp_large_cases test_grouped_convnd_fwd_bias_clamp_large_cases.cpp)
+    target_compile_options(test_grouped_convnd_fwd_bias_clamp_large_cases PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_fwd_bias_clamp_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
 endif()
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
new file mode 100644
index 0000000000..7a59a95527
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_clamp_large_cases.cpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using AddClamp = ck::tensor_operation::element_wise::AddClamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::long_index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                                  InLayout,
+                                                                                  WeiLayout,
+                                                                                  OutLayout,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  IndexType,
+                                                                                  false /*BiasGK*/>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwdBiasClamp2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwdBiasClamp3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwdBiasClamp2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwdBiasClamp3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwdBiasClamp2d, Test2D)
+{
+    // Case larger than 2GB
+    this->conv_params.push_back(
+        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+    // With supported NumGroupsToMerge > 1
+    this->conv_params.push_back(
+        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back(
+        {2, 2, 2, 128, 128, {3, 3}, {4096, 2048}, {300, 300}, {3, 3}, {1, 1}, {1, 1}});
+    // Split N and G > 1
+    this->conv_params.push_back(
+        {2, 4, 112, 8, 8, {3, 3}, {469, 724}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwdBiasClamp3d, Test3D)
+{
+    // Case larger than 2GB
+    this->conv_params.push_back({3,
+                                 1,
+                                 128,
+                                 4,
+                                 192,
+                                 {2, 2, 2},
+                                 {2, 224, 224},
+                                 {1, 224, 224},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    // With supported NumGroupsToMerge > 1
+    this->conv_params.push_back({3,
+                                 32,
+                                 64,
+                                 1,
+                                 1,
+                                 {2, 2, 2},
+                                 {360, 2, 672},
+                                 {360, 2, 672},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back({3,
+                                 1,
+                                 2,
+                                 128,
+                                 128,
+                                 {3, 1, 3},
+                                 {900, 2, 2048},
+                                 {300, 1, 300},
+                                 {3, 2, 3},
+                                 {1, 1, 1},
+                                 {1, 1, 1}});
+    this->template Run<3>();
+}

From 7fe50dc3da2069d6645d9deb8c017a876472a977 Mon Sep 17 00:00:00 2001
From: Andres Lugo <108368282+alugorey@users.noreply.github.com>
Date: Mon, 28 Jul 2025 16:53:24 -0500
Subject: [PATCH 362/443] Remove filter for only batch on receipt 4 (#2574)

Re-enable group mode instances for the Pytorch receipt and resolve linker errors for torch SDPA
---
 example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 30b524d606..e4f46b502a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -775,7 +775,6 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
                 cond &= bias in ['no', 'bias']
                 cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
                 cond &= dpad == dvpad
-                cond &= mode == 'batch'
                 cond &= deterministic == "f"
                 if not cond:
                     continue

From 1926cd0cb8bfb0139f29a518ebfb5368920d5e4b Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Tue, 29 Jul 2025 09:31:14 +0800
Subject: [PATCH 363/443] [CK_TILE] FMHA bwd Support hdim as a Multiple of 32
 (#2130)

* Fix shuffle_tile

* Add fmha bwd d160

* CHANGELOG

* Use static_cast

* Update

---------

Co-authored-by: asleepzzz <hanwen.chang@amd.com>
---
 CHANGELOG.md                                  |   1 +
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   |   2 +
 .../tensor/tile_distribution_encoding.hpp     |  22 +-
 ...a_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp |   6 +-
 ...block_fmha_bwd_pipeline_default_policy.hpp | 556 +++++++++++++-----
 5 files changed, 446 insertions(+), 141 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fa3ba71143..4c054b822a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
 * Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv)
+* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv/bwd)
 * Added benchmarking support for tile engine GEMM.
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index e4f46b502a..77b63a0c83 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -357,6 +357,8 @@ def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict
                         "kr_ktr_vr_iglp", "kr_ktr_vr"],
             '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
                         "kr_ktr_vr_iglp", "kr_ktr_vr"],
+            # '160' : [FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            #             "kr_ktr_vr_iglp", "kr_ktr_vr"],
             '256' : [FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
                         "kr_ktr_vr_iglp", "kr_ktr_vr"]
         }
diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
index 52a16f32bd..b380e7c9d8 100644
--- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -533,6 +533,26 @@ struct tile_distribution_encoding
     }
 };
 
+template <typename encoding, typename shuffle>
+class tile_distribution_encoding_shuffle;
+template <typename encoding, index_t... shuffle>
+class tile_distribution_encoding_shuffle<encoding, sequence<shuffle...>>
+{
+    template <typename Ys2RHs>
+    using shuffled = sequence<(Ys2RHs::template get<shuffle>())...>;
+
+    public:
+    using type = tile_distribution_encoding<typename encoding::RsLengths,
+                                            typename encoding::HsLengthss,
+                                            typename encoding::Ps2RHssMajor,
+                                            typename encoding::Ps2RHssMinor,
+                                            shuffled<typename encoding::Ys2RHsMajor>,
+                                            shuffled<typename encoding::Ys2RHsMinor>>;
+};
+template <typename encoding, typename shuffle>
+using tile_distribution_encoding_shuffle_t =
+    typename tile_distribution_encoding_shuffle<encoding, shuffle>::type;
+
 namespace detail {
 
 template <typename OuterDstr, typename InnerDstr>
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index 420ae03b7e..c88b058d32 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -182,7 +182,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
 
         auto k_lds_read_window =
             make_tile_window(k_lds_write_window.get_bottom_tensor_view(),
-                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             make_tuple(number<kN0>{}, number<kQKHeaddim>{}),
                              k_lds_write_window.get_window_origin(),
                              Policy::template MakeKRegBlockDescriptor<Problem>());
 
@@ -208,7 +208,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
 
         auto v_lds_read_window =
             make_tile_window(v_lds_write_window.get_bottom_tensor_view(),
-                             make_tuple(number<kN0>{}, number<kK2>{}),
+                             make_tuple(number<kN0>{}, number<kVHeaddim>{}),
                              v_lds_write_window.get_window_origin(),
                              Policy::template MakeVRegBlockDescriptor<Problem>());
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index d353203e0e..bc0dc592f0 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -22,6 +22,13 @@ namespace ck_tile {
 
 struct BlockFmhaBwdPipelineDefaultPolicy
 {
+    template <index_t ndim>
+    static constexpr auto swap_last2 = generate_sequence_v2(
+        [](auto i) {
+            return number < i == ndim - 2 ? ndim - 1 : i == ndim - 1 ? ndim - 2 : i > {};
+        },
+        number<ndim>{});
+
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
     {
@@ -384,13 +391,40 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t N0 = kBlockSize / get_warp_size();
         constexpr index_t N2 = kNPerBlock / (N1 * N0);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<0>, sequence<1, 0>>,
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
+
+        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                     kNPerBlock * kKPerBlock)
+        {
+            return dstr;
+        }
+        else
+        {
+            constexpr index_t kKPerIter = 32;
+            static_assert(kKPerBlock % kKPerIter == 0);
+            constexpr index_t K0_m = kKPerBlock / kKPerIter;
+            constexpr index_t K2   = 2;
+            constexpr index_t K1_m = kKPerIter / K2;
+            constexpr index_t N1_m = get_warp_size() / K1_m;
+            constexpr index_t N2_m = kNPerBlock / (N1_m * N0);
+            constexpr auto dstr_m  = make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<N0, N1_m, N2_m>, sequence<K0_m, K1_m, K2>>,
+                    tuple<sequence<1>, sequence<1, 2>>, // N0, N1 K1
+                    tuple<sequence<0>, sequence<1, 1>>,
+                    sequence<2, 1, 2>, // K0 N2 K2
+                    sequence<0, 2, 2>>{});
+            static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                          kNPerBlock * kKPerBlock);
+            return dstr_m;
+        }
     }
 
     template <typename Problem>
@@ -407,13 +441,39 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t N1 = kBlockSize / get_warp_size();
         constexpr index_t N0 = kNPerBlock / (N2 * N1);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<1, 2>>, // N1, N2 K0
                                        tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
+                                       sequence<1, 2>, // N0 K1
                                        sequence<0, 1>>{});
+        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                     kNPerBlock * kKPerBlock)
+        {
+            return dstr;
+        }
+        else
+        {
+            constexpr index_t kKPerIter = 32;
+            static_assert(kKPerBlock % kKPerIter == 0);
+            constexpr index_t K0_m = kKPerBlock / kKPerIter;
+            constexpr index_t K2   = 2;
+            constexpr index_t K1_m = kKPerIter / K2;
+            constexpr index_t N2_m = get_warp_size() / K1_m;
+            constexpr index_t N0_m = kNPerBlock / (N2_m * N1);
+            constexpr auto dstr_m  = make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<N0_m, N1, N2_m>, sequence<K0_m, K1_m, K2>>,
+                    tuple<sequence<1>, sequence<1, 2>>, // N1, N2 K1
+                    tuple<sequence<1>, sequence<2, 1>>,
+                    sequence<2, 1, 2>, // K0 N0 K2
+                    sequence<0, 0, 2>>{});
+            static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                          kNPerBlock * kKPerBlock);
+            return dstr_m;
+        }
     }
 
     template <typename Problem>
@@ -430,13 +490,41 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M0 = kBlockSize / get_warp_size();
         constexpr index_t M2 = kMPerBlock / (M1 * M0);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<0>, sequence<1, 0>>,
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
+
+        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                     kMPerBlock * kKPerBlock)
+        {
+            return dstr;
+        }
+        else
+        {
+            // something not divisible, try a more flexible distribution
+            constexpr index_t kKPerIter = 32;
+            static_assert(kKPerBlock % kKPerIter == 0);
+            constexpr index_t K0_m = kKPerBlock / kKPerIter;
+            constexpr index_t K2   = 2;
+            constexpr index_t K1_m = kKPerIter / K2;
+            constexpr index_t M1_m = get_warp_size() / K1_m;
+            constexpr index_t M2_m = kMPerBlock / (M1_m * M0);
+            constexpr auto dstr_m  = make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<M0, M1_m, M2_m>, sequence<K0_m, K1_m, K2>>,
+                    tuple<sequence<1>, sequence<1, 2>>, // M0, M1 K1
+                    tuple<sequence<0>, sequence<1, 1>>,
+                    sequence<2, 1, 2>, // K0 M2 K2
+                    sequence<0, 2, 2>>{});
+            static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                          kMPerBlock * kKPerBlock);
+            return dstr_m;
+        }
     }
 
     template <typename Problem>
@@ -453,13 +541,41 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M0 = kBlockSize / get_warp_size();
         constexpr index_t M2 = kMPerBlock / (M1 * M0);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<0>, sequence<1, 0>>,
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
+
+        if constexpr(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                     kMPerBlock * kKPerBlock)
+        {
+            return dstr;
+        }
+        else
+        {
+            // something not divisible, try a more flexible distribution
+            constexpr index_t kKPerIter = 32;
+            static_assert(kKPerBlock % kKPerIter == 0);
+            constexpr index_t K0_m = kKPerBlock / kKPerIter;
+            constexpr index_t K2   = 2;
+            constexpr index_t K1_m = kKPerIter / K2;
+            constexpr index_t M1_m = get_warp_size() / K1_m;
+            constexpr index_t M2_m = kMPerBlock / (M1_m * M0);
+            constexpr auto dstr_m  = make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<M0, M1_m, M2_m>, sequence<K0_m, K1_m, K2>>,
+                    tuple<sequence<1>, sequence<1, 2>>, // M0, M1 K1
+                    tuple<sequence<0>, sequence<1, 1>>,
+                    sequence<2, 1, 2>, // K0 M2 K2
+                    sequence<0, 2, 2>>{});
+            static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                          kMPerBlock * kKPerBlock);
+            return dstr_m;
+        }
     }
 
     template <typename Problem, typename BlockGemm>
@@ -504,13 +620,16 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M0 = kBlockSize / get_warp_size();
         constexpr index_t M2 = kMPerBlock / (M1 * M0);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<M0, M1, M2>, sequence<N0, N1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<0>, sequence<1, 0>>,
                                        sequence<1, 2>,
                                        sequence<2, 1>>{});
+        static_assert(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kMPerBlock * kNPerBlock);
+        return dstr;
     }
 
     template <typename DataType, index_t MPerBlock, index_t KPerBlock>
@@ -522,13 +641,16 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M1 = get_warp_size();
         constexpr index_t M0 = MPerBlock / M1;
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
                                        tuple<sequence<1>, sequence<1>>,
                                        tuple<sequence<0>, sequence<1>>,
                                        sequence<1, 2, 2>,
                                        sequence<2, 0, 1>>{});
+        static_assert(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      MPerBlock * KPerBlock);
+        return dstr;
     }
 
     template <typename Problem>
@@ -569,13 +691,16 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M1 = kBlockSize / get_warp_size();
         constexpr index_t M0 = kMPerBlock / (M1 * M2);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<1>, sequence<M0, M1, M2>, sequence<K0, K1>>,
                                        tuple<sequence<2>, sequence<2, 3>>,
                                        tuple<sequence<1>, sequence<2, 0>>,
                                        sequence<1, 2, 3>,
                                        sequence<0, 0, 1>>{});
+        static_assert(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kMPerBlock * kKPerBlock);
+        return dstr;
     }
 
     template <typename Problem>
@@ -594,13 +719,17 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M1 = kBlockSize / get_warp_size();
         constexpr index_t M0 = kMPerBlock / (M1 * M2);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
                                        tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
                                        tuple<sequence<1>, sequence<1, 2>>,
                                        tuple<sequence<1>, sequence<2, 0>>,
                                        sequence<1, 2>,
                                        sequence<0, 1>>{});
+
+        static_assert(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kMPerBlock * kKPerBlock);
+        return dstr;
     }
 
     // these are for lds
@@ -666,56 +795,80 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         return 16 / sizeof(GemmDataType);
     }
 
-    template <index_t MNPerBlock, index_t KPerBlock, index_t KPack>
+    template <index_t KIter, index_t MNPerBlock, index_t KPerSubBlock, index_t KPack>
     CK_TILE_HOST_DEVICE static constexpr auto MakeXLdsBlockDescriptor()
     {
         constexpr auto DataTypeSize = 2; // sizeof(F16/BF16)
         constexpr auto MNLdsLayer =
-            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+            (32 * 4 / KPerSubBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerSubBlock / DataTypeSize);
 
-        constexpr auto x_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<KPerBlock / KPack * MNLdsLayer>{},
-                       number<MNPerBlock / MNLdsLayer>{},
-                       number<KPack>{}),
-            make_tuple(number<KPack>{}, number<KPerBlock * MNLdsLayer>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
+        constexpr auto x_lds_block_desc_0 =
+            make_naive_tensor_descriptor(make_tuple(number<KIter>{},
+                                                    number<KPerSubBlock / KPack * MNLdsLayer>{},
+                                                    number<MNPerBlock / MNLdsLayer>{},
+                                                    number<KPack>{}),
+                                         make_tuple(number<KPerSubBlock * MNPerBlock>{},
+                                                    number<KPack>{},
+                                                    number<KPerSubBlock * MNLdsLayer>{},
+                                                    number<1>{}),
+                                         number<KPack>{},
+                                         number<1>{});
 
         constexpr auto x_lds_block_desc_permuted = transform_tensor_descriptor(
             x_lds_block_desc_0,
-            make_tuple(make_xor_transform(make_tuple(number<MNPerBlock / MNLdsLayer>{},
-                                                     number<KPerBlock / KPack * MNLdsLayer>{})),
+            make_tuple(make_pass_through_transform(number<KIter>{}),
+                       make_xor_transform(make_tuple(number<MNPerBlock / MNLdsLayer>{},
+                                                     number<KPerSubBlock / KPack * MNLdsLayer>{})),
                        make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+            make_tuple(sequence<0>{}, sequence<2, 1>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<2, 1>{}, sequence<3>{}));
 
         constexpr auto x_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
             x_lds_block_desc_permuted,
-            make_tuple(make_unmerge_transform(
-                           make_tuple(number<KPerBlock / KPack>{}, number<MNLdsLayer>{})),
+            make_tuple(make_pass_through_transform(number<KIter>{}),
+                       make_unmerge_transform(
+                           make_tuple(number<KPerSubBlock / KPack>{}, number<MNLdsLayer>{})),
                        make_pass_through_transform(number<MNPerBlock / MNLdsLayer>{}),
                        make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}));
 
         constexpr auto x_lds_block_desc = transform_tensor_descriptor(
             x_lds_block_desc_xk0_mnldslayer_mn_xk1,
             make_tuple(make_merge_transform_v3_division_mod(
                            make_tuple(number<MNPerBlock / MNLdsLayer>{}, number<MNLdsLayer>{})),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
+                       make_merge_transform_v3_division_mod(make_tuple(
+                           number<KIter>{}, number<KPerSubBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<2, 3>{}, sequence<0, 1, 4>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
 
+        static_assert(container_reduce(x_lds_block_desc.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == KIter * MNPerBlock * KPerSubBlock);
         return x_lds_block_desc;
     }
 
+    template <index_t MNPerBlock, index_t KPerBlock, index_t KPack>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeXLdsBlockDescriptor()
+    {
+        return MakeXLdsBlockDescriptor<1, MNPerBlock, KPerBlock, KPack>();
+    }
     template <typename Problem,
               index_t MNPerBlock,
               index_t KPerBlock,
               index_t KPack,
               index_t KPackT>
     CK_TILE_HOST_DEVICE static constexpr auto MakeXTLdsBlockDescriptor()
+    {
+        return MakeXTLdsBlockDescriptor<Problem, 1, MNPerBlock, KPerBlock, KPack, KPackT>();
+    }
+    template <typename Problem,
+              index_t MNIter,
+              index_t MNPerSubBlock,
+              index_t KPerBlock,
+              index_t KPack,
+              index_t KPackT>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeXTLdsBlockDescriptor()
     {
         // kfold and mpair dimension is not always required.
         // more dimension in merge_transform increase the difficulty of generating immarg offset
@@ -723,7 +876,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr auto MNPerXDL   = Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
         constexpr auto kBlockSize = Problem::kBlockSize;
 
-        constexpr auto MN0 = MNPerBlock / KPack;
+        constexpr auto MN0 = MNPerSubBlock / KPack;
         constexpr auto MN1 = KPack;
 
         constexpr auto KThreadWrite     = kBlockSize / MN0;
@@ -745,13 +898,15 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                 : ((128 / (KPackT * MNPerXDL * 2)) > MN0 ? MN0 : 128 / (KPackT * MNPerXDL * 2));
 
         constexpr auto xt_lds_block_desc_raw = make_naive_tensor_descriptor(
-            make_tuple(number<KThreadWrite / kfold / KThreadReadPerm>{},
+            make_tuple(number<MNIter>{},
+                       number<KThreadWrite / kfold / KThreadReadPerm>{},
                        number<K0PerThreadWrite>{},
                        number<KThreadReadPerm * MN1>{},
                        number<kfold * MN0 / mnpair>{},
                        number<mnpair>{},
                        KPackT),
-            make_tuple(number<KPackT * kfold * MN0 * KThreadReadPerm * MN1 * K0PerThreadWrite>{},
+            make_tuple(number<KPackT * MN0 * KThreadWrite * MN1 * K0PerThreadWrite>{},
+                       number<KPackT * kfold * MN0 * KThreadReadPerm * MN1 * K0PerThreadWrite>{},
                        number<KPackT * kfold * MN0 * KThreadReadPerm * MN1>{},
                        number<KPackT * kfold * MN0>{},
                        number<KPackT * mnpair>{},
@@ -763,20 +918,30 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr auto xt_lds_block_desc_permuted = transform_tensor_descriptor(
             xt_lds_block_desc_raw,
             make_tuple(
+                make_pass_through_transform(number<MNIter>{}),
                 make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
                 make_pass_through_transform(number<K0PerThreadWrite>{}),
                 make_xor_transform(
                     make_tuple(number<KThreadReadPerm * MN1>{}, number<kfold * MN0 / mnpair>{})),
                 make_pass_through_transform(number<mnpair>{}),
                 make_pass_through_transform(KPackT)),
-            make_tuple(
-                sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}),
-            make_tuple(
-                sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4>{}, sequence<5>{}));
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2>{},
+                       sequence<3, 4>{},
+                       sequence<5>{},
+                       sequence<6>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2>{},
+                       sequence<3, 4>{},
+                       sequence<5>{},
+                       sequence<6>{}));
 
         constexpr auto xt_lds_block_desc_unmerged = transform_tensor_descriptor(
             xt_lds_block_desc_permuted,
             make_tuple(
+                make_pass_through_transform(number<MNIter>{}),
                 make_pass_through_transform(number<KThreadWrite / kfold / KThreadReadPerm>{}),
                 make_pass_through_transform(number<K0PerThreadWrite>{}),
                 make_unmerge_transform(make_tuple(number<KThreadReadPerm>{}, number<MN1>{})),
@@ -788,27 +953,32 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                        sequence<2>{},
                        sequence<3>{},
                        sequence<4>{},
-                       sequence<5>{}),
-            make_tuple(sequence<1>{},
+                       sequence<5>{},
+                       sequence<6>{}),
+            make_tuple(sequence<0>{},
                        sequence<2>{},
-                       sequence<0, 3>{},
-                       sequence<4, 5>{},
-                       sequence<6>{},
-                       sequence<7>{}));
+                       sequence<3>{},
+                       sequence<1, 4>{},
+                       sequence<5, 6>{},
+                       sequence<7>{},
+                       sequence<8>{}));
 
         constexpr auto xt_lds_block_desc = transform_tensor_descriptor(
             xt_lds_block_desc_unmerged,
-            make_tuple(make_merge_transform_v3_division_mod(
-                           make_tuple(number<KThreadReadPerm>{},
-                                      number<KThreadWrite / kfold / KThreadReadPerm>{},
-                                      number<kfold>{},
-                                      number<K0PerThreadWrite>{},
-                                      number<KPackT>{})),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<MN0 / mnpair>{}, number<mnpair>{}, number<MN1>{}))),
-            make_tuple(sequence<0, 1, 4, 2, 7>{}, sequence<5, 6, 3>{}),
+            make_tuple(
+                make_merge_transform_v3_division_mod(
+                    make_tuple(number<KThreadReadPerm>{},
+                               number<KThreadWrite / kfold / KThreadReadPerm>{},
+                               number<kfold>{},
+                               number<K0PerThreadWrite>{},
+                               number<KPackT>{})),
+                make_merge_transform_v3_division_mod(make_tuple(
+                    number<MNIter>{}, number<MN0 / mnpair>{}, number<mnpair>{}, number<MN1>{}))),
+            make_tuple(sequence<1, 2, 5, 3, 8>{}, sequence<0, 6, 7, 4>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
-
+        static_assert(container_reduce(xt_lds_block_desc.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == MNPerSubBlock * MNIter * KPerBlock);
         return xt_lds_block_desc;
     }
 
@@ -817,9 +987,24 @@ struct BlockFmhaBwdPipelineDefaultPolicy
     {
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
-        constexpr index_t kKPack     = GetSmemKPackK<Problem>();
 
-        return MakeXLdsBlockDescriptor<kNPerBlock, kKPerBlock, kKPack>();
+        using dram_encoding = typename decltype(MakeKDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kKPack = GetSmemKPackK<Problem>();
+            return MakeXLdsBlockDescriptor<kNPerBlock, kKPerBlock, kKPack>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kKPack = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            return MakeXLdsBlockDescriptor<KIter, kNPerBlock, kKPerBlock / KIter, kKPack>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -850,7 +1035,8 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             k_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
         constexpr auto k_block_dstr = make_static_tile_distribution(k_block_dstr_encode);
-
+        static_assert(container_reduce(k_block_dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kNPerBlock * kKPerBlock);
         return k_block_dstr;
     }
 
@@ -860,9 +1046,23 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kVHeaddim;
 
-        constexpr index_t kVPack = GetSmemKPackV<Problem>();
-
-        return MakeXLdsBlockDescriptor<kNPerBlock, kKPerBlock, kVPack>();
+        using dram_encoding = typename decltype(MakeVDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kVPack = GetSmemKPackV<Problem>();
+            return MakeXLdsBlockDescriptor<kNPerBlock, kKPerBlock, kVPack>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kVPack = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            return MakeXLdsBlockDescriptor<KIter, kNPerBlock, kKPerBlock / KIter, kVPack>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -893,30 +1093,21 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             v_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
         constexpr auto v_block_dstr = make_static_tile_distribution(v_block_dstr_encode);
-
+        static_assert(container_reduce(v_block_dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kNPerBlock * kKPerBlock);
         return v_block_dstr;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledKRegWriteBlockDescriptor()
     {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
-
-        constexpr index_t K1 = GetAlignmentK<Problem>();
-        constexpr index_t K0 = kKPerBlock / K1;
-        constexpr index_t N2 = GetTransposedAlignmentK<Problem>();
-        constexpr index_t N1 = get_warp_size() / K0;
-        constexpr index_t N0 = kBlockSize / get_warp_size();
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<1, 0>>,
-                                       sequence<2, 1>,
-                                       sequence<1, 2>>{});
+        using dram_encoding = typename decltype(MakeKDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        static_assert(y_ndim >= 2);
+        using shuffled_encoding_t =
+            tile_distribution_encoding_shuffle_t<dram_encoding,
+                                                 remove_cvref_t<decltype(swap_last2<y_ndim>)>>;
+        return make_static_tile_distribution(shuffled_encoding_t{});
     }
 
     template <typename Problem>
@@ -926,10 +1117,30 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
 
-        constexpr index_t kKPack  = GetSmemKPackK<Problem>();
-        constexpr index_t kKPackT = GetSmemKPackKT<Problem>();
-
-        return MakeXTLdsBlockDescriptor<Problem, kNPerBlock, kKPerBlock, kKPack, kKPackT>();
+        using dram_encoding = typename decltype(MakeKDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kKPack  = GetSmemKPackK<Problem>();
+            constexpr index_t kKPackT = GetSmemKPackKT<Problem>();
+            return MakeXTLdsBlockDescriptor<Problem, kNPerBlock, kKPerBlock, kKPack, kKPackT>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter   = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kKPack  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            constexpr index_t kKPackT = typename dram_encoding::HsLengthss{}.at(number<0>{}).at(2);
+            return MakeXTLdsBlockDescriptor<Problem,
+                                            KIter,
+                                            kNPerBlock / KIter,
+                                            kKPerBlock,
+                                            kKPack,
+                                            kKPackT>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -976,7 +1187,9 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             kt_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
         constexpr auto kt_block_dstr = make_static_tile_distribution(kt_block_dstr_encode);
-
+        static_assert(container_reduce(kt_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kNPerBlock * kKPerBlock);
         return kt_block_dstr;
     }
 
@@ -986,9 +1199,23 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
 
-        constexpr index_t kKPack = GetSmemKPackQ<Problem>();
-
-        return MakeXLdsBlockDescriptor<kMPerBlock, kKPerBlock, kKPack>();
+        using dram_encoding = typename decltype(MakeQDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kKPack = GetSmemKPackQ<Problem>();
+            return MakeXLdsBlockDescriptor<kMPerBlock, kKPerBlock, kKPack>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kKPack = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            return MakeXLdsBlockDescriptor<KIter, kMPerBlock, kKPerBlock / KIter, kKPack>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -1019,30 +1246,21 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
 
         constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
-
+        static_assert(container_reduce(q_block_dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kMPerBlock * kKPerBlock);
         return q_block_dstr;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledQRegWriteBlockDescriptor()
     {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
-
-        constexpr index_t K1 = GetAlignmentQ<Problem>();
-        constexpr index_t K0 = kKPerBlock / K1;
-        constexpr index_t N2 = GetTransposedAlignmentQ<Problem>();
-        constexpr index_t N1 = get_warp_size() / K0;
-        constexpr index_t N0 = kBlockSize / get_warp_size();
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<1, 0>>,
-                                       sequence<2, 1>,
-                                       sequence<1, 2>>{});
+        using dram_encoding = typename decltype(MakeQDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        static_assert(y_ndim >= 2);
+        using shuffled_encoding_t =
+            tile_distribution_encoding_shuffle_t<dram_encoding,
+                                                 remove_cvref_t<decltype(swap_last2<y_ndim>)>>;
+        return make_static_tile_distribution(shuffled_encoding_t{});
     }
 
     template <typename Problem>
@@ -1052,10 +1270,30 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kM0;
 
-        constexpr index_t kKPack  = GetSmemKPackQ<Problem>();
-        constexpr index_t kKPackT = GetSmemKPackQT<Problem>();
-
-        return MakeXTLdsBlockDescriptor<Problem, kNPerBlock, kKPerBlock, kKPack, kKPackT>();
+        using dram_encoding = typename decltype(MakeQDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kKPack  = GetSmemKPackQ<Problem>();
+            constexpr index_t kKPackT = GetSmemKPackQT<Problem>();
+            return MakeXTLdsBlockDescriptor<Problem, kNPerBlock, kKPerBlock, kKPack, kKPackT>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter   = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kKPack  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            constexpr index_t kKPackT = typename dram_encoding::HsLengthss{}.at(number<0>{}).at(2);
+            return MakeXTLdsBlockDescriptor<Problem,
+                                            KIter,
+                                            kNPerBlock / KIter,
+                                            kKPerBlock,
+                                            kKPack,
+                                            kKPackT>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -1103,6 +1341,9 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             qt_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
         constexpr auto qt_block_dstr = make_static_tile_distribution(qt_block_dstr_encode);
+        static_assert(container_reduce(qt_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kNPerBlock * kKPerBlock);
 
         return qt_block_dstr;
     }
@@ -1135,7 +1376,9 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             dst_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
 
         constexpr auto dst_block_dstr = make_static_tile_distribution(dst_block_dstr_encode);
-
+        static_assert(container_reduce(dst_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kMPerBlock * kKPerBlock);
         return dst_block_dstr;
     }
 
@@ -1177,13 +1420,16 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t M1 = MWarp;
         constexpr index_t M0 = kMPerBlock / (M1 * WG::WarpGemmAttribute::Impl::kM);
 
-        return make_static_tile_distribution(
+        constexpr auto dstr = make_static_tile_distribution(
             tile_distribution_encoding<sequence<N0, N1>,
                                        tuple<sequence<M0, M1, M2, M3, M4>>,
                                        tuple<sequence<1, 0>, sequence<1, 0>>,
                                        tuple<sequence<1, 0>, sequence<3, 1>>,
                                        sequence<1, 1, 1>,
                                        sequence<0, 2, 4>>{});
+        static_assert(container_reduce(dstr.get_lengths(), std::multiplies<index_t>{}, 1) ==
+                      kMPerBlock);
+        return dstr;
     }
 
     template <typename Problem>
@@ -1193,9 +1439,24 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kVHeaddim;
 
-        constexpr index_t kKPack = GetSmemKPackOGrad<Problem>();
-
-        return MakeXLdsBlockDescriptor<kMPerBlock, kKPerBlock, kKPack>();
+        using dram_encoding =
+            typename decltype(MakeOGradDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kKPack = GetSmemKPackOGrad<Problem>();
+            return MakeXLdsBlockDescriptor<kMPerBlock, kKPerBlock, kKPack>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kKPack = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            return MakeXLdsBlockDescriptor<KIter, kMPerBlock, kKPerBlock / KIter, kKPack>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -1226,30 +1487,24 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             do_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
 
         constexpr auto do_block_dstr = make_static_tile_distribution(do_block_dstr_encode);
-
+        static_assert(container_reduce(do_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kMPerBlock * kKPerBlock);
         return do_block_dstr;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledOGradRegWriteBlockDescriptor()
     {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
 
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kVHeaddim;
-
-        constexpr index_t K1 = GetAlignmentOGrad<Problem>();
-        constexpr index_t K0 = kKPerBlock / K1;
-        constexpr index_t N2 = GetTransposedAlignmentOGrad<Problem>();
-        constexpr index_t N1 = get_warp_size() / K0;
-        constexpr index_t N0 = kBlockSize / get_warp_size();
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<0>, sequence<1, 0>>,
-                                       sequence<2, 1>,
-                                       sequence<1, 2>>{});
+        using dram_encoding =
+            typename decltype(MakeOGradDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        static_assert(y_ndim >= 2);
+        using shuffled_encoding_t =
+            tile_distribution_encoding_shuffle_t<dram_encoding,
+                                                 remove_cvref_t<decltype(swap_last2<y_ndim>)>>;
+        return make_static_tile_distribution(shuffled_encoding_t{});
     }
 
     template <typename Problem>
@@ -1259,10 +1514,31 @@ struct BlockFmhaBwdPipelineDefaultPolicy
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kVHeaddim;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kM0;
 
-        constexpr index_t kKPack  = GetSmemKPackOGrad<Problem>();
-        constexpr index_t kKPackT = GetSmemKPackOGradT<Problem>();
-
-        return MakeXTLdsBlockDescriptor<Problem, kNPerBlock, kKPerBlock, kKPack, kKPackT>();
+        using dram_encoding =
+            typename decltype(MakeOGradDramTileDistribution<Problem>())::DstrEncode;
+        constexpr index_t dram_y_ndim = typename dram_encoding::Ys2RHsMajor{}.size();
+        if constexpr(dram_y_ndim == 2)
+        {
+            constexpr index_t kKPack  = GetSmemKPackOGrad<Problem>();
+            constexpr index_t kKPackT = GetSmemKPackOGradT<Problem>();
+            return MakeXTLdsBlockDescriptor<Problem, kNPerBlock, kKPerBlock, kKPack, kKPackT>();
+        }
+        else if constexpr(dram_y_ndim == 3)
+        {
+            constexpr index_t KIter   = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(0);
+            constexpr index_t kKPack  = typename dram_encoding::HsLengthss{}.at(number<1>{}).at(2);
+            constexpr index_t kKPackT = typename dram_encoding::HsLengthss{}.at(number<0>{}).at(2);
+            return MakeXTLdsBlockDescriptor<Problem,
+                                            KIter,
+                                            kNPerBlock / KIter,
+                                            kKPerBlock,
+                                            kKPack,
+                                            kKPackT>();
+        }
+        else
+        {
+            static_assert(false, "Unexpected dram y dimension");
+        }
     }
 
     template <typename Problem>
@@ -1310,7 +1586,9 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             dot_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
         constexpr auto dot_block_dstr = make_static_tile_distribution(dot_block_dstr_encode);
-
+        static_assert(container_reduce(dot_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kNPerBlock * kKPerBlock);
         return dot_block_dstr;
     }
 
@@ -1342,7 +1620,9 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             pt_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
 
         constexpr auto pt_block_dstr = make_static_tile_distribution(pt_block_dstr_encode);
-
+        static_assert(container_reduce(pt_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kMPerBlock * kKPerBlock);
         return pt_block_dstr;
     }
 
@@ -1384,7 +1664,9 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             ds_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
 
         constexpr auto ds_block_dstr = make_static_tile_distribution(ds_block_dstr_encode);
-
+        static_assert(container_reduce(ds_block_dstr.get_lengths(),
+                                       std::multiplies<index_t>{},
+                                       1) == kMPerBlock * kKPerBlock);
         return ds_block_dstr;
     }
 

From 49723e94bbbbf74763a70ac6fe99b9afb454eab2 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 28 Jul 2025 20:49:55 -0700
Subject: [PATCH 364/443] fix the clang-format (#2578)

---
 ...block_fmha_bwd_pipeline_default_policy.hpp | 48 +++++++++----------
 include/ck_tile/remod.py                      | 14 ++++--
 script/clang-format-overwrite.sh              |  2 +-
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index bc0dc592f0..521968a43b 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -415,12 +415,12 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             constexpr index_t N2_m = kNPerBlock / (N1_m * N0);
             constexpr auto dstr_m  = make_static_tile_distribution(
                 tile_distribution_encoding<
-                    sequence<>,
-                    tuple<sequence<N0, N1_m, N2_m>, sequence<K0_m, K1_m, K2>>,
-                    tuple<sequence<1>, sequence<1, 2>>, // N0, N1 K1
-                    tuple<sequence<0>, sequence<1, 1>>,
-                    sequence<2, 1, 2>, // K0 N2 K2
-                    sequence<0, 2, 2>>{});
+                     sequence<>,
+                     tuple<sequence<N0, N1_m, N2_m>, sequence<K0_m, K1_m, K2>>,
+                     tuple<sequence<1>, sequence<1, 2>>, // N0, N1 K1
+                     tuple<sequence<0>, sequence<1, 1>>,
+                     sequence<2, 1, 2>, // K0 N2 K2
+                     sequence<0, 2, 2>>{});
             static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
                           kNPerBlock * kKPerBlock);
             return dstr_m;
@@ -464,12 +464,12 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             constexpr index_t N0_m = kNPerBlock / (N2_m * N1);
             constexpr auto dstr_m  = make_static_tile_distribution(
                 tile_distribution_encoding<
-                    sequence<>,
-                    tuple<sequence<N0_m, N1, N2_m>, sequence<K0_m, K1_m, K2>>,
-                    tuple<sequence<1>, sequence<1, 2>>, // N1, N2 K1
-                    tuple<sequence<1>, sequence<2, 1>>,
-                    sequence<2, 1, 2>, // K0 N0 K2
-                    sequence<0, 0, 2>>{});
+                     sequence<>,
+                     tuple<sequence<N0_m, N1, N2_m>, sequence<K0_m, K1_m, K2>>,
+                     tuple<sequence<1>, sequence<1, 2>>, // N1, N2 K1
+                     tuple<sequence<1>, sequence<2, 1>>,
+                     sequence<2, 1, 2>, // K0 N0 K2
+                     sequence<0, 0, 2>>{});
             static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
                           kNPerBlock * kKPerBlock);
             return dstr_m;
@@ -515,12 +515,12 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             constexpr index_t M2_m = kMPerBlock / (M1_m * M0);
             constexpr auto dstr_m  = make_static_tile_distribution(
                 tile_distribution_encoding<
-                    sequence<>,
-                    tuple<sequence<M0, M1_m, M2_m>, sequence<K0_m, K1_m, K2>>,
-                    tuple<sequence<1>, sequence<1, 2>>, // M0, M1 K1
-                    tuple<sequence<0>, sequence<1, 1>>,
-                    sequence<2, 1, 2>, // K0 M2 K2
-                    sequence<0, 2, 2>>{});
+                     sequence<>,
+                     tuple<sequence<M0, M1_m, M2_m>, sequence<K0_m, K1_m, K2>>,
+                     tuple<sequence<1>, sequence<1, 2>>, // M0, M1 K1
+                     tuple<sequence<0>, sequence<1, 1>>,
+                     sequence<2, 1, 2>, // K0 M2 K2
+                     sequence<0, 2, 2>>{});
             static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
                           kMPerBlock * kKPerBlock);
             return dstr_m;
@@ -566,12 +566,12 @@ struct BlockFmhaBwdPipelineDefaultPolicy
             constexpr index_t M2_m = kMPerBlock / (M1_m * M0);
             constexpr auto dstr_m  = make_static_tile_distribution(
                 tile_distribution_encoding<
-                    sequence<>,
-                    tuple<sequence<M0, M1_m, M2_m>, sequence<K0_m, K1_m, K2>>,
-                    tuple<sequence<1>, sequence<1, 2>>, // M0, M1 K1
-                    tuple<sequence<0>, sequence<1, 1>>,
-                    sequence<2, 1, 2>, // K0 M2 K2
-                    sequence<0, 2, 2>>{});
+                     sequence<>,
+                     tuple<sequence<M0, M1_m, M2_m>, sequence<K0_m, K1_m, K2>>,
+                     tuple<sequence<1>, sequence<1, 2>>, // M0, M1 K1
+                     tuple<sequence<0>, sequence<1, 1>>,
+                     sequence<2, 1, 2>, // K0 M2 K2
+                     sequence<0, 2, 2>>{});
             static_assert(container_reduce(dstr_m.get_lengths(), std::multiplies<index_t>{}, 1) ==
                           kMPerBlock * kKPerBlock);
             return dstr_m;
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index 6f5a425207..e5e5ad4300 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -1,8 +1,14 @@
-from datetime import datetime import pathlib from pathlib import Path import subprocess import os
-    import copy
+from datetime import datetime
+import pathlib
+from pathlib import Path
+import subprocess
+import os
+import copy
 
-        NS = 'ck_tile' OPS = 'ops' REF = 'ref' OPS_COMMON =
-            'common' #common header will be duplicated into ops/* other module
+NS = 'ck_tile'
+OPS = 'ops'
+REF = 'ref'
+OPS_COMMON = 'common' #common header will be duplicated into ops/* other module
 
 HEADER_COMMON = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-{datetime.now().year}, Advanced Micro Devices, Inc. All rights reserved.\n
diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
index 53de05a7d8..a770970fef 100755
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
@@ -1,2 +1,2 @@
 find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
-git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'

From 01642ca8b15ffed4e15e0b9f58d7b0fb19b6f2b7 Mon Sep 17 00:00:00 2001
From: rocking <ChunYu.Lai@amd.com>
Date: Tue, 29 Jul 2025 13:44:10 +0800
Subject: [PATCH 365/443] set default optdim (#2580)

---
 example/ck_tile/01_fmha/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt
index 1b004ec100..bd03aee924 100644
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -28,12 +28,14 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
 set(FMHA_FWD_CODE_GEN_COMMON_ARGS
   ${CMAKE_CURRENT_LIST_DIR}/generate.py
   --api ${FMHA_FWD_APIS}
+  --optdim 32,64,128,256
   # --filter fmha_fwd...
 )
 set(FMHA_BWD_CODE_GEN_COMMON_ARGS
   ${CMAKE_CURRENT_LIST_DIR}/generate.py
   --api bwd
   --receipt 3
+  --optdim 32,64,128,256
   # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
 )
 

From 9d4b494f07494332889851db848b990e2349e793 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 28 Jul 2025 23:56:53 -0700
Subject: [PATCH 366/443] Expand the bandwidth of direct_global_to_lds for
 gfx950 (#2576)

* Expand the bandwidth of direct_global_to_lds for gfx950

* clang-format

* fix the remod.py and script for clang format

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
---
 .../core/arch/amd_buffer_addressing.hpp       | 22 ++++++++++++++-----
 .../arch/amd_buffer_addressing_builtins.hpp   | 22 ++++++++++++++-----
 include/ck_tile/remod.py                      |  2 +-
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 0932f39ca7..29cc3fefe5 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2762,11 +2762,6 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                                   const bool is_valid,
                                                   const index_t src_element_space_size)
 {
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-    constexpr auto dword_bytes      = 4;
-    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
-    static_assert(bytes_per_thread == dword_bytes);
-
     const uint32_t* global_ptr =
         reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
     const int32x4_t src_resource =
@@ -2783,12 +2778,27 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                  "s"(src_resource)
                  : "memory");
 #else
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+#if defined(__gfx9__)
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+#endif
+    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
+    // For gfx950: supports 1, 3, or 4 DWORDs per thread
+    // For gfx942: supports exactly 1 DWORD per thread
+#if defined(__gfx950__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
+                  bytes_per_thread == dword_bytes * 4);
+#elif defined(__gfx9__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes);
+#endif
     // LDS pointer must be attributed with the LDS address space.
     as3_uint32_ptr lds_ptr =
         reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
 
     llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
 #endif
 }
 
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index ce4af430e2..8c3bc0bc36 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -2532,11 +2532,6 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                                   const bool is_valid,
                                                   const index_t src_element_space_size)
 {
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-    constexpr auto dword_bytes      = 4;
-    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
-    static_assert(bytes_per_thread == dword_bytes);
-
     const uint32_t* global_ptr =
         reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
     const int32x4_t src_resource =
@@ -2553,12 +2548,27 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
                  "s"(src_resource)
                  : "memory");
 #else
+    // Direct loads require that each thread reads and writes exactly a single DWORD.
+#if defined(__gfx9__)
+    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+#endif
+    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
+    // For gfx950: supports 1, 3, or 4 DWORDs per thread
+    // For gfx942: supports exactly 1 DWORD per thread
+#if defined(__gfx950__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
+                  bytes_per_thread == dword_bytes * 4);
+#elif defined(__gfx9__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes);
+#endif
     // LDS pointer must be attributed with the LDS address space.
     as3_uint32_ptr lds_ptr =
         reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
 
     llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
 #endif
 }
 
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index e5e5ad4300..1584f706e9 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -76,7 +76,7 @@ class submodule_t:
                     gen_header(Path(k) / (f'{km}.hpp'), kv)
             else:
                 gen_header(Path(f'{k}.hpp'), v)
-            
+
 
 submodule = submodule_t()
 # formatting

From b80099cc5feb0e52a89257893d45a71f59021154 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 29 Jul 2025 13:04:51 -0700
Subject: [PATCH 367/443] Revert "Add gemm universal f8 f8 bf16 mk nk instances
 on gfx950 (#2558)" (#2584)

This reverts commit c64a0c65b96ef1fba731746e6c39146326023b7f.
---
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 84 -------------------
 ...f8_bf16_mk_nk_mn_comp_default_instance.cpp |  8 +-
 ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  8 +-
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |  8 --
 ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  8 --
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  8 --
 ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  8 --
 7 files changed, 2 insertions(+), 130 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index da4307d9be..27d7933477 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -54,54 +54,6 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
 #endif
     // clang-format on
     >;
-// instances for double rate mfma on gfx950
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr = std::tuple<
-// clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        // Compute friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  32,  32,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>
-#endif
-    // clang-format on
-    >;
 // instances not working on gfx950
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2 = std::tuple<
@@ -163,42 +115,6 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
 #endif
     // clang-format on
     >;
-// instances for double rate mfma on gfx950
-template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr = std::tuple<
-// clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        // Latency friendly 
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
-        // Memory friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    256, 32,  32,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 32,  32,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    256, 32,  32,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    256, 32,  32,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    256, 32,  32,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    256, 32,  32,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
-#endif
-    // clang-format on
-    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
index 6cf0228c04..d6c9809020 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -17,13 +17,7 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
 
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmDefault>{});
-    }
-    else
+    if(ck::get_device_name() != "gfx950")
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
index 65e49d5f88..fc6ad01742 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -17,13 +17,7 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
 
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmKPadding>{});
-    }
-    else
+    if(ck::get_device_name() != "gfx950")
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
index 56c7c71a13..f6a9c48555 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -16,14 +16,6 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
-                                                                           GemmDefault>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
index bad30bad99..f9c12e7cb2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -16,14 +16,6 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
-                                                                           GemmKPadding>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
index 8d6b8dcbca..1d33c7fa57 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -16,14 +16,6 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
-                                                                           GemmDefault>{});
-    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
index d0bbc4aeda..252aec5bc2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -16,14 +16,6 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmKPadding>{});
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
-                                                                           GemmKPadding>{});
-    }
 }
 
 } // namespace instance

From 61e21f5567fabb2e9f44b4a3f6530eb9b8e9d4fb Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Tue, 29 Jul 2025 15:21:05 -0700
Subject: [PATCH 368/443] Update to gpu_timer for rotating_buffer (#2524)

* update gpu_timer for rotating buffer as hipblasLt's implementation

* timing fix

* Updating gpu timer for old ck as well

* Revert "Updating gpu timer for old ck as well"

This reverts commit 958cd1bc9961755daf14f1ed9e4cd8860ca84fd8.

* code clean up with runtime argument; function rename

* code cleanup

* general timer fixes

* bug fix

* clang formatted

* addressing reveiew comments

* clang formatted

* Addressing review comments

* CI fix

---------

Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |   3 +-
 .../03_gemm/gemm_weight_preshuffle.cpp        |  10 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  22 ++-
 example/ck_tile/03_gemm/universal_gemm.cpp    |  10 +-
 include/ck_tile/host/kernel_launch.hpp        | 125 +++++++++++-------
 include/ck_tile/host/stream_config.hpp        |   1 +
 include/ck_tile/host/timer.hpp                |  77 ++++++++++-
 .../test_gemm_pipeline_universal_run_test.inc |   2 +-
 tile_engine/ops/gemm/benchmark_gemm.cpp       |   3 +-
 tile_engine/ops/gemm/benchmark_gemm.hpp       |   1 +
 tile_engine/ops/gemm/gemm_host_api.hpp        |   1 +
 tile_engine/ops/gemm/gemm_instance_builder.py |   2 +-
 tile_engine/ops/gemm/gemm_profiler.hpp        |   3 +-
 13 files changed, 182 insertions(+), 78 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 1e867afd1a..300a3826d7 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -457,7 +457,8 @@ auto create_args(int argc, char* argv[])
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("persistent", "0", "0:non-persistent, 1:persistent");
+        .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index 34333d5474..74e79574d1 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -146,18 +146,14 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         if(s.flush_cache_)
         {
             std::cout << "Flushing cache..." << std::endl;
-            static constexpr ck_tile::index_t APackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-            static constexpr ck_tile::index_t BPackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
             ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
                 args.M, args.K, args.stride_A, is_row_major(ALayout{})));
             ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
                 args.K, args.N, args.stride_B, is_row_major(BLayout{})));
 
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
 
             ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
                 kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
@@ -173,7 +169,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                     hipGetErrorString(hipMemsetAsync(
                         args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             };
-            ave_time = ck_tile::launch_kernel_preprocess(
+            ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
                 ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 7f87c2bc06..69331282a4 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -183,7 +183,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::index_t kbatch,
                   int n_warmup,
                   int n_repeat,
-                  bool persistent)
+                  bool persistent,
+                  int bench_time_ms)
 {
     ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
                                   b_k_n_dev_buf.GetDeviceBuffer(),
@@ -211,7 +212,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                         CLayout,
                         true,
                         CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
     }
     else
     {
@@ -227,7 +230,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                         CLayout,
                         false,
                         CDEElementWise>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
     }
 
     std::size_t flop = std::size_t(2) * M * N * K;
@@ -236,15 +241,16 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_byte / 1.E6 / ave_time;
 
-    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+    std::cout << "Run Gemm kernel with \n M=" << M << " N=" << N << " K=" << K
               << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
               << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
               << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
               << " B_Type=" << DataTypeTraits<BDataType>::name
               << " C_Type=" << DataTypeTraits<CDataType>::name
               << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
-              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+              << " Persistent=" << (persistent ? "on" : "off") << " : \n"
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
 
     return ave_time;
 }
@@ -297,6 +303,7 @@ int run_gemm_example_with_layouts(int argc,
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
     bool persistent              = arg_parser.get_int("persistent");
+    int bench_time_ms            = arg_parser.get_int("bench_time_ms");
 
     const bool preshuffle = GemmConfig::Preshuffle;
 
@@ -414,7 +421,8 @@ int run_gemm_example_with_layouts(int argc,
                          kbatch,
                          n_warmup,
                          n_repeat,
-                         persistent);
+                         persistent,
+                         bench_time_ms);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 6c60f98fa4..d82520241d 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -147,18 +147,14 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
         if(s.flush_cache_)
         {
             std::cout << "Flushing cache..." << std::endl;
-            static constexpr ck_tile::index_t APackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-            static constexpr ck_tile::index_t BPackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
 
             ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
                 args.M, args.K, args.stride_A, is_row_major(ALayout{})));
             ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
                 args.K, args.N, args.stride_B, is_row_major(BLayout{})));
 
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
 
             ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
                 kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
@@ -174,7 +170,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                     hipGetErrorString(hipMemsetAsync(
                         args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             };
-            ave_time = ck_tile::launch_kernel_preprocess(
+            ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
                 ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index f6ccb6968b..262b8bae45 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <numeric>
+#include <functional>
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
 #include "ck_tile/host/hip_check_error.hpp"
@@ -63,6 +65,73 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
     }
 }
 
+template <class it>
+typename std::iterator_traits<it>::value_type median(it begin, it end)
+{
+    if(begin == end)
+    {
+        return std::numeric_limits<double>::quiet_NaN();
+    }
+    auto n  = std::distance(begin, end);
+    auto n2 = n / 2;
+    std::nth_element(begin, begin + n2, end);
+    return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0;
+}
+
+inline void remove_outliers(std::vector<float>& v)
+{
+    // 1.5x IQR method to detect and remove outliers
+    auto n2 = v.size() / 2;
+    std::nth_element(v.begin(), v.begin() + n2, v.end());
+    auto q1  = median(v.begin(), v.begin() + n2);
+    auto q3  = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end());
+    auto iqr = q3 - q1;
+    auto lb  = q1 - 1.5 * iqr;
+    auto ub  = q3 + 1.5 * iqr;
+    v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end());
+}
+
+template <typename TimerType, typename CallablesFunc>
+CK_TILE_HOST double timing_loop_impl(TimerType timer,
+                                     const stream_config& s,
+                                     CallablesFunc&& callables_func,
+                                     std::function<void()> preprocess = nullptr)
+{
+    for(int i = 0; i < s.cold_niters_; i++)
+    {
+        callables_func();
+    }
+
+    float per_iter_time = 0.f;
+    std::vector<float> times;
+    int i = 0;
+    while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_)
+    {
+        if(preprocess)
+            preprocess();
+
+        timer.start(s.stream_id_, i);
+        callables_func();
+        timer.stop(s.stream_id_, i);
+
+        if(i > 0)
+        {
+            per_iter_time = timer.duration(i - 1);
+            times.push_back(per_iter_time);
+            per_iter_time = timer.is_exceed(i - 1);
+        }
+        i++;
+    }
+
+    if(!i)
+        return 0.;
+
+    per_iter_time = timer.duration(i - 1);
+    times.push_back(per_iter_time);
+    remove_outliers(times);
+    return std::accumulate(times.begin(), times.end(), 0.) / times.size();
+}
+
 // clang-format off
 /*
  * launch_kernel()
@@ -101,37 +170,21 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
         return 0;
     }
 
-    auto time_launches = [&](auto timer) {
-        // Warmup
-        for(int i = 0; i < s.cold_niters_; i++)
-        {
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-
-        timer.start(s.stream_id_);
-        for(int i = 0; i < s.nrepeat_; i++)
-        {
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-        timer.stop(s.stream_id_);
-
-        return timer.duration() / s.nrepeat_;
-    };
+    auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
 
     if(s.is_gpu_timer_)
     {
-        return time_launches(gpu_timer{});
+        return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func);
     }
     else
     {
-        return time_launches(cpu_timer{});
+        return timing_loop_impl(cpu_timer{}, s, callables_func);
     }
 }
 
 template <typename PreprocessFunc, typename... Callables>
-CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
-                                            PreprocessFunc preprocess,
-                                            Callables&&... callables)
+CK_TILE_HOST float
+launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
 {
     static_assert(sizeof...(callables) > 0, "At least one callable is required!");
 
@@ -142,39 +195,15 @@ CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
         return 0;
     }
 
-    auto time_launches = [&](auto timer) {
-        // Warmup
-        for(int i = 0; i < s.cold_niters_; i++)
-        {
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-
-        timer.start(s.stream_id_);
-        for(int i = 0; i < s.nrepeat_; i++)
-        {
-            preprocess();
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-        timer.stop(s.stream_id_);
-
-        hipDeviceProp_t deviceProps;
-        HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-
-        float preprocess_offset = (deviceProps.multiProcessorCount >= HIGH_CU_PROCESSORS)
-                                      ? OPTIMAL_LATENCY_HIGH_CU_PROCESSORS
-                                  : (deviceProps.multiProcessorCount == LOW_CU_PROCESSORS)
-                                      ? OPTIMAL_LATENCY_LOW_CU_PROCESSORS
-                                      : OPTIMAL_LATENCY_SAFE_MARGIN;
-        return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
-    };
+    auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
 
     if(s.is_gpu_timer_)
     {
-        return time_launches(gpu_timer{});
+        return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess);
     }
     else
     {
-        return time_launches(cpu_timer{});
+        return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
     }
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp
index f6bd40f6f2..0c239d0a7d 100644
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -32,5 +32,6 @@ struct stream_config
     bool is_gpu_timer_     = true; // keep compatible
     bool flush_cache_      = false;
     int rotating_count_    = 1;
+    int bench_time_ms_     = 0;
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/host/timer.hpp b/include/ck_tile/host/timer.hpp
index e5519643bf..b4bff932e4 100644
--- a/include/ck_tile/host/timer.hpp
+++ b/include/ck_tile/host/timer.hpp
@@ -48,31 +48,100 @@ struct gpu_timer
     hipEvent_t start_evt, stop_evt;
 };
 
+struct gpu_timer_new
+{
+    CK_TILE_HOST gpu_timer_new(const hipStream_t& s)
+    {
+        for(auto& e : start_event)
+        {
+            HIP_CHECK_ERROR(hipEventCreate(&e));
+        }
+        for(auto& e : stop_event)
+        {
+            HIP_CHECK_ERROR(hipEventCreate(&e));
+        }
+        HIP_CHECK_ERROR(hipEventCreate(&event0));
+        HIP_CHECK_ERROR(hipEventRecord(event0, s));
+    }
+
+    CK_TILE_HOST ~gpu_timer_new() noexcept(false)
+    {
+        for(auto& e : start_event)
+        {
+            HIP_CHECK_ERROR(hipEventDestroy(e));
+        }
+        for(auto& e : stop_event)
+        {
+            HIP_CHECK_ERROR(hipEventDestroy(e));
+        }
+        HIP_CHECK_ERROR(hipEventDestroy(event0));
+    }
+
+    CK_TILE_HOST void start(const hipStream_t& s, int idx = 0)
+    {
+        HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s));
+    }
+
+    CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0)
+    {
+        HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s));
+    }
+    // return in ms
+    CK_TILE_HOST float duration(int idx = 0) const
+    {
+        float ms;
+        HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2]));
+        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2]));
+        return ms;
+    }
+    CK_TILE_HOST float is_exceed(int idx = 0) const
+    {
+        float ms;
+        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2]));
+        return ms;
+    }
+
+    private:
+    std::array<hipEvent_t, 2> start_event;
+    std::array<hipEvent_t, 2> stop_event;
+    hipEvent_t event0;
+};
+
 struct cpu_timer
 {
     // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
-    CK_TILE_HOST void start(const hipStream_t& s)
+    CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0)
     {
         HIP_CHECK_ERROR(hipStreamSynchronize(s));
-        start_tick = std::chrono::high_resolution_clock::now();
+        start_tick  = std::chrono::high_resolution_clock::now();
+        time_event0 = std::chrono::high_resolution_clock::now();
     }
     // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
-    CK_TILE_HOST void stop(const hipStream_t& s)
+    CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0)
     {
         HIP_CHECK_ERROR(hipStreamSynchronize(s));
         stop_tick = std::chrono::high_resolution_clock::now();
     }
     // return in ms
-    CK_TILE_HOST float duration() const
+    CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const
     {
         double sec =
             std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
                 .count();
         return static_cast<float>(sec * 1e3);
     }
+    // return in ms
+    CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const
+    {
+        double sec =
+            std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - time_event0)
+                .count();
+        return static_cast<float>(sec * 1e3);
+    }
 
     private:
     std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
+    std::chrono::time_point<std::chrono::high_resolution_clock> time_event0;
     std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
index 860541ef18..7d89dda684 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -162,7 +162,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                     hipGetErrorString(hipMemsetAsync(
                         args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             };
-            ave_time = ck_tile::launch_kernel_preprocess(
+            ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
                 ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
index db2b648437..5f240c8fe4 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -34,7 +34,8 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                     arg_parser.get_bool("log"),
                     arg_parser.get_str("csv_filename"),
                     arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count")};
+                    arg_parser.get_int("rotating_count"),
+                    arg_parser.get_int("bench_time")};
 
     auto& profiler = GemmProfiler::instance(setting);
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index ce8a6e8234..993e7ea1f5 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,6 +125,7 @@ struct Setting
     std::string csv_filename_;
     bool flush_cache_;
     int rotating_count_;
+    int bench_time_ms_;
 };
 
 inline std::string get_rocm_version()
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 2c4af8955f..8b18aa703d 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -110,6 +110,7 @@ inline auto create_args(int argc, char* argv[])
                 "To flush cache, possible values are true or false. "
                 "Default is false.")
         .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
+        .insert("bench_time", "0", "benchmark time in ms. default is 0 ms.")
         .insert("metric",
                 "0",
                 "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 6796121328..4a35a2bcd3 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -348,7 +348,7 @@ struct GemmKernel {{
                         hipGetErrorString(hipMemsetAsync(
                             args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
                 }};
-                ave_time = ck_tile::launch_kernel_preprocess(
+                ave_time = ck_tile::launch_kernel_time_mask(
                     stream,
                     run_flush_cache,
                     ck_tile::make_kernel<blocks.x, kBlockPerCu>(
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 634e19de6e..d4efc7fa7f 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -131,7 +131,8 @@ class GemmProfiler
                                                                      setting_.n_repeat_,
                                                                      setting_.is_gpu_timer_,
                                                                      setting_.flush_cache_,
-                                                                     setting_.rotating_count_});
+                                                                     setting_.rotating_count_,
+                                                                     setting_.bench_time_ms_});
             process_result(gemm_problem,
                            c_m_n_dev_buf,
                            c_m_n_host_result,

From b25d512e8abbd2eb6a12171d12739a144307d809 Mon Sep 17 00:00:00 2001
From: Gino Lu <gino.lu@amd.com>
Date: Wed, 30 Jul 2025 22:29:04 +0800
Subject: [PATCH 369/443] add constexpr to pk_fp4::pack/unpack() (#2586)

---
 include/ck_tile/core/numeric/pk_fp4.hpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index b7dca9dd0a..0dee750b69 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -55,8 +55,8 @@ struct pk_float4_e2m1_t
     CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const;
 
     template <index_t I>
-    CK_TILE_HOST_DEVICE raw_type unpack(number<I>) const;
-    CK_TILE_HOST_DEVICE static pk_float4_e2m1_t pack(const type x0, const type x1)
+    CK_TILE_HOST_DEVICE constexpr raw_type unpack(number<I>) const;
+    CK_TILE_HOST_DEVICE constexpr static pk_float4_e2m1_t pack(const type x0, const type x1)
     {
         return (x1 << 4) | (x0 & 0b00001111);
     }
@@ -130,7 +130,7 @@ struct numeric<pk_fp4_t>
 };
 
 template <index_t I>
-CK_TILE_HOST_DEVICE pk_fp4_raw_t pk_fp4_t::unpack(number<I>) const
+CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t pk_fp4_t::unpack(number<I>) const
 {
     static_assert(I < 2, "Index is out of range.");
     if constexpr(I == 1)
@@ -147,7 +147,6 @@ namespace impl {
 template <typename T>
 CK_TILE_DEVICE T _from_f4(pk_fp4_raw_t src, float scale = 1.0f)
 {
-    // TODO: check the order
     if constexpr(std::is_same_v<T, fp32_t>)
         return fp32x2_t(__builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0))[0];
     else if constexpr(std::is_same_v<T, fp32x2_t>)
@@ -167,7 +166,6 @@ CK_TILE_DEVICE T _from_f4(pk_fp4_raw_t src, float scale = 1.0f)
 template <typename T>
 CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f)
 {
-    // TODO: check the order
     union
     {
         uint32_t u32;

From de0cdb4c3143cbf5995da0f0ed7616d940d7d162 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 30 Jul 2025 07:31:05 -0700
Subject: [PATCH 370/443] [CK-tile] add gtest for ck-tile batched transpose
 kernels (#2585)

* add a dummy test file

* add kernel launch logic to the test

* transfer all test cases into gtest params

* factor kernel out into test config

* add load transpose pipeline tests

* add padded tests and skip invalid kernels at runtime

* enum class for pipeline type

* add multiwarp test cases

* fix type

* try to solve the problem

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 test/ck_tile/batched_transpose/CMakeLists.txt |   4 +-
 .../test_batched_transpose.cpp                | 263 ++++++++++++++++++
 2 files changed, 265 insertions(+), 2 deletions(-)
 create mode 100644 test/ck_tile/batched_transpose/test_batched_transpose.cpp

diff --git a/test/ck_tile/batched_transpose/CMakeLists.txt b/test/ck_tile/batched_transpose/CMakeLists.txt
index ac8e3dac49..f2ef158a4d 100644
--- a/test/ck_tile/batched_transpose/CMakeLists.txt
+++ b/test/ck_tile/batched_transpose/CMakeLists.txt
@@ -26,8 +26,8 @@ if(GPU_TARGETS MATCHES "gfx9")
 
     add_batched_transpose_test(test_ck_tile_batched_transpose_bf16 batched_transpose_bf16.cpp)
     add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_bf16)
-
-
+    add_gtest_executable(test_batched_transpose test_batched_transpose.cpp)
+    set_property(TARGET test_batched_transpose PROPERTY CXX_STANDARD 20)
 else()
     message(DEBUG "Skipping ck_tile batched_transpose tests for current target")
 endif()
diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
new file mode 100644
index 0000000000..85008a51a2
--- /dev/null
+++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+#include "ck_tile/ops/batched_transpose.hpp"
+
+enum class PipelineTag : ck_tile::index_t
+{
+    Universal,
+    LDSLoadTranspose,
+};
+
+template <PipelineTag kPipelineId_>
+struct PipelineSelector
+{
+};
+
+template <>
+struct PipelineSelector<PipelineTag::Universal>
+{
+    template <typename DataType, typename BlockTile, typename WarpLayout, bool kPadM, bool kPadN>
+    using Problem = ck_tile::BatchedTransposeProblem<DataType, BlockTile, WarpLayout, kPadM, kPadN>;
+
+    using Policy = ck_tile::BatchedTransposePolicy;
+
+    template <typename Problem_>
+    using Pipeline = ck_tile::BatchedTransposePipeline<Problem_, Policy>;
+};
+
+template <>
+struct PipelineSelector<PipelineTag::LDSLoadTranspose>
+{
+    template <typename DataType, typename BlockTile, typename WarpLayout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeLdsProblem<DataType, BlockTile, WarpLayout, kPadM, kPadN>;
+
+    using Policy = ck_tile::BatchedTransposeLdsPolicy;
+
+    template <typename Problem_>
+    using Pipeline = ck_tile::BatchedTransposeLdsPipeline<Problem_, Policy>;
+};
+
+template <typename DataType_,
+          PipelineTag kPipelineId_     = PipelineTag::Universal,
+          ck_tile::index_t kBlockX_    = 64,
+          ck_tile::index_t kBlockY_    = 64,
+          ck_tile::index_t kNumWarpsX_ = 1,
+          ck_tile::index_t kNumWarpsY_ = 1,
+          bool kPadM_                  = true,
+          bool kPadN_                  = true>
+struct PipelineConfig
+{
+    using DataType                               = DataType_;
+    using BlockTile                              = ck_tile::sequence<kBlockX_, kBlockY_>;
+    using WarpLayout                             = ck_tile::sequence<kNumWarpsX_, kNumWarpsY_>;
+    static constexpr bool kPadM                  = kPadM_;
+    static constexpr bool kPadN                  = kPadN_;
+    static constexpr PipelineTag kPipelineId     = kPipelineId_;
+    static constexpr ck_tile::index_t kBlockX    = kBlockX_;
+    static constexpr ck_tile::index_t kBlockY    = kBlockY_;
+    static constexpr ck_tile::index_t kNumWarpsX = kNumWarpsX_;
+    static constexpr ck_tile::index_t kNumWarpsY = kNumWarpsY_;
+
+    using Problem = typename PipelineSelector<
+        kPipelineId_>::template Problem<DataType, BlockTile, WarpLayout, kPadM, kPadN>;
+    using Pipeline = typename PipelineSelector<kPipelineId_>::template Pipeline<Problem>;
+    using Kernel   = ck_tile::BatchedTransposeKernel<Pipeline>;
+};
+
+template <typename Config>
+class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==NCHW
+    : public ::testing::TestWithParam<std::tuple<int, int, int, int, bool>>
+{
+    protected:
+    void Run(std::tuple<int, int, int, int, bool> param)
+    {
+        using DataType                     = typename Config::DataType;
+        const auto [N, C, H, W, nchw2nhwc] = param;
+        const std::string layout_in        = nchw2nhwc ? "NCHW" : "NHWC";
+        const std::string layout_out       = nchw2nhwc ? "NHWC" : "NCHW";
+        const auto X_dim = nchw2nhwc ? std::array{N, C, H, W} : std::array{N, H, W, C};
+        const auto X_stride =
+            nchw2nhwc ? std::array{C * H * W, H * W, W, 1} : std::array{C * H * W, C * W, C, 1};
+        ck_tile::HostTensor<DataType> x_host(X_dim, X_stride);
+        const auto Y_dim = nchw2nhwc ? std::array{N, H, W, C} : std::array{N, C, H, W};
+        const auto Y_stride =
+            nchw2nhwc ? std::array{C * H * W, C * W, C, 1} : std::array{C * H * W, H * W, W, 1};
+        ck_tile::HostTensor<DataType> y_host(Y_dim, Y_stride);
+        ck_tile::HostTensor<DataType> y_ref(Y_dim, Y_stride);
+
+        ck_tile::FillUniformDistribution<DataType>{-.5f, .5f}(x_host);
+
+        ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
+        x_dev.ToDevice(x_host.data());
+
+        using Kernel = typename Config::Kernel;
+
+        const ck_tile::index_t height = nchw2nhwc ? C : H * W;
+        const ck_tile::index_t width  = nchw2nhwc ? H * W : C;
+
+        if(height % Config::kBlockX != 0 && !Config::kPadM)
+        {
+            GTEST_SKIP_("Input cannot be covered with block tiles and Kernel does not force height "
+                        "padding");
+        }
+
+        if(width % Config::kBlockY != 0 && !Config::kPadN)
+        {
+            GTEST_SKIP_(
+                "Input cannot be covered with block tiles and Kernel does not force width padding");
+        }
+
+        const auto device_name = ck_tile::get_device_name();
+
+        if(Config::kPipelineId == PipelineTag::LDSLoadTranspose &&
+           device_name.find("gfx950") == std::string::npos)
+        {
+            GTEST_SKIP_(
+                std::format("LDS Load Transpose cannot be launched with {}", device_name).c_str());
+        }
+
+        const auto host_args = ck_tile::BatchedTransposeHostArgs{x_dev.GetDeviceBuffer(),
+                                                                 y_dev.GetDeviceBuffer(),
+                                                                 N,
+                                                                 height,
+                                                                 width,
+                                                                 height * width,
+                                                                 Config::BlockTile::at(1),
+                                                                 Config::BlockTile::at(0)};
+        auto kargs           = Kernel::MakeKargs(host_args);
+
+        auto sc                   = ck_tile::stream_config{};
+        const dim3 grid_size      = Kernel::GridSize(host_args);
+        constexpr dim3 block_size = Kernel::BlockSize();
+        ck_tile::launch_kernel(
+            sc, ck_tile::make_kernel<block_size.x, 1>(Kernel{}, grid_size, block_size, 0, kargs));
+        y_dev.FromDevice(y_host.data());
+        ck_tile::reference_batched_transpose<DataType>(x_host, y_ref, layout_in, layout_out);
+
+        std::ostringstream message;
+        message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in
+                << " layout_out=" << layout_out << " device_name=" << device_name;
+
+        bool pass = ck_tile::check_err(
+            y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false);
+
+        EXPECT_TRUE(pass);
+    }
+};
+
+// clang-format off
+// the default indent is not sane
+static const auto kTestingValues = ::testing::Values(
+//             N  C   H  W   layout_in==NCHW    
+    std::tuple{1, 32, 1, 32, true},
+    std::tuple{1, 64, 1, 64, true},
+    std::tuple{2, 12, 1, 32, false},
+    std::tuple{3, 1334, 1, 37, false},
+    std::tuple{4, 27, 1, 32, true},
+    std::tuple{5, 1234, 1, 12, true},
+    std::tuple{1, 1, 1, 1, true},
+    std::tuple{1, 1, 1, 1, false},
+    std::tuple{128, 1024, 64, 64, true},
+    std::tuple{128, 1024, 64, 64, false},
+    std::tuple{16, 64, 32, 128, true},
+    std::tuple{16, 64, 128, 32, false},
+    std::tuple{1, 2048, 1, 1, true},
+    std::tuple{1, 2048, 1, 1, false},
+    std::tuple{1, 1, 1024, 1024, true},
+    std::tuple{1, 1, 1024, 1024, false},
+    std::tuple{8, 16, 8, 16, true},
+    std::tuple{8, 16, 8, 16, false},
+    std::tuple{1, 64, 1, 1024, true},
+    std::tuple{1, 64, 1024, 1, false}
+);
+// clang-format on
+
+class CaseHalf : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t>>
+{
+};
+
+class CaseByte : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::fp8_t>>
+{
+};
+
+class CaseWord : public TestCkTileBatchedTranspose<PipelineConfig<float>>
+{
+};
+
+class CaseHalfLoadTranspose : public TestCkTileBatchedTranspose<
+                                  PipelineConfig<ck_tile::half_t, PipelineTag::LDSLoadTranspose>>
+{
+};
+
+class CaseByteLoadTranspose : public TestCkTileBatchedTranspose<
+                                  PipelineConfig<ck_tile::fp8_t, PipelineTag::LDSLoadTranspose>>
+{
+};
+
+class CaseHalfPad
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 64, 64, 1, 1, false, false>>
+{
+};
+
+class CaseHalfPadLoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       64,
+                                                       64,
+                                                       1,
+                                                       1,
+                                                       false,
+                                                       false>>
+{
+};
+
+class CaseHalfPadMultiWarp
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 64, 64, 2, 2, false, false>>
+{
+};
+
+class CaseHalfPadMultiWarpLoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       64,
+                                                       64,
+                                                       2,
+                                                       2,
+                                                       false,
+                                                       false>>
+{
+};
+
+TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseByteLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+
+// clang-format off
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseByte, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseWord, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseByteLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues);
+// clang-format on

From e8709c24f403173ad21a2da907d1347957e324fb Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:02:25 -0700
Subject: [PATCH 371/443] upgrade clang-format version in install_precommit.sh
 (#2589)

---
 script/install_precommit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/script/install_precommit.sh b/script/install_precommit.sh
index 6132f6a287..fd1840290e 100755
--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
@@ -15,7 +15,7 @@ source "$(dirname "$0")/../.venv/bin/activate"
 
 echo "I: Installing tools required for pre-commit checks..."
 run_and_check pip install dos2unix
-run_and_check pip install clang-format==12.0.1
+run_and_check pip install clang-format==18.1.3
 echo "I: Installing pre-commit in virtual environment..."
 run_and_check pip install pre-commit
 run_and_check pre-commit install

From 7b074249f44c4fda2ed71e2f4059f80806476424 Mon Sep 17 00:00:00 2001
From: Anton Gorenko <anton@streamhpc.com>
Date: Thu, 31 Jul 2025 10:54:17 +0600
Subject: [PATCH 372/443] [CK_TILE] Fix UB and corner cases in f32/f16 to/from
 f8 conversion  (#2571)

* Add tests for host convesion f32/f16 to f8

* Add tests for host convesion from f8 to f32/f16

* Fix UB and corner cases in f32/f16 to/from f8 conversion

* There are UBs when very small values are converted to f8: bitshifts
  can be larger that type width. Using unsigned long long does not help
  because exponent_diff >= 64 in such cases. This causes that values
  like 2.117582368e-22 are converted to non-zero f8 in host validation
  of FMHA tests, test_f8 crashes with segfault in completely irrelevant
  code like GTest internals or produces non-deterministic results etc.
* Fix FNUZ conversion to return NaN for NaN inputs.
* Fix compilation error (due to uint8_t << 8) in OCP e5m2 to f16
  conversion.

* Replace some magic numbers with values from numeric_traits

* Build tests only on devices supporting the type
---
 include/ck_tile/core/numeric/float8.hpp |  93 ++--
 test/ck_tile/data_type/CMakeLists.txt   |  14 +-
 test/ck_tile/data_type/test_fp8.cpp     | 606 ++++++++++++++++++++++++
 3 files changed, 663 insertions(+), 50 deletions(-)
 create mode 100644 test/ck_tile/data_type/test_fp8.cpp

diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index a3ce614f84..04ca950641 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -43,19 +43,19 @@ enum class fp8_interpretation
 };
 
 /*
- *                ______________FNUZ_________________    |   ______________OCP________________
+ *                 ______________FNUZ_________________    |   ______________OCP________________
  *                   e4m3               e5m2              |    e4m3                e5m2
  *      bias :        8                  16               |     7                   15
- *      inf  :  1.0000.000           1.00000.00           |    N/A              s.11111.00
+ *      inf  :       N/A                 N/A              |    N/A              s.11111.00
  *      Nan  :  1.0000.000           1.00000.00           | s.1111.111          s.11111.{01, 10, 11}
  *      zero :  0.0000.000           0.00000.00           | s.0000.000          s.00000.00
  * Max(norm) :  s.1111.111 (240)     s.11111.11(57344)    | s.1111.110(448)     s.11110.11(57344)
  * Max(snorm):  s.0000.111           s.00000.11           | s.0000.111          s.00000.11
- *                0.0068359375         2.288818e-05       |   0.013671875         4.57763671875e-05
+ *                0.0068359375         2.288818e-05       | 0.013671875         4.57763671875e-05
  * Min(norm) :  s.0001.000           s.00001.00           | s.0001.000          s.00001.00
- *                2^-7(0.00078125)     2^-15(3.05176e-05) |   2^-6(0.015625)      2^-14(6.10352e-05)
+ *                2^-7(0.0078125)      2^-15(3.05176e-05) |   2^-6(0.015625) 2^-14(6.10352e-05)
  * Min(snorm):  s.0000.001           s.00000.01           | s.0000.001          s.00000.01
- *                2^-10(0.00097656)    2^-17(7.629395e-06)|   2^-9(0.001953125)   2^-16(1.52588e-05)
+ *                2^-10(0.0009765625)  2^-17(7.62939e-06) |   2^-9(0.001953125) 2^-16(1.52588e-05)
  */
 
 template <fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
@@ -259,50 +259,50 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
     // fp8/bf8 type exponent/mantissa layout
     constexpr int DstT_exp  = numeric_traits<DstT>::exp;  // exponent width of the destination type
     constexpr int DstT_mant = numeric_traits<DstT>::mant; // mantissa width of the destination type
+    constexpr int DstT_bias = numeric_traits<DstT>::bias;
     constexpr bool is_fnuz =
         (numeric_traits<DstT>::f8_interpret == fp8_interpretation::E4M3_FNUZ) ||
         (numeric_traits<DstT>::f8_interpret == fp8_interpretation::E5M2_FNUZ);
 
-    constexpr int SrcT_exp  = numeric_traits<SrcT>::exp;
-    constexpr int SrcT_mant = numeric_traits<SrcT>::mant;
+    constexpr int SrcT_exp          = numeric_traits<SrcT>::exp;
+    constexpr int SrcT_mant         = numeric_traits<SrcT>::mant;
+    constexpr int bias              = numeric_traits<SrcT>::bias;
+    constexpr unsigned int fInf     = numeric_traits<SrcT>::Inf;
+    constexpr unsigned int abs_mask = numeric_traits<SrcT>::abs_mask;
 
     using SrcT_bitwise       = typename numeric_traits<SrcT>::bitwise_type;
     SrcT_bitwise src_bitwise = bit_cast<SrcT_bitwise>(src);
 
-    unsigned long long head, mantissa;
-    int exponent, bias;
+    unsigned int head, mantissa;
+    int exponent;
     unsigned int sign;
-    unsigned long long fInf, abs_mask;
 
     head     = src_bitwise & numeric_traits<SrcT>::head_mask;
     mantissa = src_bitwise & numeric_traits<SrcT>::mant_mask;
     exponent = (head >> SrcT_mant) & numeric_traits<SrcT>::exp_mask;
     sign     = head >> (SrcT_exp + SrcT_mant);
-    bias     = numeric_traits<SrcT>::bias;
-    fInf     = numeric_traits<SrcT>::Inf;
-    abs_mask = numeric_traits<SrcT>::abs_mask;
 
     unsigned int signed_inf = 0;
     unsigned int nan        = 0;
     if constexpr(is_fnuz)
     {
-        signed_inf = clip ? ((sign << 7) + 0x7f) : 0x80;
+        signed_inf = clip ? ((sign << (DstT_exp + DstT_mant)) + 0x7f) : 0x80;
         nan        = 0x80;
     }
     else
     {
         if constexpr(DstT_exp == 4)
         { // e4m3
-            signed_inf = (sign << 7) + (clip ? 0x7e : 0x7f);
+            signed_inf = (sign << (DstT_exp + DstT_mant)) + (clip ? 0x7e : 0x7f);
         }
         else
         { // e5m2
-            signed_inf = (sign << 7) + (clip ? 0x7b : 0x7c);
+            signed_inf = (sign << (DstT_exp + DstT_mant)) + (clip ? 0x7b : 0x7c);
         }
-        nan = (sign << 7) + 0x7f;
+        nan = (sign << (DstT_exp + DstT_mant)) + 0x7f;
     }
     // Max values
-    unsigned long long ifmax = 0;
+    unsigned int ifmax = 0;
     if constexpr(is_float)
     {
         if constexpr(DstT_exp == 5)
@@ -343,9 +343,6 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
     // Deal with inf and NaNs
     if((src_bitwise & fInf) == fInf)
     {
-        if constexpr(is_fnuz)
-            return signed_inf;
-
         return mantissa != 0 ? nan : signed_inf;
     }
 
@@ -354,11 +351,6 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
         return signed_inf;
     }
 
-    if(src_bitwise == 0)
-    {
-        return 0;
-    }
-
     // First need to check if it is normal or denorm as there is a difference of
     // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
     // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
@@ -367,8 +359,7 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
 
     // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
     // bits
-    const int f8_bias                  = (1 << (DstT_exp - 1)) - 1 + (is_fnuz ? 1 : 0);
-    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    constexpr int f8_denormal_act_exponent = 1 - DstT_bias; // actual exponent of f8 denormal
     // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
     // f8_exponent is the converted f8 exponent with bias encoding
     // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
@@ -406,11 +397,16 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
                                // for this case, act_exponent could be larger. Just
                                // that it does not need shift mantissa
         }
-        mantissa += (1ull << SrcT_mant); // Add the implicit 1 into mantissa
+        mantissa += (1u << SrcT_mant); // Add the implicit 1 into mantissa
     }
-
-    bool midpoint = (mantissa & ((1ull << (SrcT_mant - DstT_mant + exponent_diff)) - 1)) ==
-                    (1ull << (SrcT_mant - DstT_mant + exponent_diff - 1));
+    // The value is smaller than min f8 denormal and results in zero (the early exit also prevents
+    // an undefined behavior of bit shifts >= type width).
+    if(exponent_diff > DstT_mant)
+    {
+        return is_fnuz ? 0 : (sign << (DstT_exp + DstT_mant));
+    }
+    bool midpoint = (mantissa & ((1u << (SrcT_mant - DstT_mant + exponent_diff)) - 1)) ==
+                    (1u << (SrcT_mant - DstT_mant + exponent_diff - 1));
     /* This part is a bit tricky. The judgment of whether it is a tie needs to be
   done before we shift right as shift right could rip off some residual part and
   make something not midpoint look like midpoint. For example, the fp16 number
@@ -422,31 +418,31 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
         mantissa >>= exponent_diff;
     else if(exponent_diff == -1)
         mantissa <<= -exponent_diff;
-    bool implicit_one = mantissa & (1ull << SrcT_mant);
+    bool implicit_one = mantissa & (1u << SrcT_mant);
     // if there is no implicit 1, it  means the f8 is denormal and need to adjust
     // to denorm exponent
     f8_exponent =
-        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + DstT_bias - (implicit_one ? 0 : 1);
 
     // Now we have the exponent and mantissa adjusted
-    unsigned long long drop_mask = (1ull << (SrcT_mant - DstT_mant)) - 1;
+    unsigned int drop_mask = (1u << (SrcT_mant - DstT_mant)) - 1;
     bool odd =
-        mantissa & (1ull << (SrcT_mant -
-                             DstT_mant)); // if the least significant bit that is not truncated is 1
+        mantissa &
+        (1u << (SrcT_mant - DstT_mant)); // if the least significant bit that is not truncated is 1
     mantissa +=
-        (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask;
+        (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1u) : mantissa)) & drop_mask;
 
     // Now we deal with overflow
     if(f8_exponent == 0)
     {
-        if((1ull << SrcT_mant) & mantissa)
+        if((1u << SrcT_mant) & mantissa)
         {
             f8_exponent = 1; // denormal overflow to become normal, promote exponent
         }
     }
     else
     {
-        if((1ull << (SrcT_mant + 1)) & mantissa)
+        if((1u << (SrcT_mant + 1)) & mantissa)
         {
             mantissa >>= 1;
             f8_exponent++;
@@ -471,9 +467,9 @@ CK_TILE_HOST_DEVICE DstT run_cast_to_f8(SrcT src, unsigned int rng = 0)
     }
 
     if(f8_exponent == 0 && mantissa == 0)
-        return is_fnuz ? 0 : (sign << 7);
+        return is_fnuz ? 0 : (sign << (DstT_exp + DstT_mant));
     mantissa &= (1 << DstT_mant) - 1;
-    return (sign << 7) | (f8_exponent << DstT_mant) | mantissa;
+    return (sign << (DstT_exp + DstT_mant)) | (f8_exponent << DstT_mant) | mantissa;
 }
 
 template <typename SrcT, typename DstT, bool clip = true>
@@ -481,8 +477,9 @@ CK_TILE_HOST_DEVICE DstT run_cast_from_f8(SrcT x)
 {
     static_assert(std::is_same<SrcT, fp8_t>::value || std::is_same<SrcT, bf8_t>::value,
                   "SrcT type must be fp8 or bf8.");
-    constexpr int SrcT_exp  = numeric_traits<SrcT>::exp;
-    constexpr int SrcT_mant = numeric_traits<SrcT>::mant;
+    constexpr int SrcT_exp          = numeric_traits<SrcT>::exp;
+    constexpr int SrcT_mant         = numeric_traits<SrcT>::mant;
+    constexpr uint8_t SrcT_abs_mask = numeric_traits<SrcT>::abs_mask;
     constexpr bool is_fnuz =
         (numeric_traits<SrcT>::f8_interpret == fp8_interpretation::E4M3_FNUZ) ||
         (numeric_traits<SrcT>::f8_interpret == fp8_interpretation::E5M2_FNUZ);
@@ -518,9 +515,9 @@ CK_TILE_HOST_DEVICE DstT run_cast_from_f8(SrcT x)
         return 0;
     }
 
-    unsigned long long sign     = x >> 7;
-    unsigned long long mantissa = x & ((1 << SrcT_mant) - 1);
-    int exponent                = (x & 0x7F) >> SrcT_mant;
+    unsigned int sign     = x >> (SrcT_exp + SrcT_mant);
+    unsigned int mantissa = x & ((1 << SrcT_mant) - 1);
+    int exponent          = (x & SrcT_abs_mask) >> SrcT_mant;
     if constexpr(is_fnuz)
     {
         if((x & 0xff) == 0x80)
@@ -559,7 +556,7 @@ CK_TILE_HOST_DEVICE DstT run_cast_from_f8(SrcT x)
 
     if constexpr(SrcT_exp == 5 && is_half && !is_fnuz)
     {
-        retval = x << 8;
+        retval = static_cast<typename numeric_traits<DstT>::bitwise_type>(x) << 8;
         return bit_cast<DstT>(retval);
     }
 
diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt
index 655a0cef9c..a9ce48d1de 100644
--- a/test/ck_tile/data_type/CMakeLists.txt
+++ b/test/ck_tile/data_type/CMakeLists.txt
@@ -1,5 +1,15 @@
-# Currently ck_tile is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
     add_gtest_executable(test_ck_tile_pk_int4 test_pk_int4.cpp)
-	add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp)
+endif()
+if(GPU_TARGETS MATCHES "gfx95")
+    add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp)
+endif()
+
+if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8)
+    add_gtest_executable(test_ck_tile_fp8 test_fp8.cpp)
+    target_compile_options(test_ck_tile_fp8 PRIVATE -Wno-float-equal)
+    # conditionally specify the use of OCP_FP8
+    if(CK_USE_OCP_FP8)
+        target_compile_options(test_ck_tile_fp8 PRIVATE -DCK_TILE_USE_OCP_FP8)
+    endif()
 endif()
diff --git a/test/ck_tile/data_type/test_fp8.cpp b/test/ck_tile/data_type/test_fp8.cpp
new file mode 100644
index 0000000000..49fd68591f
--- /dev/null
+++ b/test/ck_tile/data_type/test_fp8.cpp
@@ -0,0 +1,606 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/core.hpp"
+
+template <typename T>
+class ConvertTest : public ::testing::Test
+{
+};
+
+using TestTypes = ::testing::Types<float, ck_tile::fp16_t>;
+
+TYPED_TEST_SUITE(ConvertTest, TestTypes);
+
+TYPED_TEST(ConvertTest, ToFp8)
+{
+    using SrcT = TypeParam;
+    using DstT = ck_tile::fp8_t;
+
+    auto c = [](SrcT f) {
+        return static_cast<unsigned int>(
+            ck_tile::bit_cast<uint8_t>(ck_tile::impl::run_cast_to_f8<SrcT, DstT, true>(f)));
+    };
+
+    auto c_nosat = [](SrcT f) {
+        return static_cast<unsigned int>(
+            ck_tile::bit_cast<uint8_t>(ck_tile::impl::run_cast_to_f8<SrcT, DstT, false>(f)));
+    };
+
+#if CK_TILE_USE_OCP_FP8
+    EXPECT_EQ(c(+1.0f), 0b0'0111'000);
+    EXPECT_EQ(c(-1.0f), 0b1'0111'000);
+    // max f8 normal
+    EXPECT_EQ(c(+448.0f), 0b0'1111'110);
+    EXPECT_EQ(c(-448.0f), 0b1'1111'110);
+    // min f8 normal
+    EXPECT_EQ(c(+0.015625f), 0b0'0001'000);
+    EXPECT_EQ(c(-0.015625f), 0b1'0001'000);
+    // max f8 subnormal
+    EXPECT_EQ(c(+0.013671875f), 0b0'0000'111);
+    EXPECT_EQ(c(-0.013671875f), 0b1'0000'111);
+    // min f8 subnormal
+    EXPECT_EQ(c(+0.001953125f), 0b0'0000'001);
+    EXPECT_EQ(c(-0.001953125f), 0b1'0000'001);
+    // arbitrary values (exact)
+    EXPECT_EQ(c(+0.203125f), 0b0'0100'101);
+    EXPECT_EQ(c(-88.0f), 0b1'1101'011);
+    // arbitrary values (rounded)
+    EXPECT_EQ(c(+432.919f), 0b0'1111'110);
+    EXPECT_EQ(c(-431.111f), 0b1'1111'101);
+    EXPECT_EQ(c(-0.76123f), 0b1'0110'100);
+    EXPECT_EQ(c(+0.81234f), 0b0'0110'101);
+    // midpoint values (rounded to nearest even)
+    EXPECT_EQ(c(+58.0f), 0b0'1100'110);
+    EXPECT_EQ(c(+62.0f), 0b0'1101'000);
+
+    // saturating mode -> max f8 normal
+    // max f32/f16 normal -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::max()), 0b0'1111'110);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::max()), 0b1'1111'110);
+    // f32/f16 infinity -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::infinity()), 0b0'1111'110);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::infinity()), 0b1'1111'110);
+    // large f32/f16 -> max f8 normal
+    EXPECT_EQ(c(+1.23e9f), 0b0'1111'110);
+    EXPECT_EQ(c(-1.23e9f), 0b1'1111'110);
+
+    constexpr unsigned int nan_mask = 0b0'1111'111;
+
+    // non-saturating mode -> f8 NaN (because OCP e4m3 has no infinity)
+    // max f32/f16 normal -> f8 NaN
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::max()) & nan_mask, nan_mask);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::max()) & nan_mask, nan_mask);
+    // f32/f16 infinity -> f8 NaN
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::infinity()) & nan_mask, nan_mask);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::infinity()) & nan_mask, nan_mask);
+    // large f32/f16 -> f8 NaN
+    EXPECT_EQ(c_nosat(+1.23e9f) & nan_mask, nan_mask);
+    EXPECT_EQ(c_nosat(-1.23e9f) & nan_mask, nan_mask);
+
+    // f32/f16 NaN -> f8 NaN
+    EXPECT_EQ(c(ck_tile::numeric<SrcT>::quiet_NaN()) & nan_mask, nan_mask);
+    EXPECT_EQ(c(ck_tile::numeric<SrcT>::signaling_NaN()) & nan_mask, nan_mask);
+
+    // f32/f16 zero -> f8 zero
+    EXPECT_EQ(c(+0.0f), 0b0'0000'000);
+    EXPECT_EQ(c(-0.0f), 0b1'0000'000);
+    // min f32/f16 normal -> f8 zero
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::min()), 0b0'0000'000);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::min()), 0b1'0000'000);
+    // min f32/f16 subnormal -> f8 zero
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'0000'000);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b1'0000'000);
+
+    // All values smaller than min f8 subnormal must be converted to f8 zero
+    constexpr int src_min_subnorm_exp =
+        -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
+    constexpr int dst_min_subnorm_exp =
+        -(ck_tile::numeric_traits<DstT>::bias + ck_tile::numeric_traits<DstT>::mant - 1);
+    for(int exp = src_min_subnorm_exp; exp <= 0; ++exp)
+    {
+        const float f = std::ldexp(1.0, exp);
+        if(exp < dst_min_subnorm_exp)
+        {
+            EXPECT_EQ(c(+f), 0b0'0000'000) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_EQ(c(-f), 0b1'0000'000) << "-f = 2^" << exp << " = " << -f;
+        }
+        else
+        {
+            EXPECT_GT(c(+f), 0b0'0000'000) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_GT(c(-f), 0b1'0000'000) << "-f = 2^" << exp << " = " << -f;
+        }
+    }
+#else // FNUZ
+    EXPECT_EQ(c(+1.0f), 0b0'1000'000);
+    EXPECT_EQ(c(-1.0f), 0b1'1000'000);
+    // max f8 normal
+    EXPECT_EQ(c(+240.0f), 0b0'1111'111);
+    EXPECT_EQ(c(-240.0f), 0b1'1111'111);
+    // min f8 normal
+    EXPECT_EQ(c(+0.0078125f), 0b0'0001'000);
+    EXPECT_EQ(c(-0.0078125f), 0b1'0001'000);
+    // max f8 subnormal
+    EXPECT_EQ(c(+0.0068359375f), 0b0'0000'111);
+    EXPECT_EQ(c(-0.0068359375f), 0b1'0000'111);
+    // min f8 subnormal
+    EXPECT_EQ(c(+0.0009765625f), 0b0'0000'001);
+    EXPECT_EQ(c(-0.0009765625f), 0b1'0000'001);
+    // arbitrary values (exact)
+    EXPECT_EQ(c(+0.1015625f), 0b0'0100'101);
+    EXPECT_EQ(c(-44.0f), 0b1'1101'011);
+    // arbitrary values (rounded)
+    EXPECT_EQ(c(+219.91f), 0b0'1111'110);
+    EXPECT_EQ(c(-203.11f), 0b1'1111'101);
+    EXPECT_EQ(c(-0.3639f), 0b1'0110'100);
+    EXPECT_EQ(c(+0.4139f), 0b0'0110'101);
+
+    // saturating mode -> max f8 normal
+    // max f32/f16 normal -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::max()), 0b0'1111'111);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::max()), 0b1'1111'111);
+    // f32/f16 infinity -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::infinity()), 0b0'1111'111);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::infinity()), 0b1'1111'111);
+    // large f32/f16 -> max f8 normal
+    EXPECT_EQ(c(+1.23e9f), 0b0'1111'111);
+    EXPECT_EQ(c(-1.23e9f), 0b1'1111'111);
+
+    constexpr unsigned int nan_value = 0b1'0000'000;
+
+    // non-saturating mode -> f8 NaN (FN means "finite", so no infinity)
+    // max f32/f16 normal -> f8 NaN
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::max()), nan_value);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::max()), nan_value);
+    // f32/f16 infinity -> f8 NaN
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::infinity()), nan_value);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::infinity()), nan_value);
+    // large f32/f16 -> f8 NaN
+    EXPECT_EQ(c_nosat(+1.23e9f), nan_value);
+    EXPECT_EQ(c_nosat(-1.23e9f), nan_value);
+
+    // f32/f16 NaN -> f8 NaN
+    EXPECT_EQ(c(ck_tile::numeric<SrcT>::quiet_NaN()), nan_value);
+    EXPECT_EQ(c(ck_tile::numeric<SrcT>::signaling_NaN()), nan_value);
+
+    // UZ means "unsigned zero" (0b1'0000'000 is NaN)
+    // f32/f16 +-zero -> f8 +zero
+    EXPECT_EQ(c(+0.0f), 0b0'0000'000);
+    EXPECT_EQ(c(-0.0f), 0b0'0000'000);
+    // min f32/f16 normal -> f8 +zero
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::min()), 0b0'0000'000);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::min()), 0b0'0000'000);
+    // min f32/f16 subnormal -> f8 +zero
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'0000'000);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b0'0000'000);
+
+    // All values smaller than min f8 subnormal must be converted to f8 zero
+    constexpr int src_min_subnorm_exp =
+        -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
+    constexpr int dst_min_subnorm_exp =
+        -(ck_tile::numeric_traits<DstT>::bias + ck_tile::numeric_traits<DstT>::mant - 1);
+    for(int exp = src_min_subnorm_exp; exp <= 0; ++exp)
+    {
+        const float f = std::ldexp(1.0, exp);
+        if(exp < dst_min_subnorm_exp)
+        {
+            EXPECT_EQ(c(+f), 0b0'0000'000) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_EQ(c(-f), 0b0'0000'000) << "-f = 2^" << exp << " = " << -f;
+        }
+        else
+        {
+            EXPECT_GT(c(+f), 0b0'0000'000) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_GT(c(-f), 0b0'0000'000) << "-f = 2^" << exp << " = " << -f;
+        }
+    }
+#endif
+}
+
+TYPED_TEST(ConvertTest, ToBf8)
+{
+    using SrcT = TypeParam;
+    using DstT = ck_tile::bf8_t;
+
+    auto c = [](SrcT f) {
+        return static_cast<unsigned int>(
+            ck_tile::bit_cast<uint8_t>(ck_tile::impl::run_cast_to_f8<SrcT, DstT, true>(f)));
+    };
+
+    auto c_nosat = [](SrcT f) {
+        return static_cast<unsigned int>(
+            ck_tile::bit_cast<uint8_t>(ck_tile::impl::run_cast_to_f8<SrcT, DstT, false>(f)));
+    };
+
+#if CK_TILE_USE_OCP_FP8
+    EXPECT_EQ(c(+1.0f), 0b0'01111'00);
+    EXPECT_EQ(c(-1.0f), 0b1'01111'00);
+    // max f8 normal
+    EXPECT_EQ(c(+57344.0f), 0b0'11110'11);
+    EXPECT_EQ(c(-57344.0f), 0b1'11110'11);
+    // min f8 normal
+    EXPECT_EQ(c(+6.103515625e-05f), 0b0'00001'00);
+    EXPECT_EQ(c(-6.103515625e-05f), 0b1'00001'00);
+    // max f8 subnormal
+    EXPECT_EQ(c(+4.57763671875e-05f), 0b0'00000'11);
+    EXPECT_EQ(c(-4.57763671875e-05f), 0b1'00000'11);
+    // min f8 subnormal
+    EXPECT_EQ(c(+1.52587890625e-05f), 0b0'00000'01);
+    EXPECT_EQ(c(-1.52587890625e-05f), 0b1'00000'01);
+    // arbitrary values (exact)
+    EXPECT_EQ(c(+0.01953125f), 0b0'01001'01);
+    EXPECT_EQ(c(-3584.0f), 0b1'11010'11);
+    // arbitrary values (rounded)
+    EXPECT_EQ(c(+2030.56f), 0b0'11010'00);
+    EXPECT_EQ(c(-1801.33f), 0b1'11001'11);
+    EXPECT_EQ(c(-0.27891f), 0b1'0110'100);
+    EXPECT_EQ(c(+0.33333f), 0b0'0110'101);
+
+    // saturating mode -> max f8 normal
+    // max f32/f16 normal -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::max()), 0b0'11110'11);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::max()), 0b1'11110'11);
+    // f32/f16 infinity -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::infinity()), 0b0'11110'11);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::infinity()), 0b1'11110'11);
+    // large f32/f16 -> max f8 normal
+    EXPECT_EQ(c(+1.23e9f), 0b0'11110'11);
+    EXPECT_EQ(c(-1.23e9f), 0b1'11110'11);
+
+    // non-saturating mode -> f8 infinity
+    // max f32/f16 normal -> f8 infinity
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::max()), 0b0'11111'00);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::max()), 0b1'11111'00);
+    // f32/f16 infinity -> f8 infinity
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::infinity()), 0b0'11111'00);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::infinity()), 0b1'11111'00);
+    // large f32/f16 -> f8 infinity
+    EXPECT_EQ(c_nosat(+1.23e9f), 0b0'11111'00);
+    EXPECT_EQ(c_nosat(-1.23e9f), 0b1'11111'00);
+
+    // f32/f16 NaN -> f8 NaN
+    EXPECT_TRUE((c(ck_tile::numeric<SrcT>::quiet_NaN()) & 0b0'11111'11) != 0b0'11111'00);
+    EXPECT_TRUE((c(ck_tile::numeric<SrcT>::signaling_NaN()) & 0b0'11111'11) != 0b0'11111'00);
+
+    // f32/f16 zero -> f8 zero
+    EXPECT_EQ(c(+0.0f), 0b0'00000'00);
+    EXPECT_EQ(c(-0.0f), 0b1'00000'00);
+    if constexpr(std::is_same_v<SrcT, float>)
+    {
+        // min f32 normal -> f8 zero
+        EXPECT_EQ(c(+ck_tile::numeric<SrcT>::min()), 0b0'00000'00);
+        EXPECT_EQ(c(-ck_tile::numeric<SrcT>::min()), 0b1'00000'00);
+    }
+    else
+    {
+        // min f16 normal -> min f8 normal (they are equal)
+        EXPECT_EQ(c(+ck_tile::numeric<SrcT>::min()), 0b0'00001'00);
+        EXPECT_EQ(c(-ck_tile::numeric<SrcT>::min()), 0b1'00001'00);
+    }
+    // min f32/f16 subnormal -> f8 zero
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'00000'00);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b1'00000'00);
+
+    // All values smaller than min f8 subnormal must be converted to f8 zero
+    constexpr int src_min_subnorm_exp =
+        -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
+    constexpr int dst_min_subnorm_exp =
+        -(ck_tile::numeric_traits<DstT>::bias + ck_tile::numeric_traits<DstT>::mant - 1);
+    for(int exp = src_min_subnorm_exp; exp <= 0; ++exp)
+    {
+        const float f = std::ldexp(1.0, exp);
+        if(exp < dst_min_subnorm_exp)
+        {
+            EXPECT_EQ(c(+f), 0b0'00000'00) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_EQ(c(-f), 0b1'00000'00) << "-f = 2^" << exp << " = " << -f;
+        }
+        else
+        {
+            EXPECT_GT(c(+f), 0b0'00000'00) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_GT(c(-f), 0b1'00000'00) << "-f = 2^" << exp << " = " << -f;
+        }
+    }
+#else // FNUZ
+    EXPECT_EQ(c(+1.0f), 0b0'10000'00);
+    EXPECT_EQ(c(-1.0f), 0b1'10000'00);
+    // max f8 normal
+    EXPECT_EQ(c(+57344.0f), 0b0'11111'11);
+    EXPECT_EQ(c(-57344.0f), 0b1'11111'11);
+    // min f8 normal
+    EXPECT_EQ(c(+3.0517578125e-05f), 0b0'00001'00);
+    EXPECT_EQ(c(-3.0517578125e-05f), 0b1'00001'00);
+    // max f8 subnormal
+    EXPECT_EQ(c(+2.288818359375e-05f), 0b0'00000'11);
+    EXPECT_EQ(c(-2.288818359375e-05f), 0b1'00000'11);
+    // min f8 subnormal
+    EXPECT_EQ(c(+7.62939453125e-06f), 0b0'00000'01);
+    EXPECT_EQ(c(-7.62939453125e-06f), 0b1'00000'01);
+    // arbitrary values (exact)
+    EXPECT_EQ(c(+0.009765625f), 0b0'01001'01);
+    EXPECT_EQ(c(-1792.0f), 0b1'11010'11);
+    // arbitrary values (rounded)
+    EXPECT_EQ(c(+840.100f), 0b0'11001'11);
+    EXPECT_EQ(c(-999.999f), 0b1'11010'00);
+    EXPECT_EQ(c(-0.12789f), 0b1'0110'100);
+    EXPECT_EQ(c(+0.14444f), 0b0'0110'101);
+
+    // saturating mode -> max f8 normal
+    // max f32/f16 normal -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::max()), 0b0'11111'11);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::max()), 0b1'1111'111);
+    // f32/f16 infinity -> max f8 normal
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::infinity()), 0b0'11111'11);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::infinity()), 0b1'1111'111);
+    // large f32/f16 -> max f8 normal
+    EXPECT_EQ(c(+1.23e9f), 0b0'11111'11);
+    EXPECT_EQ(c(-1.23e9f), 0b1'1111'111);
+
+    constexpr unsigned int nan_value = 0b1'00000'00;
+
+    // non-saturating mode -> f8 NaN (FN means "finite", so no infinity)
+    // max f32/f16 normal -> f8 NaN
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::max()), nan_value);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::max()), nan_value);
+    // f32/f16 infinity -> f8 NaN
+    EXPECT_EQ(c_nosat(+ck_tile::numeric<SrcT>::infinity()), nan_value);
+    EXPECT_EQ(c_nosat(-ck_tile::numeric<SrcT>::infinity()), nan_value);
+    // large f32/f16 -> f8 NaN
+    EXPECT_EQ(c_nosat(+1.23e9f), nan_value);
+    EXPECT_EQ(c_nosat(-1.23e9f), nan_value);
+
+    // f32/f16 NaN -> f8 NaN
+    EXPECT_EQ(c(ck_tile::numeric<SrcT>::quiet_NaN()), nan_value);
+    EXPECT_EQ(c(ck_tile::numeric<SrcT>::signaling_NaN()), nan_value);
+
+    // UZ means "unsigned zero" (0b1'00000'00 is NaN)
+    // f32/f16 +-zero -> f8 +zero
+    EXPECT_EQ(c(+0.0f), 0b0'00000'00);
+    EXPECT_EQ(c(-0.0f), 0b0'00000'00);
+    if constexpr(std::is_same_v<SrcT, float>)
+    {
+        // min f32 normal -> f8 +zero
+        EXPECT_EQ(c(+ck_tile::numeric<SrcT>::min()), 0b0'00000'00);
+        EXPECT_EQ(c(-ck_tile::numeric<SrcT>::min()), 0b0'00000'00);
+    }
+    else
+    {
+        // min f16 normal -> f8 normal
+        EXPECT_EQ(c(+ck_tile::numeric<SrcT>::min()), 0b0'00010'00);
+        EXPECT_EQ(c(-ck_tile::numeric<SrcT>::min()), 0b1'00010'00);
+    }
+    // min f32/f16 subnormal -> f8 +zero
+    EXPECT_EQ(c(+ck_tile::numeric<SrcT>::denorm_min()), 0b0'00000'00);
+    EXPECT_EQ(c(-ck_tile::numeric<SrcT>::denorm_min()), 0b0'00000'00);
+
+    // All values smaller than min f8 subnormal must be converted to f8 zero
+    constexpr int src_min_subnorm_exp =
+        -(ck_tile::numeric_traits<SrcT>::bias + ck_tile::numeric_traits<SrcT>::mant - 1);
+    constexpr int dst_min_subnorm_exp =
+        -(ck_tile::numeric_traits<DstT>::bias + ck_tile::numeric_traits<DstT>::mant - 1);
+    for(int exp = src_min_subnorm_exp; exp <= 0; ++exp)
+    {
+        const float f = std::ldexp(1.0, exp);
+        if(exp < dst_min_subnorm_exp)
+        {
+            EXPECT_EQ(c(+f), 0b0'00000'00) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_EQ(c(-f), 0b0'00000'00) << "-f = 2^" << exp << " = " << -f;
+        }
+        else
+        {
+            EXPECT_GT(c(+f), 0b0'00000'00) << "+f = 2^" << exp << " = " << +f;
+            EXPECT_GT(c(-f), 0b0'00000'00) << "-f = 2^" << exp << " = " << -f;
+        }
+    }
+#endif
+}
+
+TYPED_TEST(ConvertTest, FromFp8)
+{
+    using SrcT = ck_tile::fp8_t;
+    using DstT = TypeParam;
+
+    auto c = [](uint8_t u) {
+        return ck_tile::type_convert<float>(
+            ck_tile::impl::run_cast_from_f8<SrcT, DstT, true>(ck_tile::bit_cast<SrcT>(u)));
+    };
+
+#if CK_TILE_USE_OCP_FP8
+    EXPECT_EQ(c(0b0'0111'000), +1.0f);
+    EXPECT_EQ(c(0b1'0111'000), -1.0f);
+    // max f8 normal
+    EXPECT_EQ(c(0b0'1111'110), +448.0f);
+    EXPECT_EQ(c(0b1'1111'110), -448.0f);
+    // min f8 normal
+    EXPECT_EQ(c(0b0'0001'000), +0.015625f);
+    EXPECT_EQ(c(0b1'0001'000), -0.015625f);
+    // max f8 subnormal
+    EXPECT_EQ(c(0b0'0000'111), +0.013671875f);
+    EXPECT_EQ(c(0b1'0000'111), -0.013671875f);
+    // min f8 subnormal
+    EXPECT_EQ(c(0b0'0000'001), +0.001953125f);
+    EXPECT_EQ(c(0b1'0000'001), -0.001953125f);
+    // arbitrary values
+    EXPECT_EQ(c(0b0'0100'101), +0.203125f);
+    EXPECT_EQ(c(0b1'1101'011), -88.0f);
+
+    // f8 NaN -> f32/f16 NaN
+    EXPECT_TRUE(ck_tile::isnan(c(0b0'1111'111)));
+    EXPECT_TRUE(ck_tile::isnan(c(0b1'1111'111)));
+
+    // f8 zero -> f32/f16 zero (sign is preserved)
+    EXPECT_EQ(c(0b0'0000'000),
+              ck_tile::bit_cast<DstT>(typename ck_tile::numeric_traits<DstT>::bitwise_type{0}));
+    EXPECT_EQ(c(0b1'0000'000), ck_tile::bit_cast<DstT>(ck_tile::numeric_traits<DstT>::Neg0));
+#else // FNUZ
+    EXPECT_EQ(c(0b0'1000'000), +1.0f);
+    EXPECT_EQ(c(0b1'1000'000), -1.0f);
+    // max f8 normal
+    EXPECT_EQ(c(0b0'1111'111), +240.0f);
+    EXPECT_EQ(c(0b1'1111'111), -240.0f);
+    // min f8 normal
+    EXPECT_EQ(c(0b0'0001'000), +0.0078125f);
+    EXPECT_EQ(c(0b1'0001'000), -0.0078125f);
+    // max f8 subnormal
+    EXPECT_EQ(c(0b0'0000'111), +0.0068359375f);
+    EXPECT_EQ(c(0b1'0000'111), -0.0068359375f);
+    // min f8 subnormal
+    EXPECT_EQ(c(0b0'0000'001), +0.0009765625f);
+    EXPECT_EQ(c(0b1'0000'001), -0.0009765625f);
+    // arbitrary values
+    EXPECT_EQ(c(0b0'0100'101), +0.1015625f);
+    EXPECT_EQ(c(0b1'1101'011), -44.0f);
+
+    // f8 NaN -> f32/f16 NaN
+    EXPECT_TRUE(ck_tile::isnan(c(0b1'0000'000)));
+
+    // UZ means "unsigned zero" (0b1'0000'000 is NaN)
+    // f8 +zero -> f32/f16 +zero
+    EXPECT_EQ(c(0b0'0000'000),
+              ck_tile::bit_cast<DstT>(typename ck_tile::numeric_traits<DstT>::bitwise_type{0}));
+#endif
+}
+
+TYPED_TEST(ConvertTest, FromBf8)
+{
+    using SrcT = ck_tile::bf8_t;
+    using DstT = TypeParam;
+
+    using DstT = TypeParam;
+
+    auto c = [](uint8_t u) {
+        return ck_tile::type_convert<float>(
+            ck_tile::impl::run_cast_from_f8<SrcT, DstT, true>(ck_tile::bit_cast<SrcT>(u)));
+    };
+
+#if CK_TILE_USE_OCP_FP8
+    auto c_nosat = [](uint8_t u) {
+        return ck_tile::type_convert<float>(
+            ck_tile::impl::run_cast_from_f8<SrcT, DstT, false>(ck_tile::bit_cast<SrcT>(u)));
+    };
+
+    EXPECT_EQ(c(0b0'01111'00), +1.0f);
+    EXPECT_EQ(c(0b1'01111'00), -1.0f);
+    // max f8 normal
+    EXPECT_EQ(c(0b0'11110'11), +57344.0f);
+    EXPECT_EQ(c(0b1'11110'11), -57344.0f);
+    // min f8 normal
+    EXPECT_EQ(c(0b0'00001'00), +6.103515625e-05f);
+    EXPECT_EQ(c(0b1'00001'00), -6.103515625e-05f);
+    // max f8 subnormal
+    EXPECT_EQ(c(0b0'00000'11), +4.57763671875e-05f);
+    EXPECT_EQ(c(0b1'00000'11), -4.57763671875e-05f);
+    // min f8 subnormal
+    EXPECT_EQ(c(0b0'00000'01), +1.52587890625e-05f);
+    EXPECT_EQ(c(0b1'00000'01), -1.52587890625e-05f);
+    // arbitrary values
+    EXPECT_EQ(c(0b0'01001'01), +0.01953125f);
+    EXPECT_EQ(c(0b1'11010'11), -3584.0f);
+
+    // saturating mode
+    // f8 infinity -> max f8 normal as f32/f16
+    EXPECT_EQ(c(0b0'11111'00), +57344.0f);
+    EXPECT_EQ(c(0b1'11111'00), -57344.0f);
+
+    // non-saturating mode
+    // f8 infinity -> f32/f16 infinity
+    EXPECT_EQ(c_nosat(0b0'11111'00), +ck_tile::numeric<DstT>::infinity());
+    EXPECT_EQ(c_nosat(0b1'11111'00), -ck_tile::numeric<DstT>::infinity());
+
+    // f8 NaN -> f32/f16 NaN
+    EXPECT_TRUE(ck_tile::isnan(c(0b0'11111'01)));
+    EXPECT_TRUE(ck_tile::isnan(c(0b0'11111'10)));
+    EXPECT_TRUE(ck_tile::isnan(c(0b0'11111'11)));
+    EXPECT_TRUE(ck_tile::isnan(c(0b1'11111'01)));
+    EXPECT_TRUE(ck_tile::isnan(c(0b1'11111'10)));
+    EXPECT_TRUE(ck_tile::isnan(c(0b1'11111'11)));
+
+    // f8 zero -> f32/f16 zero (sign is preserved)
+    EXPECT_EQ(c(0b0'00000'00),
+              ck_tile::bit_cast<DstT>(typename ck_tile::numeric_traits<DstT>::bitwise_type{0}));
+    EXPECT_EQ(c(0b1'00000'00), ck_tile::bit_cast<DstT>(ck_tile::numeric_traits<DstT>::Neg0));
+    if constexpr(std::is_same_v<DstT, ck_tile::fp16_t>)
+    {
+        // min f8 normal -> min f16 normal (they are equal)
+        EXPECT_EQ(c(0b0'00001'00), +ck_tile::numeric<DstT>::min());
+        EXPECT_EQ(c(0b1'00001'00), -ck_tile::numeric<DstT>::min());
+    }
+#else // FNUZ
+    EXPECT_EQ(c(0b0'10000'00), +1.0f);
+    EXPECT_EQ(c(0b1'10000'00), -1.0f);
+    // max f8 normal
+    EXPECT_EQ(c(0b0'11111'11), +57344.0f);
+    EXPECT_EQ(c(0b1'11111'11), -57344.0f);
+    // min f8 normal
+    EXPECT_EQ(c(0b0'00001'00), +3.0517578125e-05f);
+    EXPECT_EQ(c(0b1'00001'00), -3.0517578125e-05f);
+    // max f8 subnormal
+    EXPECT_EQ(c(0b0'00000'11), +2.288818359375e-05f);
+    EXPECT_EQ(c(0b1'00000'11), -2.288818359375e-05f);
+    // min f8 subnormal
+    EXPECT_EQ(c(0b0'00000'01), +7.62939453125e-06f);
+    EXPECT_EQ(c(0b1'00000'01), -7.62939453125e-06f);
+    // arbitrary values
+    EXPECT_EQ(c(0b0'01001'01), +0.009765625f);
+    EXPECT_EQ(c(0b1'11010'11), -1792.0f);
+
+    // f8 NaN -> f32/f16 NaN
+    EXPECT_TRUE(ck_tile::isnan(c(0b1'00000'00)));
+
+    // UZ means "unsigned zero" (0b1'00000'00 is NaN)
+    // f8 +zero -> f32/f16 +zero
+    EXPECT_EQ(c(0b0'00000'00),
+              ck_tile::bit_cast<DstT>(typename ck_tile::numeric_traits<DstT>::bitwise_type{0}));
+    if constexpr(std::is_same_v<DstT, ck_tile::fp16_t>)
+    {
+        // one of f8 normals -> min f16 normal
+        EXPECT_EQ(c(0b0'00010'00), +ck_tile::numeric<DstT>::min());
+        EXPECT_EQ(c(0b1'00010'00), -ck_tile::numeric<DstT>::min());
+    }
+#endif
+}
+
+// Convert f8 -> f32/f16 -> f8 to check if all values are covered
+// OCP types multiple NaN representations (e4m3 - 2, e5m2 - 6), they are ignored for simplicity.
+
+TYPED_TEST(ConvertTest, FromFp8AndToFp8)
+{
+    using SrcT = ck_tile::fp8_t;
+    using DstT = TypeParam;
+
+    for(int i = 0; i < 256; ++i)
+    {
+#if CK_TILE_USE_OCP_FP8
+        if((i & 0b0'1111'111) == 0b0'1111'111)
+        {
+            continue;
+        }
+#endif
+        const uint8_t u = static_cast<uint8_t>(i);
+        const SrcT from = ck_tile::bit_cast<SrcT>(u);
+        const DstT f    = ck_tile::impl::run_cast_from_f8<SrcT, DstT, false>(from);
+        const SrcT to   = ck_tile::impl::run_cast_to_f8<DstT, SrcT, false>(f);
+        EXPECT_EQ(from, to) << "u8: " << i << " f32/f16: " << ck_tile::type_convert<float>(f);
+    }
+}
+
+TYPED_TEST(ConvertTest, FromBf8AndToBf8)
+{
+    using SrcT = ck_tile::bf8_t;
+    using DstT = TypeParam;
+
+    for(int i = 0; i < 256; ++i)
+    {
+#if CK_TILE_USE_OCP_FP8
+        if((i & 0b0'11111'11) > 0b0'11111'00)
+        {
+            continue;
+        }
+#endif
+        const uint8_t u = static_cast<uint8_t>(i);
+        const SrcT from = ck_tile::bit_cast<SrcT>(u);
+        const DstT f    = ck_tile::impl::run_cast_from_f8<SrcT, DstT, false>(from);
+        const SrcT to   = ck_tile::impl::run_cast_to_f8<DstT, SrcT, false>(f);
+        EXPECT_EQ(from, to) << "u8: " << i << " f32/f16: " << ck_tile::type_convert<float>(f);
+    }
+}

From e962a4163818c1f316172626ea6330be0d6afa5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?=
 <188998872+vpietila-amd@users.noreply.github.com>
Date: Thu, 31 Jul 2025 13:08:45 +0300
Subject: [PATCH 373/443] Automatic deduction of split-K value for grouped
 convolution (#2491)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Split-K autodeduction for DeviceGroupedConvBwdWeight_Xdl_CShuffle and DeviceGroupedConvBwdWeight_Xdl_CShuffleV3.

* Split-K autodeduction for DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle.

* Use simple best occupancy model to calculate the split-K.

* Handle split-K autodeduction in explicit gemm conv.

* Add unit tests for split-K autodeduction.

* Remove oversubscription.

* Small fixes.

* Added split-K autodeduction for DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle.

* Run clang formatting.

* Fix error handling in the conv profiler.

* Add missing documentation for the autodeducted split-K values.

* Add split-K autodeduction to DeviceGroupedConvBwdWeight_Explicit_Xdl solver.

* Fix clang formatting and split-K profiler documentation.

* Rename max_occupancy value variable.

* Calculate grid size for split-K autodeduction directly from input array shapes and template params.

---------

Co-authored-by: Ville Pietilä <>
---
 ...atched_gemm_multiple_d_xdl_cshuffle_v3.hpp | 60 ++++++++++++
 ...e_grouped_conv_bwd_weight_explicit_xdl.hpp | 21 ++++-
 ...onv_bwd_weight_multiple_d_xdl_cshuffle.hpp | 55 ++++++++++-
 ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 87 ++++++++++++++++-
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 56 ++++++++++-
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 85 ++++++++++++++++-
 .../gpu/device/impl/split_k_arg.hpp           | 17 ++++
 .../gpu/device/impl/split_k_utils.hpp         | 93 +++++++++++++++++++
 profiler/README.md                            |  2 +-
 .../profile_grouped_conv_bwd_weight_impl.hpp  | 42 ++++++---
 .../src/profile_grouped_conv_bwd_weight.cpp   |  6 +-
 .../test_grouped_convnd_bwd_weight.cpp        |  4 +-
 ...rouped_convnd_bwd_weight_interface_xdl.cpp | 44 +++++----
 ...ped_convnd_bwd_weight_v3_interface_xdl.cpp | 44 +++++----
 14 files changed, 544 insertions(+), 72 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index cf7941195e..64d5fbd509 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -337,6 +337,60 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
         }
     };
 
+    struct ActiveWorkgroupsPerCU
+    {
+        ActiveWorkgroupsPerCU()
+        {
+            constexpr int dynamic_smem_size = 0;
+            int max_occupancy               = 0;
+
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
+
+            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+            {
+                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                    &max_occupancy,
+                    kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds<
+                        GridwiseGemm,
+                        Argument,
+                        true,
+                        InMemoryDataOperationEnum::AtomicAdd,
+                        minimum_occupancy>,
+                    BlockSize,
+                    dynamic_smem_size));
+            }
+            else
+            {
+                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                    &max_occupancy,
+                    kernel_batched_gemm_xdl_cshuffle_v3_multi_d<
+                        GridwiseGemm,
+                        Argument,
+                        true,
+                        InMemoryDataOperationEnum::AtomicAdd,
+                        minimum_occupancy>,
+                    BlockSize,
+                    dynamic_smem_size));
+            }
+
+            max_occupancy_ = std::max(1, max_occupancy);
+        }
+        int max_occupancy_;
+    };
+
     // Invoker
     struct Invoker : public BaseInvoker
     {
@@ -1044,6 +1098,12 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
 
         return str.str();
     }
+
+    static ck::index_t GetMaxOccupancy()
+    {
+        static ActiveWorkgroupsPerCU active_workgroups_per_cu;
+        return active_workgroups_per_cu.max_occupancy_;
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
index e5872816f5..5d68ca720a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_explicit_xdl.hpp
@@ -13,6 +13,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
+#include "ck/tensor_operation/gpu/device/impl/split_k_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_arg.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -142,6 +144,20 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
                       end(e_g_k_c_xs_lengths),
                       begin(filter_spatial_lengths_));
 
+            if(split_k < 0)
+            {
+                const auto max_occupancy = DeviceGemmV3Op::GetMaxOccupancy();
+                index_t gdx, gdy, gdz;
+                std::tie(gdx, gdy, gdz) =
+                    DeviceGemmV3Op::GridwiseGemm::CalculateGridSize(M, N, BatchSize);
+                const index_t grid_size = gdx * gdy * gdz;
+                split_k_ = get_best_occupancy_k_batch_value(max_occupancy, grid_size);
+            }
+            else
+            {
+                split_k_ = split_k;
+            }
+
             if constexpr(IsTwoStageNeeded)
             {
                 const index_t merged_filter_dims = std::accumulate(begin(e_g_k_c_xs_lengths),
@@ -176,7 +192,7 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
                                                   out_element_op,
                                                   in_element_op,
                                                   wei_element_op,
-                                                  split_k};
+                                                  split_k_};
             }
             else
             {
@@ -199,7 +215,7 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
                                                   out_element_op,
                                                   in_element_op,
                                                   wei_element_op,
-                                                  split_k};
+                                                  split_k_};
             }
         }
 
@@ -236,6 +252,7 @@ struct DeviceGroupedConvBwdWeight_Explicit_Xdl
         bool is_filter_data_packed;
         CElementwiseGridDesc elementwise_desc_;
         Block2TileMapElementwise elementwise_block_2_ctile_map_;
+        ck::index_t split_k_;
     };
 
     // Invoker
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
index 4e6b4927fc..b761939642 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -19,6 +19,8 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_arg.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/host_utility/device_prop.hpp"
@@ -542,7 +544,36 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
     using Block2CTileMap =
         decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
 
-    struct Argument : public BaseArgument
+    struct ActiveWorkgroupsPerCU
+    {
+        ActiveWorkgroupsPerCU()
+        {
+            constexpr int dynamic_smem_size = 0;
+            int max_occupancy               = 0;
+            hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                &max_occupancy,
+                kernel_batched_gemm_xdlops_bwd_weight<
+                    GridwiseGemm,
+                    ADataType,
+                    BDataType,
+                    AccDataType,
+                    OutElementwiseOperation,
+                    InElementwiseOperation,
+                    element_wise::PassThrough,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                    remove_reference_t<DeviceOp::Block2CTileMap>,
+                    ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                    true>,
+                BlockSize,
+                dynamic_smem_size));
+            max_occupancy_ = std::max(1, max_occupancy);
+        }
+        int max_occupancy_;
+    };
+
+    struct Argument : public BaseArgument, public ArgumentSplitK
     {
         Argument(
             const InDataType* p_in_grid,
@@ -591,9 +622,10 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
               output_spatial_lengths_{},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads},
-              k_batch_{split_k}
+              input_right_pads_{input_right_pads}
         {
+            static ActiveWorkgroupsPerCU active_workgroups_per_cu;
+
             c_space_size_bytes =
                 ck::accumulate_n<long_index_t>(
                     e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
@@ -610,6 +642,22 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
+            if(split_k < 0)
+            {
+                ck::index_t gemmM, gemmN;
+                std::tie(gemmM, gemmN, std::ignore) =
+                    get_bwd_weight_gemm_sizes<NDimSpatial>(a_g_n_k_wos_lengths, e_g_k_c_xs_lengths);
+
+                const auto grid_size =
+                    calculate_mn_grid_size<MPerBlock, NPerBlock>(gemmM, gemmN) * Conv_G_;
+                k_batch_ = get_best_occupancy_k_batch_value(active_workgroups_per_cu.max_occupancy_,
+                                                            grid_size);
+            }
+            else
+            {
+                k_batch_ = split_k;
+            }
+
             const auto descs =
                 conv_to_gemm_transformer
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -712,7 +760,6 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
         const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
-        const index_t k_batch_;
         long_index_t c_space_size_bytes;
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index bfb6707e09..95361287db 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -22,6 +22,8 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_arg.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/host_utility/device_prop.hpp"
@@ -504,7 +506,55 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             CGridDesc_M_N{}, 1, 1));
 
-    struct Argument : public BaseArgument
+    struct ActiveWorkgroupsPerCU
+    {
+        ActiveWorkgroupsPerCU()
+        {
+            constexpr int dynamic_smem_size = 0;
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+            int max_occupancy = 0;
+
+            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+            {
+                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                    &max_occupancy,
+                    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                        GridwiseGemm,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                        NumGroupsToMerge,
+                        true,
+                        InMemoryDataOperationEnum::AtomicAdd,
+                        minimum_occupancy>,
+                    BlockSize,
+                    dynamic_smem_size));
+            }
+            else
+            {
+                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                    &max_occupancy,
+                    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                        GridwiseGemm,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                        NumGroupsToMerge,
+                        true,
+                        InMemoryDataOperationEnum::AtomicAdd,
+                        minimum_occupancy>,
+                    BlockSize,
+                    dynamic_smem_size));
+            }
+            max_occupancy_ = std::max(1, max_occupancy);
+        }
+        int max_occupancy_;
+    };
+
+    struct Argument : public BaseArgument, public ArgumentSplitK
     {
         Argument(const InDataType* p_in_grid,
                  WeiDataType* p_wei_grid,
@@ -547,9 +597,10 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
               output_spatial_lengths_{},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads},
-              k_batch_{split_k}
+              input_right_pads_{input_right_pads}
         {
+            static ActiveWorkgroupsPerCU active_workgroups_per_cu;
+
             c_space_size_bytes =
                 ck::accumulate_n<long_index_t>(
                     e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
@@ -576,6 +627,35 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
                 conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
                                                                     e_g_k_c_xs_strides);
 
+            if(split_k < 0)
+            {
+                ck::index_t gemmM, gemmN, gemmK;
+                std::tie(gemmM, gemmN, gemmK) =
+                    get_bwd_weight_gemm_sizes<NDimSpatial>(a_g_n_k_wos_lengths, e_g_k_c_xs_lengths);
+
+                const auto grid_size = calculate_mn_grid_size<MPerBlock, NPerBlock>(gemmM, gemmN) *
+                                       Conv_G_ / NumGroupsToMerge;
+                k_batch_ = get_best_occupancy_k_batch_value(active_workgroups_per_cu.max_occupancy_,
+                                                            grid_size);
+
+                // Ensure that k_batch_ does not exceed the maximum value
+                // for the GEMM pipeline.
+                const auto k_batch_max = static_cast<index_t>((gemmK - 1) / KPerBlock);
+                k_batch_               = std::min(k_batch_, k_batch_max);
+
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[SPLIT-K AUTODEDUCE] k_batch max value: " << k_batch_max
+                              << std::endl;
+                    std::cout << "[SPLIT-K AUTODEDUCE] Final k_batch value: " << k_batch_
+                              << std::endl;
+                }
+            }
+            else
+            {
+                k_batch_ = split_k;
+            }
+
             const auto descs =
                 conv_to_gemm_transformer_v2
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -751,7 +831,6 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
         const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
-        const index_t k_batch_;
         long_index_t c_space_size_bytes;
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index b58f6885c7..488dadf512 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -19,6 +19,8 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_arg.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 
@@ -419,7 +421,36 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using Block2CTileMap =
         decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
 
-    struct Argument : public BaseArgument
+    struct ActiveWorkgroupsPerCU
+    {
+        ActiveWorkgroupsPerCU()
+        {
+            constexpr int dynamic_smem_size = 0;
+            int max_occupancy               = 0;
+            hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                &max_occupancy,
+                kernel_batched_gemm_xdlops_bwd_weight<
+                    GridwiseGemm,
+                    ADataType,
+                    BDataType,
+                    CDataType,
+                    OutElementwiseOperation,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                    remove_reference_t<DeviceOp::Block2CTileMap>,
+                    ComputePtrOffsetOfStridedBatch<>,
+                    false>, // Both true/false give the same occupancy.
+                BlockSize,
+                dynamic_smem_size));
+            max_occupancy_ = std::max(1, max_occupancy);
+        }
+        int max_occupancy_;
+    };
+
+    struct Argument : public BaseArgument, public ArgumentSplitK
     {
         Argument(const InDataType* p_in_grid,
                  WeiDataType* p_wei_grid,
@@ -463,9 +494,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
               output_spatial_lengths_{},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads},
-              k_batch_{split_k}
+              input_right_pads_{input_right_pads}
         {
+            static ActiveWorkgroupsPerCU active_workgroups_per_cu;
+
             c_space_size_bytes =
                 ck::accumulate_n<long_index_t>(
                     e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
@@ -491,6 +523,23 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             std::array<index_t, NDimSpatial + 3> e_g_k_c_xs_strides_transposed =
                 conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(e_g_k_c_xs_lengths,
                                                                     e_g_k_c_xs_strides);
+
+            if(split_k < 0)
+            {
+                ck::index_t gemmM, gemmN;
+                std::tie(gemmM, gemmN, std::ignore) =
+                    get_bwd_weight_gemm_sizes<NDimSpatial>(a_g_n_k_wos_lengths, e_g_k_c_xs_lengths);
+
+                const auto grid_size =
+                    calculate_mn_grid_size<MPerBlock, NPerBlock>(gemmM, gemmN) * Conv_G_;
+                k_batch_ = get_best_occupancy_k_batch_value(active_workgroups_per_cu.max_occupancy_,
+                                                            grid_size);
+            }
+            else
+            {
+                k_batch_ = split_k;
+            }
+
             const auto descs =
                 conv_to_gemm_transformer
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -656,7 +705,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
-        const index_t k_batch_;
         long_index_t c_space_size_bytes;
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 243a6adafc..1cd1f16245 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -20,6 +20,8 @@
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp"
 #include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_utils.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_arg.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 
@@ -381,7 +383,53 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
             CGridDesc_M_N{}, 1, 1));
 
-    struct Argument : public BaseArgument
+    struct ActiveWorkgroupsPerCU
+    {
+        ActiveWorkgroupsPerCU()
+        {
+            constexpr int dynamic_smem_size = 0;
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+            int max_occupancy = 0;
+
+            if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+            {
+                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                    &max_occupancy,
+                    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                        GridwiseGemm,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                        true,
+                        InMemoryDataOperationEnum::AtomicAdd,
+                        minimum_occupancy>,
+                    BlockSize,
+                    dynamic_smem_size));
+            }
+            else
+            {
+                hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+                    &max_occupancy,
+                    kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                        GridwiseGemm,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                        true,
+                        InMemoryDataOperationEnum::AtomicAdd,
+                        minimum_occupancy>,
+                    BlockSize,
+                    dynamic_smem_size));
+            }
+            max_occupancy_ = std::max(1, max_occupancy);
+        }
+        int max_occupancy_;
+    };
+
+    struct Argument : public BaseArgument, public ArgumentSplitK
     {
         Argument(const InDataType* p_in_grid,
                  WeiDataType* p_wei_grid,
@@ -424,9 +472,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
               output_spatial_lengths_{},
               conv_filter_strides_{conv_filter_strides},
               input_left_pads_{input_left_pads},
-              input_right_pads_{input_right_pads},
-              k_batch_{split_k}
+              input_right_pads_{input_right_pads}
         {
+            static ActiveWorkgroupsPerCU active_workgroups_per_cu;
+
             c_space_size_bytes =
                 ck::accumulate_n<long_index_t>(
                     e_g_k_c_xs_lengths.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *
@@ -443,6 +492,35 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
+            if(split_k < 0)
+            {
+                ck::index_t gemmM, gemmN, gemmK;
+                std::tie(gemmM, gemmN, gemmK) =
+                    get_bwd_weight_gemm_sizes<NDimSpatial>(a_g_n_k_wos_lengths, e_g_k_c_xs_lengths);
+
+                const auto grid_size =
+                    calculate_mn_grid_size<MPerBlock, NPerBlock>(gemmM, gemmN) * Conv_G_;
+                k_batch_ = get_best_occupancy_k_batch_value(active_workgroups_per_cu.max_occupancy_,
+                                                            grid_size);
+
+                // Ensure that k_batch_ does not exceed the maximum value
+                // for the GEMM pipeline.
+                const auto k_batch_max = static_cast<index_t>((gemmK - 1) / K0PerBlock);
+                k_batch_               = std::min(k_batch_, k_batch_max);
+
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "[SPLIT-K AUTODEDUCE] k_batch max value: " << k_batch_max
+                              << std::endl;
+                    std::cout << "[SPLIT-K AUTODEDUCE] Final k_batch value: " << k_batch_
+                              << std::endl;
+                }
+            }
+            else
+            {
+                k_batch_ = split_k;
+            }
+
             const auto descs =
                 conv_to_gemm_transformer
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -513,7 +591,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
         const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
         const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
-        const index_t k_batch_;
         long_index_t c_space_size_bytes;
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
new file mode 100644
index 0000000000..de683f3282
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct ArgumentSplitK
+{
+    index_t k_batch_{1};
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
new file mode 100644
index 0000000000..32179d179e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <numeric>
+#include <hip/hip_runtime.h>
+#include "ck/utility/env.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/ck.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct DeviceProperties
+{
+    DeviceProperties()
+    {
+        hipDeviceProp_t dev_prop;
+        hipDevice_t dev;
+        hip_check_error(hipGetDevice(&dev));
+        hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
+
+        num_cu_ = dev_prop.multiProcessorCount;
+    };
+    int num_cu_;
+};
+
+inline ck::index_t get_best_occupancy_k_batch_value(int max_occupancy, ck::index_t grid_size)
+{
+    static DeviceProperties device_properties;
+    const int max_capacity = max_occupancy * device_properties.num_cu_;
+
+    ck::index_t k_batch = 1;
+    const auto optimal_split =
+        static_cast<ck::index_t>(std::floor((1.0 * max_capacity) / grid_size));
+    if(optimal_split > 1)
+    {
+        k_batch = optimal_split;
+    }
+
+    if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+    {
+        std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel:  "
+                  << max_occupancy << std::endl;
+        std::cout << "[SPLIT-K AUTODEDUCE] Output grid size:  " << grid_size << std::endl;
+        std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << std::endl;
+    }
+    return k_batch;
+}
+
+template <ck::index_t NDimSpatial>
+inline auto
+get_bwd_weight_gemm_sizes(const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths,
+                          const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths)
+{
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    // The input array has elements in the order: G, N, K, Do, Ho, Wo
+    // GemmK = N * Do * Ho * Wo for the BWD weight pass.
+    constexpr index_t spatial_offset = 3;
+    const index_t DoHoWo             = std::accumulate(begin(a_g_n_k_wos_lengths) + spatial_offset,
+                                           end(a_g_n_k_wos_lengths),
+                                           index_t{1},
+                                           std::multiplies<>{});
+    const auto gemmK                 = a_g_n_k_wos_lengths[I1] * DoHoWo;
+
+    // The GEMM M dimension is the number of output channels.
+    const auto gemmM = e_g_k_c_xs_lengths[I1];
+
+    // The output array has elements in the order: G, K, C, X, Y, Z
+    // GemmN = C * X * Y * Z for the BWD weight pass.
+    const index_t XYZ = std::accumulate(begin(e_g_k_c_xs_lengths) + spatial_offset,
+                                        end(e_g_k_c_xs_lengths),
+                                        index_t{1},
+                                        std::multiplies<>{});
+    const auto gemmN  = e_g_k_c_xs_lengths[I2] * XYZ;
+    return std::make_tuple(gemmM, gemmN, gemmK);
+}
+
+template <ck::index_t MPerBlock, ck::index_t NPerBlock>
+inline ck::index_t calculate_mn_grid_size(ck::index_t gemmM, ck::index_t gemmN)
+{
+    const auto M0 = math::integer_divide_ceil(gemmM, MPerBlock);
+    const auto N0 = math::integer_divide_ceil(gemmN, NPerBlock);
+    return M0 * N0;
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/README.md b/profiler/README.md
index 4398a878bc..05bbc7b4f9 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -148,7 +148,7 @@
 #  <dilations>, (ie Dy, Dx for 2D)
 #  <left padding>, (ie LeftPy, LeftPx for 2D)
 #  <right padding>, (ie RightPy, RightPx for 2D)
-# SplitK
+# SplitK (-1 for internally computed split-K value, positive value to set k batches explicitly, or 'all' to test all internal split-K values)
 
  ################                   op   datatype  layout  verify  init  log  time  Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx  SplitK
 ./bin/ckProfiler grouped_conv_bwd_weight         1       1      0     1    0     1      2 32 256 256 512  3  3  28  28   1   1   1   1       1       0        0        0       1
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 84acb53425..479fed78e7 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -11,6 +11,7 @@
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/split_k_arg.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
@@ -40,7 +41,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                                           bool do_log,
                                           bool time_kernel,
                                           const ck::utils::conv::ConvParam& conv_param,
-                                          ck::index_t split_k)
+                                          const std::string& split_k)
 {
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -138,10 +139,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
 
     std::string best_op_name;
-    float best_avg_time      = 0;
-    float best_tflops        = 0;
-    float best_gb_per_sec    = 0;
-    ck::index_t best_split_k = 1;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    std::string best_split_k("1");
 
     // profile device Conv instances
     bool all_pass = true;
@@ -170,11 +171,20 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     range_copy(conv_param.input_left_pads_, begin(input_left_pads));
     range_copy(conv_param.input_right_pads_, begin(input_right_pads));
 
-    std::vector<ck::index_t> split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
+    std::vector<ck::index_t> split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128};
 
-    if(split_k > 0)
+    if(split_k != "all")
     {
-        split_k_list = {split_k};
+        try
+        {
+            ck::index_t split_k_value = std::stoi(split_k);
+            split_k_list              = {split_k_value};
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << e.what() << '\n';
+            exit(EXIT_FAILURE);
+        }
     }
 
     for(auto& op_ptr : op_ptrs)
@@ -200,6 +210,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                 out_element_op,
                 split_k_list[split_k_id]);
 
+            auto split_k_value     = split_k_list[split_k_id];
+            auto split_k_param_str = std::to_string(split_k_value);
+            auto* split_k_arg =
+                dynamic_cast<ck::tensor_operation::device::ArgumentSplitK*>(argument_ptr.get());
+            if(split_k_arg && split_k_value < 0)
+            {
+                split_k_value     = split_k_arg->k_batch_;
+                split_k_param_str = std::to_string(split_k_value) + " (best occupancy)";
+            }
+
             const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
             DeviceMem workspace_dev(workspace_sz);
             op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
@@ -222,7 +242,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
                 std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
                           << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
-                          << split_k_list[split_k_id] << std::endl;
+                          << split_k_param_str << std::endl;
 
                 if(tflops > best_tflops)
                 {
@@ -230,7 +250,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                     best_tflops     = tflops;
                     best_avg_time   = avg_time;
                     best_gb_per_sec = gb_per_sec;
-                    best_split_k    = split_k_list[split_k_id];
+                    best_split_k    = split_k_param_str;
                 }
 
                 if(do_verification)
@@ -244,7 +264,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                     using AccDataType =
                         std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
                     const index_t num_accums         = output.GetElementSize() / conv_param.K_;
-                    const index_t num_accums_split_k = split_k_list[split_k_id];
+                    const index_t num_accums_split_k = split_k_value;
                     // Calculate thresholds
                     auto rtol =
                         ck::utils::get_relative_threshold<ComputeType, WeiDataType, AccDataType>(
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index 1640b48ffd..8347ce0e42 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -56,7 +56,9 @@ static void print_helper_msg()
               << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
               << "arg6: print tensor value (0: no; 1: yes)\n"
               << "arg7: time kernel (0: no, 1: yes)\n"
-              << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg()
+              << " SplitK (-1 for internally computed split-K value, positive value to set k "
+                 "batches explicitly, or 'all' to test all internal split-K values)\n"
               << std::endl;
 }
 
@@ -88,7 +90,7 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
 
     const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
 
-    ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
+    const auto& split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
 
     using F32  = float;
     using F16  = ck::half_t;
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
index 95a0a09414..8343629f3a 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -30,7 +30,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
     using NDimSpatial = std::tuple_element_t<6, Tuple>;
 
     std::vector<ck::utils::conv::ConvParam> conv_params;
-    std::vector<ck::index_t> split_ks{1, 2};
+    std::vector<ck::index_t> split_ks{-1, 1, 2};
 
     bool skip_case(const ck::index_t split_k)
     {
@@ -108,7 +108,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
                                        false, // do_log
                                        false, // time_kernel
                                        param,
-                                       split_k);
+                                       std::to_string(split_k));
                 }
             }
         }
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
index cfbf13f00e..2ad1cd11f0 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface_xdl.cpp
@@ -52,7 +52,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
     // clang-format on
 
     ck::utils::conv::ConvParam conv_param;
-    ck::index_t split_k{2};
+    std::vector<ck::index_t> split_ks{-1, 2};
 
     template <ck::index_t NDimSpatial>
     bool Run()
@@ -96,24 +96,30 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         auto conv = GroupedConvBwdWeightDeviceInstance{};
 
-        auto argument = conv.MakeArgument(nullptr,
-                                          nullptr,
-                                          nullptr,
-                                          input_lengths,
-                                          input_strides,
-                                          filter_lengths,
-                                          weights_strides,
-                                          output_lengths,
-                                          output_strides,
-                                          conv_filter_strides,
-                                          conv_filter_dilations,
-                                          input_left_pads,
-                                          input_right_pads,
-                                          PassThrough{},
-                                          PassThrough{},
-                                          PassThrough{},
-                                          split_k);
-        return conv.IsSupportedArgument(argument);
+        bool is_supported = true;
+
+        for(const auto split_k : split_ks)
+        {
+            auto argument = conv.MakeArgument(nullptr,
+                                              nullptr,
+                                              nullptr,
+                                              input_lengths,
+                                              input_strides,
+                                              filter_lengths,
+                                              weights_strides,
+                                              output_lengths,
+                                              output_strides,
+                                              conv_filter_strides,
+                                              conv_filter_dilations,
+                                              input_left_pads,
+                                              input_right_pads,
+                                              PassThrough{},
+                                              PassThrough{},
+                                              PassThrough{},
+                                              split_k);
+            is_supported &= conv.IsSupportedArgument(argument);
+        }
+        return is_supported;
     }
 };
 
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp
index 1556f15898..bfd55a7c55 100644
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp
@@ -52,7 +52,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
     // clang-format on
 
     ck::utils::conv::ConvParam conv_param;
-    ck::index_t split_k{2};
+    std::vector<ck::index_t> split_ks{-1, 2};
 
     template <ck::index_t NDimSpatial>
     bool Run()
@@ -96,24 +96,30 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
 
         auto conv = GroupedConvBwdWeightDeviceInstance{};
 
-        auto argument = conv.MakeArgument(nullptr,
-                                          nullptr,
-                                          nullptr,
-                                          input_lengths,
-                                          input_strides,
-                                          filter_lengths,
-                                          weights_strides,
-                                          output_lengths,
-                                          output_strides,
-                                          conv_filter_strides,
-                                          conv_filter_dilations,
-                                          input_left_pads,
-                                          input_right_pads,
-                                          PassThrough{},
-                                          PassThrough{},
-                                          PassThrough{},
-                                          split_k);
-        return conv.IsSupportedArgument(argument);
+        bool is_supported = true;
+
+        for(const auto split_k : split_ks)
+        {
+            auto argument = conv.MakeArgument(nullptr,
+                                              nullptr,
+                                              nullptr,
+                                              input_lengths,
+                                              input_strides,
+                                              filter_lengths,
+                                              weights_strides,
+                                              output_lengths,
+                                              output_strides,
+                                              conv_filter_strides,
+                                              conv_filter_dilations,
+                                              input_left_pads,
+                                              input_right_pads,
+                                              PassThrough{},
+                                              PassThrough{},
+                                              PassThrough{},
+                                              split_k);
+            is_supported &= conv.IsSupportedArgument(argument);
+        }
+        return is_supported;
     }
 };
 

From 546ef78d1dd9b93ed17f4edc19049091326dfe04 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Thu, 31 Jul 2025 15:18:02 -0400
Subject: [PATCH 374/443] Disable fp8 instances on unsupported targets (#2592)

* Restrict building of gemm_universal_preshuffle_f8 instances to specific targets in CMakeLists.txt

* Add condition to skip gemm_xdl_universal_preshuffle_f8 instances for unsupported targets in CMakeLists.txt

* Add conditions to skip unsupported targets for gemm_universal_preshuffle_f8 and gemm_xdl_universal_preshuffle_f8 instances in CMakeLists.txt

* Refine conditions to exclude gemm_universal_preshuffle_f8 instances for unsupported targets in CMakeLists.txt

---------

Co-authored-by: AviralGoelAMD <aviralgoel@amd.com>
---
 .../gpu/CMakeLists.txt                        | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 90e8dc0221..5204b51edf 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -89,6 +89,12 @@ function(add_instance_library INSTANCE_NAME)
             message(DEBUG "removing gemm_universal_f8 instance ${source} ")
             list(REMOVE_ITEM ARGN "${source}")
         endif()
+        # Do not build gemm_universal_preshuffle_f8 for any targets except gfx94
+        if(NOT (INST_TARGETS MATCHES "gfx942" OR INST_TARGETS MATCHES "gfx950") AND (source_name MATCHES "gemm_universal_preshuffle" OR source_name MATCHES "gemm_xdl_universal_preshuffle") AND (source_name MATCHES "_f8_f8_f16" OR source_name MATCHES "_f8_f8_bf16"))
+            message(DEBUG "removing gemm_universal_preshuffle_f8 instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+
     endforeach()
 
     message(DEBUG "remaining instances: ${ARGN}")
@@ -119,6 +125,12 @@ function(add_instance_library INSTANCE_NAME)
                 if(source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
+                if(source_name MATCHES "gemm_universal_preshuffle" AND source_name MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                endif()
+                if(source_name MATCHES "gemm_xdl_universal_preshuffle" AND source_name MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                endif()
             else()
                 if(source_name MATCHES "gemm_xdl_universal" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
@@ -126,6 +138,12 @@ function(add_instance_library INSTANCE_NAME)
                 if(source_name MATCHES "gemm_multiply_multiply" AND source_name MATCHES "f8")
                     list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
                 endif()
+                if(source_name MATCHES "gemm_universal_preshuffle" AND source_name MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                endif()
+                if(source_name MATCHES "gemm_xdl_universal_preshuffle" AND source_name MATCHES "f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 gfx10-3-generic gfx11-generic gfx12-generic)
+                endif()
             endif()
             if(source_name MATCHES "gemm_wmma_universal" AND source_name MATCHES "f8")
                 list(FILTER INST_TARGETS INCLUDE REGEX "gfx12")
@@ -273,7 +291,11 @@ FOREACH(subdir_path ${dir_list})
             set(add_inst 0)
         endif()
         if(("${cmake_instance}" MATCHES "gemm_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
-            message(STATUS "Found gemm_universal_preshuffle_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
+            message(DEBUG "Found gemm_universal_preshuffle_f8 instances, but gfx94/gfx95 not on the target list. Skipping.")
+            set(add_inst 0)
+        endif()
+        if(("${cmake_instance}" MATCHES "gemm_xdl_universal_preshuffle" AND "${cmake_instance}" MATCHES "_f8_" ) AND (NOT INST_TARGETS MATCHES "gfx94") AND (NOT INST_TARGETS MATCHES "gfx95") AND (NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH))
+            message(DEBUG "Found gemm_xdl_universal_preshuffle_f8_f8_bf16 instances, but gfx94/gfx95 not on the target list. Skipping.")
             set(add_inst 0)
         endif()
         if ("${cmake_instance}" MATCHES "gemm_bilinear")

From 88d72178d6739c7e277074e5f9bb5d1e59bf0152 Mon Sep 17 00:00:00 2001
From: Khushbu Agarwal <khuagarw@amd.com>
Date: Thu, 31 Jul 2025 16:43:33 -0700
Subject: [PATCH 375/443] [CK_Tile] Updating gpu timer when doing flush cache
 (#2593)

* Missed updating function names in example

* updating timer

* code cleanup

* addressing review comments

* updating tile_engine code

* addressing review comments
---
 example/ck_tile/03_gemm/gemm_utils.hpp        |  3 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  | 13 +--
 example/ck_tile/18_flatmm/flatmm_basic.cpp    |  2 +-
 .../grouped_convolution_backward_weight.cpp   |  2 +-
 include/ck_tile/host/kernel_launch.hpp        | 83 +++++++------------
 include/ck_tile/host/stream_config.hpp        |  5 +-
 include/ck_tile/host/timer.hpp                | 77 +----------------
 tile_engine/ops/gemm/benchmark_gemm.cpp       |  3 +-
 tile_engine/ops/gemm/benchmark_gemm.hpp       |  1 -
 tile_engine/ops/gemm/gemm_host_api.hpp        |  1 -
 tile_engine/ops/gemm/gemm_profiler.hpp        |  3 +-
 11 files changed, 54 insertions(+), 139 deletions(-)

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 300a3826d7..e9b779c00c 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -458,7 +458,8 @@ auto create_args(int argc, char* argv[])
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
         .insert("persistent", "0", "0:non-persistent, 1:persistent")
-        .insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms");
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
+        .insert("rotating_count", "1", "rotating count, defaults to 1");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 69331282a4..cc10394065 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -184,7 +184,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   int n_warmup,
                   int n_repeat,
                   bool persistent,
-                  int bench_time_ms)
+                  bool flush_cache,
+                  int rotating_count)
 {
     ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
                                   b_k_n_dev_buf.GetDeviceBuffer(),
@@ -214,7 +215,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                         CDEElementWise>(
             args,
             ck_tile::stream_config{
-                nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
     }
     else
     {
@@ -232,7 +233,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                         CDEElementWise>(
             args,
             ck_tile::stream_config{
-                nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
     }
 
     std::size_t flop = std::size_t(2) * M * N * K;
@@ -303,7 +304,8 @@ int run_gemm_example_with_layouts(int argc,
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
     bool persistent              = arg_parser.get_int("persistent");
-    int bench_time_ms            = arg_parser.get_int("bench_time_ms");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+    int rotating_count           = arg_parser.get_int("rotating_count");
 
     const bool preshuffle = GemmConfig::Preshuffle;
 
@@ -422,7 +424,8 @@ int run_gemm_example_with_layouts(int argc,
                          n_warmup,
                          n_repeat,
                          persistent,
-                         bench_time_ms);
+                         flush_cache,
+                         rotating_count);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 4d29b68694..0f2beca2c7 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -168,7 +168,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
                     hipGetErrorString(hipMemsetAsync(
                         args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             };
-            ave_time = ck_tile::launch_kernel_preprocess(
+            ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
                 ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
index bdfaffecb6..67db775e09 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -120,7 +120,7 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
                       << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
         }
 
-        float ave_time = ck_tile::launch_kernel_preprocess(
+        float ave_time = ck_tile::launch_kernel_time_mask(
             s,
             Kernel::Preprocess(kargs, s),
             ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 262b8bae45..91ac3d5a0b 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -15,12 +15,6 @@
 
 namespace ck_tile {
 
-#define LOW_CU_PROCESSORS 80
-#define HIGH_CU_PROCESSORS 228
-#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005
-#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015
-#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
-
 template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -65,71 +59,58 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
     }
 }
 
-template <class it>
-typename std::iterator_traits<it>::value_type median(it begin, it end)
+// Measure the preprocess time during the cold iterations
+template <typename TimerType, typename PreprocessFunc>
+CK_TILE_HOST double
+preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
 {
-    if(begin == end)
+    timer.start(s.stream_id_);
+    for(int i = 0; i < s.nrepeat_; i++)
     {
-        return std::numeric_limits<double>::quiet_NaN();
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
     }
-    auto n  = std::distance(begin, end);
-    auto n2 = n / 2;
-    std::nth_element(begin, begin + n2, end);
-    return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0;
+    timer.stop(s.stream_id_);
+
+    return timer.duration() / s.nrepeat_;
 }
 
-inline void remove_outliers(std::vector<float>& v)
-{
-    // 1.5x IQR method to detect and remove outliers
-    auto n2 = v.size() / 2;
-    std::nth_element(v.begin(), v.begin() + n2, v.end());
-    auto q1  = median(v.begin(), v.begin() + n2);
-    auto q3  = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end());
-    auto iqr = q3 - q1;
-    auto lb  = q1 - 1.5 * iqr;
-    auto ub  = q3 + 1.5 * iqr;
-    v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end());
-}
-
-template <typename TimerType, typename CallablesFunc>
+template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
 CK_TILE_HOST double timing_loop_impl(TimerType timer,
                                      const stream_config& s,
                                      CallablesFunc&& callables_func,
-                                     std::function<void()> preprocess = nullptr)
+                                     PreprocessFunc preprocess = nullptr)
 {
     for(int i = 0; i < s.cold_niters_; i++)
     {
         callables_func();
     }
-
-    float per_iter_time = 0.f;
-    std::vector<float> times;
-    int i = 0;
-    while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_)
+    // Only profile preprocess if it's provided
+    auto preprocess_time = 0.0;
+    if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
     {
-        if(preprocess)
-            preprocess();
+        preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
+    }
 
-        timer.start(s.stream_id_, i);
-        callables_func();
-        timer.stop(s.stream_id_, i);
-
-        if(i > 0)
+    int i = 0;
+    timer.start(s.stream_id_);
+    while(i < s.nrepeat_)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
         {
-            per_iter_time = timer.duration(i - 1);
-            times.push_back(per_iter_time);
-            per_iter_time = timer.is_exceed(i - 1);
+            preprocess();
         }
+
+        callables_func();
         i++;
     }
+    timer.stop(s.stream_id_);
 
     if(!i)
         return 0.;
-
-    per_iter_time = timer.duration(i - 1);
-    times.push_back(per_iter_time);
-    remove_outliers(times);
-    return std::accumulate(times.begin(), times.end(), 0.) / times.size();
+    return (timer.duration() / s.nrepeat_) - preprocess_time;
 }
 
 // clang-format off
@@ -174,7 +155,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
 
     if(s.is_gpu_timer_)
     {
-        return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func);
+        return timing_loop_impl(gpu_timer{}, s, callables_func);
     }
     else
     {
@@ -199,7 +180,7 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla
 
     if(s.is_gpu_timer_)
     {
-        return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess);
+        return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
     }
     else
     {
diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp
index 0c239d0a7d..acb861b2e7 100644
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -20,6 +20,10 @@ namespace ck_tile {
  *
  *   // create stream config with _some_stream_id_, and benchmark using cpu timer
  *   stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
+ *
+ *   // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
+ *rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
+ *true, 1};
  **/
 
 struct stream_config
@@ -32,6 +36,5 @@ struct stream_config
     bool is_gpu_timer_     = true; // keep compatible
     bool flush_cache_      = false;
     int rotating_count_    = 1;
-    int bench_time_ms_     = 0;
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/host/timer.hpp b/include/ck_tile/host/timer.hpp
index b4bff932e4..e5519643bf 100644
--- a/include/ck_tile/host/timer.hpp
+++ b/include/ck_tile/host/timer.hpp
@@ -48,100 +48,31 @@ struct gpu_timer
     hipEvent_t start_evt, stop_evt;
 };
 
-struct gpu_timer_new
-{
-    CK_TILE_HOST gpu_timer_new(const hipStream_t& s)
-    {
-        for(auto& e : start_event)
-        {
-            HIP_CHECK_ERROR(hipEventCreate(&e));
-        }
-        for(auto& e : stop_event)
-        {
-            HIP_CHECK_ERROR(hipEventCreate(&e));
-        }
-        HIP_CHECK_ERROR(hipEventCreate(&event0));
-        HIP_CHECK_ERROR(hipEventRecord(event0, s));
-    }
-
-    CK_TILE_HOST ~gpu_timer_new() noexcept(false)
-    {
-        for(auto& e : start_event)
-        {
-            HIP_CHECK_ERROR(hipEventDestroy(e));
-        }
-        for(auto& e : stop_event)
-        {
-            HIP_CHECK_ERROR(hipEventDestroy(e));
-        }
-        HIP_CHECK_ERROR(hipEventDestroy(event0));
-    }
-
-    CK_TILE_HOST void start(const hipStream_t& s, int idx = 0)
-    {
-        HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s));
-    }
-
-    CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0)
-    {
-        HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s));
-    }
-    // return in ms
-    CK_TILE_HOST float duration(int idx = 0) const
-    {
-        float ms;
-        HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2]));
-        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2]));
-        return ms;
-    }
-    CK_TILE_HOST float is_exceed(int idx = 0) const
-    {
-        float ms;
-        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2]));
-        return ms;
-    }
-
-    private:
-    std::array<hipEvent_t, 2> start_event;
-    std::array<hipEvent_t, 2> stop_event;
-    hipEvent_t event0;
-};
-
 struct cpu_timer
 {
     // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
-    CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0)
+    CK_TILE_HOST void start(const hipStream_t& s)
     {
         HIP_CHECK_ERROR(hipStreamSynchronize(s));
-        start_tick  = std::chrono::high_resolution_clock::now();
-        time_event0 = std::chrono::high_resolution_clock::now();
+        start_tick = std::chrono::high_resolution_clock::now();
     }
     // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
-    CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0)
+    CK_TILE_HOST void stop(const hipStream_t& s)
     {
         HIP_CHECK_ERROR(hipStreamSynchronize(s));
         stop_tick = std::chrono::high_resolution_clock::now();
     }
     // return in ms
-    CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const
+    CK_TILE_HOST float duration() const
     {
         double sec =
             std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
                 .count();
         return static_cast<float>(sec * 1e3);
     }
-    // return in ms
-    CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const
-    {
-        double sec =
-            std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - time_event0)
-                .count();
-        return static_cast<float>(sec * 1e3);
-    }
 
     private:
     std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
-    std::chrono::time_point<std::chrono::high_resolution_clock> time_event0;
     std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
 };
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
index 5f240c8fe4..db2b648437 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -34,8 +34,7 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                     arg_parser.get_bool("log"),
                     arg_parser.get_str("csv_filename"),
                     arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count"),
-                    arg_parser.get_int("bench_time")};
+                    arg_parser.get_int("rotating_count")};
 
     auto& profiler = GemmProfiler::instance(setting);
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 993e7ea1f5..ce8a6e8234 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,7 +125,6 @@ struct Setting
     std::string csv_filename_;
     bool flush_cache_;
     int rotating_count_;
-    int bench_time_ms_;
 };
 
 inline std::string get_rocm_version()
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 8b18aa703d..2c4af8955f 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -110,7 +110,6 @@ inline auto create_args(int argc, char* argv[])
                 "To flush cache, possible values are true or false. "
                 "Default is false.")
         .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
-        .insert("bench_time", "0", "benchmark time in ms. default is 0 ms.")
         .insert("metric",
                 "0",
                 "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index d4efc7fa7f..634e19de6e 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -131,8 +131,7 @@ class GemmProfiler
                                                                      setting_.n_repeat_,
                                                                      setting_.is_gpu_timer_,
                                                                      setting_.flush_cache_,
-                                                                     setting_.rotating_count_,
-                                                                     setting_.bench_time_ms_});
+                                                                     setting_.rotating_count_});
             process_result(gemm_problem,
                            c_m_n_dev_buf,
                            c_m_n_host_result,

From 1441a0a7eee2930c037d1c7cadde157e8eb3c476 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Fri, 1 Aug 2025 03:04:54 -0400
Subject: [PATCH 376/443] Integration of a new pipeline for weight preshuffle
 into gemm examples (#2516)

* something khushbu can help with

* v1 v2 works with flatmm develop

* v0 v1 v2 numerical error gone

* Fixing numerical error, and interchange preshuffle configs to match with flatmm

* Refactor GEMM pipeline configurations and integrate preshuffle support

- Updated preshuffle pipeline definitions to include multiple versions (V1, V2, V3).
- Changed the pipeline constant from CK_TILE_PIPELINE_PRESHUFFLE to CK_TILE_PIPELINE_PRESHUFFLE_V3 in relevant configurations.
- Removed obsolete code and comments

* clang format

* fix vectorloadsize bug

* add the Preshuffle3

* update kwarp calculation in gemm utils

* update vector size A and B correctly in V2 pipeline; Added few more changes to align with dteng's branch

* fix: add CK_GFX950_SUPPORT macro for gfx950 detection

* default disable rotating buffer

* docs(CHANGELOG): update changelog for rocm 7.0

* Revert "docs(CHANGELOG): update changelog for rocm 7.0"

This reverts commit 2bc16fff84a416b33b8a87692044fc4645fd2086.

* Remove unused Preshuffle V3 pipeline and related code; update gemm function to use Preshuffle V2; clean up comments and formatting in various files.

* revert example/ck_tile/flatmm to its original state

* remove comment added by second author

* switch to xor ALDSDescriptor

* modify the MakeALdsDescriptor()

* temporary profiling script

* getting rid of line marker compiler error

* UniversalWeightPreshufflePipelineAgBgCrPolicy now derives from UniversalGemmBasePolicy

* add a minor fix for the config

* typo fix

* Fix formatting in lambda function for WeightPreshufflePipelineAGmemBGmemCRegV2

* revert change in include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp

* revert change in include/ck_tile/core/arch/amd_buffer_addressing.hpp

* reenable the GemmSpatiallyLocalTilePartitioner

* make GemmConfigPreshuffle_1 for v1 pipeline, GemmConfigPreshuffle_2 for v2 pipeline

* remove hardcoded true for preshuffle bool template argument

* rename script

* remove gemm_profilie.sh script

* merge conflict resolve

* clang formatted

* typo fix

* Remove duplicate include of block_gemm_areg_bsmem_creg_v2r1.hpp in gemm.hpp

* Remove commented-out code in UniversalWeightPreshufflePipelineAgBgCrPolicy

* Fix missing newline at end of file in run_gemm_example.inc

* Remove unused barrier call in BlockWeightPreshuffleASmemBSmemCRegV1

* addressing review comments

* removing debug code

* addressing review comments

* Revert "addressing review comments"

This reverts commit 29c45192badc2371d78cfba9df4ed65148885b88.

* updating tile_engine code

* addressing review comments

---------

Co-authored-by: amd-khushbu <khuagarw@amd.com>
Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/03_gemm/CMakeLists.txt        |    6 +
 example/ck_tile/03_gemm/gemm_utils.hpp        |   33 +-
 .../03_gemm/gemm_weight_preshuffle.cpp        |    4 +-
 example/ck_tile/18_flatmm/flatmm_basic.cpp    |    1 +
 .../ops/flatmm/kernel/flatmm_kernel.hpp       |    0
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |    1 +
 include/ck_tile/ops/gemm.hpp                  |    3 +-
 .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp   |    0
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |    2 +-
 ...pipeline_agmem_bgmem_creg_base_policy.hpp} |  177 +--
 .../wp_pipeline_agmem_bgmem_creg_v1.hpp       |   14 +-
 .../wp_pipeline_agmem_bgmem_creg_v2.hpp       | 1070 +++++++++++++++++
 script/gemm_profile.sh                        |  107 ++
 13 files changed, 1231 insertions(+), 187 deletions(-)
 mode change 100755 => 100644 include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
 mode change 100755 => 100644 include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
 rename include/ck_tile/ops/gemm/pipeline/{wp_pipeline_agmem_bgmem_creg_v1_policy.hpp => wp_pipeline_agmem_bgmem_creg_base_policy.hpp} (64%)
 create mode 100644 include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
 create mode 100755 script/gemm_profile.sh

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index 3d3a54020c..e6f67e4c76 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -2,9 +2,15 @@ add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
 add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
   list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
 endif()
 list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm -enable-noalias-to-md-conversion=0")
 target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index e9b779c00c..cab110597b 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -14,12 +14,13 @@
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4
-#define CK_TILE_PIPELINE_PRESHUFFLE 5
+#define CK_TILE_PIPELINE_PRESHUFFLE_V1 5
+#define CK_TILE_PIPELINE_PRESHUFFLE_V2 6
 
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
     constexpr bool is_8bit_float =
         std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
     if constexpr(M_Warp_Tile == 32)
@@ -36,7 +37,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile_flatmm()
 {
-#if defined(__gfx950__)
+#if defined(CK_GFX950_SUPPORT)
     if constexpr(M_Warp_Tile == 32)
         return sizeof(PrecType) == 2 ? 16 : 64;
     else
@@ -231,7 +232,7 @@ struct GemmConfigComputeV5 : public GemmConfigBase
 };
 
 template <typename PrecType>
-struct GemmConfigPreshufle_1 : public GemmConfigBase
+struct GemmConfigPreshuffle_1 : public GemmConfigBase
 {
     static constexpr ck_tile::index_t M_Tile = 128;
     static constexpr ck_tile::index_t N_Tile = 128;
@@ -247,13 +248,13 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V1;
     static constexpr bool Preshuffle           = true;
     static constexpr bool DoubleSmemBuffer     = false;
 };
 
 template <typename PrecType>
-struct GemmConfigPreshufle_2 : public GemmConfigBase
+struct GemmConfigPreshuffle_2 : public GemmConfigBase
 {
     static constexpr ck_tile::index_t M_Tile = 128;
     static constexpr ck_tile::index_t N_Tile = 128;
@@ -263,15 +264,15 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
     static constexpr ck_tile::index_t N_Warp = 4;
     static constexpr ck_tile::index_t K_Warp = 1;
 
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
     static constexpr bool Preshuffle           = true;
-    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr bool DoubleSmemBuffer     = true;
 };
 
 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
@@ -429,7 +430,7 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
 };
 
 template <>
-struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V1>
 {
     template <typename PipelineProblem>
     using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
@@ -438,6 +439,16 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE>
         ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
 };
 
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V2>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index 74e79574d1..0a06787e2b 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -279,13 +279,11 @@ int main(int argc, char* argv[])
 {
     try
     {
-        return !run_gemm_example<GemmConfigPreshufle_1>(argc, argv);
+        return !run_gemm_example<GemmConfigPreshuffle_2>(argc, argv);
     }
     catch(const std::runtime_error& e)
     {
         std::cerr << "Caught runtime error: " << e.what() << '\n';
-        // Return a non-zero code to indicate failure
         return EXIT_FAILURE;
     }
-    return EXIT_SUCCESS;
 }
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 0f2beca2c7..475a0c7bf3 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -219,6 +219,7 @@ int run_flatmm_example(int argc, char* argv[])
     std::string b_layout  = arg_parser.get_str("b_layout");
     if(a_layout == "R" && b_layout == "C")
     {
+
         if(data_type == "fp16")
         {
             run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
old mode 100755
new mode 100644
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 54f2a777bf..1a28366e24 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -32,6 +32,7 @@ struct BaseFlatmmPipelineAGmemBGmemCRegV1
         return run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Empty>{});
     }
 };
+
 template <typename Problem, typename PipelinePolicy = UniversalFlatmmPipelineAgBgCrPolicy>
 struct FlatmmPipelineAGmemBGmemCRegV1 : public BaseFlatmmPipelineAGmemBGmemCRegV1<Problem>
 {
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index c201293389..c9bedd7c53 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -48,8 +48,9 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp"
-#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
old mode 100755
new mode 100644
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index 0a6bacdc42..b621468e92 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -112,7 +112,7 @@ struct GemmTile1DPartitioner
      * @param N     GEMM's N dimension.
      * @return dim3 Structure holding grid's X,Y and Z dimensions.
      */
-    CK_TILE_HOST static auto
+    CK_TILE_HOST_DEVICE static auto
     GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> index_t
     {
         const index_t GridDimX = (M + MPerBlock - 1) / MPerBlock;
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
similarity index 64%
rename from include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
rename to include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
index 25aad329d9..83555e5295 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
@@ -9,77 +9,19 @@
 namespace ck_tile {
 
 struct UniversalWeightPreshufflePipelineAgBgCrPolicy
+    : public UniversalGemmBasePolicy<UniversalWeightPreshufflePipelineAgBgCrPolicy>
 {
-    static constexpr auto I0 = number<0>{};
-    static constexpr auto I1 = number<1>{};
-    static constexpr auto I2 = number<2>{};
+    using BasePolicy = UniversalGemmBasePolicy<UniversalWeightPreshufflePipelineAgBgCrPolicy>;
 
     // 3d + padding
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
     {
         using namespace ck_tile;
-
-        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
-        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
-        if constexpr(MPerXdl == 16 && NPerXdl == 16)
-        {
-            /*reduce transform layers,compare with old ck*/
-            constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-            constexpr index_t KPack     = GetSmemPackA<Problem>();
-
-            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-                make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
-                make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
-                number<KPack>{},
-                number<1>{});
-
-            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-                a_lds_block_desc_0,
-                make_tuple(make_xor_transform(
-                               make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
-                           make_pass_through_transform(number<KPack>{})),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}),
-                make_tuple(sequence<1, 0>{}, sequence<2>{}));
-
-            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-                a_lds_block_desc_permuted,
-                make_tuple(make_pass_through_transform(number<MPerBlock>{}),
-                           make_merge_transform_v3_division_mod(
-                               make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-                make_tuple(sequence<1>{}, sequence<0, 2>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}));
-
-            return a_lds_block_desc;
-        }
-        else
-        {
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-            constexpr index_t kKPack     = GetSmemPackA<Problem>();
-
-            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-                make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
-                make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
-                number<kKPack>{},
-                number<1>{});
-
-            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-                a_lds_block_desc_0,
-                make_tuple(make_pass_through_transform(kMPerBlock),
-                           make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
-                make_tuple(sequence<1>{}, sequence<0, 2>{}),
-                make_tuple(sequence<0>{}, sequence<1>{}));
-
-            return a_lds_block_desc;
-        }
-/*xor*/
-#if 0
         constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
         constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
         constexpr index_t kKPack     = GetSmemPackA<Problem>();
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ADataType              = remove_cvref_t<typename Problem::ADataType>;
 
         constexpr auto DataTypeSize = sizeof(ADataType);
         constexpr auto MLdsLayer =
@@ -87,8 +29,8 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
 
         constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
             make_tuple(number<kKPerBlock / kKPack * MLdsLayer>{},
-                    number<kMPerBlock / MLdsLayer>{},
-                    number<kKPack>{}),
+                       number<kMPerBlock / MLdsLayer>{},
+                       number<kKPack>{}),
             make_tuple(number<kKPack>{}, number<kKPerBlock * MLdsLayer>{}, number<1>{}),
             number<kKPack>{},
             number<1>{});
@@ -96,119 +38,29 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
         constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
             a_lds_block_desc_0,
             make_tuple(make_xor_transform(make_tuple(number<kMPerBlock / MLdsLayer>{},
-                                                    number<kKPerBlock / kKPack * MLdsLayer>{})),
-                    make_pass_through_transform(number<kKPack>{})),
+                                                     number<kKPerBlock / kKPack * MLdsLayer>{})),
+                       make_pass_through_transform(number<kKPack>{})),
             make_tuple(sequence<1, 0>{}, sequence<2>{}),
             make_tuple(sequence<1, 0>{}, sequence<2>{}));
 
         constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
             a_lds_block_desc_permuted,
             make_tuple(make_unmerge_transform(
-                        make_tuple(number<MLdsLayer>{}, number<kKPerBlock / kKPack>{})),
-                        make_pass_through_transform(number<kMPerBlock / MLdsLayer>{}),
-                        make_pass_through_transform(number<kKPack>{})),
+                           make_tuple(number<MLdsLayer>{}, number<kKPerBlock / kKPack>{})),
+                       make_pass_through_transform(number<kMPerBlock / MLdsLayer>{}),
+                       make_pass_through_transform(number<kKPack>{})),
             make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
             make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
 
         constexpr auto a_lds_block_desc = transform_tensor_descriptor(
             a_lds_block_desc_xk0_mnldslayer_mn_xk1,
-            make_tuple(make_merge_transform(
-                        make_tuple(number<kMPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
-                        make_merge_transform(
-                        make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+            make_tuple(
+                make_merge_transform(
+                    make_tuple(number<kMPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                make_merge_transform(make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
             make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
             make_tuple(sequence<0>{}, sequence<1>{}));
         return a_lds_block_desc;
-#endif
-    }
-
-    /**
-     * @brief Get the maximum global memory vector load size.
-     *
-     * @tparam Problem      The UniversalGemmPipelineProblem object.
-     * @tparam DataType     The tensor data type we're considering.
-     * @tparam MNPerBlock   The MPerBlock or NPerBlock value depending on tensor (A/B).
-     * @tparam XPerTile     The contiguous Tile dimension size.
-     * @return Maximum DRAM vector load size.
-     */
-    template <typename Problem, typename DataType, index_t MNPerBlock, index_t XPerTile>
-    CK_TILE_HOST_DEVICE static constexpr auto GetGlobalVectorLoadSize()
-    {
-        constexpr index_t BlockSize           = Problem::kBlockSize;
-        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
-        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
-        constexpr index_t PackedSize =
-            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
-
-        // Assume DataType is even!
-        if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
-                     elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
-                     PackedSize == 2)
-        {
-            return (PackedSize * 32 / sizeof(DataType));
-        }
-        else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
-        {
-            return (PackedSize * 16 / sizeof(DataType));
-        }
-        else if constexpr(XPerTile % (PackedSize * 8 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (PackedSize * 8 / sizeof(DataType)) == 0)
-        {
-            return (PackedSize * 8 / sizeof(DataType));
-        }
-        else if constexpr(sizeof(DataType) >= PackedSize * 4 &&
-                          XPerTile % (PackedSize * 4 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (PackedSize * 4 / sizeof(DataType)) == 0)
-        {
-            return (PackedSize * 4 / sizeof(DataType));
-        }
-        else if constexpr(sizeof(DataType) >= PackedSize * 2 &&
-                          XPerTile % (PackedSize * 2 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (PackedSize * 2 / sizeof(DataType)) == 0)
-        {
-            return (PackedSize * 2 / sizeof(DataType));
-        }
-        else
-        {
-            return PackedSize;
-        }
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA()
-    {
-        using ALayout               = remove_cvref_t<typename Problem::ALayout>;
-        using ADataType             = remove_cvref_t<typename Problem::ADataType>;
-        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, KPerBlock>();
-        }
-        else
-        {
-            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, MPerBlock>();
-        }
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB()
-    {
-        using BLayout               = remove_cvref_t<typename Problem::BLayout>;
-        using BDataType             = remove_cvref_t<typename Problem::BDataType>;
-        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, NPerBlock>();
-        }
-        else
-        {
-            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, KPerBlock>();
-        }
     }
 
     template <typename Problem>
@@ -426,7 +278,6 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockWeightPreshuffle()
     {
-        // using AccDataType = float;
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
index 04d0b3baab..cadd77a61f 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -5,7 +5,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/concat.hpp"
-#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp"
 
 namespace ck_tile {
 
@@ -276,12 +276,11 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
         // B flat DRAM window for load
         auto b_flat_distribution =
             PipelinePolicy::template MakeBFlatDramTileDistribution<Problem>();
-        auto b_flat_dram_window = // tile_window_with_static_distribution
-            make_tile_window(
-                b_flat_dram_block_window_tmp.get_bottom_tensor_view(), // from kernel gemm_pad_views
-                make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
-                b_flat_dram_block_window_tmp.get_window_origin(),
-                b_flat_distribution);
+        auto b_flat_dram_window =
+            make_tile_window(b_flat_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
+                             b_flat_dram_block_window_tmp.get_window_origin(),
+                             b_flat_distribution);
 
         // Acc register tile
         auto c_block_tile = block_flatmm.MakeCBlockTile();
@@ -468,5 +467,4 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
             p_smem);
     }
 };
-
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
new file mode 100644
index 0000000000..9c0f257e8e
--- /dev/null
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
@@ -0,0 +1,1070 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
+#include "ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp"
+
+namespace ck_tile {
+
+template <typename Problem>
+struct BaseWeightPreshufflePipelineAGmemBGmemCRegV2
+{
+    static constexpr index_t PrefetchStages   = 2;
+    static constexpr index_t PrefillStages    = 1;
+    static constexpr index_t GlobalBufferNum  = 1;
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <typename RunFunction>
+    CK_TILE_HOST_DEVICE static auto
+    TailHandler(const RunFunction& run_func, bool, TailNumber tail_number)
+    {
+        if(tail_number == TailNumber::Odd)
+        {
+            run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Odd>{});
+        }
+        else if(tail_number == TailNumber::Even)
+        {
+            run_func(bool_constant<true>{}, integral_constant<TailNumber, TailNumber::Even>{});
+        }
+    }
+};
+
+template <typename Problem, typename PipelinePolicy = UniversalWeightPreshufflePipelineAgBgCrPolicy>
+struct WeightPreshufflePipelineAGmemBGmemCRegV2
+    : public BaseWeightPreshufflePipelineAGmemBGmemCRegV2<Problem>
+{
+    using Base = BaseWeightPreshufflePipelineAGmemBGmemCRegV2<Problem>;
+
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockWeightPreshuffle =
+        remove_cvref_t<decltype(PipelinePolicy::template GetBlockWeightPreshuffle<Problem>())>;
+
+    static constexpr auto config =
+        BlockWeightPreshuffle::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+    using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kMPerBlock = BlockGemmShape::kM;
+    static constexpr index_t kNPerBlock = BlockGemmShape::kN;
+    static constexpr index_t kKPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t flatKPerWarp = BlockGemmShape::flatKPerWarp;
+    static constexpr index_t flatNPerWarp = BlockGemmShape::flatNPerWarp;
+
+    static constexpr index_t GetVectorSizeA()
+    {
+        return PipelinePolicy::template GetVectorSizeA<Problem>();
+    }
+    static constexpr index_t GetVectorSizeB()
+    {
+        return PipelinePolicy::template GetVectorSizeB<Problem>();
+    }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    static constexpr index_t kLdsAlignmentInBytes = 16;
+    static constexpr index_t NumWaveGroups        = Problem::NumWaveGroups;
+
+    static constexpr auto I0   = number<0>();
+    static constexpr auto I1   = number<1>();
+    static constexpr auto I2   = number<2>();
+    static constexpr auto idxM = I0;
+    static constexpr auto idxN = I1;
+    static constexpr auto idxK = I2;
+    using BlockTile            = remove_cvref_t<typename BlockGemmShape::BlockTile>;
+    using BlockWarps           = remove_cvref_t<typename BlockGemmShape::BlockWarps>;
+    using WarpTile             = remove_cvref_t<typename BlockGemmShape::WarpTile>;
+
+    static constexpr index_t MWarp = config.template at<1>();
+    static constexpr index_t NWarp = config.template at<2>();
+
+    static constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WG::kM);
+    static constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WG::kN);
+    static constexpr index_t KIterPerWarp = kKPerBlock / WG::kK;
+
+    static constexpr index_t KFlatPerBlockPerIter = flatKPerWarp;
+    static constexpr index_t NFlatPerBlockPerIter = flatNPerWarp;
+
+    static constexpr index_t MPerBlockPerIter = kMPerBlock / MIterPerWarp;
+    static constexpr index_t KPerBlockPerIter = kKPerBlock / KIterPerWarp;
+
+    static constexpr index_t K1           = Problem::VectorLoadSize / sizeof(ADataType);
+    static constexpr index_t ACopyLoadNum = kMPerBlock * kKPerBlock / BlockSize / K1;
+    static constexpr auto TailNum         = Problem::TailNum;
+
+    static constexpr auto warp_m = WarpTile::at(idxM);
+    static constexpr auto warp_n = WarpTile::at(idxN);
+    static constexpr auto warp_k = WarpTile::at(idxK);
+
+    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "pipeline_AGmemBGmemCRegV2", 
+                      concat('x', kMPerBlock, kNPerBlock, kKPerBlock,  BlockSize),
+                      concat('x', WG::kM, WG::kN, WG::kK),
+                      concat('x', GetVectorSizeA(), GetVectorSizeB()),
+                      concat('x', kPadM, kPadN, kPadK));
+
+        // clang-format on
+    }
+
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr index_t Preshuffle    = Problem::Preshuffle;
+    using Base::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return PipelinePolicy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
+    {
+
+        constexpr index_t KPerLoad               = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
+        constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
+        constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
+
+        // Keypoint of pipeline optimize is workload balance in time
+        // instruction schedule example(128X256X256, 1X4, 16X16X128):
+        // Iter MNK     MFMA    ds_read ds_write    A_load  b_load
+        // -1   M6N3:   60      2       -           -       -
+        // -1   M7N0:   61      -       -           -       -
+        // -1   M7N1:   62      -       -           -       -
+        // -1   M7N2:   63      -       -           -       -
+        // -1   M7N3:   64      4       -           -       -
+        //  0   M0N0K0:  1      -       -           -       -
+        //  0   M0N1:    2      -       -           -       2
+        //  0   M0N2:    3      -       -           -       -
+        //  0   M0N3:    4      6       -           -       -
+        //  0   M1N0:    5      -       -           -       -
+        //  0   M1N1:    6      -       -           -       4
+        //  0   M1N2:    7      -       -           -       -
+        //  0   M1N3:    8      8       -           -       -
+        //  0   M2N0:    9      -       -           -       -
+        //  0   M2N1:   10      -       -           -       6
+        //  0   M2N2:   11      -       -           -       -
+        //  0   M2N3:   12     10       -           -       -
+        //  0   M3N0:   13      -       1           -       -
+        //  0   M3N1:   14      -       -           -       8
+        //  0   M3N2:   15      -       -           -       -
+        //  0   M3N3:   16     12       -           -       -
+        //  0   M4N0:   17      -       2           -       -
+        //  0   M4N1:   18      -       -           -       -
+        //  0   M4N2:   19      -       -           1       -
+        //  0   M4N3:   20     14       -           -       -
+        //  0   M5N0:   21      -       3           -       -
+        //  0   M5N1:   22      -       -           -       -
+        //  0   M5N2:   23      -       -           2       -
+        //  0   M5N3:   24     16       -           -       -
+        //  0   M6N0:   25      -       4           -       -
+        //  0   M6N1:   26      -       -           -       -
+        //  0   M6N2:   27      -       -           3       -
+        //  0   M6N3:   28     17       -           -       -
+        //  0   M7N0:   29      -       -           -       -
+        //  0   M7N1:   30      -       -           -       -
+        //  0   M7N2:   31      -       -           4       -
+        //  0   M7N3:   32     18       -           -       -
+        //  0   M0N0K1: 33      -       -           -       -
+        //  0   M0N1:   34      -       -           -       10
+        //  0   M0N2:   35      -       -           -       -
+        //  0   M0N3:   36     20       -           -       -
+        //  0   M1N0:   37      -       -           -       -
+        //  0   M1N1:   38      -       -           -       12
+        //  0   M1N2:   39      -       -           -       -
+        //  0   M1N3:   40     22       -           -       -
+        //  0   M2N0:   41      -       -           -       -
+        //  0   M2N1:   42      -       -           -       14
+        //  0   M2N2:   43      -       -           -       -
+        //  0   M2N3:   44     24       -           -       -
+        //  0   M3N0:   45      -       5           -       -
+        //  0   M3N1:   46      -       -           -       16
+        //  0   M3N2:   47      -       -           -       -
+        //  0   M3N3:   48     26       -           -       -
+        //  0   M4N0:   49      -       6           -       -
+        //  0   M4N1:   50      -       -           -       -
+        //  0   M4N2:   51      -       -           5       -
+        //  0   M4N3:   52     28       -           -       -
+        //  0   M5N0:   53      -       7           -       -
+        //  0   M5N1:   54      -       -           -       -
+        //  0   M5N2:   55      -       -           6       -
+        //  0   M5N3:   56     30       -           -       -
+        //  0   M6N0:   57      -       8           -       -
+        //  0   M6N1:   58      -       -           -       -
+        //  0   M6N2:   59      -       -           7       -
+        //  0   M6N3:   60      2       -           -       -
+        //  0   M7N0:   61      -       -           -       -
+        //  0   M7N1:   62      -       -           -       -
+        //  0   M7N2:   63      -       -           8       -
+        //  0   M7N3:   64      4       -           -       -
+
+        if constexpr(warp_m == 16 && warp_n == 16)
+        {
+// MFMA -> VMEM READ -> MFMA -> DS Read -> MFMA
+// hiding the glbal memory VMEM latency
+#if defined(__gfx950__)
+            if constexpr(kMPerBlock == 128 && kNPerBlock == 256 && kKPerBlock == 256)
+            {
+                static_for<0, 2, 1>{}([&](auto j) {
+                    ignore = j;
+                    static_for<0, 3, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+
+                    static_for<0, 3, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+            }
+            else
+            {
+                static_for<0, 2, 1>{}([&](auto j) {
+                    ignore = j;
+                    static_for<0, 3, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+
+                    static_for<0, 3, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+            }
+// MFMA → MFMA → MFMA → MFMA → DS Read
+// For other device engine we need more aggressive MFMA with DS writes interleaved
+#else
+            if constexpr(kMPerBlock == 128 && kNPerBlock == 256 && kKPerBlock == 256)
+            {
+                static_for<0, 2, 1>{}([&](auto j) {
+                    ignore = j;
+                    // Uses loops to amortize scheduling overhead
+                    static_for<0, 4, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+            }
+            else if constexpr(kMPerBlock == 16 && kNPerBlock == 64 && kKPerBlock == 256)
+            {
+                static_for<0, 1, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                });
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_barrier(0);
+            }
+            else if constexpr(kMPerBlock == 128 && kNPerBlock == 128 && kKPerBlock == 128)
+            {
+                // prioritize MFMA to avoid LDS write conflicts
+                static_for<0, 2, 1>{}([&](auto j) {
+                    ignore = j;
+                    static_for<0, 2, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 2, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                    static_for<0, 1, 1>{}([&](auto i) {
+                        ignore = i;
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    });
+                });
+
+                __builtin_amdgcn_sched_barrier(0);
+            }
+            else
+            {
+                static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+                static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+                });
+                static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+                });
+                static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                    __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+                });
+            }
+
+#endif
+        }
+        else
+        {
+            if constexpr((A_LDS_Read_Inst_Num / 2 >
+                          A_Buffer_Load_Inst_Num + B_Buffer_Load_Inst_Num))
+            {
+                static_for<0,
+                           A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num -
+                               B_Buffer_Load_Inst_Num,
+                           1>{}([&](auto i) {
+                    ignore = i;
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                });
+            }
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+        }
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const AElementFunction& a_element_func,
+                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                        index_t num_loop,
+                                        void* p_smem_ping,
+                                        void* p_smem_pong) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}],
+                      "wrong!");
+        static_assert(kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        constexpr auto MIter_2nd_last = (MIterPerWarp >= 2) ? MIterPerWarp - 2 : MIterPerWarp - 1;
+        const index_t iMWarp          = get_warp_id() / NWarp;
+
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // A tile in LDS
+        ADataType* p_a_lds_ping = static_cast<ADataType*>(p_smem_ping);
+        ADataType* p_a_lds_pong = static_cast<ADataType*>(p_smem_pong);
+
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block_ping =
+            make_tensor_view<address_space_enum::lds>(p_a_lds_ping, a_lds_block_desc);
+        auto a_lds_block_pong =
+            make_tensor_view<address_space_enum::lds>(p_a_lds_pong, a_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        auto a_copy_lds_window_ping =
+            make_tile_window(a_lds_block_ping,
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        auto a_copy_lds_window_pong =
+            make_tile_window(a_lds_block_pong,
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             {0, 0},
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        // ping-pong window for A LDS
+        auto a_warp_window_ping_tmp =
+            make_tile_window(a_lds_block_ping,
+                             make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+                             {iMWarp * WG::kM, 0},
+                             make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        auto a_warp_window_pong_tmp =
+            make_tile_window(a_lds_block_pong,
+                             make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+                             {iMWarp * WG::kM, 0},
+                             make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_ping_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows_ping;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_pong_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows_pong;
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows_ping(mIter)(kIter) = a_warp_window_ping_tmp;
+
+                move_tile_window(a_warp_windows_ping(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows_pong(mIter)(kIter) = a_warp_window_pong_tmp;
+
+                move_tile_window(a_warp_windows_pong(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // Block GEMM
+        auto block_weight_preshuffle = BlockWeightPreshuffle();
+        // Acc register tile
+        auto c_block_tile = block_weight_preshuffle.MakeCBlockTile();
+
+        // B flat DRAM window for load
+        auto b_flat_distribution =
+            PipelinePolicy::template MakeBFlatDramTileDistribution<Problem>();
+        auto b_flat_dram_window = // tile_window_with_static_distribution
+            make_tile_window(
+                b_flat_dram_block_window_tmp.get_bottom_tensor_view(), // from kernel gemm_pad_views
+                make_tuple(number<flatNPerWarp>{}, number<flatKPerWarp>{}),
+                b_flat_dram_block_window_tmp.get_window_origin(),
+                b_flat_distribution);
+
+        // pingpong buffer for B
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_flat_dram_window), KIterPerWarp>,
+            NIterPerWarp>
+            b_flat_dram_windows;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensor_ping;
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(load_tile(b_flat_dram_window)), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_tensor_pong;
+
+        // Prefetch A0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+        // move A window to next k
+        move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+        // prefetch B
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                 {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+            });
+        });
+        // move B window to next flat K
+        move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+        // Prefill A0
+        auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+        store_tile(a_copy_lds_window_ping, a_block_tile_tmp);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // Prefetch A1
+        a_block_tile = load_tile(a_copy_dram_window);
+        // move A window to next k
+        move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+        // initialize C
+        tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+        block_sync_lds();
+
+        // preload A00,A10 from lds
+        constexpr auto m_preload = (MIterPerWarp * KIterPerWarp >= 2) ? 2 : 1;
+        statically_indexed_array<decltype(load_tile(a_warp_windows_ping(number<0>{})(number<0>{}))),
+                                 m_preload>
+            a_warp_tensor_ping;
+        statically_indexed_array<decltype(load_tile(a_warp_windows_pong(number<0>{})(number<0>{}))),
+                                 m_preload>
+            a_warp_tensor_pong;
+
+        static_for<0, m_preload, 1>{}([&](auto loadIter) {
+            constexpr auto mIter = loadIter % MIterPerWarp;
+            constexpr auto kIter = loadIter / MIterPerWarp;
+            a_warp_tensor_ping(loadIter) =
+                load_tile(a_warp_windows_ping(number<mIter>{})(number<kIter>{}));
+        });
+        __builtin_amdgcn_sched_barrier(0);
+
+        index_t iCounter = (num_loop - 1) / 2;
+        while(iCounter > 0)
+        {
+            // prefetch B(2i+1)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // Prefill A(2i+1)
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window_pong, a_block_tile_tmp);
+
+            // Prefetch A(2i+2)
+            a_block_tile = load_tile(a_copy_dram_window);
+            // move A window to next k
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // GEMM 2i
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WG{}(c_warp_tensor,
+                             a_warp_tensor_ping(number<AwarpIter>{}),
+                             b_warp_tensor_ping(nIter)(kIter));
+
+                        // write C warp tensor into C block tensor
+                        c_block_tile.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+
+                        __builtin_amdgcn_sched_barrier(0x7F6);
+                    });
+                    // preload next A from lds
+                    if constexpr((kIter * MIterPerWarp + mIter) <
+                                 (KIterPerWarp * MIterPerWarp - m_preload))
+                    {
+                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                        a_warp_tensor_ping(number<AwarpIter>{}) =
+                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                    }
+
+                    // barrier
+                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                    {
+                        block_sync_lds();
+                    }
+                });
+            });
+            // move B window to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            static_for<0, m_preload, 1>{}([&](auto loadIter) {
+                constexpr auto mIter = loadIter % MIterPerWarp;
+                constexpr auto kIter = loadIter / MIterPerWarp;
+                a_warp_tensor_pong(loadIter) =
+                    load_tile(a_warp_windows_pong(number<mIter>{})(number<kIter>{}));
+            });
+            HotLoopScheduler();
+
+            // Next K
+
+            // prefetch B(2i+2)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_ping(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // Prefill A(2i+2)
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window_ping, a_block_tile_tmp);
+
+            // Prefetch A(2i+3)
+            a_block_tile = load_tile(a_copy_dram_window);
+            // move A window to next k
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+            // GEMM 2i+1
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WG{}(c_warp_tensor,
+                             a_warp_tensor_pong(number<AwarpIter>{}),
+                             b_warp_tensor_pong(nIter)(kIter));
+
+                        // write C warp tensor into C block tensor
+                        c_block_tile.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+
+                        __builtin_amdgcn_sched_barrier(0x7F6);
+                    });
+                    // preload next A from lds
+                    if constexpr((kIter * MIterPerWarp + mIter) <
+                                 (KIterPerWarp * MIterPerWarp - m_preload))
+                    {
+                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                        a_warp_tensor_pong(number<AwarpIter>{}) =
+                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                    }
+
+                    // barrier
+                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                    {
+                        block_sync_lds();
+                    }
+                });
+            });
+            // move B window to next flat K
+            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});
+
+            static_for<0, m_preload, 1>{}([&](auto loadIter) {
+                constexpr auto mIter = loadIter % MIterPerWarp;
+                constexpr auto kIter = loadIter / MIterPerWarp;
+                a_warp_tensor_ping(loadIter) =
+                    load_tile(a_warp_windows_ping(number<mIter>{})(number<kIter>{}));
+            });
+            HotLoopScheduler();
+
+            iCounter--;
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            // __builtin_amdgcn_sched_barrier(0);
+            // prefetch B(loopK)
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    b_flat_dram_windows(nIter)(kIter) = b_flat_dram_window;
+
+                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
+                                     {nIter * NFlatPerBlockPerIter, kIter * KFlatPerBlockPerIter});
+
+                    b_warp_tensor_pong(nIter)(kIter) = load_tile(b_flat_dram_windows(nIter)(kIter));
+                });
+            });
+
+            // Prefill A(loopK)
+            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window_pong, a_block_tile_tmp);
+
+            // GEMM loopK-1
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WG{}(c_warp_tensor,
+                             a_warp_tensor_ping(number<AwarpIter>{}),
+                             b_warp_tensor_ping(nIter)(kIter));
+
+                        // write C warp tensor into C block tensor
+                        c_block_tile.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+
+                        __builtin_amdgcn_sched_barrier(0x7F6);
+                    });
+                    // preload next A from lds
+                    if constexpr((kIter * MIterPerWarp + mIter) <
+                                 (KIterPerWarp * MIterPerWarp - m_preload))
+                    {
+                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                        a_warp_tensor_ping(number<AwarpIter>{}) =
+                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                    }
+
+                    // barrier
+                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                    {
+                        block_sync_lds();
+                    }
+                });
+            });
+            // TailHotLoopScheduler();
+
+            static_for<0, m_preload, 1>{}([&](auto loadIter) {
+                constexpr auto mIter = loadIter % MIterPerWarp;
+                constexpr auto kIter = loadIter / MIterPerWarp;
+                a_warp_tensor_pong(loadIter) =
+                    load_tile(a_warp_windows_pong(number<mIter>{})(number<kIter>{}));
+            });
+
+            // __builtin_amdgcn_sched_barrier(0);
+
+            // GEMM loopK
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WG{}(c_warp_tensor,
+                             a_warp_tensor_pong(number<AwarpIter>{}),
+                             b_warp_tensor_pong(nIter)(kIter));
+
+                        // write C warp tensor into C block tensor
+                        c_block_tile.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                        __builtin_amdgcn_sched_barrier(0x7F6);
+                    });
+                    if constexpr((kIter * MIterPerWarp + mIter) <
+                                 (KIterPerWarp * MIterPerWarp - m_preload))
+                    {
+                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                        a_warp_tensor_pong(number<AwarpIter>{}) =
+                            load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
+                    }
+                });
+            });
+            // TailHotLoopScheduler();
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            // GEMM loopK
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tile.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WG{}(c_warp_tensor,
+                             a_warp_tensor_ping(number<AwarpIter>{}),
+                             b_warp_tensor_ping(nIter)(kIter));
+
+                        // write C warp tensor into C block tensor
+                        c_block_tile.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+
+                        __builtin_amdgcn_sched_barrier(0x7F6);
+                    });
+                    // preload next A from lds
+                    if constexpr((kIter * MIterPerWarp + mIter) <
+                                 (KIterPerWarp * MIterPerWarp - m_preload))
+                    {
+                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                        a_warp_tensor_ping(number<AwarpIter>{}) =
+                            load_tile(a_warp_windows_ping(number<AmIter>{})(number<AkIter>{}));
+                    }
+
+                    // barrier
+                    if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
+                    {
+                        block_sync_lds();
+                    }
+                });
+            });
+        }
+
+        return c_block_tile;
+    }
+
+    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem_ping,
+                                   void* p_smem_pong) const
+    {
+        return operator()(
+            a_dram_block_window_tmp,
+            [](const ADataType & a) { return a; },
+            b_flat_dram_block_window_tmp,
+            num_loop,
+            p_smem_ping,
+            p_smem_pong);
+    }
+};
+
+} // namespace ck_tile
diff --git a/script/gemm_profile.sh b/script/gemm_profile.sh
new file mode 100755
index 0000000000..b71c43f74f
--- /dev/null
+++ b/script/gemm_profile.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+BIN=./bin/tile_example_gemm_weight_preshuffle
+PREC=fp8
+VERBOSITY=2
+
+# List of all (m, n, k) triplets
+ARGS_LIST=(
+  "1 2048 5120"
+  "1 5120 1024"
+  "2 2048 5120"
+  "2 5120 1024"
+  "3 2048 5120"
+  "3 5120 1024"
+  "4 2048 5120"
+  "4 5120 1024"
+  "5 2048 5120"
+  "5 5120 1024"
+  "6 2048 5120"
+  "6 5120 1024"
+  "7 2048 5120"
+  "7 5120 1024"
+  "8 2048 5120"
+  "8 5120 1024"
+  "9 2048 5120"
+  "9 5120 1024"
+  "10 2048 5120"
+  "10 5120 1024"
+  "11 2048 5120"
+  "11 5120 1024"
+  "12 2048 5120"
+  "12 5120 1024"
+  "13 2048 5120"
+  "13 5120 1024"
+  "14 2048 5120"
+  "14 5120 1024"
+  "15 2048 5120"
+  "15 5120 1024"
+  "16 2048 5120"
+  "16 5120 1024"
+  "2048 5120 1024"
+  "2048 5120 8192"
+  "2048 7168 8192"
+  "2048 8192 3584"
+  "16384 7168 8192"
+  "16384 8192 3584"
+)
+
+# Output file
+OUTPUT_FILE="gemm_profile_results.csv"
+
+# Output header
+echo "m,n,k,Pipeline,Time_ms,TFlops,GBps,Verification" > "$OUTPUT_FILE"
+
+# Loop over each argument set
+for args in "${ARGS_LIST[@]}"; do
+  read -r m n k <<< "$args"
+
+  echo "Testing: m=$m, n=$n, k=$k"
+  OUTPUT=$($BIN -m=$m -n=$n -k=$k -prec=$PREC -v=$VERBOSITY 2>/dev/null)
+
+  # Extract pipeline information
+  # Format: "Launching kernel with args: gemm_fp8_pipeline_AGmemBGmemCRegV2_128x256x256x256_16x16x128_16x16_0x0x0"
+  PIPELINE=$(echo "$OUTPUT" | grep "Launching kernel with args:" | sed -n 's/.*Launching kernel with args: \(.*\)/\1/p')
+
+  # Extract TFlops and GB/s from the output
+  # Format: "Run Gemm kernel with M=3840 N=4096 K=2048 ... : 0.042338 ms, 1521.67 TFlops, 1126.89 GB/s,"
+  PERF_LINE=$(echo "$OUTPUT" | grep "TFlops")
+
+  # Extract verification result
+  # Format: "The GPU verification result is: correct"
+  VERIFICATION=$(echo "$OUTPUT" | grep "The GPU verification result is:" | sed -n 's/.*The GPU verification result is: \(.*\)/\1/p')
+
+  if [ -n "$PERF_LINE" ]; then
+    # Extract execution time in ms
+    TIME_MS=$(echo "$PERF_LINE" | grep -o '[0-9]\+\.[0-9]\+ ms' | grep -o '[0-9]\+\.[0-9]\+')
+    # Extract TFlops value - more robust regex
+    TFLOPS=$(echo "$PERF_LINE" | grep -o '[0-9]\+\.[0-9]\+ TFlops' | grep -o '[0-9]\+\.[0-9]\+')
+    # Extract GB/s value - more robust regex
+    GBPS=$(echo "$PERF_LINE" | grep -o '[0-9]\+\.[0-9]\+ GB/s' | grep -o '[0-9]\+\.[0-9]\+')
+
+    # Use extracted pipeline or default if not found
+    if [ -z "$PIPELINE" ]; then
+      PIPELINE="gemm_basic"
+    fi
+
+    # Print to terminal
+    echo "  Pipeline: $PIPELINE"
+    echo "  Time: ${TIME_MS} ms"
+    echo "  TFlops: ${TFLOPS}"
+    echo "  GB/s: ${GBPS}"
+
+    
+    # Save to CSV file
+    echo "$m,$n,$k,$PIPELINE,$TIME_MS,$TFLOPS,$GBPS,$VERIFICATION" >> "$OUTPUT_FILE"
+  else
+    echo "  ERROR: Could not parse performance data"
+    echo ""
+    echo "$m,$n,$k,$PIPELINE,,,,$VERIFICATION" >> "$OUTPUT_FILE"
+  fi
+done
+
+echo "=========================================="
+echo "Profile completed!"
+echo "Results saved to: $OUTPUT_FILE"
+echo "Total tests run: ${#ARGS_LIST[@]}"
+echo "=========================================="
\ No newline at end of file

From bb5c4782950d60ab0c15c4c9a806d745ca1c4c87 Mon Sep 17 00:00:00 2001
From: lalala-sh <Jiaxing.Wen@amd.com>
Date: Fri, 1 Aug 2025 17:50:02 +0800
Subject: [PATCH 377/443] fix weight index out of range (#2414)

---
 .../gpu/grid/gridwise_moe_gemm.hpp            | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
index 82be6ac7ce..48ccb49db4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1235,9 +1235,9 @@ struct GridwiseMoeGemm
             }
             gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride =
+        const IndexType expert_stride =
             __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
-
+        const IndexType expert_offset = expert_id * expert_stride / BPackedSize;
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
@@ -1245,8 +1245,7 @@ struct GridwiseMoeGemm
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + expert_id * expert_stride / BPackedSize,
-            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            p_b_grid + expert_offset, b_grid_desc_bpreshuffled.GetElementSpaceSize());
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
@@ -1335,8 +1334,7 @@ struct GridwiseMoeGemm
         {
             const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
             const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + expert_id * expert_stride / BPackedSize,
-                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+                p_b_grid_up + expert_offset, b_grid_desc_bpreshuffled.GetElementSpaceSize());
             auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
                 BDataType,
                 BDataType,
@@ -1947,9 +1945,9 @@ struct GridwiseMoeGemm
             }
             gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
-        const index_t expert_stride =
+        const IndexType expert_stride =
             __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
-
+        const IndexType expert_offset = expert_id * expert_stride / BPackedSize;
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
             __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
@@ -1957,8 +1955,7 @@ struct GridwiseMoeGemm
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + expert_id * expert_stride / BPackedSize,
-            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            p_b_grid + expert_offset, b_grid_desc_bpreshuffled.GetElementSpaceSize());
 
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
@@ -2055,8 +2052,7 @@ struct GridwiseMoeGemm
         {
             const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
             const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + expert_id * expert_stride / BPackedSize,
-                b_grid_desc_bpreshuffled.GetElementSpaceSize());
+                p_b_grid_up + expert_offset, b_grid_desc_bpreshuffled.GetElementSpaceSize());
             auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
                 BDataType,
                 BDataType,

From e6104daecc7e29d26fc0435dd697132bdd262163 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 1 Aug 2025 07:55:51 -0700
Subject: [PATCH 378/443] Add a daily CI stage to test AITER with latest CK.
 (#2598)

* add a CI stage for AITER testing
---
 Dockerfile.aiter | 17 +++++++++++
 Jenkinsfile      | 79 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 94 insertions(+), 2 deletions(-)
 create mode 100644 Dockerfile.aiter

diff --git a/Dockerfile.aiter b/Dockerfile.aiter
new file mode 100644
index 0000000000..f6e66f460a
--- /dev/null
+++ b/Dockerfile.aiter
@@ -0,0 +1,17 @@
+ARG BASE_DOCKER="rocm/pytorch:latest"
+FROM $BASE_DOCKER
+RUN groupadd -f render && \
+    pip install pandas zmq einops && \
+    pip install numpy==1.26.2 && \
+    sudo mkdir /home/jenkins && \
+    sudo mkdir /home/jenkins/workspace && \
+    cd /home/jenkins/workspace && \
+    rm -rf aiter && \
+    git clone --recursive https://github.com/ROCm/aiter.git && \
+    cd aiter && \
+    rm -rf 3rdparty/composable_kernel/ && \
+    git clone https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
+    python3 setup.py develop && \
+    chown -R jenkins:jenkins /home/jenkins/workspace && \
+    chmod -R a+rwx /home/jenkins/workspace && \
+    sudo usermod -aG irc jenkins
diff --git a/Jenkinsfile b/Jenkinsfile
index f08e247a06..0363b07d89 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -188,12 +188,16 @@ def buildDocker(install_prefix){
     if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
         dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
     }
+    else if(params.RUN_AITER_TESTS){
+        image_name = "rocm/composable_kernel:ck_aiter"
+        dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter . "
+    }
     else{
         dockerArgs = dockerArgs + " -f Dockerfile . "
     }
     echo "Build Args: ${dockerArgs}"
     try{
-        if(params.BUILD_DOCKER){
+        if(params.BUILD_DOCKER || params.RUN_AITER_TESTS){
             //force building the new docker if that parameter is true
             echo "Building image: ${image_name}"
             retimage = docker.build("${image_name}", dockerArgs)
@@ -807,13 +811,62 @@ def process_results(Map conf=[:]){
     }
 }
 
+def run_aiter_tests(Map conf=[:]){
+    show_node_info()
+    env.HSA_ENABLE_SDMA=0
+    checkout scm
+    //use the latest pytorch image
+    def image = "rocm/composable_kernel:ck_aiter"
+    def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
+    def variant = env.STAGE_NAME
+    def retimage
+    def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+    def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+    echo "Docker flags: ${dockerOpts}"
+
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        try
+        {
+            echo "Pulling image: ${image}"
+            retimage = docker.image("${image}")
+            withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+                retimage.pull()
+            }
+        }
+        catch(Exception ex)
+        {
+            error "Unable to locate image: ${image}"
+        }
+    }
+
+    withDockerContainer(image: image, args: dockerOpts) {
+        timeout(time: 45, unit: 'MINUTES'){
+            try{
+                sh "python3 --version"
+                sh "rocminfo"
+                sh "python3 ../aiter/op_tests/test_gemm_a8w8_blockscale.py"
+                //sh "python3 ../aiter/op_tests/test_mha.py"
+            }
+            catch(e){
+                echo "Throwing error exception while running AITER tests"
+                echo 'Exception occurred: ' + e.toString()
+                throw e
+            }
+            finally{
+                echo "Finished running AITER tests"
+            }
+        }
+    }
+}
+
 //launch develop branch daily jobs
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
-                                              0 13 * * * % BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
+                                              0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
 
 pipeline {
     agent none
@@ -952,6 +1005,10 @@ pipeline {
             name: "RUN_ALL_UNIT_TESTS",
             defaultValue: false,
             description: "Run all unit tests (default: OFF)")
+        booleanParam(
+            name: "RUN_AITER_TESTS",
+            defaultValue: false,
+            description: "Run AITER tests with latest CK develop branch (default: OFF)")
     }
     environment{
         dbuser = "${dbuser}"
@@ -1032,6 +1089,24 @@ pipeline {
                 }
             }
         }
+        stage("Run AITER Tests")
+        {
+            parallel
+            {
+                stage("Run AITER Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_AITER_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    steps{
+                        run_aiter_tests()
+                        cleanWs()
+                    }
+                }
+            }
+        }
         stage("Run Grouped Conv Large Case Tests")
         {
             parallel

From 7c44a763fa9719ba1b18d3b6a37b6138c78d97fd Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Fri, 1 Aug 2025 09:32:24 -0700
Subject: [PATCH 379/443] Fix the GFX 950 Universal GEMM (#2597)

* solve the gfx950 error

* clang format

* fix a typo error

---------

Co-authored-by: ThomasNing <thomasning@amd.com>
---
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 24 ++++++++++++-------
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    | 16 +++++++++----
 .../test_gemm_pipeline_universal_run_test.inc |  2 ++
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 47b91ccbf7..fb191d565d 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -265,17 +265,25 @@ using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIter
     WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
-using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
-    WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
-using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
-    WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
-using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfma<
-    WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
-using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfma<
-    WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
+                          AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 4e5d102e35..e91d505c8e 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -92,10 +92,10 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<>; };
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
@@ -110,6 +110,14 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
 
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<WGAttrNumAccessEnum::Quad>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<WGAttrNumAccessEnum::Quad>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+    using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
 // int8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
index 7d89dda684..a22ecf2486 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
+++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc
@@ -375,6 +375,8 @@ int run_gemm_combinations(std::string const& data_type)
                     {
                         is_success =
                             run_gemm_test<GemmConfigComputeV3>(ARG_COUNT, argv) && is_success;
+                        is_success =
+                            run_gemm_test<GemmConfigComputeV3_2>(ARG_COUNT, argv) && is_success;
                     }
                     catch(const ArgumentsNotSupportedException& e)
                     {

From 788e8a878edb5aaed282c60957fba296a3c4303c Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 1 Aug 2025 14:30:07 -0700
Subject: [PATCH 380/443] update the switch condition for buffer built-ins
 (#2602)

---
 .../device/impl/device_sparse_embeddings_forward_layernorm.hpp  | 2 +-
 include/ck/utility/common_header.hpp                            | 2 +-
 include/ck/utility/dynamic_buffer.hpp                           | 2 +-
 include/ck_tile/core/config.hpp                                 | 2 +-
 include/ck_tile/core/tensor/buffer_view.hpp                     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
index df3c929c2e..d43dab2983 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
@@ -12,7 +12,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#if __clang_major__ == 20
+#if __clang_major__ >= 20
 #include "ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm_builtins.hpp"
 #else
 #include "ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp"
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
index c2c3aa002c..69420a6465 100644
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -33,7 +33,7 @@
 #include "ck/utility/thread_group.hpp"
 #include "ck/utility/debug.hpp"
 
-#if __clang_major__ == 20
+#if __clang_major__ >= 20
 #include "amd_buffer_addressing_builtins.hpp"
 #else
 #include "amd_buffer_addressing.hpp"
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 027290dbf8..a1f3ee2d78 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -7,7 +7,7 @@
 #include "ck/utility/data_type.hpp"
 #include "enable_if.hpp"
 #include "c_style_pointer_cast.hpp"
-#if __clang_major__ == 20
+#if __clang_major__ >= 20
 #include "amd_buffer_addressing_builtins.hpp"
 #else
 #include "amd_buffer_addressing.hpp"
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 3a1ddd8abd..c471f416c3 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -253,7 +253,7 @@
 #endif
 
 #ifndef CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
-#if __clang_major__ == 20
+#if __clang_major__ >= 20
 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 1
 #else
 #define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 0
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 13b038bc48..4b39773939 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -5,7 +5,7 @@
 
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/arch/arch.hpp"
-#if __clang_major__ == 20
+#if __clang_major__ >= 20
 #include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
 #else
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"

From e5b79b26fae87a9e610a805e7feed6eb1e30158c Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Fri, 1 Aug 2025 14:42:33 -0700
Subject: [PATCH 381/443] Reduce build time tile engine (#2579)

* Modify CMakeLists to allow for splitting.

* Modify CMakeLists for data and layout logic.

* Run tests and get build artifact.

* Test new Cmakelists for speedup.

* Further improvements for speedup.

* turn off the FMHA

* turn off the automatic tile engine gemm

* minor fix

* disable the transpose test first

* Address the comment

* Jenkinsfile

* change the make thread to 64

* change the compile thread to 32

* Try to use with less OS memory space

* Have the Unity build batch size to 2

* reduce the chunk size

---------

Co-authored-by: Vidyasagar Ananthan <vidyasagar.ananthan@amd.com>
---
 Jenkinsfile                         | 146 +-------------
 tile_engine/ops/gemm/CMakeLists.txt | 283 +++++++++++++++++-----------
 2 files changed, 177 insertions(+), 252 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0363b07d89..b70c28ad39 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -438,34 +438,6 @@ def cmake_build(Map conf=[:]){
             echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
         }
     }
-    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
-        try{
-            archiveArtifacts "perf_transpose_*.log"
-            if (arch_type == 1){
-                stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a"
-            }
-            else if (arch_type == 2){
-                stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942"
-            }
-        }
-        catch(Exception err){
-            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
-        }
-    }
-    if (params.RUN_CK_TILE_GEMM_TESTS){
-        try{
-            archiveArtifacts "perf_tile_gemm_**.log"
-            if (arch == 1){
-                stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
-            }
-            else if (arch == 2){
-                stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
-            }
-        }
-        catch(Exception err){
-            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
-        }
-    }
 }
 
 def buildHipClangJob(Map conf=[:]){
@@ -762,24 +734,6 @@ def process_results(Map conf=[:]){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                         }
                     }
-                    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
-                        try{
-                            unstash "perf_transpose_log_gfx942"
-                            unstash "perf_transpose_log_gfx90a"
-                        }
-                        catch(Exception err){
-                            echo "could not locate the Transpose performance logs: ${err.getMessage()}."
-                        }
-                    }
-                    if (params.RUN_CK_TILE_GEMM_TESTS){
-                        try{
-                            unstash "perf_tile_gemm_log_gfx942"
-                            unstash "perf_tile_gemm_log_gfx90a"
-                        }
-                        catch(Exception err){
-                            echo "could not locate the GEMM performance logs: ${err.getMessage()}."
-                        }
-                    }
                     if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
                         // unstash deb packages
                         unstash "packages"
@@ -861,7 +815,7 @@ def run_aiter_tests(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -941,14 +895,6 @@ pipeline {
             name: "RUN_CK_TILE_FMHA_TESTS",
             defaultValue: false,
             description: "Run the ck_tile FMHA tests (default: OFF)")
-        booleanParam(
-            name: "RUN_CK_TILE_TRANSPOSE_TESTS",
-            defaultValue: false,
-            description: "Run the ck_tile Transpose tests (default: OFF)")
-        booleanParam(
-            name: "RUN_CK_TILE_GEMM_TESTS",
-            defaultValue: false,
-            description: "Run the ck_tile GEMM tests (default: OFF)")
         booleanParam(
             name: "RUN_TILE_ENGINE_GEMM_TESTS",
             defaultValue: false,
@@ -1198,94 +1144,6 @@ pipeline {
                 }
             }
         }
-        stage("Run CK_TILE_TRANSPOSE Tests")
-        {
-            parallel
-            {
-                stage("Run CK_TILE_TRANSPOSE Tests on gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx90a") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 tile_example_batched_transpose && \
-                                           cd ../ &&
-                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-                stage("Run CK_TILE_TRANSPOSE Tests on gfx942")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx942") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_batched_transpose && \
-                                           cd ../ &&
-                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-            }
-        }
-        stage("Run CK_TILE_GEMM Tests")
-        {
-            parallel
-            {
-                stage("Run CK_TILE_GEMM Tests on gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx90a") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 tile_example_gemm_universal && \
-                                           cd ../ &&
-                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-                stage("Run CK_TILE_GEMM Tests on gfx942")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx942") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_gemm_universal && \
-                                           cd ../ &&
-                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-            }
-        }
         stage("Run TILE_ENGINE_GEMM Tests")
         {
             parallel
@@ -1492,7 +1350,7 @@ pipeline {
                                            -DGPU_TARGETS="gfx90a" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j 32"""
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index fe9b7802a7..d8200ed947 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,148 +1,215 @@
-
 set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
 set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
 
+# Pre-generate all kernel lists to avoid blocking during parallel builds
+foreach(dt IN LISTS GEMM_DATATYPE)
+    foreach(l IN LISTS GEMM_LAYOUT)
+        set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${dt}/${l}")
+        file(MAKE_DIRECTORY "${working_path}")
+
+        if (l STREQUAL "rcr")
+            set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        else()
+            set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
+        endif()
+
+        # Only run if files don't exist
+        if (NOT EXISTS "${working_path}/gemm_instance_blobs.txt")
+            execute_process(
+                COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py"
+                        --working_path "${working_path}"
+                        --datatype "${dt}"
+                        --layout "${l}"
+                        --config_json "${json_blob}"
+                        --list_blobs
+                RESULT_VARIABLE ret
+            )
+            if (NOT ret EQUAL 0)
+                message(FATAL_ERROR "Failed to pre-generate kernel list for ${dt} ${l}")
+            endif()
+        endif()
+    endforeach()
+endforeach()
+
 function(build_gemm_for_datatype datatype layout)
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
 
-    # Comment this if-else block when using user_provided_config
-    if(layout STREQUAL "rcr")
+    if (layout STREQUAL "rcr")
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
     else()
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
     endif()
-
-    # uncomment this if you want to use user_provided_config.json
+    # Uncomment to override:
     # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
-    
-    # Generate kernel list
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
-                --working_path ${working_path}
-                --datatype ${datatype}
-                --layout ${layout}
-                --config_json ${json_blob}
-                --list_blobs
-        RESULT_VARIABLE ret
-    )
-    if(NOT ret EQUAL 0)
-        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
-    endif()
 
+    # Read pre-generated kernel lists
     file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs)
     file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range)
-    
+
     # Generate the blobs
     add_custom_command(
         OUTPUT ${codegen_blobs}
-        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+        COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py"
                 --working_path "${working_path}"
-                --datatype ${datatype}
-                --layout ${layout}
+                --datatype "${datatype}"
+                --layout "${layout}"
                 --config_json "${json_blob}"
                 --gen_blobs
         COMMENT "Generating GEMM instance sources for ${datatype} ${layout}"
     )
     add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})
 
-    set(intermediate_libs)
-    list(LENGTH codegen_blobs codegen_blobs_len)
+    # Parse ranges to identify unique trait names
+    set(unique_traits)
+    foreach(range_line IN LISTS codegen_blobs_range)
+        string(STRIP "${range_line}" stripped_line)
+        separate_arguments(split_line UNIX_COMMAND "${stripped_line}")
+        list(GET split_line 0 trait_name)
+        list(APPEND unique_traits "${trait_name}")
+    endforeach()
+    list(REMOVE_DUPLICATES unique_traits)
 
-    foreach(blob IN LISTS codegen_blobs_range)
-        string(STRIP "${blob}" stripped_blob)
-        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
-        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
-        list(GET spilit_blob 0 name)
-        list(GET spilit_blob 1 first)
-        list(GET spilit_blob 2 last)
-        math(EXPR total_files "${last} - ${first}")
-        if(total_files EQUAL 0)
-            continue()        # nothing for this trait
-        endif()
+    # Build each trait separately
+    foreach(trait IN LISTS unique_traits)
+        set(trait_files)
+        foreach(range_line IN LISTS codegen_blobs_range)
+            string(STRIP "${range_line}" stripped_line)
+            separate_arguments(split_line UNIX_COMMAND "${stripped_line}")
+            list(GET split_line 0 name)
+            if (name STREQUAL trait)
+                list(GET split_line 1 first)
+                list(GET split_line 2 last)
+                math(EXPR total_files "${last} - ${first}")
+                if (total_files GREATER 0)
+                    foreach(j RANGE ${first} ${last}-1)
+                        list(LENGTH codegen_blobs blobs_len)
+                        if (j LESS blobs_len)
+                            list(GET codegen_blobs ${j} f)
+                            list(APPEND trait_files "${f}")
+                        endif()
+                    endforeach()
+                endif()
+            endif()
+        endforeach()
 
-        # Object libraries (chunked) per trait
-        set(sub_intermediate_libs)
-        set(chunk_size 3)
-        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
-        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
-        
-        foreach(i RANGE 0 ${num_chunks_minus_1})
-            math(EXPR start "${first} + ${i} * ${chunk_size} ")
-            math(EXPR end "${start} + ${chunk_size} - 1")
+        if (trait_files)
+            # Create object libraries with chunking
+            set(chunk_size 3)  # adjust as needed for memory vs parallelism
+            list(LENGTH trait_files num_files)
+            math(EXPR num_chunks "( ${num_files} + ${chunk_size} - 1 ) / ${chunk_size}")
 
-            set(chunk_files)
-            foreach(j RANGE ${start} ${end})
-                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
-                    list(GET codegen_blobs ${j} f)
-                    list(APPEND chunk_files "${f}")
+            set(trait_obj_libs)
+            foreach(i RANGE 0 ${num_chunks}-1)
+                math(EXPR start "${i} * ${chunk_size}")
+                math(EXPR end "${start} + ${chunk_size} - 1")
+
+                set(chunk_files)
+                foreach(j RANGE ${start} ${end})
+                    if (j LESS ${num_files})
+                        list(GET trait_files ${j} f)
+                        list(APPEND chunk_files "${f}")
+                    endif()
+                endforeach()
+
+                if (chunk_files)
+                    set(obj_lib_name "gemm_obj_${trait}_${i}_${datatype}_${layout}")
+                    add_library(${obj_lib_name} OBJECT ${chunk_files})
+                    add_dependencies(${obj_lib_name} gemm_gen_${datatype}_${layout})
+
+                    target_compile_options(${obj_lib_name} PRIVATE
+                        -Wno-undefined-func-template
+                        -Wno-float-equal
+                        --offload-compress
+                        -O3
+                        -fno-exceptions
+                    )
+
+                    set_target_properties(${obj_lib_name} PROPERTIES
+                        UNITY_BUILD ON
+                        UNITY_BUILD_BATCH_SIZE 2
+                    )
+
+                    list(APPEND trait_obj_libs "${obj_lib_name}")
                 endif()
             endforeach()
 
-            #list(LENGTH chunk_files chunk_files_len)
-            #if(chunk_files_len AND chunk_files_len GREATER 1)
-            if(chunk_files)
-                set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}")
-                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
-                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
+            # Static library for this trait
+            if (trait_obj_libs)
+                set(trait_lib_name "gemm_lib_${trait}_${datatype}_${layout}")
+                set(obj_exprs)
+                foreach(objlib IN LISTS trait_obj_libs)
+                    list(APPEND obj_exprs "$<TARGET_OBJECTS:${objlib}>")
+                endforeach()
+
+                add_library(${trait_lib_name} STATIC ${obj_exprs})
+                add_dependencies(${trait_lib_name} gemm_gen_${datatype}_${layout})
+
+                # Trait-specific executable
+                set(exec_name "benchmark_gemm_${datatype}_${layout}_${trait}")
+                add_executable(${exec_name} benchmark_gemm.cpp)
+                target_link_libraries(${exec_name} PRIVATE ${trait_lib_name})
+                target_include_directories(${exec_name} PRIVATE
+                    "${CMAKE_CURRENT_LIST_DIR}"
+                    "${working_path}"
+                )
+                target_compile_definitions(${exec_name} PRIVATE
+                    GEMM_TRAIT_FILTER="${trait}"
+                )
+                target_compile_options(${exec_name} PRIVATE
+                    -Wno-undefined-func-template
+                    -Wno-float-equal
+                    --offload-compress
+                )
             endif()
-
-        endforeach()
-
-        # ------------------ Bundle the object libs into one static lib ---------
-        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
-        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
-        if(sub_intermediate_libs)
-            set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}")
-            # Collect the $<TARGET_OBJECTS:...> expressions
-            
-            set(obj_exprs)
-            foreach(objlib IN LISTS sub_intermediate_libs)
-                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
-            endforeach()
-            
-            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
-            add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout})
-            #foreach(objlib IN LISTS sub_intermediate_libs)
-            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
-            #endforeach()
-            list(APPEND intermediate_libs ${intermediate_lib_name})
         endif()
-
     endforeach()
-    
-    # Interface library for instances
-    add_library(gemm_template_instances_${datatype}_${layout} INTERFACE)
-    add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout})
-    target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
-    target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE
-        ${CMAKE_CURRENT_LIST_DIR}
-        "${working_path}"
-    )
-    set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
-    
-    # Host API interface library
-    add_library(gemm_host_api_${datatype}_${layout} INTERFACE)
-    target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout})
-    target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE
-        ${CMAKE_CURRENT_LIST_DIR}
-        "${working_path}"
-    )
-    
 
-    # Executable per datatype
-    set(exec_name "benchmark_gemm_${datatype}_${layout}")
-    add_executable(${exec_name} benchmark_gemm.cpp)
-    target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout})
-    target_compile_options(${exec_name} PRIVATE
-        -Wno-undefined-func-template
-        -Wno-float-equal
-        --offload-compress
-    )
+    # Master executable including all traits
+    set(all_trait_libs)
+    foreach(trait IN LISTS unique_traits)
+        if (TARGET gemm_lib_${trait}_${datatype}_${layout})
+            list(APPEND all_trait_libs "gemm_lib_${trait}_${datatype}_${layout}")
+        endif()
+    endforeach()
+
+    if (all_trait_libs)
+        add_executable(benchmark_gemm_${datatype}_${layout} benchmark_gemm.cpp)
+        target_link_libraries(benchmark_gemm_${datatype}_${layout} PRIVATE ${all_trait_libs})
+        target_include_directories(benchmark_gemm_${datatype}_${layout} PRIVATE
+            "${CMAKE_CURRENT_LIST_DIR}"
+            "${working_path}"
+        )
+        target_compile_options(benchmark_gemm_${datatype}_${layout} PRIVATE
+            -Wno-undefined-func-template
+            -Wno-float-equal
+            --offload-compress
+        )
+    endif()
 endfunction()
 
-# Process each datatype in isolation
+# Process each datatype/layout
 foreach(dt IN LISTS GEMM_DATATYPE)
     foreach(l IN LISTS GEMM_LAYOUT)
-        build_gemm_for_datatype(${dt} ${l})
+        build_gemm_for_datatype("${dt}" "${l}")
     endforeach()
 endforeach()
+
+# Master target for parallel builds
+set(ALL_GEMM_TARGETS)
+foreach(dt IN LISTS GEMM_DATATYPE)
+    foreach(l IN LISTS GEMM_LAYOUT)
+        list(APPEND ALL_GEMM_TARGETS "benchmark_gemm_${dt}_${l}")
+    endforeach()
+endforeach()
+add_custom_target(benchmark_gemm_all DEPENDS ${ALL_GEMM_TARGETS})
+
+# Use faster linker if available
+find_program(LLD_LINKER "ld.lld")
+find_program(MOLD_LINKER "mold")
+if (MOLD_LINKER)
+    message(STATUS "Using mold linker for faster linking")
+    add_link_options(-fuse-ld=mold)
+elseif (LLD_LINKER)
+    message(STATUS "Using lld linker for faster linking")
+    add_link_options(-fuse-ld=lld)
+endif()
\ No newline at end of file

From f36cb5b2aad0acf655173290ba672066ecfa85d1 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 1 Aug 2025 14:50:09 -0700
Subject: [PATCH 382/443] [CK-tile] remove old ck-tile transpose test (#2591)

* remove old ck-tile transpose test

* rename test exe for consistency

* replace batched transpose regression test
---
 test/CMakeLists.txt                           |   4 +-
 test/ck_tile/batched_transpose/CMakeLists.txt |  30 +-
 .../batched_transpose/batched_transpose.hpp   |  25 --
 .../batched_transpose/batched_transpose.inc   | 283 ------------------
 .../batched_transpose_api.cpp                 | 109 -------
 .../batched_transpose_bf16.cpp                |  10 -
 .../batched_transpose_fp16.cpp                |  10 -
 .../batched_transpose_fp8.cpp                 |  10 -
 8 files changed, 3 insertions(+), 478 deletions(-)
 delete mode 100644 test/ck_tile/batched_transpose/batched_transpose.hpp
 delete mode 100644 test/ck_tile/batched_transpose/batched_transpose.inc
 delete mode 100644 test/ck_tile/batched_transpose/batched_transpose_api.cpp
 delete mode 100644 test/ck_tile/batched_transpose/batched_transpose_bf16.cpp
 delete mode 100644 test/ck_tile/batched_transpose/batched_transpose_fp16.cpp
 delete mode 100644 test/ck_tile/batched_transpose/batched_transpose_fp8.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c6c09eb6ca..a2196ad2b2 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,9 +37,7 @@ set(REGRESSION_TESTS
     test_grouped_convnd_bwd_data_xdl
     test_conv_tensor_rearrange
     test_gemm_mx
-    test_ck_tile_batched_transpose_fp8
-    test_ck_tile_batched_transpose_fp16
-    test_ck_tile_batched_transpose_bf16
+    test_ck_tile_batched_transpose
 )
 
 function(add_test_executable TEST_NAME)
diff --git a/test/ck_tile/batched_transpose/CMakeLists.txt b/test/ck_tile/batched_transpose/CMakeLists.txt
index f2ef158a4d..111b7c2bed 100644
--- a/test/ck_tile/batched_transpose/CMakeLists.txt
+++ b/test/ck_tile/batched_transpose/CMakeLists.txt
@@ -1,33 +1,7 @@
 # Currently ck_tile is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
-
-    function (add_batched_transpose_test TARGET_NAME MAIN_SRC)
-        message(DEBUG "adding ${TARGET_NAME}")
-
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC} batched_transpose_api.cpp)
-        target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-
-        # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-        list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
-        # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
-        target_compile_options(${TARGET_NAME} PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
-
-    endfunction(add_batched_transpose_test TARGET_NAME MAIN_SRC)
-
-    set(CUSTOM_TARGET_NAME test_ck_tile_batched_transpose)
-
-    add_custom_target(${CUSTOM_TARGET_NAME})
-
-    add_batched_transpose_test(test_ck_tile_batched_transpose_fp16 batched_transpose_fp16.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_fp16)
-
-    add_batched_transpose_test(test_ck_tile_batched_transpose_fp8 batched_transpose_fp8.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_fp8)
-
-    add_batched_transpose_test(test_ck_tile_batched_transpose_bf16 batched_transpose_bf16.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_bf16)
-    add_gtest_executable(test_batched_transpose test_batched_transpose.cpp)
-    set_property(TARGET test_batched_transpose PROPERTY CXX_STANDARD 20)
+    add_gtest_executable(test_ck_tile_batched_transpose test_batched_transpose.cpp)
+    set_property(TARGET test_ck_tile_batched_transpose PROPERTY CXX_STANDARD 20)
 else()
     message(DEBUG "Skipping ck_tile batched_transpose tests for current target")
 endif()
diff --git a/test/ck_tile/batched_transpose/batched_transpose.hpp b/test/ck_tile/batched_transpose/batched_transpose.hpp
deleted file mode 100644
index bd1abb1191..0000000000
--- a/test/ck_tile/batched_transpose/batched_transpose.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "ck_tile/ops/reduce.hpp"
-#include "ck_tile/ops/batched_transpose.hpp"
-
-#include <vector>
-#include <string>
-
-#pragma once
-
-struct batched_transpose_trait
-{
-    std::string type;
-    std::string layout;
-};
-
-struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
-{
-};
-
-float batched_transpose(batched_transpose_trait t,
-                        batched_transpose_kargs a,
-                        ck_tile::stream_config s);
diff --git a/test/ck_tile/batched_transpose/batched_transpose.inc b/test/ck_tile/batched_transpose/batched_transpose.inc
deleted file mode 100644
index 30084f5664..0000000000
--- a/test/ck_tile/batched_transpose/batched_transpose.inc
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include <vector>
-#include <iostream>
-#include <numeric>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <time.h>
-#include <unordered_set>
-
-#include "batched_transpose.hpp"
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit(std::string /*init_method*/)
-{
-    double rtol = 1e-3;
-    double atol = 1e-3;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
-{
-    double rtol = 1e-2;
-    double atol = 1e-2;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::fp8_t>(std::string init_method)
-{
-    if(init_method == "ui" || init_method == "ni")
-    {
-        unsigned max_rounding_point_distance = 0;
-        double atol                          = 2e-3;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-    else
-    {
-        unsigned max_rounding_point_distance = 1;
-        double atol                          = 0.0625;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-}
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "whether do CPU validation or not")
-        .insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
-        .insert("N", "1", "input batch size. ")
-        .insert("C", "64", "input channel size.")
-        .insert("H", "18", "input height size.")
-        .insert("W", "64", "input width size. ")
-        .insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
-        .insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
-        .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "t to 1 will print kernel name");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename Type>
-bool run_batched_transpose(ck_tile::ArgParser args)
-{
-    int validate           = args.get_int("v");
-    std::string prec       = args.get_str("pr");
-    int N                  = args.get_int("N");
-    int C                  = args.get_int("C");
-    int H                  = args.get_int("H");
-    int W                  = args.get_int("W");
-    int n_warmup           = args.get_int("warmup");
-    int n_repeat           = args.get_int("repeat");
-    std::string layout_in  = args.get_str("layout_in");
-    std::string layout_out = args.get_str("layout_out");
-    int seed               = args.get_int("seed");
-
-    int dim_in[4], dim_out[4];
-    int stride_dim_in[4], stride_dim_out[4];
-    bool nchw2nhwc = layout_in == "NCHW" && layout_out == "NHWC";
-    bool nhwc2nchw = layout_in == "NHWC" && layout_out == "NCHW";
-    assert(nchw2nhwc != nhwc2nchw);
-    (void)nhwc2nchw;
-
-    dim_in[0]         = N;
-    dim_in[1]         = nchw2nhwc ? C : H;
-    dim_in[2]         = nchw2nhwc ? H : W;
-    dim_in[3]         = nchw2nhwc ? W : C;
-    dim_out[0]        = N;
-    dim_out[1]        = nchw2nhwc ? H : C;
-    dim_out[2]        = nchw2nhwc ? W : H;
-    dim_out[3]        = nchw2nhwc ? C : W;
-    stride_dim_in[0]  = C * H * W;
-    stride_dim_in[1]  = nchw2nhwc ? H * W : C * W;
-    stride_dim_in[2]  = nchw2nhwc ? W : C;
-    stride_dim_in[3]  = 1;
-    stride_dim_out[0] = C * H * W;
-    stride_dim_out[1] = nchw2nhwc ? C * W : H * W;
-    stride_dim_out[2] = nchw2nhwc ? C : W;
-    stride_dim_out[3] = 1;
-
-    if(seed < 0)
-    {
-        seed = std::time(nullptr);
-    }
-
-    ck_tile::HostTensor<Type> x_host(
-        {dim_in[0], dim_in[1], dim_in[2], dim_in[3]},
-        {stride_dim_in[0], stride_dim_in[1], stride_dim_in[2], stride_dim_in[3]});
-    ck_tile::HostTensor<Type> y_host(
-        {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
-        {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
-
-    ck_tile::FillUniformDistribution<Type>{-.5f, .5f}(x_host);
-
-    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
-
-    x_dev.ToDevice(x_host.data());
-
-    auto trait = batched_transpose_trait{prec, layout_in};
-
-    uint32_t height = nchw2nhwc ? C : H * W;
-    uint32_t width  = nchw2nhwc ? H * W : C;
-
-    batched_transpose_kargs karg = [&]() {
-        batched_transpose_kargs a_;
-        a_.p_input  = x_dev.GetDeviceBuffer();
-        a_.p_output = y_dev.GetDeviceBuffer();
-        a_.batch    = N;
-        a_.height   = height;
-        a_.width    = width;
-        return a_;
-    }();
-
-    ck_tile::stream_config sc{nullptr, true, n_warmup, n_repeat};
-
-    auto ms = batched_transpose(trait, karg, sc);
-
-    std::size_t num_operations = N * C * H * (W - 1);
-    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
-
-    float ave_time   = ms * 1E-3;
-    float gb_per_sec = num_bytes / ms * 1.E-6;
-    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;
-
-    std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
-              << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
-              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
-              << gb_per_sec << " GB/s, " << std::endl;
-
-    printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
-           prec.c_str(),
-           N,
-           C,
-           H,
-           W,
-           layout_in.c_str(),
-           ms);
-    if(ms < 0)
-        printf("------------------------------------not "
-               "supported-------------------------------------\n");
-    fflush(stdout);
-
-    if(ms < 0)
-    {
-        return false;
-    }
-
-    y_dev.FromDevice(y_host.data());
-
-    bool rtn = true;
-    if(validate)
-    {
-        // this host buffer will not copy to GPU, so no need use stride
-        ck_tile::HostTensor<Type> y_ref(
-            {dim_out[0], dim_out[1], dim_out[2], dim_out[3]},
-            {stride_dim_out[0], stride_dim_out[1], stride_dim_out[2], stride_dim_out[3]});
-
-        ck_tile::reference_batched_transpose<Type>(x_host, y_ref, layout_in, layout_out);
-
-        auto [rtol, atol] = get_elimit<Type>("");
-
-        rtn &= ck_tile::check_err(
-            y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
-    }
-    printf("-----------------------------------------------------------------------valid:%s--------"
-           "--------------------------------------------------------------------\n",
-           rtn ? "y" : "n");
-    fflush(stdout);
-    return rtn;
-}
-
-template <typename PrecType>
-bool run_test_case(int argc, char** argv)
-{
-    auto [result, args] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return run_batched_transpose<PrecType>(args);
-}
-
-template <typename PrecType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid = true;
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-        constexpr int num_args = 7;
-        char* argv[num_args];
-
-        assert(test_cases[test_idx].size() == num_args &&
-               "invalid number of arguments in test case");
-
-        for(std::size_t idx = 0; idx < test_cases[test_idx].size(); ++idx)
-        {
-            argv[idx] = test_cases[test_idx][idx].data();
-        }
-
-        valid = valid && run_test_case<PrecType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
-
-std::vector<std::vector<std::string>> generate_test_cases(const std::string prec)
-{
-    return {
-        {"-pr=" + prec, "-N=1", "-C=32", "-H=1", "-W=32", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=1", "-C=64", "-H=1", "-W=64", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=2", "-C=12", "-H=1", "-W=32", "-layout_in=NHWC", "-layout_out=NCHW"},
-        {"-pr=" + prec, "-N=3", "-C=1334", "-H=1", "-W=37", "-layout_in=NHWC", "-layout_out=NCHW"},
-        {"-pr=" + prec, "-N=4", "-C=27", "-H=1", "-W=32", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=5", "-C=1234", "-H=1", "-W=12", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=1", "-C=1", "-H=1", "-W=1", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=1", "-C=1", "-H=1", "-W=1", "-layout_in=NHWC", "-layout_out=NCHW"},
-        {"-pr=" + prec,
-         "-N=128",
-         "-C=1024",
-         "-H=64",
-         "-W=64",
-         "-layout_in=NCHW",
-         "-layout_out=NHWC"},
-        {"-pr=" + prec,
-         "-N=128",
-         "-C=1024",
-         "-H=64",
-         "-W=64",
-         "-layout_in=NHWC",
-         "-layout_out=NCHW"},
-        {"-pr=" + prec, "-N=16", "-C=64", "-H=32", "-W=128", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=16", "-C=64", "-H=128", "-W=32", "-layout_in=NHWC", "-layout_out=NCHW"},
-        {"-pr=" + prec, "-N=1", "-C=2048", "-H=1", "-W=1", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=1", "-C=2048", "-H=1", "-W=1", "-layout_in=NHWC", "-layout_out=NCHW"},
-        {"-pr=" + prec,
-         "-N=1",
-         "-C=1",
-         "-H=1024",
-         "-W=1024",
-         "-layout_in=NCHW",
-         "-layout_out=NHWC"},
-        {"-pr=" + prec,
-         "-N=1",
-         "-C=1",
-         "-H=1024",
-         "-W=1024",
-         "-layout_in=NHWC",
-         "-layout_out=NCHW"},
-        {"-pr=" + prec, "-N=8", "-C=16", "-H=8", "-W=16", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=8", "-C=16", "-H=8", "-W=16", "-layout_in=NHWC", "-layout_out=NCHW"},
-        {"-pr=" + prec, "-N=1", "-C=64", "-H=1", "-W=1024", "-layout_in=NCHW", "-layout_out=NHWC"},
-        {"-pr=" + prec, "-N=1", "-C=64", "-H=1024", "-W=1", "-layout_in=NHWC", "-layout_out=NCHW"}};
-}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_api.cpp b/test/ck_tile/batched_transpose/batched_transpose_api.cpp
deleted file mode 100644
index 973a1967f2..0000000000
--- a/test/ck_tile/batched_transpose/batched_transpose_api.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-#include "batched_transpose.hpp"
-
-template <typename ts_type,
-          ck_tile::index_t block_x,
-          ck_tile::index_t block_y,
-          ck_tile::index_t warp_x,
-          ck_tile::index_t warp_y,
-          bool kPadM,
-          bool kPadN>
-float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
-{
-    uint32_t dim_stride = a.height * a.width;
-
-    a.dim_stride  = dim_stride;
-    a.dim_block_h = block_y;
-    a.dim_block_w = block_x;
-
-    using block_tile  = ck_tile::sequence<block_x, block_y>;
-    using warp_layout = ck_tile::sequence<warp_x, warp_y>;
-
-    using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
-    using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
-
-    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
-
-    auto kargs = kernel::MakeKargs(a);
-
-    const dim3 grids      = kernel::GridSize(a);
-    constexpr dim3 blocks = kernel::BlockSize();
-
-    printf("Grid: %u %u %u\n", grids.x, grids.y, grids.z);
-    printf("Block: %u %u %u\n", blocks.x, blocks.y, blocks.z);
-    printf("kargs: kargs.batch %d kargs.height %d kargs.width %d kargs.dim_strid %d\n",
-           kargs.batch,
-           kargs.height,
-           kargs.width,
-           kargs.dim_stride);
-
-    printf("Launching Kernel...\n");
-
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));
-
-    printf("Kernel finished...\n");
-
-    return ave_time;
-}
-
-// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)                       \
-    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true)     \
-    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true)   \
-    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false) \
-    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true)   \
-    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false)
-
-// Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN)               \
-    static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN( \
-        batched_transpose_kargs& a, ck_tile::stream_config& s)                            \
-    {                                                                                     \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, PADM, PADN>(a, s);   \
-    }
-
-FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
-
-float batched_transpose(batched_transpose_trait t,
-                        batched_transpose_kargs a,
-                        ck_tile::stream_config s)
-{
-    if(t.type == "fp8")
-    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
-        {
-            return transpose_fn_fp8_64_64_1_1_false_false(a, s);
-        }
-        else
-        {
-            return transpose_fn_fp8_64_64_1_1_true_true(a, s);
-        }
-    }
-    else if(t.type == "fp16")
-    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
-        {
-            return transpose_fn_fp16_64_64_1_1_false_false(a, s);
-        }
-        else
-        {
-            return transpose_fn_fp16_64_64_1_1_true_true(a, s);
-        }
-    }
-    else if(t.type == "bf16")
-    {
-        if(a.height % 64 == 0 && a.width % 64 == 0)
-        {
-            return transpose_fn_bf16_64_64_1_1_false_false(a, s);
-        }
-        else
-        {
-            return transpose_fn_bf16_64_64_1_1_true_true(a, s);
-        }
-    }
-    return -1;
-}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_bf16.cpp b/test/ck_tile/batched_transpose/batched_transpose_bf16.cpp
deleted file mode 100644
index 42642335f6..0000000000
--- a/test/ck_tile/batched_transpose/batched_transpose_bf16.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-#include "batched_transpose.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16");
-
-    return !run_test_cases<ck_tile::bf16_t>(test_cases);
-}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_fp16.cpp b/test/ck_tile/batched_transpose/batched_transpose_fp16.cpp
deleted file mode 100644
index 5562dd54e8..0000000000
--- a/test/ck_tile/batched_transpose/batched_transpose_fp16.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-#include "batched_transpose.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16");
-
-    return !run_test_cases<ck_tile::fp16_t>(test_cases);
-}
diff --git a/test/ck_tile/batched_transpose/batched_transpose_fp8.cpp b/test/ck_tile/batched_transpose/batched_transpose_fp8.cpp
deleted file mode 100644
index 45e79fb4c2..0000000000
--- a/test/ck_tile/batched_transpose/batched_transpose_fp8.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-#include "batched_transpose.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp8");
-
-    return !run_test_cases<ck_tile::fp8_t>(test_cases);
-}

From b786d12e5667dbe5b40ae04544fb13c541ee2a7d Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 1 Aug 2025 16:18:16 -0700
Subject: [PATCH 383/443] remove std=c++17 compiler flag (#2603)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da5a86523e..f49376d139 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,7 +336,7 @@ find_package(Threads REQUIRED)
 link_libraries(Threads::Threads)
 
 ## C++
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")

From 0d9439760fb077150b7535b0d5ad55b063141cd3 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 1 Aug 2025 19:22:07 -0700
Subject: [PATCH 384/443] remove std::format (#2604)

---
 test/ck_tile/batched_transpose/test_batched_transpose.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
index 85008a51a2..cce00e27cb 100644
--- a/test/ck_tile/batched_transpose/test_batched_transpose.cpp
+++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
@@ -122,8 +122,7 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
         if(Config::kPipelineId == PipelineTag::LDSLoadTranspose &&
            device_name.find("gfx950") == std::string::npos)
         {
-            GTEST_SKIP_(
-                std::format("LDS Load Transpose cannot be launched with {}", device_name).c_str());
+            GTEST_SKIP_("LDS Load Transpose cannot be launched with this device");
         }
 
         const auto host_args = ck_tile::BatchedTransposeHostArgs{x_dev.GetDeviceBuffer(),

From 8655ba989ccd3b1b5d2590828e157299c777b3bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Mon, 4 Aug 2025 16:49:55 +0200
Subject: [PATCH 385/443] Mark non-grouped convolutions instances as deprecated
 (#2595)

* Mark non-grouped convolutions instances as deprecated

* Update CHANGELOG.md

Co-authored-by: John Afaganis <john.afaganis@amd.com>

* Update library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp

Co-authored-by: John Afaganis <john.afaganis@amd.com>

---------

Co-authored-by: John Afaganis <john.afaganis@amd.com>
---
 CHANGELOG.md                                              | 4 ++++
 ...vice_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp | 8 +++++++-
 ...evice_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp | 8 +++++++-
 ...evice_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp | 8 +++++++-
 ...vice_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp | 8 +++++++-
 ...ice_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++-
 ...ice_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp | 8 +++++++-
 ...ce_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp | 8 +++++++-
 ...e_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 8 +++++++-
 ...ce_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++-
 ...ce_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 8 +++++++-
 ...e_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 8 +++++++-
 ...nv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++-
 ...device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 8 +++++++-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 8 +++++++-
 ...device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 8 +++++++-
 ...dl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++-
 ..._shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++-
 ...onv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 8 +++++++-
 ...conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 8 +++++++-
 ...conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 8 +++++++-
 ...onv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 8 +++++++-
 23 files changed, 158 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4c054b822a..7a21634b7d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -51,6 +51,10 @@ None
 
 None
 
+### Upcoming changes
+
+* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution.
+
 ## Composable Kernel 1.1.0 for ROCm 6.1.0
 
 ### Additions
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
index e3e90c966d..3c332c3b22 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -90,10 +90,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are deprecated.  They may be removed in a future release."
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
index 81e9122d95..aaaeda0312 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -83,10 +83,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
         DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
index dbc82168f4..331cc3c4b2 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -82,10 +82,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
         DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
index 3ac250f3e6..4e51074b3a 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -87,10 +87,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
index 6ca909c35e..58b3f8e37d 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
index d263e98851..a487f0a6f0 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
index bc949e757c..cfd4f849b8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index 366d1fe160..c2f55d94eb 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -140,6 +140,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{});
     add_device_operation_instances(
@@ -149,6 +151,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
     add_device_operation_instances(
         instances,
         device_conv_dedidecate_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index 422e37e926..5df1c9cf39 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -142,6 +142,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -150,6 +152,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index 5993f6bd7a..76ca976e37 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -139,6 +139,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(
@@ -147,6 +149,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 2f079c234c..8221515caa 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -136,6 +136,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(
@@ -144,6 +146,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
index 86c17aacf0..d7a82fdd2c 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -180,6 +180,8 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
         DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -200,6 +202,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
         add_device_operation_instances(
             instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
     }
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index 63c612523f..153b770e1b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -114,12 +114,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                               PassThrough,
                                               PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index 0f3b9e7939..fd0c94250f 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -107,11 +107,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
         DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index 14f9b5cd6a..038316ac31 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -106,11 +106,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
         DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f32_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 3f641cdadc..c77c8683c8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -111,12 +111,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
                                               PassThrough,
                                               PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_int8_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
index 3402653e84..97830449ee 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -179,6 +179,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instanc
 void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
     std::vector<DeviceConvFwdBiasActivationPtr<PassThrough, PassThrough, AddRelu>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -203,6 +205,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
             instances,
             device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
     }
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
index faac2813ba..e5c682d3cd 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -177,6 +177,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_ins
 void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(
     std::vector<DeviceConvFwdBiasActivationAddPtr<PassThrough, PassThrough, AddReluAdd>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -204,6 +206,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instan
             instances,
             device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
     }
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
index 94b2a47e50..0b9a6c2b8d 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
index 4244ab7b87..6c54552cc8 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
index 5c7db4ca3b..363e342c1b 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -89,10 +89,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
index ebc56487a1..35bca49fed 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -87,10 +87,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance

From 15eb493152b4cddff947159ea4b829e1f55c56f3 Mon Sep 17 00:00:00 2001
From: Jinchao Xu <robotchaox@qq.com>
Date: Tue, 5 Aug 2025 02:26:08 +0800
Subject: [PATCH 386/443] Add -gsplit-dwarf flag to reduce debug section size
 and fix ckProfiler link errors (#2611)

Resolves R_X86_64_32 relocation out of range errors in grouped conv2d instances
by splitting debug information into separate .dwo files.

Add explicit cast to avoid signed/unsigned comparison warning.
---
 include/ck_tile/host/reference/reference_softmax.hpp     | 4 ++--
 include/ck_tile/host/reference/reference_topk.hpp        | 9 +++++----
 library/src/tensor_operation_instance/gpu/CMakeLists.txt | 4 ++++
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp
index d86e879944..4e729c437d 100644
--- a/include/ck_tile/host/reference/reference_softmax.hpp
+++ b/include/ck_tile/host/reference/reference_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -14,7 +14,7 @@ CK_TILE_HOST void
 reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
 {
     index_t rank = x.get_num_of_dimension();
-    assert(rank == y.get_num_of_dimension());
+    assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
     assert(dim == -1 || dim < rank);
 
     index_t target_dim  = dim == -1 ? (rank - 1) : dim;
diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp
index 3d0404a2e5..0fc99a983a 100644
--- a/include/ck_tile/host/reference/reference_topk.hpp
+++ b/include/ck_tile/host/reference/reference_topk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
 {
     // rank must be the same
     index_t rank = x.get_num_of_dimension();
-    assert(rank == y_values.get_num_of_dimension());
-    assert(rank == y_indices.get_num_of_dimension());
+    assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
+    assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
     assert(dim == -1 || dim < rank);
 
     index_t topk_dim     = dim == -1 ? (rank - 1) : dim;
@@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
     auto x_len           = x.get_lengths();
 
     assert(k <= topk_src_len);
-    assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
+    assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
+           static_cast<size_t>(k) == y_indices.get_length(topk_dim));
 
     index_t n_parallel = x.get_element_size() / topk_src_len;
 
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 5204b51edf..1eaaa7e6ba 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME)
 
         target_compile_features(${INSTANCE_NAME} PUBLIC)
 
+        # splits debug information into separate .dwo files to reduce debug section size
+        if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+            target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf)
+        endif()
         # flags to compress the library
         if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
             message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}")

From 59245df46d1090bfb1cd438d867c15a300989d63 Mon Sep 17 00:00:00 2001
From: rahjain-amd <Rahul.Jain@amd.com>
Date: Mon, 4 Aug 2025 23:58:09 +0530
Subject: [PATCH 387/443] Fix Debug Build for ckProfiler (#2609)

Problem
=======
relocation R_X86_64_32 out of range: 5405348154 is not in [0, 4294967295]

Solution
========
The problem was caused due the limitation comes from the 32 bit offsets
used in original DWARF standard.
We have the option to switch to 64bit offset for your libs which free
us from 4G size boundary.

add -gdwarf64 and -Og to avoid this limit.
---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f49376d139..19c036e1a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed)
 add_compile_options(-Wno-switch-default)
 add_compile_options(-Wno-unique-object-duplication)
 
+# add -Og -gdwarf64 for debug builds
+add_compile_options(
+    "$<$<CONFIG:Debug>:-Og>"
+    "$<$<CONFIG:Debug>:-gdwarf64>"
+)
+
 # Recent change in compiler makes this warning ON by default, which led to compile errors.
 add_compile_options(-Wno-nrvo)
 

From fb96b49666ddd4d7ccfd3528b1859796657e1a6b Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 4 Aug 2025 11:43:47 -0700
Subject: [PATCH 388/443] fix test_mx_mfma errors (#2614)

---
 test/mx_mfma_op/mx_mfma_op.hpp | 46 +++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 4bb38a0c16..b2e615b9d8 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
     auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
     auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
 
-    using ARawT = typename scalar_type<AFragT>::type;
-    using AScalarFragT =
-        vector_type<ARawT,
-                    BLOCK_M * BLOCK_K / WAVE_SIZE /
-                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
+    using ARawT        = typename scalar_type<AFragT>::type;
+    using AScalarFragT = typename vector_type<
+        ARawT,
+        BLOCK_M * BLOCK_K / WAVE_SIZE /
+            (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
 
     AScalarFragT fragA{};
 
@@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
     // Flatten to 1D row_major offsets.
     auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
 
-    using ARawT         = typename scalar_type<AFragT>::type;
-    using AScalarChunkT = vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
+    using ARawT = typename scalar_type<AFragT>::type;
+    using AScalarChunkT =
+        typename vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
 
     union
     {
@@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
 
     auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
 
-    using BRawT         = typename scalar_type<BFragT>::type;
-    using BScalarChunkT = vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
+    using BRawT = typename scalar_type<BFragT>::type;
+    using BScalarChunkT =
+        typename vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
 
     union
     {
@@ -780,7 +782,7 @@ struct store_C_col_major<CType, CFragT, 32, 32>
 
         // we can vector store 4 contiguous elements at a time.
         using CRawT        = typename scalar_type<CFragT>::type;
-        using CScalarFragT = vector_type<CRawT, VW>::type;
+        using CScalarFragT = typename vector_type<CRawT, VW>::type;
         union
         {
             CFragT frag;
@@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t<AType>* a, const packed_type_t<BType>
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
-    using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
+    using AFragT =
+        typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
+    using BFragT =
+        typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
 
-    using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using CFragT        = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
-    using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
 
     // Create frags
     auto fragA   = AFragT{};
@@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t<AType>* a,
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
-    using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
+    using AFragT =
+        typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
+    using BFragT =
+        typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
 
-    using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using CFragT        = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
-    using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
-    using AScaleFragT   = vector_type<ScaleType, 1>::type;
-    using BScaleFragT   = vector_type<ScaleType, 1>::type;
+    using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using AScaleFragT   = typename vector_type<ScaleType, 1>::type;
+    using BScaleFragT   = typename vector_type<ScaleType, 1>::type;
 
     // Create frags
     auto fragA   = AFragT{};

From 2a78da47082edbff25b5cf2c5b43eeea673f1485 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Mon, 4 Aug 2025 17:43:15 -0700
Subject: [PATCH 389/443] fix build for test_ck_tile_fp8 on rhel8 (#2615)

---
 test/ck_tile/data_type/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt
index a9ce48d1de..a9461dca9c 100644
--- a/test/ck_tile/data_type/CMakeLists.txt
+++ b/test/ck_tile/data_type/CMakeLists.txt
@@ -8,6 +8,7 @@ endif()
 if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8)
     add_gtest_executable(test_ck_tile_fp8 test_fp8.cpp)
     target_compile_options(test_ck_tile_fp8 PRIVATE -Wno-float-equal)
+    target_compile_definitions(test_ck_tile_fp8 PUBLIC GTEST_HAS_RTTI=0)
     # conditionally specify the use of OCP_FP8
     if(CK_USE_OCP_FP8)
         target_compile_options(test_ck_tile_fp8 PRIVATE -DCK_TILE_USE_OCP_FP8)

From cbfecf8d7aa50ae64c26f5aba6fef9f2eaab743e Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Mon, 4 Aug 2025 23:43:01 -0700
Subject: [PATCH 390/443] Persistent grouped gemm CompV4 Enablement & Polish
 (#2605)

* enable the persistent kernel for CompV4

* polish the example and clang format

* fix the non-persistent kernel error

---------

Co-authored-by: ThomasNing <thomasning@amd.com>
---
 .../ck_tile/17_grouped_gemm/CMakeLists.txt    |   1 -
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 122 ++++--------
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  |   2 +-
 .../17_grouped_gemm/grouped_gemm_tileloop.cpp | 176 ------------------
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   | 130 ++++++++++---
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |   6 +-
 6 files changed, 148 insertions(+), 289 deletions(-)
 delete mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp

diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
index 79df4e624d..475c13166d 100644
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1 @@
 add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-add_executable(tile_example_grouped_gemm_tileloop EXCLUDE_FROM_ALL grouped_gemm_tileloop.cpp)
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index bb0a0d5840..897952f03c 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <hip/hip_runtime.h>
 
@@ -16,19 +16,11 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-template <typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
-                   const ck_tile::stream_config& s,
-                   void* kargs_ptr)
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     // Memory friendly for Interwave scheduler
@@ -83,8 +75,6 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
     constexpr bool kPadN = false;
     constexpr bool kPadK = false;
 
-    constexpr bool TransposeC = false;
-
     constexpr int kBlockPerCu                         = 1;
     constexpr ck_tile::index_t TileParitionerGroupNum = 8;
     constexpr ck_tile::index_t TileParitionerM01      = 4;
@@ -97,54 +87,41 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
     using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
-                                                                 kPadN,
-                                                                 kPadK,
-                                                                 DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 CLayout,
-                                                                 TransposeC>;
+    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
+                                                                           kPadN,
+                                                                           kPadK,
+                                                                           DoubleSmemBuffer,
+                                                                           ALayout,
+                                                                           BLayout,
+                                                                           CLayout>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = gemm_descs[0].k_batch * K_Tile;
-    const ck_tile::index_t K_split     = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
+    const auto Run = [&](const auto memory_operation_) {
         constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
         constexpr auto memory_operation = memory_operation_.value;
 
+        // We create the GEMM pipeline without specifying hotloop or tailnumber.
+        // These are automatically run inside the kernel based on the given input data.
         using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                            BDataType,
                                                                            AccDataType,
                                                                            GemmShape,
                                                                            GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+                                                                           scheduler>;
 
         using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
-                                             DsDataType,
+                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
-                                             DsLayout,
+                                             ck_tile::tuple<>,
                                              CLayout,
-                                             CDEElementWise,
+                                             ck_tile::element_wise::PassThrough,
                                              GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
@@ -156,20 +133,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                                              UniversalGemmProblem::TransposeC,
                                              memory_operation>>;
         using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(gemm_descs);
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Kernel arguments not supported!");
-        }
-
         constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::GridSize(gemm_descs);
-
-        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
-                                            kargs.data(),
-                                            get_workspace_size(gemm_descs),
-                                            hipMemcpyHostToDevice,
-                                            s.stream_id_));
+        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
 
         if(s.log_level_ > 0)
         {
@@ -186,45 +151,26 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                                        blocks,
                                        0,
                                        ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
-                                       gemm_descs.size()));
+                                       num_groups));
 
         return ave_time;
     };
 
-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(gemm_descs[0].k_batch == 1)
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
-        }
-        else
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    if(!splitk)
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::atomic_add>{});
+    }
 
     return ave_time;
 }
 
 #include "run_grouped_gemm_example.inc"
 
-constexpr bool Persistent = false;
-int main(int argc, char* argv[])
-{
-    try
-    {
-        return !run_grouped_gemm_example<Persistent>(argc, argv);
-    }
-    catch(const std::runtime_error& e)
-    {
-        std::cerr << "Runtime error: " << e.what() << '\n';
-        return EXIT_FAILURE;
-    }
-}
+constexpr bool Persistent = true;
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 74efb1bdeb..89d91fbef6 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -15,7 +15,7 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 
 #ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4
 #endif
 
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
deleted file mode 100644
index 897952f03c..0000000000
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <memory>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/host.hpp"
-#include "grouped_gemm.hpp"
-
-template <typename ALayout, typename BLayout, typename CLayout>
-float grouped_gemm_tileloop(const ck_tile::stream_config& s,
-                            const ck_tile::index_t num_groups,
-                            void* kargs_ptr,
-                            bool splitk)
-{
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu                         = 1;
-    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    constexpr ck_tile::index_t TileParitionerM01      = 4;
-
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-    using TilePartitioner = ck_tile::
-        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
-                                                                           kPadN,
-                                                                           kPadK,
-                                                                           DoubleSmemBuffer,
-                                                                           ALayout,
-                                                                           BLayout,
-                                                                           CLayout>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    float ave_time{0};
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        // We create the GEMM pipeline without specifying hotloop or tailnumber.
-        // These are automatically run inside the kernel based on the given input data.
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler>;
-
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation>>;
-        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
-                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
-                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
-        }
-
-        ave_time =
-            ck_tile::launch_kernel(s,
-                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                                       Kernel{},
-                                       grids,
-                                       blocks,
-                                       0,
-                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
-                                       num_groups));
-
-        return ave_time;
-    };
-
-    if(!splitk)
-    {
-        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                       ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                       ck_tile::memory_operation_enum::atomic_add>{});
-    }
-
-    return ave_time;
-}
-
-#include "run_grouped_gemm_example.inc"
-
-constexpr bool Persistent = true;
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 921ea11720..477a87d42f 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -252,13 +252,6 @@ struct GroupedGemmKernel
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs,
-                            const tuple<index_t, index_t>& block_idx_2d,
-                            const index_t block_idx_z) const
-    {
-        Run(kargs.group_karg, block_idx_2d, block_idx_z);
-    }
-
     CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<>& kargs,
                             const tuple<index_t, index_t>& block_idx_2d,
                             const index_t block_idx_z) const
@@ -277,24 +270,56 @@ struct GroupedGemmKernel
         CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr);
 
         // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        __shared__ char smem_ptr_0[GetSmemSize()];
 
-        if constexpr(UsePersistentKernel)
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
         {
-            RunGemmWithPipelineSelection(
-                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(UsePersistentKernel)
+            {
+                RunGemmWithPipelineSelection2LDS(a_ptr,
+                                                 b_ptr,
+                                                 c_ptr,
+                                                 smem_ptr_0,
+                                                 smem_ptr_1,
+                                                 kargs,
+                                                 splitk_batch_offset,
+                                                 i_m,
+                                                 i_n);
+            }
+            else
+            {
+                Base::RunGemm2LDS({a_ptr},
+                                  {b_ptr},
+                                  {/*ds_ptr*/},
+                                  c_ptr,
+                                  smem_ptr_0,
+                                  smem_ptr_1,
+                                  kargs,
+                                  splitk_batch_offset,
+                                  i_m,
+                                  i_n);
+            }
         }
         else
         {
-            Base::RunGemm({a_ptr},
-                          {b_ptr},
-                          {/*ds_ptr*/},
-                          c_ptr,
-                          smem_ptr,
-                          kargs,
-                          splitk_batch_offset,
-                          i_m,
-                          i_n);
+            if constexpr(UsePersistentKernel)
+            {
+                RunGemmWithPipelineSelection(
+                    a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+            }
+            else
+            {
+                Base::RunGemm({a_ptr},
+                              {b_ptr},
+                              {/*ds_ptr*/},
+                              c_ptr,
+                              smem_ptr_0,
+                              kargs,
+                              splitk_batch_offset,
+                              i_m,
+                              i_n);
+            }
         }
     }
 
@@ -358,6 +383,69 @@ struct GroupedGemmKernel
             c_block_window, c_block_tile, d_block_window, smem_ptr_0);
     }
 
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note The GEMM pipeline is selected in-kernel based on the number of K-loops
+     *       and the tail-number. This is needed for the persistent tile-loop when
+     *       we didn't have access to the K dimension on the host.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param smem_ptr_1 The second start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k
+     * batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void
+    RunGemmWithPipelineSelection2LDS(const ADataType* a_ptr,
+                                     const BDataType* b_ptr,
+                                     CDataType* c_ptr,
+                                     void* __restrict__ smem_ptr_0,
+                                     void* __restrict__ smem_ptr_1,
+                                     const UniversalGemmKernelArgs<>& kargs,
+                                     const typename Base::SplitKBatchOffset& splitk_batch_offset,
+                                     const index_t block_idx_m,
+                                     const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows =
+            Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        const auto& a_block_window = gemm_tile_windows.at(Base::I0);
+        const auto& b_block_window = gemm_tile_windows.at(Base::I1);
+        const auto& d_block_window = gemm_tile_windows.at(Base::I2);
+
+        // Get hot-loop and tail configuration
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        // Run GEMM pipeline
+        const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                                      b_block_window[Base::I0],
+                                                                      num_loop,
+                                                                      has_hot_loop,
+                                                                      tail_num,
+                                                                      smem_ptr_0,
+                                                                      smem_ptr_1);
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(Base::I3);
+        EpiloguePipeline{}.template
+        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
     CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr,
                                        index_t block_id,
                                        index_t group_count) const
@@ -401,7 +489,7 @@ struct GroupedGemmKernel
             kargs.group_karg.M,
             kargs.group_karg.N,
             (block_id - kargs.block_start) % grid_size_2d);
-        Run(kargs, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d);
+        Run(kargs.group_karg, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d);
     }
 
     // For persistent kernels
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index ac91c2f58f..22c8cf383b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -18,12 +18,14 @@ struct BaseGemmPipelineAgBgCrCompV4
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
 
-    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop)
     {
         return num_loop > PrefetchStages;
     }
 
-    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
     {
         if(num_loop % PrefetchStages == 1)
         {

From 2203b0ddfe06f4f9f5126e54e78697dfb16118d4 Mon Sep 17 00:00:00 2001
From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:23:19 +0200
Subject: [PATCH 391/443] Add padding to 1x1Stride1Pad0 conv specialization
 (grouped conv bwd weight) (#2610)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add padding 1x1Stride1Pad0 conv specialization

* Add gridwise checks for conv cshufflev3

* Merge padding with previous transforms

* Apply transform changes for padding to default specialization as well

---------

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 include/ck/ck.hpp                             |   3 -
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |  11 +-
 .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp    | 198 ++++++++++++++++++
 .../transform_conv_bwd_weight_to_gemm.hpp     | 126 ++++-------
 .../transform_conv_bwd_weight_to_gemm_v2.hpp  | 120 ++++-------
 5 files changed, 290 insertions(+), 168 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 794c6f4e20..09801203ba 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -222,9 +222,6 @@
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 
-// workaround: conv crash when K, C is even
-#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1
-
 // workaround: compiler crash when compiling recursive lambda
 #define CK_WORKAROUND_SWDEV_275126 1
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 1cd1f16245..ed64b83356 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -331,9 +331,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
     using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3<
-        tensor_layout::gemm::RowMajor,
         tensor_layout::gemm::ColumnMajor,
         tensor_layout::gemm::RowMajor,
+        tensor_layout::gemm::RowMajor,
         ADataType,
         BDataType,
         AccDataType,
@@ -1299,13 +1299,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         if constexpr(ConvBackwardWeightSpecialization ==
                      ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
         {
-// workaround: disable when K, C is even
-#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN
-            if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0)
-            {
-                return false;
-            }
-#endif
             // check if it's 1x1, stride=1 pad = 0 conv
             for(int i = 0; i < NDimSpatial; i++)
             {
@@ -1330,7 +1323,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         }
 
         // Gridwise GEMM size
-        return true;
+        return GridwiseGemm::CheckValidity(gemm_arg);
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 68112489ca..382d2870e8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -606,6 +607,203 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ static constexpr bool CheckValidity(const Argument& karg)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
+        {
+            if(!(karg.M % MPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
+        {
+            if(!(karg.N % NPerBlock == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+        {
+
+            auto K_t = karg.KBatch * KPerBlock;
+            if(!(karg.K % K_t == 0))
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
+            auto K_t                = karg.KBatch * KReadVec;
+            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
+            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
+            {
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg K (" << karg.K
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+        {
+            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg N (" << karg.N
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+        else
+        {
+            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "Arg M (" << karg.M
+                              << ") value is not a multiple of "
+                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
+                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
+        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, float>::value ||
+                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
+                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
+        {
+            if(!karg.IsReduceAdd())
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                }
+                if(karg.KBatch > 1)
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
+
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
     __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
index bd3ab10802..efc7f20cdc 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
@@ -192,7 +192,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -210,7 +210,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -218,9 +218,17 @@ struct TransformConvBwdWeightToGemm
             const auto wei_gemmm_gemmn_grid_desc =
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -240,7 +248,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -279,7 +287,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -288,26 +296,6 @@ struct TransformConvBwdWeightToGemm
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -315,8 +303,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -392,7 +380,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -407,13 +395,21 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -428,7 +424,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -469,31 +465,11 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -501,8 +477,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -585,7 +561,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -600,13 +576,21 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -621,7 +605,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -671,31 +655,11 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -703,8 +667,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
index b72ddb8243..e410f06190 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -374,7 +374,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -390,13 +390,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -412,7 +420,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -453,29 +461,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -483,8 +473,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
 
@@ -562,7 +552,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -578,13 +568,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -600,7 +598,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -650,29 +648,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -680,8 +660,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -765,7 +745,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -781,13 +761,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -803,7 +791,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -868,29 +856,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -898,8 +868,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end

From 833ae1d051d5e9e658afb43a63c73de108ee87d3 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 5 Aug 2025 09:27:55 -0700
Subject: [PATCH 392/443] Revert "Reduce build time tile engine (#2579)"
 (#2623)

This reverts commit e5b79b26fae87a9e610a805e7feed6eb1e30158c.
---
 Jenkinsfile                         | 146 +++++++++++++-
 tile_engine/ops/gemm/CMakeLists.txt | 287 +++++++++++-----------------
 2 files changed, 254 insertions(+), 179 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b70c28ad39..0363b07d89 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -438,6 +438,34 @@ def cmake_build(Map conf=[:]){
             echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
         }
     }
+    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
+        try{
+            archiveArtifacts "perf_transpose_*.log"
+            if (arch_type == 1){
+                stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a"
+            }
+            else if (arch_type == 2){
+                stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942"
+            }
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
+    if (params.RUN_CK_TILE_GEMM_TESTS){
+        try{
+            archiveArtifacts "perf_tile_gemm_**.log"
+            if (arch == 1){
+                stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
+            }
+            else if (arch == 2){
+                stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
+            }
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
 }
 
 def buildHipClangJob(Map conf=[:]){
@@ -734,6 +762,24 @@ def process_results(Map conf=[:]){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                         }
                     }
+                    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
+                        try{
+                            unstash "perf_transpose_log_gfx942"
+                            unstash "perf_transpose_log_gfx90a"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the Transpose performance logs: ${err.getMessage()}."
+                        }
+                    }
+                    if (params.RUN_CK_TILE_GEMM_TESTS){
+                        try{
+                            unstash "perf_tile_gemm_log_gfx942"
+                            unstash "perf_tile_gemm_log_gfx90a"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the GEMM performance logs: ${err.getMessage()}."
+                        }
+                    }
                     if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
                         // unstash deb packages
                         unstash "packages"
@@ -815,7 +861,7 @@ def run_aiter_tests(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -895,6 +941,14 @@ pipeline {
             name: "RUN_CK_TILE_FMHA_TESTS",
             defaultValue: false,
             description: "Run the ck_tile FMHA tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_TRANSPOSE_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile Transpose tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_GEMM_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile GEMM tests (default: OFF)")
         booleanParam(
             name: "RUN_TILE_ENGINE_GEMM_TESTS",
             defaultValue: false,
@@ -1144,6 +1198,94 @@ pipeline {
                 }
             }
         }
+        stage("Run CK_TILE_TRANSPOSE Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_TRANSPOSE Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_batched_transpose && \
+                                           cd ../ &&
+                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_TRANSPOSE Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_batched_transpose && \
+                                           cd ../ &&
+                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
+        stage("Run CK_TILE_GEMM Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_GEMM Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_gemm_universal && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_GEMM Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_gemm_universal && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
         stage("Run TILE_ENGINE_GEMM Tests")
         {
             parallel
@@ -1350,7 +1492,7 @@ pipeline {
                                            -DGPU_TARGETS="gfx90a" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j 32"""
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index d8200ed947..fe9b7802a7 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,215 +1,148 @@
+
 set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
 set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
 
-# Pre-generate all kernel lists to avoid blocking during parallel builds
-foreach(dt IN LISTS GEMM_DATATYPE)
-    foreach(l IN LISTS GEMM_LAYOUT)
-        set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${dt}/${l}")
-        file(MAKE_DIRECTORY "${working_path}")
-
-        if (l STREQUAL "rcr")
-            set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        else()
-            set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
-        endif()
-
-        # Only run if files don't exist
-        if (NOT EXISTS "${working_path}/gemm_instance_blobs.txt")
-            execute_process(
-                COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py"
-                        --working_path "${working_path}"
-                        --datatype "${dt}"
-                        --layout "${l}"
-                        --config_json "${json_blob}"
-                        --list_blobs
-                RESULT_VARIABLE ret
-            )
-            if (NOT ret EQUAL 0)
-                message(FATAL_ERROR "Failed to pre-generate kernel list for ${dt} ${l}")
-            endif()
-        endif()
-    endforeach()
-endforeach()
-
 function(build_gemm_for_datatype datatype layout)
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
 
-    if (layout STREQUAL "rcr")
+    # Comment this if-else block when using user_provided_config
+    if(layout STREQUAL "rcr")
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
     else()
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
     endif()
-    # Uncomment to override:
-    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
 
-    # Read pre-generated kernel lists
+    # uncomment this if you want to use user_provided_config.json
+    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    
+    # Generate kernel list
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --config_json ${json_blob}
+                --list_blobs
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
+    endif()
+
     file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs)
     file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range)
-
+    
     # Generate the blobs
     add_custom_command(
         OUTPUT ${codegen_blobs}
-        COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py"
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                 --working_path "${working_path}"
-                --datatype "${datatype}"
-                --layout "${layout}"
+                --datatype ${datatype}
+                --layout ${layout}
                 --config_json "${json_blob}"
                 --gen_blobs
         COMMENT "Generating GEMM instance sources for ${datatype} ${layout}"
     )
     add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})
 
-    # Parse ranges to identify unique trait names
-    set(unique_traits)
-    foreach(range_line IN LISTS codegen_blobs_range)
-        string(STRIP "${range_line}" stripped_line)
-        separate_arguments(split_line UNIX_COMMAND "${stripped_line}")
-        list(GET split_line 0 trait_name)
-        list(APPEND unique_traits "${trait_name}")
-    endforeach()
-    list(REMOVE_DUPLICATES unique_traits)
+    set(intermediate_libs)
+    list(LENGTH codegen_blobs codegen_blobs_len)
 
-    # Build each trait separately
-    foreach(trait IN LISTS unique_traits)
-        set(trait_files)
-        foreach(range_line IN LISTS codegen_blobs_range)
-            string(STRIP "${range_line}" stripped_line)
-            separate_arguments(split_line UNIX_COMMAND "${stripped_line}")
-            list(GET split_line 0 name)
-            if (name STREQUAL trait)
-                list(GET split_line 1 first)
-                list(GET split_line 2 last)
-                math(EXPR total_files "${last} - ${first}")
-                if (total_files GREATER 0)
-                    foreach(j RANGE ${first} ${last}-1)
-                        list(LENGTH codegen_blobs blobs_len)
-                        if (j LESS blobs_len)
-                            list(GET codegen_blobs ${j} f)
-                            list(APPEND trait_files "${f}")
-                        endif()
-                    endforeach()
-                endif()
-            endif()
-        endforeach()
+    foreach(blob IN LISTS codegen_blobs_range)
+        string(STRIP "${blob}" stripped_blob)
+        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
+        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
+        list(GET spilit_blob 0 name)
+        list(GET spilit_blob 1 first)
+        list(GET spilit_blob 2 last)
+        math(EXPR total_files "${last} - ${first}")
+        if(total_files EQUAL 0)
+            continue()        # nothing for this trait
+        endif()
 
-        if (trait_files)
-            # Create object libraries with chunking
-            set(chunk_size 3)  # adjust as needed for memory vs parallelism
-            list(LENGTH trait_files num_files)
-            math(EXPR num_chunks "( ${num_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        # Object libraries (chunked) per trait
+        set(sub_intermediate_libs)
+        set(chunk_size 3)
+        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
+        
+        foreach(i RANGE 0 ${num_chunks_minus_1})
+            math(EXPR start "${first} + ${i} * ${chunk_size} ")
+            math(EXPR end "${start} + ${chunk_size} - 1")
 
-            set(trait_obj_libs)
-            foreach(i RANGE 0 ${num_chunks}-1)
-                math(EXPR start "${i} * ${chunk_size}")
-                math(EXPR end "${start} + ${chunk_size} - 1")
-
-                set(chunk_files)
-                foreach(j RANGE ${start} ${end})
-                    if (j LESS ${num_files})
-                        list(GET trait_files ${j} f)
-                        list(APPEND chunk_files "${f}")
-                    endif()
-                endforeach()
-
-                if (chunk_files)
-                    set(obj_lib_name "gemm_obj_${trait}_${i}_${datatype}_${layout}")
-                    add_library(${obj_lib_name} OBJECT ${chunk_files})
-                    add_dependencies(${obj_lib_name} gemm_gen_${datatype}_${layout})
-
-                    target_compile_options(${obj_lib_name} PRIVATE
-                        -Wno-undefined-func-template
-                        -Wno-float-equal
-                        --offload-compress
-                        -O3
-                        -fno-exceptions
-                    )
-
-                    set_target_properties(${obj_lib_name} PROPERTIES
-                        UNITY_BUILD ON
-                        UNITY_BUILD_BATCH_SIZE 2
-                    )
-
-                    list(APPEND trait_obj_libs "${obj_lib_name}")
+            set(chunk_files)
+            foreach(j RANGE ${start} ${end})
+                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
+                    list(GET codegen_blobs ${j} f)
+                    list(APPEND chunk_files "${f}")
                 endif()
             endforeach()
 
-            # Static library for this trait
-            if (trait_obj_libs)
-                set(trait_lib_name "gemm_lib_${trait}_${datatype}_${layout}")
-                set(obj_exprs)
-                foreach(objlib IN LISTS trait_obj_libs)
-                    list(APPEND obj_exprs "$<TARGET_OBJECTS:${objlib}>")
-                endforeach()
-
-                add_library(${trait_lib_name} STATIC ${obj_exprs})
-                add_dependencies(${trait_lib_name} gemm_gen_${datatype}_${layout})
-
-                # Trait-specific executable
-                set(exec_name "benchmark_gemm_${datatype}_${layout}_${trait}")
-                add_executable(${exec_name} benchmark_gemm.cpp)
-                target_link_libraries(${exec_name} PRIVATE ${trait_lib_name})
-                target_include_directories(${exec_name} PRIVATE
-                    "${CMAKE_CURRENT_LIST_DIR}"
-                    "${working_path}"
-                )
-                target_compile_definitions(${exec_name} PRIVATE
-                    GEMM_TRAIT_FILTER="${trait}"
-                )
-                target_compile_options(${exec_name} PRIVATE
-                    -Wno-undefined-func-template
-                    -Wno-float-equal
-                    --offload-compress
-                )
+            #list(LENGTH chunk_files chunk_files_len)
+            #if(chunk_files_len AND chunk_files_len GREATER 1)
+            if(chunk_files)
+                set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}")
+                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
+                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
             endif()
-        endif()
-    endforeach()
 
-    # Master executable including all traits
-    set(all_trait_libs)
-    foreach(trait IN LISTS unique_traits)
-        if (TARGET gemm_lib_${trait}_${datatype}_${layout})
-            list(APPEND all_trait_libs "gemm_lib_${trait}_${datatype}_${layout}")
-        endif()
-    endforeach()
+        endforeach()
 
-    if (all_trait_libs)
-        add_executable(benchmark_gemm_${datatype}_${layout} benchmark_gemm.cpp)
-        target_link_libraries(benchmark_gemm_${datatype}_${layout} PRIVATE ${all_trait_libs})
-        target_include_directories(benchmark_gemm_${datatype}_${layout} PRIVATE
-            "${CMAKE_CURRENT_LIST_DIR}"
-            "${working_path}"
-        )
-        target_compile_options(benchmark_gemm_${datatype}_${layout} PRIVATE
-            -Wno-undefined-func-template
-            -Wno-float-equal
-            --offload-compress
-        )
-    endif()
+        # ------------------ Bundle the object libs into one static lib ---------
+        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
+        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
+        if(sub_intermediate_libs)
+            set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}")
+            # Collect the $<TARGET_OBJECTS:...> expressions
+            
+            set(obj_exprs)
+            foreach(objlib IN LISTS sub_intermediate_libs)
+                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
+            endforeach()
+            
+            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
+            add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout})
+            #foreach(objlib IN LISTS sub_intermediate_libs)
+            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
+            #endforeach()
+            list(APPEND intermediate_libs ${intermediate_lib_name})
+        endif()
+
+    endforeach()
+    
+    # Interface library for instances
+    add_library(gemm_template_instances_${datatype}_${layout} INTERFACE)
+    add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout})
+    target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
+    target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
+    
+    # Host API interface library
+    add_library(gemm_host_api_${datatype}_${layout} INTERFACE)
+    target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout})
+    target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    
+
+    # Executable per datatype
+    set(exec_name "benchmark_gemm_${datatype}_${layout}")
+    add_executable(${exec_name} benchmark_gemm.cpp)
+    target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout})
+    target_compile_options(${exec_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+    )
 endfunction()
 
-# Process each datatype/layout
+# Process each datatype in isolation
 foreach(dt IN LISTS GEMM_DATATYPE)
     foreach(l IN LISTS GEMM_LAYOUT)
-        build_gemm_for_datatype("${dt}" "${l}")
+        build_gemm_for_datatype(${dt} ${l})
     endforeach()
 endforeach()
-
-# Master target for parallel builds
-set(ALL_GEMM_TARGETS)
-foreach(dt IN LISTS GEMM_DATATYPE)
-    foreach(l IN LISTS GEMM_LAYOUT)
-        list(APPEND ALL_GEMM_TARGETS "benchmark_gemm_${dt}_${l}")
-    endforeach()
-endforeach()
-add_custom_target(benchmark_gemm_all DEPENDS ${ALL_GEMM_TARGETS})
-
-# Use faster linker if available
-find_program(LLD_LINKER "ld.lld")
-find_program(MOLD_LINKER "mold")
-if (MOLD_LINKER)
-    message(STATUS "Using mold linker for faster linking")
-    add_link_options(-fuse-ld=mold)
-elseif (LLD_LINKER)
-    message(STATUS "Using lld linker for faster linking")
-    add_link_options(-fuse-ld=lld)
-endif()
\ No newline at end of file

From 07469142cb887dd7569aae24cc264f95c8339b0e Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Wed, 6 Aug 2025 00:34:39 -0700
Subject: [PATCH 393/443] delete all slp compilation flag in CK Tile (#2625)

---
 example/65_gemm_multiply_multiply/CMakeLists.txt | 13 ++++++-------
 example/67_gemm_microscaling/CMakeLists.txt      |  2 +-
 example/ck_tile/03_gemm/CMakeLists.txt           |  2 +-
 .../gpu/gemm_blockscale_wp/CMakeLists.txt        | 16 ++++++++--------
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 9f4c43338e..d1e1a51afd 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -31,7 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
             example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
             example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()
-        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
         example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
@@ -39,22 +39,22 @@ foreach(gpu IN LISTS GPU_TARGETS)
     endif()
 endforeach()
 
-set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
 set(BLOCKSCALE_GEMM_OPTIONS )
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
 
 if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000)
   if(HAS_MISCHED_BOTTOMUP)
-     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
   elseif(HAS_MISCHED_PRERA_DIRECTION)
-     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
   endif()
 else()
   if(HAS_MISCHED_BOTTOMUP)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-bottomup=1")
   elseif(HAS_MISCHED_PRERA_DIRECTION)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup")
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-prera-direction=bottomup")
   endif()
 endif()
 
@@ -62,7 +62,6 @@ check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupa
 if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
     list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)
 endif()
-# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
 example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 14b648c9f8..6ee43aac62 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -58,7 +58,7 @@ example_compile_options(example_moe_gemm1_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_M
 example_compile_options(example_moe_gemm2_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
 
 set(FP8_MXGEMM_OPTIONS)
-list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
 example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index e6f67e4c76..b1aede42c7 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -10,7 +10,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion
 list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
 list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
 list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
-list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm -enable-noalias-to-md-conversion=0")
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
 target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
index c8740e8d8c..0ffe5f95b2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
@@ -10,14 +10,14 @@ list(APPEND GEMM_BLOCKSCALE_WP_INSTANCES
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
 if(HAS_MISCHED_BOTTOMUP)
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
 elseif(HAS_MISCHED_PRERA_DIRECTION)
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
 endif()
 add_instance_library(device_gemm_blockscale_wp_instance ${GEMM_BLOCKSCALE_WP_INSTANCES})

From 15e8b6ccf7220fa11c7497348e3c877c59e3b013 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Wed, 6 Aug 2025 20:04:23 +0800
Subject: [PATCH 394/443] [CK_TILE] Fix FMHA qr_async causing errors in FA
 (#2627)

---
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   | 33 ++++++++++++-------
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   | 28 +++++-----------
 2 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 730641a6b0..269af4e6a7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -533,20 +533,31 @@ class KernelComponentFactory:
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-                if bias == "bias":
-                    # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                if hdim == 256 and hdim_v == 256:
+                # if True:
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                 else:
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                if receipt == 1 and bias != "bias":
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                    if bias == "bias":
+                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    else:
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    if receipt == 1 and bias != "bias":
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
@@ -584,7 +595,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                         continue
-                if (hdim, hdim_v) == (192, 128) or hdim == 160:
+                if (hdim, hdim_v) == (192, 128):
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 5b35e7f0bd..0e4ac44d45 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -41,7 +41,6 @@ K0_MAX_SUBMAX_MAP = {
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS",
-    "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
 }
 
 FMHA_FWD_SPLITKV_KERNEL_BODY="""
@@ -685,28 +684,17 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, opt
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
-                # TODO: use async pipeline when compiler is more stable
-                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128, 160]:
-                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
         elif dtype in ['fp8', 'bf8']:
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                 pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask))

From 2622ff06cb2aabfd94df191083777b4caeb03966 Mon Sep 17 00:00:00 2001
From: Adam Osewski <19374865+aosewski@users.noreply.github.com>
Date: Wed, 6 Aug 2025 15:16:12 +0200
Subject: [PATCH 395/443] Remove unused lds direct load instruction. (#2573)

This functionality is replaced by amd_async_buffer_load

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 .../core/arch/amd_buffer_addressing.hpp       | 48 -------------------
 include/ck_tile/core/arch/arch.hpp            | 16 -------
 2 files changed, 64 deletions(-)

diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 29cc3fefe5..35da19cd3e 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 #endif
 }
 
-template <typename T, index_t NumElemsPerThread>
-CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
-                                                  const index_t global_offset,
-                                                  T* lds_base_ptr,
-                                                  const index_t lds_offset,
-                                                  const bool is_valid,
-                                                  const index_t src_element_space_size)
-{
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
-    const int32x4_t src_resource =
-        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
-    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
-
-#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
-    T* lds_ptr = lds_base_ptr + lds_offset;
-    auto const lds_ptr_sgpr =
-        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
-    asm volatile("s_mov_b32 m0, %0; \n\t"
-                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
-                 "v"(global_offset_bytes),
-                 "s"(src_resource)
-                 : "memory");
-#else
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-#if defined(__gfx9__)
-    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
-#endif
-    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
-    // For gfx950: supports 1, 3, or 4 DWORDs per thread
-    // For gfx942: supports exactly 1 DWORD per thread
-#if defined(__gfx950__)
-    constexpr auto dword_bytes = 4;
-    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
-                  bytes_per_thread == dword_bytes * 4);
-#elif defined(__gfx9__)
-    constexpr auto dword_bytes = 4;
-    static_assert(bytes_per_thread == dword_bytes);
-#endif
-    // LDS pointer must be attributed with the LDS address space.
-    as3_uint32_ptr lds_ptr =
-        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
-
-    llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
-#endif
-}
-
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
 __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 0723026836..96df9d70f7 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -174,22 +174,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
     __builtin_amdgcn_s_barrier();
 }
 
-CK_TILE_DEVICE void block_sync_lds_direct_load()
-{
-#if 1
-    // invoke clang builtins which *should* produce the same result as the inline asm below
-    // difference: inline asm is being compiled to wait vmcnt(0) after the barrier
-    s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>();
-#else
-    // same content as in old CK (#999)
-    asm volatile("\
-    s_waitcnt vmcnt(0) \n \
-    s_waitcnt lgkmcnt(0) \n \
-    s_barrier \
-    " ::);
-#endif
-}
-
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1

From 4750b293fe0abfa44a32181742a48b1dfec468f7 Mon Sep 17 00:00:00 2001
From: Yashvardhan Agarwal <yashagar@amd.com>
Date: Wed, 6 Aug 2025 16:36:59 +0300
Subject: [PATCH 396/443] General 2D Reduction Kernel (#2535)

* General 2D Reduction Kernel

* Move the reduction kernel from the example
* Split the code and add the necessary policy, problem, shape files as
per ck_tile convention
* Add/modify the headers
* Modified the example to work with the 'new' kernel
* Added tests for the kernel
* N-D refernce reduce
* Added support for N-D input with transform to 2D
* Added padding to support various input sized tensors
* Bug fix in the thread buffer constructor
* Some comments to explain the reduce2d block kernel

* comments resolution

* clang-format

* comments resolution

* clang-format

* clang-format

* comments resolution

* clang-format
---
 example/ck_tile/05_reduce/reduce.cpp          |  63 ++-
 example/ck_tile/05_reduce/reduce.hpp          | 164 --------
 .../ck_tile/core/container/thread_buffer.hpp  |   6 +-
 .../ck_tile/core/utility/reduce_operator.hpp  |  57 ++-
 .../host/reference/reference_reduce.hpp       |  78 ++++
 include/ck_tile/ops/reduce.hpp                |   5 +-
 .../ops/reduce/block/block_reduce2d.hpp       |  72 +++-
 .../ops/reduce/kernel/reduce2d_kernel.hpp     | 219 +++++++++++
 .../reduce2d_default_policy.hpp}              |   9 +-
 .../ops/reduce/pipeline/reduce2d_problem.hpp  |  27 ++
 .../ops/reduce/pipeline/reduce2d_shape.hpp    |  37 ++
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/reduce/CMakeLists.txt            |   7 +
 test/ck_tile/reduce/test_reduce2d.cpp         | 359 ++++++++++++++++++
 14 files changed, 905 insertions(+), 199 deletions(-)
 delete mode 100644 example/ck_tile/05_reduce/reduce.hpp
 create mode 100644 include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
 rename include/ck_tile/ops/reduce/{block/block_reduce2d_default_policy.hpp => pipeline/reduce2d_default_policy.hpp} (89%)
 create mode 100644 include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
 create mode 100644 include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
 create mode 100644 test/ck_tile/reduce/CMakeLists.txt
 create mode 100644 test/ck_tile/reduce/test_reduce2d.cpp

diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp
index 602661f779..cf816caa88 100644
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -1,16 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "ck_tile/host.hpp"
-#include "reduce.hpp"
+#include "ck_tile/ops/reduce.hpp"
 #include <cstring>
 
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "n dimension")
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "7", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
         .insert("v", "1", "cpu validation or not")
         .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -23,15 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using ComputeDataType = float;
     using YDataType       = DataType;
 
-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
     int do_validation  = arg_parser.get_int("v");
     int warmup         = arg_parser.get_int("warmup");
     int repeat         = arg_parser.get_int("repeat");
 
-    ck_tile::HostTensor<XDataType> x_host({m, n});
-    ck_tile::HostTensor<YDataType> y_host_ref({m});
-    ck_tile::HostTensor<YDataType> y_host_dev({m});
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_ref({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({N, C}, {C, 1});
 
     ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
 
@@ -54,7 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     constexpr ck_tile::index_t kBlockSize  = 256;
     constexpr ck_tile::index_t kBlockPerCu = 1;
-    ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
+    ck_tile::index_t kept_dim_len_prod     = N * C;
+    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
     std::cout << "grid size " << kGridSize << std::endl;
 
     using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
@@ -63,6 +83,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using Kernel = ck_tile::Reduce<Porblem>;
 
+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    if(!Kernel::IsSupportedArgument(
+           C, input_strides)) // output tensor's continuous dimension and input strides
+    {
+        throw std::runtime_error("Wrong! Arguments not supported!\n");
+    }
+
     float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
                                    ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
                                        Kernel{},
@@ -71,10 +102,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                        0,
                                        static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
                                        static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       m,
-                                       n));
+                                       input_shape,
+                                       input_strides,
+                                       kept_dim,
+                                       reduce_dims));
 
-    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
+    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
 
     float gb_per_sec = num_btype / 1.E6 / ave_time;
 
@@ -86,7 +119,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     {
         // reference
         ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
-            x_host, y_host_ref, ReduceOp{});
+            x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
         y_buf.FromDevice(y_host_dev.mData.data());
         pass = ck_tile::check_err(y_host_dev, y_host_ref);
 
diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp
deleted file mode 100644
index 6fbb0b4274..0000000000
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
-
-namespace ck_tile {
-
-template <typename BlockWarps, // num warps along seq<M, N>
-          typename BlockTile,  // block size, seq<M, N>
-          typename WarpTile,   // warp size, seq<M, N>
-          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
-struct Reduce2dShape
-{
-    static constexpr index_t Block_M = BlockTile::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile::at(number<1>{});
-
-    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
-
-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
-
-    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
-
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    static constexpr index_t BlockSize =
-        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
-};
-
-template <typename XDataType_,
-          typename ComputeDataType_,
-          typename YDataType_,
-          typename BlockShape_,
-          typename ReduceOp_>
-struct Reduce2dProblem
-{
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using YDataType       = remove_cvref_t<YDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
-    using ReduceOp        = ReduceOp_;
-
-    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
-    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
-};
-
-template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
-struct Reduce
-{
-    using Problem = ck_tile::remove_cvref_t<Problem_>;
-    using Policy  = ck_tile::remove_cvref_t<Policy_>;
-
-    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
-    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
-    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
-
-#if 0
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
-    const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
-
-        const XDataType reduce_init_value = 0;
-
-        constexpr auto reduce_dims = sequence<1>{};
-
-        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
-            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
-
-        set_tile(y_compute, reduce_init_value);
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_tile_reduce_sync(y_compute, f_reduce);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#else
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        auto reduce_func         = typename Problem::ReduceOp{};
-        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
-        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
-        auto block_reduce2d_cross_warp_sync =
-            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
-
-        using XTensorType = decltype(load_tile(x_window));
-        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
-        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_reduce2d(x, y_compute, reduce_func);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_reduce2d_sync(y_compute, reduce_func);
-        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#endif
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp
index 77c46e1b8c..d67581e7d2 100644
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -42,7 +42,11 @@ struct thread_buffer {
 
     // TODO: this ctor can't ignore
     CK_TILE_HOST_DEVICE constexpr thread_buffer() : data{} {}
-    CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{o} {}
+    CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{} {
+        static_for<0, N, 1>{}(
+            [&](auto i) { data[i] = o; }
+        );
+    }
 
     CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
     CK_TILE_HOST_DEVICE auto & get() {return data; }
diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
index 8b15d187fe..2d7ac78b06 100644
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -26,7 +26,8 @@ struct Add
     }
 
     template <typename T,
-              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t>>>
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
     {
         float y_ = type_convert<float>(y);
@@ -34,6 +35,8 @@ struct Add
 
         return type_convert<T>(y_ + x_);
     }
+
+    static constexpr bool requires_special_combine = false;
 };
 
 struct SquareAdd
@@ -51,13 +54,47 @@ struct SquareAdd
     {
         return y + (x * x);
     }
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
+    {
+        float y_ = type_convert<float>(y);
+        float x_ = type_convert<float>(x);
+        return type_convert<T>(y_ + (x_ * x_));
+    }
+
+    // For combining partial results
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1,
+                                                            const T& partial2) const
+    {
+        return partial1 + partial2; // Just add the partial sums, don't square again
+    }
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const
+    {
+        float partial1_ = type_convert<float>(partial1);
+        float partial2_ = type_convert<float>(partial2);
+        return type_convert<T>(partial1_ + partial2_);
+    }
+
+    static constexpr bool requires_special_combine = true;
 };
 
 struct Max
 {
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
     {
         return numeric<T>::min();
@@ -65,18 +102,24 @@ struct Max
 
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
     {
         return max(y, x);
     }
+
+    static constexpr bool requires_special_combine = false;
 };
 
 struct AbsMax
 {
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
     {
         return numeric<T>::min();
@@ -84,11 +127,15 @@ struct AbsMax
 
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
     {
         return max(y, abs(x));
     }
+
+    static constexpr bool requires_special_combine = false;
 };
 
 } // namespace ReduceOp
diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp
index 8f8aa23670..9952b7b009 100644
--- a/include/ck_tile/host/reference/reference_reduce.hpp
+++ b/include/ck_tile/host/reference/reference_reduce.hpp
@@ -30,4 +30,82 @@ reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m,
 
     make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
 }
+
+// Generic reference reduce for arbitrary dimensions
+template <
+    typename XDataType,
+    typename ComputeDataType,
+    typename YDataType,
+    typename ReduceOp,
+    typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep
+    typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to
+                         // reduce
+CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
+                                   HostTensor<YDataType>& y_tensor,
+                                   ReduceOp reduce_op,
+                                   KeptDim kept_dim,
+                                   ReduceDims reduce_dims)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    auto f = [&](auto linear_kept_idx) {
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
+
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
+        {
+            // Convert linear reduce index to multi-dimensional reduce indices
+            std::vector<index_t> reduce_indices(reduce_dims.size());
+            index_t temp_reduce = reduce_idx;
+            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                constexpr auto dim      = reduce_dims.at(dim_idx);
+                const auto len          = x_lengths[dim];
+                reduce_indices[dim_idx] = temp_reduce % len;
+                temp_reduce /= len;
+            });
+
+            // Build full input tensor indices by combining kept and reduce indices
+            std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+            static_for<0, kept_dim.size(), 1>{}(
+                [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+            static_for<0, reduce_dims.size(), 1>{}(
+                [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+            // Access input tensor element
+            const auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+            v_acc = reduce_op(v_acc, v_a);
+        }
+
+        // Calculate output tensor index using kept indices
+        // The output tensor has the same structure as the kept dimensions
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        y_tensor(y_indices) = type_convert<YDataType>(v_acc);
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index 80ead84e85..042e0b98c2 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -5,8 +5,11 @@
 
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
+#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index 62c9944bd2..849fa6c252 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -7,20 +7,55 @@
 
 namespace ck_tile {
 
+// BlockReduce2d implements a hierarchical 2D reduction operator that reduces data along the second
+// dimension using a user-specified reduction function.
+//
+// The reduction is performed in a three-stage hierarchical approach:
+//
+// STAGE 1: Thread-level reduction (BlockReduce2d)
+// ===============================================
+// - Each thread processes multiple elements from the input tensor within its assigned data
+// partition
+// - Reduction is performed locally within each thread by iterating over assigned elements
+// - ReducePacksPerXDim controls how many elements sweep_tile processes in one iteration per
+// dimension
+//   (e.g., {1,1} = 1 element at a time from each dimension, {2,4} = 2 from dim0, 4 from dim1)
+// - Results are accumulated into a thread-local output tensor stored in registers
+// - The output tensor distribution is derived from the input tensor's distribution using
+//   make_reduce_tile_distribution_encoding() to handle dimension reduction
+//
+// STAGE 2: Warp-level reduction (BlockReduce2dSync)
+// ================================================
+// - Performs inter-thread reduction within each warp
+// - Uses warp shuffle operations to exchange data between threads in the same warp
+// - Implements a tree-reduction pattern with power-of-2 stages
+// - Only reduces along dimensions that map to lane IDs within the warp
+//
+// STAGE 3: Cross-warp reduction (BlockReduce2dCrossWarpSync)
+// ========================================================
+// - Performs reduction across multiple warps within the same thread block
+// - Uses shared memory (LDS) to facilitate data exchange between warps
+// - Each warp's lane-0 thread stores its partial results to shared memory
+// - All threads participate in loading and reducing data from shared memory
+// - Implements block-level synchronization to ensure memory consistency
+
+// BlockReduce2d: Thread-level reduction (Stage 1)
 template <typename Problem_, typename Policy_ = void>
 struct BlockReduce2d
 {
-    // in-thread reduction
+    // Thread-level reduction implementation
     using Problem         = remove_cvref_t<Problem_>;
     using XDataType       = typename Problem::XDataType;
     using ComputeDataType = typename Problem::ComputeDataType;
 
     CK_TILE_DEVICE constexpr BlockReduce2d() {}
 
-    template <typename XDistributedTensor_,
-              typename YDistributedTensor_,
-              typename ReduceFunc,
-              typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
+    template <
+        typename XDistributedTensor_,
+        typename YDistributedTensor_,
+        typename ReduceFunc,
+        typename ReducePacksPerXDim =
+            uniform_sequence_gen_t<2, 1>> // {1,1} = process 1 element at a time from each dimension
     CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
                                    YDistributedTensor_& y_tensor,
                                    const ReduceFunc& reduce_func,
@@ -33,6 +68,7 @@ struct BlockReduce2d
                     y_tensor(idx_0), ck_tile::type_convert<ComputeDataType>(x_tensor[idx_])...);
             },
             ReducePacksPerXDim{});
+
 #if 0
         constexpr auto I0 = number<0>{};
         constexpr auto I1 = number<1>{};
@@ -75,6 +111,8 @@ struct BlockReduce2d
         return tensor;
     }
 
+    // uniform_sequence_gen_t<NSize, Value> generates sequence of NSize elements filled with Value
+    // e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4}
     template <typename XDistributedTensor_,
               typename ReduceFunc,
               typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
@@ -91,6 +129,7 @@ struct BlockReduce2d
     }
 };
 
+// BlockReduce2dSync: Warp-level reduction (Stage 2)
 template <typename Problem_, typename Policy_ = void>
 struct BlockReduce2dSync
 {
@@ -145,8 +184,15 @@ struct BlockReduce2dSync
                         // pull data from remote lane
                         const auto v_remote = warp_shuffle(v_local, src_lane);
 
-                        // reduce
-                        v_local = reduce_func(v_local, v_remote);
+                        // For reduce, use combine_partial_results for operations that require it
+                        if constexpr(ReduceFunc::requires_special_combine)
+                        {
+                            v_local = reduce_func.combine_partial_results(v_local, v_remote);
+                        }
+                        else
+                        {
+                            v_local = reduce_func(v_local, v_remote);
+                        }
                     });
                 }
             });
@@ -157,6 +203,7 @@ struct BlockReduce2dSync
     }
 };
 
+// BlockReduce2dCrossWarpSync: Cross-warp reduction (Stage 3)
 template <typename Problem_, typename Policy_ = void>
 struct BlockReduce2dCrossWarpSync
 {
@@ -263,8 +310,15 @@ struct BlockReduce2dCrossWarpSync
                 constexpr auto i_1      = number<i_1_n1 + 1>{};
                 const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
 
-                // reduce
-                v_local = reduce_func(v_local, v_remote);
+                // For reduce, use combine_partial_results for operations that require it
+                if constexpr(ReduceFunc::requires_special_combine)
+                {
+                    v_local = reduce_func.combine_partial_results(v_local, v_remote);
+                }
+                else
+                {
+                    v_local = reduce_func(v_local, v_remote);
+                }
             });
 
             y_tensor.get_thread_buffer()(i_0) = v_local;
diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
new file mode 100644
index 0000000000..f65487ea6e
--- /dev/null
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
+
+// Reduce2d Kernel:
+// =======================================
+// This kernel implements a 2D reduction operation that reduces data along the second dimension
+// of a matrix. The reduction is performed in multiple hierarchical stages.
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = Reduce2dDefaultPolicy>
+struct Reduce
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+
+    private:
+    // Helper function to calculate optimal vector size for input tensor
+    template <typename InputShape, typename ReduceDims>
+    static constexpr index_t CalculateInputVectorSize()
+    {
+        using S                                   = typename Problem::BlockShape;
+        constexpr index_t memory_vector_size      = 16 / sizeof(XDataType);
+        constexpr index_t thread_tile_vector_size = S::ThreadTile_N;
+
+        // Check if innermost reduce dimension is the last dimension (stride 1).
+        constexpr auto innermost_reduce_dim    = ReduceDims{}.at(number<ReduceDims{}.size() - 1>{});
+        constexpr bool is_innermost_contiguous = (innermost_reduce_dim == InputShape{}.size() - 1);
+
+        // If innermost reduce dimension is not the last dim (not contiguous), limit vectorization
+        constexpr index_t stride_based_vector_size =
+            is_innermost_contiguous ? ck_tile::min(memory_vector_size, thread_tile_vector_size) : 1;
+
+        return stride_based_vector_size;
+    }
+
+    // Helper function to calculate optimal vector size for output tensor
+    static constexpr index_t CalculateOutputVectorSize()
+    {
+        using S                                   = typename Problem::BlockShape;
+        constexpr index_t memory_vector_size      = 16 / sizeof(YDataType);
+        constexpr index_t thread_tile_vector_size = S::ThreadTile_M;
+        constexpr index_t vector_size = ck_tile::min(memory_vector_size, thread_tile_vector_size);
+
+        return vector_size;
+    }
+
+    public:
+    template <typename InputShape, typename InputStrides, typename KeptDim, typename ReduceDims>
+    CK_TILE_DEVICE void operator()(const XDataType* p_x,
+                                   YDataType* p_y,
+                                   InputShape input_shape,
+                                   InputStrides input_strides,
+                                   KeptDim kept_dim,
+                                   ReduceDims reduce_dims) const
+    {
+        using S       = typename Problem::BlockShape;
+        const auto iM = get_block_id() * S::Block_M;
+
+        static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(),
+                      "Size of kept dimensions + reduced dimensions must equal input tensor rank");
+
+        // Extract lengths based on kept and reduced dimensions
+        const auto kept_lens = [&]() {
+            return generate_tuple([&](auto I) { return input_shape.at(number<kept_dim.at(I)>{}); },
+                                  number<kept_dim.size()>{});
+        }();
+        const auto reduce_lens = [&]() {
+            return generate_tuple(
+                [&](auto I) { return input_shape.at(number<reduce_dims.at(I)>{}); },
+                number<reduce_dims.size()>{});
+        }();
+
+        const auto kept_merge_transform   = make_merge_transform(kept_lens);
+        const auto reduce_merge_transform = make_merge_transform(reduce_lens);
+
+        auto reduce_func = typename Problem::ReduceOp{};
+        const XDataType custom_padding_value =
+            type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());
+
+        // Calculate optimal vector size for input tensor
+        constexpr auto x_tensor_vector_size = CalculateInputVectorSize<InputShape, ReduceDims>();
+
+        // Create input tensor view with custom padding value
+        auto desc = make_naive_tensor_descriptor(
+            input_shape, input_strides, number<x_tensor_vector_size>{}, number<1>{});
+
+        // Create buffer view with custom padding value
+        auto buffer_view = make_buffer_view<address_space_enum::global>(
+            p_x, desc.get_element_space_size(), custom_padding_value);
+
+        // Create tensor view with custom padding
+        const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
+        const auto transformed_x_tensor = pad_tensor_view(
+            transform_tensor_view(x_tensor,
+                                  make_tuple(kept_merge_transform, reduce_merge_transform),
+                                  make_tuple(kept_dim, reduce_dims),
+                                  make_tuple(sequence<0>{}, sequence<1>{})),
+            make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+            sequence<0, 1>{});
+
+        // Calculate strides for output tensor based on its own dimensions
+        const auto kept_strides = [&]() {
+            return generate_tuple(
+                [&](auto I) {
+                    // Calculate stride for dimension I as product of all following dimensions
+                    index_t stride = 1;
+                    static_for<I + 1, kept_dim.size(), 1>{}(
+                        [&](auto J) { stride *= kept_lens.at(number<J>{}); });
+                    return stride;
+                },
+                number<kept_dim.size()>{});
+        }();
+
+        // Calculate optimal vector size for output tensor
+        constexpr auto y_tensor_vector_size = CalculateOutputVectorSize();
+
+        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
+            p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{}, number<1>{});
+
+        // Transform output tensor to 1D merged view
+        // This creates a view compatible with the 2D reduction pattern
+        const auto y_merged = transform_tensor_view(
+            y_m,
+            make_tuple(kept_merge_transform),
+            make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}),
+            make_tuple(sequence<0>{}));
+
+        auto x_window = make_tile_window(transformed_x_tensor,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+
+        auto y_window = make_tile_window(y_merged, make_tuple(number<S::Block_M>{}), {iM});
+
+        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
+
+        // Get the merged dimension size from the transformed tensor
+        const auto merged_reduce_len =
+            transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N));
+
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(load_tile(x_window));
+        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_reduce2d(x, y_compute, reduce_func);
+            move_tile_window(x_window, {0, S::Block_N});
+        }
+
+        block_reduce2d_sync(y_compute, reduce_func);
+        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
+
+        store_tile(y_window, cast_tile<YDataType>(y_compute));
+    }
+
+    /// @brief Validates if the given arguments are supported by the 2D reduction kernel.
+    ///
+    /// @param y_continous_dim Size of the continuous dimension of the output tensor.
+    ///                        Must be a multiple of ThreadTile_N for proper thread mapping.
+    ///
+    /// @param input_strides   The stride configuration of the input tensor.
+    ///                        The last stride must be 1 to ensure contiguous memory access
+    ///                        and enable efficient vectorized loads.
+    ///
+    /// @return true if the arguments are supported, false otherwise.
+    ///         Error messages are logged when CK_TILE_LOGGING is enabled.
+    ///
+    /// @note Requirements:
+    ///       - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution)
+    ///       - input_strides[-1] == 1 (for contiguous memory access)
+    CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim, auto input_strides)
+    {
+        using S = typename Problem::BlockShape;
+
+        if(y_continous_dim % S::ThreadTile_N != 0)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!");
+            }
+            return false;
+        }
+
+        if(input_strides.at(number<input_strides.size() - 1>{}) != 1)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR(
+                    "Input tensor's last stride must be 1 to support correct vector access!");
+            }
+            return false;
+        }
+
+        return true;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
similarity index 89%
rename from include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
rename to include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
index 3c547242d5..27bb4bcdcb 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -9,7 +9,7 @@
 
 namespace ck_tile {
 
-struct BlockReduce2dDefaultPolicy
+struct Reduce2dDefaultPolicy
 {
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
@@ -18,8 +18,9 @@ struct BlockReduce2dDefaultPolicy
         return make_static_tile_distribution(
             tile_distribution_encoding<
                 sequence<>,
-                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
-                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<
+                    sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::ThreadTile_M>,
+                    sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::ThreadTile_N>>,
                 tuple<sequence<1, 2>, sequence<1, 2>>,
                 tuple<sequence<1, 1>, sequence<2, 2>>,
                 sequence<1, 1, 2, 2>,
diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
new file mode 100644
index 0000000000..67fdec9286
--- /dev/null
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename BlockShape_,
+          typename ReduceOp_>
+struct Reduce2dProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using ReduceOp        = ReduceOp_;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
new file mode 100644
index 0000000000..31eb1f2f4f
--- /dev/null
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWarps, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WarpTile,   // warp size, seq<M, N>
+          typename ThreadTile> // contiguous pixels(vector size) along seq<M, N>
+struct Reduce2dShape
+{
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
+
+    static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{});
+    static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});
+
+    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N;
+
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t BlockSize =
+        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+} // namespace ck_tile
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 42605f2513..9a1df56208 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -21,3 +21,4 @@ add_subdirectory(add_rmsnorm2d_rdquant)
 # add_subdirectory(layernorm2d)
 # add_subdirectory(rmsnorm2d)
 add_subdirectory(gemm_block_scale)
+add_subdirectory(reduce)
\ No newline at end of file
diff --git a/test/ck_tile/reduce/CMakeLists.txt b/test/ck_tile/reduce/CMakeLists.txt
new file mode 100644
index 0000000000..052669e20a
--- /dev/null
+++ b/test/ck_tile/reduce/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_ck_tile_reduce2d PRIVATE utility)
+    endif()
+endif()
+
diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp
new file mode 100644
index 0000000000..4ce0b56ef3
--- /dev/null
+++ b/test/ck_tile/reduce/test_reduce2d.cpp
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <cmath>
+#include <tuple>
+#include <iostream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+template <typename Tuple>
+class TestCkTileReduce : public ::testing::Test
+{
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using ComputeDataType = std::tuple_element_t<1, Tuple>;
+    using YDataType       = std::tuple_element_t<2, Tuple>;
+    using ReduceOpType    = std::tuple_element_t<3, Tuple>;
+    using BlockWarps_     = std::tuple_element_t<4, Tuple>;
+    using BlockTile_      = std::tuple_element_t<5, Tuple>;
+    using WarpTile_       = std::tuple_element_t<6, Tuple>;
+    using ThreadTile_     = std::tuple_element_t<7, Tuple>;
+
+    using TestReduce2dShape =
+        ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, ThreadTile_>;
+
+    template <std::size_t InputDim, typename KeptDimSeq, typename ReduceDimSeq>
+    void RunGenericTest(const std::vector<ck_tile::index_t>& input_shape,
+                        const std::vector<ck_tile::index_t>& input_strides,
+                        const std::vector<ck_tile::index_t>& output_shape,
+                        const std::vector<ck_tile::index_t>& output_strides,
+                        ck_tile::index_t kept_dim_len_prod,
+                        ck_tile::index_t total_reduce_elements,
+                        KeptDimSeq kept_dims,
+                        ReduceDimSeq reduce_dims)
+    {
+        ck_tile::HostTensor<XDataType> h_x(input_shape, input_strides);
+        ck_tile::HostTensor<YDataType> h_y(output_shape, output_strides);
+        ck_tile::HostTensor<YDataType> h_y_ref(output_shape, output_strides);
+
+        ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
+        h_y.SetZero();
+        h_y_ref.SetZero();
+
+        ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_y_mem(h_y.get_element_space_size_in_bytes());
+
+        d_x_mem.ToDevice(h_x.data());
+        d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer
+
+        // Problem and kernel setup
+        using Problem = ck_tile::
+            Reduce2dProblem<XDataType, ComputeDataType, YDataType, TestReduce2dShape, ReduceOpType>;
+
+        using Kernel = ck_tile::Reduce<Problem>;
+
+        // Launch configuration
+        constexpr ck_tile::index_t kBlockSize  = 256;
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        ck_tile::index_t kGridSize =
+            (kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M;
+
+        // Generic helper to create tuple from vector based on compile-time size
+        auto make_shape_tuple = []<std::size_t N>(const std::vector<ck_tile::index_t>& vec) {
+            return [&vec]<std::size_t... I>(std::index_sequence<I...>) {
+                return ck_tile::make_tuple(vec[I]...);
+            }(std::make_index_sequence<N>{});
+        };
+
+        auto input_shape_tuple   = make_shape_tuple.template operator()<InputDim>(input_shape);
+        auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
+
+        if(!Kernel::IsSupportedArgument(
+               output_shape[output_shape.size() - 1],
+               input_strides_tuple)) // output tensor's continuous dimension
+        {
+            throw std::runtime_error("Wrong! Arguments not supported!\n");
+        }
+
+        ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0},
+                               ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                   Kernel{},
+                                   kGridSize,
+                                   kBlockSize,
+                                   0,
+                                   static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
+                                   static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
+                                   input_shape_tuple,
+                                   input_strides_tuple,
+                                   kept_dims,
+                                   reduce_dims));
+
+        // Get results back
+        d_y_mem.FromDevice(h_y.data());
+
+        // Reference computation
+        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
+            h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims);
+
+        // Calculate proper error thresholds based on data types and number of accumulations
+        const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
+            total_reduce_elements);
+        const auto atol = ck_tile::get_absolute_threshold<XDataType, YDataType, ComputeDataType>(
+            5.0f, total_reduce_elements);
+
+        bool result =
+            ck_tile::check_err(h_y, h_y_ref, "Error: Incorrect reduce results!", rtol, atol);
+        EXPECT_TRUE(result);
+    }
+
+    // Convenience functions for specific dimensional patterns
+    void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {dim0, dim1};
+        std::vector<ck_tile::index_t> input_strides = {dim1, 1};
+
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape   = {dim0};
+        std::vector<ck_tile::index_t> output_strides = {1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = dim0;
+        ck_tile::index_t total_reduce_elements = dim1;
+
+        RunGenericTest<2>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0,
+                                        ck_tile::index_t dim1,
+                                        ck_tile::index_t dim2)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {dim0, dim1, dim2};
+        std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
+
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape   = {dim0};
+        std::vector<ck_tile::index_t> output_strides = {1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = dim0;        // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions
+
+        RunGenericTest<3>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0,
+                                        ck_tile::index_t dim1,
+                                        ck_tile::index_t dim2)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0, 1>{};
+        constexpr auto reduce_dims = ck_tile::sequence<2>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {dim0, dim1, dim2};
+        std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
+
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape   = {dim0, dim1};
+        std::vector<ck_tile::index_t> output_strides = {dim1, 1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = dim0 * dim1; // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = dim2;        // product of reduced dimensions
+
+        RunGenericTest<3>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N,
+                                         ck_tile::index_t C,
+                                         ck_tile::index_t H,
+                                         ck_tile::index_t W)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0, 1>{};
+        constexpr auto reduce_dims = ck_tile::sequence<2, 3>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {N, C, H, W};
+        std::vector<ck_tile::index_t> input_strides = {C * H * W, H * W, W, 1};
+
+        // Output shape and strides (keep dim0, dim1)
+        std::vector<ck_tile::index_t> output_shape   = {N, C};
+        std::vector<ck_tile::index_t> output_strides = {C, 1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = N * C; // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
+
+        RunGenericTest<4>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N,
+                                         ck_tile::index_t H,
+                                         ck_tile::index_t W,
+                                         ck_tile::index_t C)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0, 3>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {N, H, W, C};
+        std::vector<ck_tile::index_t> input_strides = {H * W * C, W * C, C, 1};
+
+        // Output shape and strides (keep dim0, dim1)
+        std::vector<ck_tile::index_t> output_shape   = {N, C};
+        std::vector<ck_tile::index_t> output_strides = {C, 1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = N * C; // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
+
+        RunGenericTest<4>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+};
+
+// Shape parameters for different test configurations
+using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
+using Shape1_BlockTile  = ck_tile::sequence<128, 128>;
+using Shape1_WarpTile   = ck_tile::sequence<32, 128>;
+using Shape1_ThreadTile = ck_tile::sequence<8, 8>;
+
+using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test
+using Shape2_BlockTile  = ck_tile::sequence<2, 1024>;
+using Shape2_WarpTile   = ck_tile::sequence<1, 512>;
+using Shape2_ThreadTile = ck_tile::sequence<1, 8>;
+
+// Test configurations for different data types and operations
+using TestConfig_F32_Add = std::tuple<float,
+                                      float,
+                                      float,
+                                      ck_tile::ReduceOp::Add,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile,
+                                      Shape1_ThreadTile>;
+
+using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
+                                      float,
+                                      ck_tile::half_t,
+                                      ck_tile::ReduceOp::Add,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile,
+                                      Shape1_ThreadTile>;
+
+using TestConfig_F32_CrossWarp = std::tuple<float,
+                                            float,
+                                            float,
+                                            ck_tile::ReduceOp::Add,
+                                            Shape2_BlockWarps,
+                                            Shape2_BlockTile,
+                                            Shape2_WarpTile,
+                                            Shape2_ThreadTile>;
+
+using TestConfig_F32_Max = std::tuple<float,
+                                      float,
+                                      float,
+                                      ck_tile::ReduceOp::Max,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile,
+                                      Shape1_ThreadTile>;
+
+using TestConfig_F32_SquareAdd = std::tuple<float,
+                                            float,
+                                            float,
+                                            ck_tile::ReduceOp::SquareAdd,
+                                            Shape1_BlockWarps,
+                                            Shape1_BlockTile,
+                                            Shape1_WarpTile,
+                                            Shape1_ThreadTile>;
+
+using TestTypes = ::testing::Types<TestConfig_F32_Add,
+                                   TestConfig_F16_Add,
+                                   TestConfig_F32_CrossWarp,
+                                   TestConfig_F32_Max,
+                                   TestConfig_F32_SquareAdd>;
+
+TYPED_TEST_SUITE(TestCkTileReduce, TestTypes);
+
+// 2D Tests - Keep dim0, reduce dim1
+TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32)
+{
+    this->RunTest2D_KeepDim0_ReduceDim1(64, 32);
+}
+
+TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512)
+{
+    this->RunTest2D_KeepDim0_ReduceDim1(1024, 512);
+}
+
+// 3D Tests - Keep dim0, reduce dim1,2
+TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1)
+{
+    this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8);
+}
+// 3D Tests - Keep dim0,1, reduce dim1
+TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16)
+{
+    this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16);
+}
+
+// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC)
+TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16)
+{
+    this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16);
+}
+// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC)
+TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128)
+{
+    this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128);
+}

From 1824d65758beeb6af10c02a2c35f959414348bc9 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Wed, 6 Aug 2025 10:15:44 -0700
Subject: [PATCH 397/443] modernize scripts for running cmake and clang-format
 (#2503)

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 script/clang-format-overwrite.sh | 5 +++++
 script/cmake-ck-dev.sh           | 3 +++
 script/cmake-ck-release.sh       | 3 +++
 3 files changed, 11 insertions(+)

diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
index a770970fef..ea2834ae62 100755
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
@@ -1,2 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+
+
 find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
 git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index c45bb4330d..25a1590808 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+
 rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 311ea91822..5263de92c8 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+
 rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

From 5328b232b25cdf0989ba9ec5dbbda99e4933587c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 7 Aug 2025 08:36:47 +0200
Subject: [PATCH 398/443] Grouped Convolution Forward Infer Bias Bnorm Activ
 (#2621)

* Grouped Convolution Forward Infer Bias Bnorm Activ

* 3d
---
 .../gpu/element/element_wise_operation.hpp    |  52 ++
 .../device_operation_instance_factory.hpp     |  47 +-
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp |   7 +-
 .../device_grouped_conv_fwd_xdl_instance.hpp  |   7 +-
 ...ped_conv_fwd_xdl_large_tensor_instance.hpp |   7 +-
 ...vice_grouped_conv_fwd_xdl_mem_instance.hpp |   7 +-
 ...ed_conv_fwd_xdl_merged_groups_instance.hpp |   7 +-
 ...d_convolution_forward_bias_bnorm_clamp.hpp | 237 ++++++
 ...nvolution_forward_bias_bnorm_clamp_xdl.inc | 776 ++++++++++++++++++
 .../CMakeLists.txt                            | 240 ++++++
 ...nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in |  67 ++
 ...dl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in |  63 ++
 ...gc_gkyxc_nhwgk_bf16_comp_part2_instance.in |  67 ++
 ..._nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in |  67 ++
 ...xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in |  63 ++
 ...wgc_gkyxc_nhwgk_f16_comp_part2_instance.in |  67 ++
 ...xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in |  62 ++
 ...l_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in |  63 ++
 ...amp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in |  60 ++
 ...dl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in |  62 ++
 ...lamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in |  60 ++
 ...dl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in |  62 ++
 ...lamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in |  60 ++
 ..._tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in |  43 +
 ...e_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in |  43 +
 ...e_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in |  43 +
 ...wgc_gkyxc_nhwgk_bf16_mem_inter_instance.in |  63 ++
 ...wgc_gkyxc_nhwgk_bf16_mem_intra_instance.in |  63 ++
 ...hwgc_gkyxc_nhwgk_f16_mem_inter_instance.in |  63 ++
 ...hwgc_gkyxc_nhwgk_f16_mem_intra_instance.in |  63 ++
 ...hwgc_gkyxc_nhwgk_f32_mem_inter_instance.in |  63 ++
 ...hwgc_gkyxc_nhwgk_f32_mem_intra_instance.in |  63 ++
 ..._groups_nhwgc_gkyxc_nhwgk_bf16_instance.in |  79 ++
 ...d_groups_nhwgc_gkyxc_nhwgk_f16_instance.in |  79 ++
 ...d_groups_nhwgc_gkyxc_nhwgk_f32_instance.in |  53 ++
 .../CMakeLists.txt                            | 240 ++++++
 ...wgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in |  67 ++
 ...ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in |  63 ++
 ..._gkzyxc_ndhwgk_bf16_comp_part2_instance.in |  67 ++
 ...hwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in |  67 ++
 ..._ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in |  63 ++
 ...c_gkzyxc_ndhwgk_f16_comp_part2_instance.in |  67 ++
 ..._ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in |  62 ++
 ...dhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in |  63 ++
 ..._xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in |  60 ++
 ...ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in |  62 ++
 ...p_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in |  60 ++
 ...ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in |  62 ++
 ...p_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in |  60 ++
 ...nsor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in |  43 +
 ...ensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in |  43 +
 ...ensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in |  43 +
 ...c_gkzyxc_ndhwgk_bf16_mem_inter_instance.in |  63 ++
 ...c_gkzyxc_ndhwgk_bf16_mem_intra_instance.in |  63 ++
 ...gc_gkzyxc_ndhwgk_f16_mem_inter_instance.in |  63 ++
 ...gc_gkzyxc_ndhwgk_f16_mem_intra_instance.in |  63 ++
 ...gc_gkzyxc_ndhwgk_f32_mem_inter_instance.in |  63 ++
 ...gc_gkzyxc_ndhwgk_f32_mem_intra_instance.in |  63 ++
 ...oups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in |  79 ++
 ...roups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in |  79 ++
 ...roups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in |  53 ++
 ...grouped_conv_fwd_bias_bnorm_clamp_impl.hpp | 427 ++++++++++
 .../CMakeLists.txt                            |   6 +
 ...st_grouped_convnd_fwd_bias_bnorm_clamp.cpp |  97 +++
 ...grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp |  98 +++
 65 files changed, 5299 insertions(+), 38 deletions(-)
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
 create mode 100644 profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
 create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
 create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp

diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index b57ae22172..089d4c2a9d 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -562,6 +562,58 @@ struct NormalizeInInfer
     double epsilon_;
 };
 
+// used by Conv+Bias+BatchNorm+Clamp inference
+struct BiasNormalizeInInferClamp
+{
+    BiasNormalizeInInferClamp(float floor   = 0.f,
+                              float ceil    = NumericLimits<float>::Max(),
+                              float epsilon = 1e-4)
+        : clamp_(floor, ceil), epsilon_(epsilon)
+    {
+    }
+
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y,
+                                                  const T& x,
+                                                  const T& bias,
+                                                  const T& mean,
+                                                  const T& variance,
+                                                  const T& gamma,
+                                                  const T& beta) const
+    {
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        float tmp_x = type_convert<float>(x) + type_convert<float>(bias);
+
+        float tmp_y =
+            ((tmp_x - type_convert<float>(mean)) / sqrt(type_convert<float>(variance) + epsilon_)) *
+                type_convert<float>(gamma) +
+            type_convert<float>(beta);
+        clamp_(tmp_y, tmp_y);
+        y = type_convert<T>(tmp_y);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()(float& y,
+                                                  const float& x,
+                                                  const float& bias,
+                                                  const float& mean,
+                                                  const float& variance,
+                                                  const float& gamma,
+                                                  const float& beta) const
+    {
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        float tmp_y = (((x + bias) - mean) / sqrt(variance + epsilon_)) * gamma + beta;
+        clamp_(y, tmp_y);
+    };
+
+    Clamp clamp_;
+    float epsilon_;
+};
+
 template <typename Y, typename X>
 struct UnaryTypeConvert;
 
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index f6983810be..bf7f1b4fa4 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -113,29 +113,30 @@ using GK_Tuple    = ck::Tuple<G_K>;
 using GK_GK_Tuple = ck::Tuple<G_K, G_K>;
 
 // pointwise functor
-using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
-using Relu                = ck::tensor_operation::element_wise::Relu;
-using TanH                = ck::tensor_operation::element_wise::TanH;
-using Scale               = ck::tensor_operation::element_wise::Scale;
-using Bilinear            = ck::tensor_operation::element_wise::Bilinear;
-using AddAddFastGelu      = ck::tensor_operation::element_wise::AddAddFastGelu;
-using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
-using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
-using AddRelu             = ck::tensor_operation::element_wise::AddRelu;
-using AddClamp            = ck::tensor_operation::element_wise::AddClamp;
-using Clamp               = ck::tensor_operation::element_wise::Clamp;
-using AddSilu             = ck::tensor_operation::element_wise::AddSilu;
-using AddReluAdd          = ck::tensor_operation::element_wise::AddReluAdd;
-using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
-using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
-using AddMultiply         = ck::tensor_operation::element_wise::AddMultiply;
-using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
-using MultiplyMultiply    = ck::tensor_operation::element_wise::MultiplyMultiply;
-using ScaleAdd            = ck::tensor_operation::element_wise::ScaleAdd;
-using Gelu                = ck::tensor_operation::element_wise::Gelu;
-using Swish               = ck::tensor_operation::element_wise::Swish;
-using Add                 = ck::tensor_operation::element_wise::Add;
-using Multiply            = ck::tensor_operation::element_wise::Multiply;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using Relu                      = ck::tensor_operation::element_wise::Relu;
+using TanH                      = ck::tensor_operation::element_wise::TanH;
+using Scale                     = ck::tensor_operation::element_wise::Scale;
+using Bilinear                  = ck::tensor_operation::element_wise::Bilinear;
+using AddAddFastGelu            = ck::tensor_operation::element_wise::AddAddFastGelu;
+using AddFastGelu               = ck::tensor_operation::element_wise::AddFastGelu;
+using MultiplyAddFastGelu       = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+using AddRelu                   = ck::tensor_operation::element_wise::AddRelu;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
+using AddSilu                   = ck::tensor_operation::element_wise::AddSilu;
+using AddReluAdd                = ck::tensor_operation::element_wise::AddReluAdd;
+using FastGelu                  = ck::tensor_operation::element_wise::FastGelu;
+using MultiplyFastGelu          = ck::tensor_operation::element_wise::MultiplyFastGelu;
+using AddMultiply               = ck::tensor_operation::element_wise::AddMultiply;
+using MultiplyAdd               = ck::tensor_operation::element_wise::MultiplyAdd;
+using MultiplyMultiply          = ck::tensor_operation::element_wise::MultiplyMultiply;
+using ScaleAdd                  = ck::tensor_operation::element_wise::ScaleAdd;
+using Gelu                      = ck::tensor_operation::element_wise::Gelu;
+using Swish                     = ck::tensor_operation::element_wise::Swish;
+using Add                       = ck::tensor_operation::element_wise::Add;
+using Multiply                  = ck::tensor_operation::element_wise::Multiply;
 
 template <typename Activation>
 using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index fca236d03e..bbc2a54c34 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index c641019b70..768fcbada0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 3e98852d58..5a4a011512 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 4e6b9c3d1d..57bdeddcf9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 7ef78d46e2..d07d82e7ee 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp
new file mode 100644
index 0000000000..22cb7854a9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+#ifdef CK_USE_XDL
+#include "grouped_convolution_forward_bias_bnorm_clamp_xdl.inc"
+#endif
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DLayouts,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename DDataTypes,
+          typename AComputeType,
+          typename BComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::BiasNormalizeInInferClamp,
+    AComputeType,
+    BComputeType>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleABD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        DLayouts,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        DDataTypes,
+        OutDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::BiasNormalizeInInferClamp,
+        AComputeType,
+        BComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        // layout NHWGC/GKYXC/NHWGK
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+        // layout NDHWGC/GKZYXC/NDHWGK
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+#endif // CK_USE_XDL
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc
new file mode 100644
index 0000000000..b11b428471
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc
@@ -0,0 +1,776 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_BF16
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP16
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP32
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..c06e4f5953
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
@@ -0,0 +1,240 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP)
+include(ShardInstantiation)
+
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   # large tensor
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
+  NUM_SHARDS 2
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   # merged groups
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   #mem
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   #comp
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
+  NUM_SHARDS 11
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
+  NUM_SHARDS 5
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+
+add_instance_library(device_grouped_conv2d_fwd_bias_bnorm_clamp_instance ${GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
new file mode 100644
index 0000000000..51a12c33bd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
new file mode 100644
index 0000000000..22ee546ac8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
new file mode 100644
index 0000000000..632fee85a8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
new file mode 100644
index 0000000000..50bbf761f1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
new file mode 100644
index 0000000000..89baaff411
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
new file mode 100644
index 0000000000..80a2655de6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
new file mode 100644
index 0000000000..395885d03d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
new file mode 100644
index 0000000000..097254dc34
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                         NHWGK,
+                                                         ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                         NHWGK,
+                                                         ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                         NHWGK,
+                                                         ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
new file mode 100644
index 0000000000..7844440dd0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
new file mode 100644
index 0000000000..9db1750e8e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
new file mode 100644
index 0000000000..341fdf6eb6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
new file mode 100644
index 0000000000..bcb126392a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
new file mode 100644
index 0000000000..4e3a435e74
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
new file mode 100644
index 0000000000..0956d9dd71
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
new file mode 100644
index 0000000000..b836dd8374
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
new file mode 100644
index 0000000000..6b8cbf1704
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
new file mode 100644
index 0000000000..a2c36ee52b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
new file mode 100644
index 0000000000..1c12ae66a3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
new file mode 100644
index 0000000000..4fde5e662c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
new file mode 100644
index 0000000000..d75c7f70d5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
new file mode 100644
index 0000000000..d51b3d01e3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
new file mode 100644
index 0000000000..47135a2dd7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
new file mode 100644
index 0000000000..3e08e9668f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                        NHWGK,
+                                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                        NHWGK,
+                                                                        ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
new file mode 100644
index 0000000000..ec76a8e1d1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                       NHWGK,
+                                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                       NHWGK,
+                                                                       ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
new file mode 100644
index 0000000000..2bbac89bbe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwd3x3,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..bda9149227
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
@@ -0,0 +1,240 @@
+# ONLY XDL_KERNELS
+set(GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP)
+include(ShardInstantiation)
+
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   # large tensor
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
+  NUM_SHARDS 2
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   # merged groups
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   #mem
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   #comp
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+  NUM_SHARDS 11
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
+  NUM_SHARDS 5
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+
+add_instance_library(device_grouped_conv3d_fwd_bias_bnorm_clamp_instance ${GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
new file mode 100644
index 0000000000..f397f0a810
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
new file mode 100644
index 0000000000..d6aa4ea964
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
new file mode 100644
index 0000000000..7c993f8b94
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
new file mode 100644
index 0000000000..fb41ec60f8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
new file mode 100644
index 0000000000..e1d581e4fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
new file mode 100644
index 0000000000..99b48d51a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
new file mode 100644
index 0000000000..b172975635
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
new file mode 100644
index 0000000000..8ec8d9248f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                         NDHWGK,
+                                                         ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                         NDHWGK,
+                                                         ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                         NDHWGK,
+                                                         ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
new file mode 100644
index 0000000000..fb5c4159fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
new file mode 100644
index 0000000000..a00fbf5342
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
new file mode 100644
index 0000000000..222ec0c2e0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
new file mode 100644
index 0000000000..8fbedb7793
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
new file mode 100644
index 0000000000..c538d50fc9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
new file mode 100644
index 0000000000..be76a48480
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
new file mode 100644
index 0000000000..dcfdb984c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
new file mode 100644
index 0000000000..ed1988cdf4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
new file mode 100644
index 0000000000..83af7e09ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
new file mode 100644
index 0000000000..ce83cb566a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
new file mode 100644
index 0000000000..051aaf7cf3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
new file mode 100644
index 0000000000..6fa3709cc6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
new file mode 100644
index 0000000000..2ba3e4ec93
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
new file mode 100644
index 0000000000..c4d33236af
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
new file mode 100644
index 0000000000..6a902ed72d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3,
+                                                                        NDHWGC,
+                                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                        NDHWGK,
+                                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3,
+                                                                        NDHWGC,
+                                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                        NDHWGK,
+                                                                        ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
new file mode 100644
index 0000000000..b8125423bc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3,
+                                                                       NDHWGC,
+                                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                       NDHWGK,
+                                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3,
+                                                                       NDHWGC,
+                                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                       NDHWGK,
+                                                                       ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                    NDHWGK,
+                                                                    ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                    NDHWGK,
+                                                                    ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
new file mode 100644
index 0000000000..f292d95cda
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwd3x3,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
new file mode 100644
index 0000000000..43bab919b4
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
+
+namespace ck {
+namespace profiler {
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using Clamp        = ck::tensor_operation::element_wise::Clamp;
+using Add          = ck::tensor_operation::element_wise::Add;
+
+// NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to
+// just keep such implementation valid.
+// TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse
+// the same instances.
+
+template <ck::index_t NDimSpatial>
+auto get_elementwise_desc(ck::index_t G, ck::index_t K)
+{
+    if constexpr(NDimSpatial == 1)
+    {
+        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0});
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0});
+    }
+    else
+    {
+        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0});
+    }
+}
+
+template <ck::index_t NDimSpatial, typename OutDataType>
+void ref_bnorm_clamp_infer(Tensor<OutDataType>& out,
+                           Tensor<OutDataType>& in,
+                           Tensor<OutDataType>& mean,
+                           Tensor<OutDataType>& variance,
+                           Tensor<OutDataType>& scale,
+                           Tensor<OutDataType>& shift,
+                           const float floor,
+                           const float ceil,
+                           const float epsilon)
+{
+
+    auto func = [&](auto... idxs) {
+        const float x = type_convert<float>(in(idxs...));
+
+        const float invVariance =
+            type_convert<float>(1.0f) / std::sqrt(epsilon + type_convert<float>(variance(idxs...)));
+
+        const float norm_x = (x - type_convert<float>(mean(idxs...))) * invVariance;
+
+        float y =
+            type_convert<float>(scale(idxs...)) * norm_x + type_convert<float>(shift(idxs...));
+
+        Clamp{floor, ceil}(y, y);
+
+        out(idxs...) = type_convert<OutDataType>(y);
+    };
+    if constexpr(NDimSpatial == 1)
+    {
+        make_ParallelTensorFunctor(func,
+                                   out.GetLengths()[0],
+                                   out.GetLengths()[1],
+                                   out.GetLengths()[2],
+                                   out.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        make_ParallelTensorFunctor(func,
+                                   out.GetLengths()[0],
+                                   out.GetLengths()[1],
+                                   out.GetLengths()[2],
+                                   out.GetLengths()[3],
+                                   out.GetLengths()[4])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        make_ParallelTensorFunctor(func,
+                                   out.GetLengths()[0],
+                                   out.GetLengths()[1],
+                                   out.GetLengths()[2],
+                                   out.GetLengths()[3],
+                                   out.GetLengths()[4],
+                                   out.GetLengths()[5])(std::thread::hardware_concurrency());
+    }
+}
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AComputeType = InDataType,
+          typename BComputeType = AComputeType,
+          typename IndexType    = ck::index_t,
+          bool ElementwiseGK    = false>
+bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
+                                              int init_method,
+                                              bool do_log,
+                                              bool time_kernel,
+                                              const ck::utils::conv::ConvParam& conv_param)
+{
+    const float floor   = 0.f;
+    const float ceil    = 2048.f;
+    const float epsilon = 1e-4;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{floor, ceil, epsilon};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    const index_t G = conv_param.G_;
+    const index_t K = conv_param.K_;
+
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial + 3> d_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_dilations{};
+    std::array<IndexType, NDimSpatial> input_left_pads{};
+    std::array<IndexType, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+    const auto elementwise_desc =
+        ElementwiseGK ? get_elementwise_desc<NDimSpatial>(G, K) : out_g_n_k_wos_desc;
+
+    Tensor<OutDataType> bias(elementwise_desc);
+    Tensor<OutDataType> mean(elementwise_desc);
+    Tensor<OutDataType> variance(elementwise_desc);
+    Tensor<OutDataType> scale(elementwise_desc);
+    Tensor<OutDataType> shift(elementwise_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "mean: " << mean.mDesc << std::endl;
+    std::cout << "variance: " << variance.mDesc << std::endl;
+    std::cout << "scale: " << scale.mDesc << std::endl;
+    std::cout << "shift: " << shift.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+
+        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        mean.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        variance.GenerateTensorValue(GeneratorTensor_2<OutDataType>{0, 5});
+        scale.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        shift.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+
+        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        mean.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        variance.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0, 0.5});
+        scale.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        shift.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    const std::size_t elementwise_dev_buf_size =
+        ElementwiseGK ? sizeof(OutDataType) * G * K
+                      : sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize();
+    DeviceMem bias_device_buf(elementwise_dev_buf_size);
+    DeviceMem mean_device_buf(elementwise_dev_buf_size);
+    DeviceMem variance_device_buf(elementwise_dev_buf_size);
+    DeviceMem scale_device_buf(elementwise_dev_buf_size);
+    DeviceMem shift_device_buf(elementwise_dev_buf_size);
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+
+    bias_device_buf.ToDevice(bias.mData.data());
+    mean_device_buf.ToDevice(mean.mData.data());
+    variance_device_buf.ToDevice(variance.mData.data());
+    scale_device_buf.ToDevice(scale.mData.data());
+    shift_device_buf.ToDevice(shift.mData.data());
+
+    if constexpr(ElementwiseGK)
+    {
+        constexpr ck::index_t spatial_offset = 3;
+        d_g_n_k_wos_strides[1]               = 0;
+        for(int i = 0; i < NDimSpatial; i++)
+        {
+            d_g_n_k_wos_strides[i + spatial_offset] = 0;
+        }
+    }
+
+    // run reference op
+    if(do_verification)
+    {
+        // Run Conv and Bnorm seperatly
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     Add,
+                                                                     0,
+                                                                     0,
+                                                                     1>{};
+
+        std::array<Tensor<OutDataType>, 1> d_tensors = {bias};
+        auto ref_conv_invoker                        = ref_conv.MakeInvoker();
+        auto ref_conv_argument                       = ref_conv.MakeArgument(input,
+                                                       weight,
+                                                       host_output,
+                                                       conv_param.conv_filter_strides_,
+                                                       conv_param.conv_filter_dilations_,
+                                                       conv_param.input_left_pads_,
+                                                       conv_param.input_right_pads_,
+                                                       in_element_op,
+                                                       wei_element_op,
+                                                       Add{},
+                                                       {},
+                                                       {},
+                                                       d_tensors);
+
+        // init host output to zero
+        host_output.SetZero();
+        ref_conv_invoker.Run(ref_conv_argument);
+        ref_bnorm_clamp_infer<NDimSpatial>(
+            host_output, host_output, mean, variance, scale, shift, floor, ceil, epsilon);
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        // workspace_sz will be equal to 0 for other layout than NGCHW
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    };
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, OutLayout, OutLayout, OutLayout, OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<OutDataType, OutDataType, OutDataType, OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        AComputeType,
+        BComputeType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        {bias_device_buf.GetDeviceBuffer(),
+                                                         mean_device_buf.GetDeviceBuffer(),
+                                                         variance_device_buf.GetDeviceBuffer(),
+                                                         scale_device_buf.GetDeviceBuffer(),
+                                                         shift_device_buf.GetDeviceBuffer()},
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        a_g_n_c_wis_lengths,
+                                                        a_g_n_c_wis_strides,
+                                                        b_g_k_c_xs_lengths,
+                                                        b_g_k_c_xs_strides,
+                                                        {e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths},
+                                                        {d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides},
+                                                        e_g_n_k_wos_lengths,
+                                                        e_g_n_k_wos_strides,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+
+        run_impl(op_ptr, argument_ptr);
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt
index f964325c06..4d5196505c 100644
--- a/test/grouped_convnd_fwd_activation/CMakeLists.txt
+++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt
@@ -1,4 +1,10 @@
 if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_grouped_convnd_fwd_bias_bnorm_clamp test_grouped_convnd_fwd_bias_bnorm_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
+
     add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
 
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
new file mode 100644
index 0000000000..bf96d11d53
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                                  InLayout,
+                                                                                  WeiLayout,
+                                                                                  OutLayout,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  IndexType,
+                                                                                  false /*BiasGK*/>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
new file mode 100644
index 0000000000..2400008ffa
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass &&
+                   ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                          InLayout,
+                                                                          WeiLayout,
+                                                                          OutLayout,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          IndexType,
+                                                                          true /*ElementwiseGK*/>(
+                       true,  // do_verification
+                       1,     // init_method: integer value
+                       false, // do_log
+                       false, // time_kernel
+                       param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}

From 54c7e08a2f7624409c9b2f7804e2a095079c89e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 7 Aug 2025 10:00:09 +0200
Subject: [PATCH 399/443] Fix clang format after conv changes (#2636)

---
 .../profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
index 43bab919b4..cd6c141219 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
@@ -279,8 +279,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
                                                        in_element_op,
                                                        wei_element_op,
                                                        Add{},
-                                                       {},
-                                                       {},
+                                                                             {},
+                                                                             {},
                                                        d_tensors);
 
         // init host output to zero
@@ -416,9 +416,9 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
         run_impl(op_ptr, argument_ptr);
     }
 
-    std::cout << "Best configuration parameters:"
-              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
 
     return pass;
 }

From 21e9983913657f2270e31a9d301c4b9a55c502ac Mon Sep 17 00:00:00 2001
From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com>
Date: Thu, 7 Aug 2025 12:30:08 +0200
Subject: [PATCH 400/443] Revert "Add padding to 1x1Stride1Pad0 conv
 specialization (grouped conv bwd weight) (#2610)" (#2637)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 2203b0ddfe06f4f9f5126e54e78697dfb16118d4.

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 include/ck/ck.hpp                             |   3 +
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |  11 +-
 .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp    | 198 ------------------
 .../transform_conv_bwd_weight_to_gemm.hpp     | 126 +++++++----
 .../transform_conv_bwd_weight_to_gemm_v2.hpp  | 120 +++++++----
 5 files changed, 168 insertions(+), 290 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 09801203ba..794c6f4e20 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -222,6 +222,9 @@
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 
+// workaround: conv crash when K, C is even
+#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1
+
 // workaround: compiler crash when compiling recursive lambda
 #define CK_WORKAROUND_SWDEV_275126 1
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index ed64b83356..1cd1f16245 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -331,8 +331,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
     using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3<
-        tensor_layout::gemm::ColumnMajor,
         tensor_layout::gemm::RowMajor,
+        tensor_layout::gemm::ColumnMajor,
         tensor_layout::gemm::RowMajor,
         ADataType,
         BDataType,
@@ -1299,6 +1299,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         if constexpr(ConvBackwardWeightSpecialization ==
                      ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
         {
+// workaround: disable when K, C is even
+#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN
+            if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0)
+            {
+                return false;
+            }
+#endif
             // check if it's 1x1, stride=1 pad = 0 conv
             for(int i = 0; i < NDimSpatial; i++)
             {
@@ -1323,7 +1330,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         }
 
         // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(gemm_arg);
+        return true;
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
index 382d2870e8..68112489ca 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck/utility/common_header.hpp"
-#include "ck/utility/env.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -607,203 +606,6 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3
                          c_block_size * sizeof(CShuffleDataType));
     }
 
-    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
-    __host__ static constexpr bool CheckValidity(const Argument& karg)
-    {
-        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
-                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
-                      "Invalid tuning param!");
-
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
-                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
-        {
-            if(!(karg.M % MPerBlock == 0))
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
-                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
-        {
-            if(!(karg.N % NPerBlock == 0))
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
-        {
-
-            auto K_t = karg.KBatch * KPerBlock;
-            if(!(karg.K % K_t == 0))
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
-                              << karg.K << " " << __FILE__ << ":" << __LINE__
-                              << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            constexpr auto KReadVec = math::lcm(AK1Number, BK1Number);
-            auto K_t                = karg.KBatch * KReadVec;
-            auto KReadPadSplited    = math::integer_divide_ceil(karg.K, K_t) * KReadVec;
-            if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
-            {
-                return false;
-            }
-        }
-
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
-        {
-            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg K (" << karg.K
-                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg M (" << karg.M
-                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
-        {
-            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg N (" << karg.N
-                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg K (" << karg.K
-                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
-                              << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
-        {
-            if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg N (" << karg.N
-                              << ") value is not a multiple of "
-                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-        else
-        {
-            if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << "Arg M (" << karg.M
-                              << ") value is not a multiple of "
-                                 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
-                              << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! "
-                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                              << std::endl;
-                }
-                return false;
-            }
-        }
-
-        if constexpr(!(is_same<remove_cvref_t<CDataType>, half_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, float>::value ||
-                       is_same<remove_cvref_t<CDataType>, bhalf_t>::value ||
-                       is_same<remove_cvref_t<CDataType>, int32_t>::value))
-        {
-            if(!karg.IsReduceAdd())
-            {
-                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-                {
-                    std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__
-                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
-                }
-                if(karg.KBatch > 1)
-                {
-                    return false;
-                }
-            }
-        }
-
-        // check gridwise gemm pipeline
-        const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
-
-        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
-        {
-            if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
-            {
-                return false;
-            }
-        }
-
-        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
-        return true;
-    }
-
     __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
index efc7f20cdc..bd3ab10802 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
@@ -192,7 +192,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -210,7 +210,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -218,17 +218,9 @@ struct TransformConvBwdWeightToGemm
             const auto wei_gemmm_gemmn_grid_desc =
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
-            // Padd
-            const auto wei_gemmm_gemmn_pad_grid_desc =
-                transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
-                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
-                                                       make_right_pad_transform(GemmN, PadGemmN)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_pad_grid_desc);
+                              wei_gemmm_gemmn_grid_desc);
         }
         else
         {
@@ -248,7 +240,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -287,7 +279,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -296,6 +288,26 @@ struct TransformConvBwdWeightToGemm
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
             // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -303,8 +315,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -380,7 +392,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -395,21 +407,13 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            // Padd
-            const auto wei_gemmm_gemmn_pad_grid_desc =
-                transform_tensor_descriptor(wei_grid_desc,
-                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
-                                                       make_right_pad_transform(GemmN, PadGemmN)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_pad_grid_desc);
+                              wei_grid_desc);
         }
         else
         {
@@ -424,7 +428,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -465,11 +469,31 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -477,8 +501,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -561,7 +585,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -576,21 +600,13 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
-            // Padd
-            const auto wei_gemmm_gemmn_pad_grid_desc =
-                transform_tensor_descriptor(wei_grid_desc,
-                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
-                                                       make_right_pad_transform(GemmN, PadGemmN)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_pad_grid_desc);
+                              wei_grid_desc);
         }
         else
         {
@@ -605,7 +621,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -655,11 +671,31 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch),
+                               make_pass_through_transform(GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -667,8 +703,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
index e410f06190..b72ddb8243 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -374,7 +374,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -390,21 +390,13 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
-            // Padd
-            const auto wei_gemmm_gemmn_pad_grid_desc =
-                transform_tensor_descriptor(wei_grid_desc,
-                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
-                                                       make_right_pad_transform(GemmN, PadGemmN)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_pad_grid_desc);
+                              wei_grid_desc);
         }
         else
         {
@@ -420,7 +412,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -461,11 +453,29 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -473,8 +483,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
 
@@ -552,7 +562,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -568,21 +578,13 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
-            // Padd
-            const auto wei_gemmm_gemmn_pad_grid_desc =
-                transform_tensor_descriptor(wei_grid_desc,
-                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
-                                                       make_right_pad_transform(GemmN, PadGemmN)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_pad_grid_desc);
+                              wei_grid_desc);
         }
         else
         {
@@ -598,7 +600,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -648,11 +650,29 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -660,8 +680,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -745,7 +765,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -761,21 +781,13 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
-            // Padd
-            const auto wei_gemmm_gemmn_pad_grid_desc =
-                transform_tensor_descriptor(wei_grid_desc,
-                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
-                                                       make_right_pad_transform(GemmN, PadGemmN)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_pad_grid_desc);
+                              wei_grid_desc);
         }
         else
         {
@@ -791,7 +803,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmM, PadGemmM)),
+                           make_pass_through_transform(GemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -856,11 +868,29 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_right_pad_transform(GemmN, PadGemmN)),
+                           make_pass_through_transform(GemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -868,8 +898,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end

From ffdee5e774cf73c3dc35869259ae8f460f969f1b Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Thu, 7 Aug 2025 15:45:27 +0300
Subject: [PATCH 401/443] [CK_TILE] Enable printing more structures in CK-Tile
 (#2443)

* Add more printing to core cktile

* Revert other changes in static encoding pattern

* Refactor to using a free print() function

* Remove loops and print just the containers

* Print tuple with better formatting, fix sequence compilation

* Add some tests for print utility

* Add print utility header

* Print for static_encoding_pattern

* add buffer_view printing

* Align vector_traits

* Fix formatting

* Lower-case enum strings

Co-authored-by: Christopher Millette <63608002+cgmillette@users.noreply.github.com>

* Remove empty comment lines

* Fix test with lower-case too

* Reduce repeated code in print tests, move helper function closer to type definition, test X&Y

* Add test_print_common.hpp

* add print.hpp in core.hpp

---------

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
Co-authored-by: Christopher Millette <63608002+cgmillette@users.noreply.github.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 include/ck_tile/core.hpp                      |   1 +
 .../core/algorithm/coordinate_transform.hpp   | 419 ++++++++----------
 .../algorithm/static_encoding_pattern.hpp     |  48 ++
 include/ck_tile/core/arch/arch.hpp            |  15 +
 include/ck_tile/core/container/array.hpp      |  20 +-
 include/ck_tile/core/container/map.hpp        |  35 +-
 include/ck_tile/core/container/sequence.hpp   |  28 +-
 include/ck_tile/core/container/tuple.hpp      |  21 +-
 .../core/numeric/integral_constant.hpp        |   8 +-
 include/ck_tile/core/numeric/vector_type.hpp  |   4 +-
 include/ck_tile/core/tensor/buffer_view.hpp   | 109 +----
 .../ck_tile/core/tensor/tensor_adaptor.hpp    |  65 +--
 .../ck_tile/core/tensor/tensor_descriptor.hpp |  42 +-
 .../ck_tile/core/tensor/tile_distribution.hpp |  41 +-
 .../tensor/tile_distribution_encoding.hpp     | 204 ++++-----
 include/ck_tile/core/utility/print.hpp        |  76 ++++
 test/ck_tile/CMakeLists.txt                   |   3 +-
 test/ck_tile/utility/CMakeLists.txt           |   4 +
 test/ck_tile/utility/print/CMakeLists.txt     |   8 +
 test/ck_tile/utility/print/README.md          |  70 +++
 .../utility/print/test_print_array.cpp        |  59 +++
 .../utility/print/test_print_basic_types.cpp  |  76 ++++
 .../utility/print/test_print_buffer_view.cpp  |  78 ++++
 .../utility/print/test_print_common.hpp       |  25 ++
 .../print/test_print_coordinate_transform.cpp |  83 ++++
 .../utility/print/test_print_sequence.cpp     |  45 ++
 .../test_print_static_encoding_pattern.cpp    |  89 ++++
 .../utility/print/test_print_tuple.cpp        |  66 +++
 28 files changed, 1211 insertions(+), 531 deletions(-)
 create mode 100644 include/ck_tile/core/utility/print.hpp
 create mode 100644 test/ck_tile/utility/CMakeLists.txt
 create mode 100644 test/ck_tile/utility/print/CMakeLists.txt
 create mode 100644 test/ck_tile/utility/print/README.md
 create mode 100644 test/ck_tile/utility/print/test_print_array.cpp
 create mode 100644 test/ck_tile/utility/print/test_print_basic_types.cpp
 create mode 100644 test/ck_tile/utility/print/test_print_buffer_view.cpp
 create mode 100644 test/ck_tile/utility/print/test_print_common.hpp
 create mode 100644 test/ck_tile/utility/print/test_print_coordinate_transform.cpp
 create mode 100644 test/ck_tile/utility/print/test_print_sequence.cpp
 create mode 100644 test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
 create mode 100644 test/ck_tile/utility/print/test_print_tuple.cpp

diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 188cebaabc..c8945f03e9 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -74,6 +74,7 @@
 #include "ck_tile/core/utility/literals.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/philox_rand.hpp"
+#include "ck_tile/core/utility/print.hpp"
 #include "ck_tile/core/utility/random.hpp"
 #include "ck_tile/core/utility/reduce_operator.hpp"
 #include "ck_tile/core/utility/static_counter.hpp"
diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp
index f7f9489f4c..7511413bba 100644
--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
+#include "ck_tile/core/utility/print.hpp"
 
 namespace ck_tile {
 
@@ -139,20 +140,19 @@ struct pass_through : public base_transform<1, 1>
     {
         return make_tuple(low_vector_lengths, low_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("pass_through{");
-
-        //
-        printf("up_lengths_:");
-        print(up_lengths_);
-
-        //
-        printf("}");
-    }
 };
 
+template <typename LowLength>
+CK_TILE_HOST_DEVICE static void print(const pass_through<LowLength>& pt)
+{
+    printf("pass_through{");
+
+    printf("up_lengths_: ");
+    print(pt.get_upper_lengths());
+
+    printf("}");
+}
+
 template <typename LowLength,
           typename LeftPadLength,
           typename RightPadLength,
@@ -229,29 +229,25 @@ struct pad : public base_transform<1, 1>
                ck_tile::is_known_at_compile_time<LeftPadLength>::value &&
                ck_tile::is_known_at_compile_time<RightPadLength>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("pad{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("left_pad_length_: ");
-        print(left_pad_length_);
-        printf(", ");
-
-        //
-        printf("right_pad_length_: ");
-        print(right_pad_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength,
+          typename LeftPadLength,
+          typename RightPadLength,
+          bool SkipIsValidCheck>
+CK_TILE_HOST_DEVICE static void
+print(const pad<LowLength, LeftPadLength, RightPadLength, SkipIsValidCheck>& p)
+{
+    printf("pad{");
+    printf("up_lengths_: ");
+    print(p.up_lengths_);
+    printf(", left_pad_length_: ");
+    print(p.left_pad_length_);
+    printf(", right_pad_length_: ");
+    print(p.right_pad_length_);
+    printf("}");
+}
+
 template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
 struct left_pad
 {
@@ -330,24 +326,20 @@ struct left_pad
         //       It's up to runtime to check the padding length should be multiple of vector length
         return make_tuple(low_vector_lengths, low_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("left_pad{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("left_pad_length_: ");
-        print(left_pad_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck>
+CK_TILE_HOST_DEVICE static void
+print(const left_pad<LowLength, LeftPadLength, SkipIsValidCheck>& lp)
+{
+    printf("left_pad{");
+    printf("up_lengths_: ");
+    print(lp.up_lengths_);
+    printf(", left_pad_length_: ");
+    print(lp.left_pad_length_);
+    printf("}");
+}
+
 template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
 struct right_pad : public base_transform<1, 1>
 {
@@ -430,24 +422,20 @@ struct right_pad : public base_transform<1, 1>
         //       It's up to runtime to check the padding length should be multiple of vector length
         return make_tuple(low_vector_lengths, low_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("right_pad{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("right_pad_length_: ");
-        print(right_pad_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck>
+CK_TILE_HOST_DEVICE static void
+print(const right_pad<LowLength, RightPadLength, SkipIsValidCheck>& rp)
+{
+    printf("right_pad{");
+    printf("up_lengths_: ");
+    print(rp.up_lengths_);
+    printf(", right_pad_length_: ");
+    print(rp.right_pad_length_);
+    printf("}");
+}
+
 // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
 // UpLengths and Coefficients can be either of the followings:
 //   1) Tuple of index_t, which is known at run-time, or
@@ -532,24 +520,19 @@ struct embed : public base_transform<1, UpLengths::size()>
         return ck_tile::is_known_at_compile_time<UpLengths>::value &&
                ck_tile::is_known_at_compile_time<Coefficients>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("embed{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("coefficients_: ");
-        print(coefficients_);
-
-        printf("}");
-    }
 };
 
+template <typename UpLengths, typename Coefficients>
+CK_TILE_HOST_DEVICE static void print(const embed<UpLengths, Coefficients>& e)
+{
+    printf("embed{");
+    printf("up_lengths_: ");
+    print(e.up_lengths_);
+    printf(", coefficients_: ");
+    print(e.coefficients_);
+    printf("}");
+}
+
 template <typename LowLengths>
 struct lambda_merge_generate_MagicDivision_calculate_magic_divisor
 {
@@ -699,24 +682,19 @@ struct merge_v2_magic_division : public base_transform<LowLengths::size(), 1>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("merge_v2_magic_division{");
-
-        //
-        printf("low_lengths_ ");
-        print(low_lengths_);
-        printf(", ");
-
-        //
-        printf("up_lengths_ ");
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE static void print(const merge_v2_magic_division<LowLengths>& m)
+{
+    printf("merge_v2_magic_division{");
+    printf("low_lengths_: ");
+    print(m.low_lengths_);
+    printf(", up_lengths_: ");
+    print(m.up_lengths_);
+    printf("}");
+}
+
 // Implementation of "merge" transformation primitive that uses division and mod. It is supposed to
 // be used for low_lengths that are known at compile time and are power of 2, otherwise performance
 // will be very bad
@@ -830,29 +808,21 @@ struct merge_v3_division_mod : public base_transform<LowLengths::size(), 1>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("Merge_v3_direct_division_mod{");
-
-        //
-        printf("low_lengths_ ");
-        print(low_lengths_);
-        printf(", ");
-
-        //
-        printf("low_lengths_scan_ ");
-        print(low_lengths_scan_);
-        printf(", ");
-
-        //
-        printf("up_lengths_ ");
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE static void print(const merge_v3_division_mod<LowLengths>& m)
+{
+    printf("merge_v3_division_mod{");
+    printf("low_lengths_: ");
+    print(m.low_lengths_);
+    printf(", low_lengths_scan_: ");
+    print(m.low_lengths_scan_);
+    printf(", up_lengths_: ");
+    print(m.up_lengths_);
+    printf("}");
+}
+
 template <typename UpLengths, bool Use24BitIntegerCalculation>
 struct unmerge : public base_transform<1, UpLengths::size()>
 {
@@ -958,24 +928,19 @@ struct unmerge : public base_transform<1, UpLengths::size()>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("unmerge{");
-
-        //
-        printf("up_lengths_");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("up_lengths_scan_");
-        print(up_lengths_scan_);
-
-        printf("}");
-    }
 };
 
+template <typename UpLengths, bool Use24BitIntegerCalculation>
+CK_TILE_HOST_DEVICE static void print(const unmerge<UpLengths, Use24BitIntegerCalculation>& u)
+{
+    printf("unmerge{");
+    printf("up_lengths_: ");
+    print(u.up_lengths_);
+    printf(", up_lengths_scan_: ");
+    print(u.up_lengths_scan_);
+    printf("}");
+}
+
 template <typename LowerIndex>
 struct freeze : public base_transform<1, 0>
 {
@@ -1023,19 +988,17 @@ struct freeze : public base_transform<1, 0>
     {
         return ck_tile::is_known_at_compile_time<LowerIndex>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("freeze{");
-
-        //
-        printf("low_idx_: ");
-        print(low_idx_);
-
-        printf("}");
-    }
 };
 
+template <typename LowerIndex>
+CK_TILE_HOST_DEVICE static void print(const freeze<LowerIndex>& f)
+{
+    printf("freeze{");
+    printf("low_idx_: ");
+    print(f.low_idx_);
+    printf("}");
+}
+
 // insert a dangling upper dimension without lower dimension
 template <typename UpperLength>
 struct insert : public base_transform<0, 1>
@@ -1092,18 +1055,17 @@ struct insert : public base_transform<0, 1>
     {
         return ck_tile::is_known_at_compile_time<UpperLength>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("insert{");
-
-        //
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename UpperLength>
+CK_TILE_HOST_DEVICE static void print(const insert<UpperLength>& i)
+{
+    printf("insert{");
+    printf("up_lengths_: ");
+    print(i.up_lengths_);
+    printf("}");
+}
+
 // replicate the original tensor and create a higher dimensional tensor
 template <typename UpLengths>
 struct replicate : public base_transform<0, UpLengths::size()>
@@ -1152,21 +1114,19 @@ struct replicate : public base_transform<0, UpLengths::size()>
         return ck_tile::is_known_at_compile_time<UpLengths>::value;
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("replicate{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-
-        printf("}");
-    }
-
     //
     UpLengths up_lengths_;
 };
 
+template <typename UpLengths>
+CK_TILE_HOST_DEVICE static void print(const replicate<UpLengths>& r)
+{
+    printf("replicate{");
+    printf("up_lengths_: ");
+    print(r.up_lengths_);
+    printf("}");
+}
+
 template <typename LowLength, typename SliceBegin, typename SliceEnd>
 struct slice : public base_transform<1, 1>
 {
@@ -1238,28 +1198,20 @@ struct slice : public base_transform<1, 1>
                ck_tile::is_known_at_compile_time<SliceBegin>::value &&
                ck_tile::is_known_at_compile_time<SliceEnd>::value;
     }
+};
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("slice{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("slice_begin_: ");
-        print(slice_begin_);
-        printf(", ");
-
-        //
-        printf("slice_end_: ");
-        print(slice_end_);
-
-        printf("}");
-    } // namespace ck
-}; // namespace ck
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+CK_TILE_HOST_DEVICE static void print(const slice<LowLength, SliceBegin, SliceEnd>& s)
+{
+    printf("slice{");
+    printf("up_lengths_: ");
+    print(s.up_lengths_);
+    printf(", slice_begin_: ");
+    print(s.slice_begin_);
+    printf(", slice_end_: ");
+    print(s.slice_end_);
+    printf("}");
+}
 
 /*
  * \brief lower_idx = upper_idx % modulus.
@@ -1328,19 +1280,19 @@ struct modulo : public base_transform<1, 1>
     {
         return ck_tile::is_known_at_compile_time<UpLengths>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("Modulus{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename Modulus, typename UpLength>
+CK_TILE_HOST_DEVICE static void print(const modulo<Modulus, UpLength>& m)
+{
+    printf("modulo{");
+    printf("modulus_: ");
+    print(m.modulus_);
+    printf(", up_lengths_: ");
+    print(m.up_lengths_);
+    printf("}");
+}
+
 // 2D XOR, NOTE: "xor" is a keyword
 template <typename LowLengths>
 struct xor_t : public base_transform<2, 2>
@@ -1424,20 +1376,17 @@ struct xor_t : public base_transform<2, 2>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("xor_t{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        printf("}");
-    }
 };
 
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE static void print(const xor_t<LowLengths>& x)
+{
+    printf("xor_t{");
+    printf("up_lengths_: ");
+    print(x.up_lengths_);
+    printf("}");
+}
+
 template <typename LowLength, typename OffsetLength>
 struct offset : public base_transform<1, 1>
 {
@@ -1509,24 +1458,19 @@ struct offset : public base_transform<1, 1>
         return ck_tile::is_known_at_compile_time<UpLengths>::value &&
                ck_tile::is_known_at_compile_time<OffsetLength>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("offset{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("offset_length_: ");
-        print(offset_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength, typename OffsetLength>
+CK_TILE_HOST_DEVICE static void print(const offset<LowLength, OffsetLength>& o)
+{
+    printf("offset{");
+    printf("up_lengths_: ");
+    print(o.up_lengths_);
+    printf(", offset_length_: ");
+    print(o.offset_length_);
+    printf("}");
+}
+
 template <typename UpLength, typename IndexingAdaptor>
 struct indexing : public base_transform<1, 1>
 {
@@ -1595,20 +1539,19 @@ struct indexing : public base_transform<1, 1>
         return ck_tile::is_known_at_compile_time<UpLengths>::value &&
                IndexingAdaptor::is_known_at_compile_time();
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("embed{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        printf("}");
-    }
 };
 
+template <typename UpLength, typename IndexingAdaptor>
+CK_TILE_HOST_DEVICE static void print(const indexing<UpLength, IndexingAdaptor>& i)
+{
+    printf("indexing{");
+    printf("up_lengths_: ");
+    print(i.up_lengths_);
+    printf(", iadaptor_: ");
+    print(i.iadaptor_);
+    printf("}");
+}
+
 //*******************************************************************************************************
 
 template <typename LowLength>
diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index 8a3de3e5e0..1f6c389090 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -77,6 +77,7 @@
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
+#include "ck_tile/core/utility/print.hpp"
 
 namespace ck_tile {
 
@@ -317,4 +318,51 @@ struct TileDistributionEncodingPattern2D<BlockSize,
     }
 };
 
+// Helper function to convert enum to string
+constexpr const char* tile_distribution_pattern_to_string(tile_distribution_pattern pattern)
+{
+    switch(pattern)
+    {
+    case tile_distribution_pattern::thread_raked: return "thread_raked";
+    case tile_distribution_pattern::warp_raked: return "warp_raked";
+    case tile_distribution_pattern::block_raked: return "block_raked";
+    default: return "unknown";
+    }
+}
+
+template <index_t BlockSize,
+          index_t YPerTile,
+          index_t XPerTile,
+          index_t VecSize,
+          tile_distribution_pattern DistributionPattern,
+          index_t NumWaveGroups>
+CK_TILE_HOST_DEVICE void print(const TileDistributionEncodingPattern2D<BlockSize,
+                                                                       YPerTile,
+                                                                       XPerTile,
+                                                                       VecSize,
+                                                                       DistributionPattern,
+                                                                       NumWaveGroups>&)
+{
+    using PatternType = TileDistributionEncodingPattern2D<BlockSize,
+                                                          YPerTile,
+                                                          XPerTile,
+                                                          VecSize,
+                                                          DistributionPattern,
+                                                          NumWaveGroups>;
+
+    printf("TileDistributionEncodingPattern2D<BlockSize:%d, YPerTile:%d, XPerTile:%d, "
+           "VecSize:%d, %s>: ",
+           BlockSize,
+           YPerTile,
+           XPerTile,
+           VecSize,
+           tile_distribution_pattern_to_string(DistributionPattern));
+    printf("{<Y0, Y1, Y2>: <%d, %d, %d>, <X0, X1>: <%d, %d>}\n",
+           PatternType::Y0,
+           PatternType::Y1,
+           PatternType::Y2,
+           PatternType::X0,
+           PatternType::X1);
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 96df9d70f7..ab42ec8617 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -218,4 +218,19 @@ CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity()
 #endif
 }
 
+/// Helper function to convert address space enum to string
+CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_enum addr_space)
+{
+    switch(addr_space)
+    {
+    case address_space_enum::generic: return "generic";
+    case address_space_enum::global: return "global";
+    case address_space_enum::lds: return "lds";
+    case address_space_enum::sgpr: return "sgpr";
+    case address_space_enum::constant: return "constant";
+    case address_space_enum::vgpr: return "vgpr";
+    default: return "unknown";
+    }
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index 94aa40e278..352c645325 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -177,9 +177,27 @@ struct array<T, 0>
     CK_TILE_HOST_DEVICE constexpr array() {}
     CK_TILE_HOST_DEVICE static constexpr index_t size() { return 0; }
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v<T>; };
-    CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); }
 };
 
+template <typename T, index_t N>
+CK_TILE_HOST_DEVICE static void print(const array<T, N>& a)
+{
+    printf("array{size: %ld, data: [", static_cast<long>(N));
+    for(index_t i = 0; i < N; ++i)
+    {
+        if(i > 0)
+            printf(", ");
+        print(a[i]);
+    }
+    printf("]}");
+}
+
+template <typename T>
+CK_TILE_HOST_DEVICE static void print(const array<T, 0>&)
+{
+    printf("array{size: 0, data: []}");
+}
+
 template <typename, typename>
 struct vector_traits;
 
diff --git a/include/ck_tile/core/container/map.hpp b/include/ck_tile/core/container/map.hpp
index 87b180cafc..7697995c92 100644
--- a/include/ck_tile/core/container/map.hpp
+++ b/include/ck_tile/core/container/map.hpp
@@ -139,26 +139,21 @@ struct map
 
     // WARNING: needed by compiler for C++ range-based for loop only, don't use this function!
     CK_TILE_HOST_DEVICE constexpr iterator end() { return iterator{impl_, size_}; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("map{size_: %d, ", size_);
-        //
-        printf("impl_: [");
-        //
-        for(const auto& [k, d] : *this)
-        {
-            printf("{key: ");
-            print(k);
-            printf(", data: ");
-            print(d);
-            printf("}, ");
-        }
-        //
-        printf("]");
-        //
-        printf("}");
-    }
 };
 
+template <typename key, typename data, index_t max_size>
+CK_TILE_HOST_DEVICE static void print(const map<key, data, max_size>& m)
+{
+    printf("map{size_: %d, impl_: [", m.size_);
+    for(const auto& [k, d] : m)
+    {
+        printf("{key: ");
+        print(k);
+        printf(", data: ");
+        print(d);
+        printf("}, ");
+    }
+    printf("]}");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 94309dd5dd..905b32dd15 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -9,13 +9,10 @@
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/utility/to_sequence.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
-#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/print.hpp"
 
 namespace ck_tile {
 
-template <index_t, index_t, index_t>
-struct static_for;
-
 template <index_t...>
 struct sequence;
 
@@ -196,15 +193,24 @@ struct sequence
     {
         return sequence<f(Is)...>{};
     }
-
-    CK_TILE_HOST_DEVICE static void print()
-    {
-        printf("sequence{size: %d, data: [", size());
-        ((printf("%d ", Is)), ...);
-        printf("]}");
-    }
 };
 
+template <index_t... Is>
+CK_TILE_HOST_DEVICE static void print(const sequence<Is...>&)
+{
+    printf("sequence<");
+    if constexpr(sizeof...(Is) > 0)
+    {
+        bool first = true;
+        (([&first](index_t value) {
+             printf("%s%d", first ? "" : ", ", value);
+             first = false;
+         }(Is)),
+         ...);
+    }
+    printf(">");
+}
+
 namespace impl {
 template <typename T, T... Ints>
 struct __integer_sequence;
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 63d145d8b9..4c48b3d477 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -300,12 +300,29 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
 #undef TP_COM_
 };
 
-template <typename, typename = void>
+template <typename... T>
+CK_TILE_HOST_DEVICE void print(const tuple<T...>& t)
+{
+    printf("tuple<");
+    if constexpr(sizeof...(T) > 0)
+    {
+        bool first = true;
+        static_for<0, sizeof...(T), 1>{}([&t, &first](auto i) {
+            if(!first)
+                printf(", ");
+            print(t.get(i));
+            first = false;
+        });
+    }
+    printf(">");
+}
+
+template <typename, typename>
 struct vector_traits;
 
 // specialization for array
 template <typename... T>
-struct vector_traits<tuple<T...>>
+struct vector_traits<tuple<T...>, void>
 {
     using scalar_type                    = __type_pack_element<0, T...>;
     static constexpr index_t vector_size = sizeof...(T);
diff --git a/include/ck_tile/core/numeric/integral_constant.hpp b/include/ck_tile/core/numeric/integral_constant.hpp
index 33c24da8c5..2ba2fd10c6 100644
--- a/include/ck_tile/core/numeric/integral_constant.hpp
+++ b/include/ck_tile/core/numeric/integral_constant.hpp
@@ -19,14 +19,18 @@ struct constant
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return true; }
 };
 
+template <auto v>
+CK_TILE_HOST_DEVICE static void print(const constant<v>&)
+{
+    printf("%ld", static_cast<long>(v));
+}
+
 template <typename T, T v>
 struct integral_constant : constant<v>
 {
     using value_type         = T;
     using type               = integral_constant; // using injected-class-name
     static constexpr T value = v;
-    // constexpr CK_TILE_HOST_DEVICE operator   value_type() const noexcept { return value; }
-    // constexpr CK_TILE_HOST_DEVICE value_type operator()() const noexcept { return value; } //
 };
 
 template <index_t v>
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index b165275a8c..58bdb43b08 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -84,7 +84,7 @@ using ext_vector_t = typename impl::ext_vector<T, N>::type;
 
 // by default, any type will result in a vector_size=1 with scalar_type=T traits.
 // ... unless we have other vector_traits specialization
-template <typename T, typename>
+template <typename T, typename = void>
 struct vector_traits
 {
     using scalar_type =
@@ -94,7 +94,7 @@ struct vector_traits
 
 // specialization for ext_vector_type()
 template <typename T, index_t N>
-struct vector_traits<T __attribute__((ext_vector_type(N)))>
+struct vector_traits<T __attribute__((ext_vector_type(N))), void>
 {
     using scalar_type = std::conditional_t<std::is_same_v<T, pk_int4_t>, int8_t, T>;
     static constexpr index_t vector_size = N;
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 4b39773939..ca314a6abe 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -210,28 +210,6 @@ struct buffer_view<address_space_enum::generic,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: generic, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 // Address Space: Global
@@ -757,28 +735,6 @@ struct buffer_view<address_space_enum::global,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: Global, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 // Address Space: LDS
@@ -1138,28 +1094,6 @@ struct buffer_view<address_space_enum::lds,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: Lds, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 // Address Space: Vgpr
@@ -1313,28 +1247,6 @@ struct buffer_view<address_space_enum::vgpr,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: Vgpr, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 template <address_space_enum BufferAddressSpace,
@@ -1360,4 +1272,25 @@ make_buffer_view(T* p, BufferSizeType buffer_size, X invalid_element_value)
         p, buffer_size, invalid_element_value};
 }
 
+// Generalized print function for all buffer_view variants
+template <address_space_enum BufferAddressSpace,
+          typename T,
+          typename BufferSizeType,
+          bool InvalidElementUseNumericalZeroValue,
+          amd_buffer_coherence_enum Coherence>
+CK_TILE_HOST_DEVICE void print(const buffer_view<BufferAddressSpace,
+                                                 T,
+                                                 BufferSizeType,
+                                                 InvalidElementUseNumericalZeroValue,
+                                                 Coherence>& bv)
+{
+    printf("buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ",
+           address_space_to_string(BufferAddressSpace),
+           static_cast<void*>(const_cast<remove_cvref_t<T>*>(bv.p_data_)));
+    print(bv.buffer_size_);
+    printf(", invalid_element_value_: ");
+    print(bv.invalid_element_value_);
+    printf("}");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index e2a6ae6555..ec5538d79c 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -305,42 +305,45 @@ struct tensor_adaptor
                           get_container_subset(vector_strides, top_dims));
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tensor_adaptor{");
-
-        //
-        printf("transforms: ");
-        print(transforms_);
-        printf(", ");
-
-        //
-        printf("LowerDimensionHiddenIds: ");
-        print(LowerDimensionHiddenIdss{});
-        printf(", ");
-
-        //
-        printf("UpperDimensionHiddenIds: ");
-        print(UpperDimensionHiddenIdss{});
-        printf(", ");
-
-        //
-        printf("BottomDimensionHiddenIds: ");
-        print(BottomDimensionHiddenIds{});
-        printf(", ");
-
-        //
-        printf("TopDimensionHiddenIds: ");
-        print(TopDimensionHiddenIds{});
-
-        printf("}");
-    }
-
     private:
     Transforms transforms_;
     ElementSize element_size_;
 };
 
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename BottomDimensionHiddenIds,
+          typename TopDimensionHiddenIds>
+CK_TILE_HOST_DEVICE static void print(const tensor_adaptor<Transforms,
+                                                           LowerDimensionHiddenIdss,
+                                                           UpperDimensionHiddenIdss,
+                                                           BottomDimensionHiddenIds,
+                                                           TopDimensionHiddenIds>& adaptor)
+{
+    printf("tensor_adaptor{\n");
+    printf("    transforms: [");
+    print(adaptor.get_transforms());
+    printf("],\n");
+
+    printf("    LowerDimensionHiddenIds: [");
+    print(LowerDimensionHiddenIdss{});
+    printf("],\n");
+
+    printf("    UpperDimensionHiddenIds: [");
+    print(UpperDimensionHiddenIdss{});
+    printf("],\n");
+
+    printf("    BottomDimensionHiddenIds: [");
+    print(BottomDimensionHiddenIds{});
+    printf("],\n");
+
+    //
+    printf("    TopDimensionHiddenIds: [");
+    print(TopDimensionHiddenIds{});
+    printf("]\n}\n");
+}
+
 // Transforms: Tuple<transforms...>
 // LowerDimensionOldTopIdss: Tuple<Sequence<...>, ...>
 // UpperDimensionNewTopIdss: Tuple<Sequence<...>, ...>
diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp
index 0c3e04f315..0e4787a2f1 100644
--- a/include/ck_tile/core/tensor/tensor_descriptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp
@@ -140,25 +140,37 @@ struct tensor_descriptor : public tensor_adaptor<Transforms,
             to_array<index_t, ndim_hidden_>(GuaranteedVectorStrides{}));
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tensor_descriptor{");
-
-        // tensor_adaptor
-        Base::print();
-        printf(", ");
-
-        // element_space_size_
-        printf("element_space_size_: ");
-        print(element_space_size_);
-
-        printf("}");
-    }
-
     // TODO make these private
     ElementSpaceSize element_space_size_;
 };
 
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename TopDimensionHiddenIds,
+          typename ElementSpaceSize,
+          typename GuaranteedVectorLengths,
+          typename GuaranteedVectorStrides>
+CK_TILE_HOST_DEVICE static void print(const tensor_descriptor<Transforms,
+                                                              LowerDimensionHiddenIdss,
+                                                              UpperDimensionHiddenIdss,
+                                                              TopDimensionHiddenIds,
+                                                              ElementSpaceSize,
+                                                              GuaranteedVectorLengths,
+                                                              GuaranteedVectorStrides>& descriptor)
+{
+    printf("tensor_descriptor{\n");
+    // first print the tensor adaptor part of the descriptor using the base class print
+    print(static_cast<const typename decltype(descriptor)::Base&>(descriptor));
+    printf("element_space_size_: %ld,\n",
+           static_cast<long>(descriptor.get_element_space_size().value));
+    printf("guaranteed_vector_lengths: ");
+    print(GuaranteedVectorLengths{});
+    printf(",\nguaranteed_vector_strides: ");
+    print(GuaranteedVectorStrides{});
+    printf("}\n}\n");
+}
+
 template <typename Adaptor, typename ElementSpaceSize>
 CK_TILE_HOST_DEVICE constexpr auto
 make_tensor_descriptor_from_adaptor(const Adaptor& adaptor,
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index 11e6b35c39..bc02ec74d2 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -228,24 +228,6 @@ struct tile_distribution
     {
         return PsYs2XsAdaptor::is_static() && Ys2DDescriptor::is_static();
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tile_distribution{");
-        //
-        printf("tile_distribution_encoding: ");
-        print(DstrEncode{});
-        printf(", ");
-        //
-        printf("ps_ys_to_xs_: ");
-        print(ps_ys_to_xs_);
-        printf(", ");
-        //
-        printf("ys_to_d_: ");
-        print(ys_to_d_);
-        //
-        printf("}");
-    }
 };
 
 namespace detail {
@@ -710,4 +692,27 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
 }
 
 } // namespace detail
+
+// Free print function for tile_distribution
+template <typename PsYs2XsAdaptor_,
+          typename Ys2DDescriptor_,
+          typename StaticTileDistributionEncoding_,
+          typename TileDistributionDetail_>
+CK_TILE_HOST_DEVICE void print(const tile_distribution<PsYs2XsAdaptor_,
+                                                       Ys2DDescriptor_,
+                                                       StaticTileDistributionEncoding_,
+                                                       TileDistributionDetail_>& distribution)
+{
+    printf("tile_distribution{");
+    printf("tile_distribution_encoding: ");
+    print(StaticTileDistributionEncoding_{});
+    printf(", ");
+    printf("ps_ys_to_xs_: ");
+    print(distribution.ps_ys_to_xs_);
+    printf(", ");
+    printf("ys_to_d_: ");
+    print(distribution.ys_to_d_);
+    printf("}\n");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
index b380e7c9d8..90d1a2ccb2 100644
--- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
@@ -428,109 +428,7 @@ struct tile_distribution_encoding
         {
             return get_sorted_info(get_uniformed_idx_y_to_h(), get_h_dim_lengths_prefix_sum());
         }
-
-        CK_TILE_HOST_DEVICE void print() const
-        {
-            printf("tile_distribution_encoding::detail{");
-            //
-            printf("ndim_rh_major_: ");
-            print(ndim_rh_major_);
-            printf(", ");
-            //
-            printf("ndim_span_major_: ");
-            print(ndim_span_major_);
-            printf(", ");
-            //
-            printf("ndims_rhs_minor_: ");
-            print(ndims_rhs_minor_);
-            printf(", ");
-            //
-            printf("ndim_rh_major_: ");
-            print(ndim_rh_major_);
-            printf(", ");
-            //
-            printf("max_ndim_rh_minor_: ");
-            print(max_ndim_rh_minor_);
-            printf(", ");
-            //
-            printf("rhs_lengthss_: ");
-            print(rhs_lengthss_);
-            printf(", ");
-            //
-            printf("ys_lengths_: ");
-            print(ys_lengths_);
-            printf(", ");
-            //
-            printf("rhs_major_minor_to_ys_: ");
-            print(rhs_major_minor_to_ys_);
-            printf(", ");
-            //
-            printf("ndims_span_minor_: ");
-            print(ndims_span_minor_);
-            printf(", ");
-            //
-            printf("max_ndim_span_minor_: ");
-            print(max_ndim_span_minor_);
-            printf(", ");
-            //
-            printf("ys_to_span_major_: ");
-            print(ys_to_span_major_);
-            printf(", ");
-            //
-            printf("ys_to_span_minor_: ");
-            print(ys_to_span_minor_);
-            printf(", ");
-            //
-            printf("distributed_spans_lengthss_: ");
-            print(distributed_spans_lengthss_);
-            printf(", ");
-            //
-            printf("ndims_distributed_spans_minor_: ");
-            print(ndims_distributed_spans_minor_);
-            printf(", ");
-            //
-            printf("ps_over_rs_derivative_: ");
-            print(ps_over_rs_derivative_);
-            //
-            printf("}");
-        }
     };
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tile_distribution_encoding{");
-        //
-        printf("NDimX: %d, NDimP: %d, NDimY: %d, ", NDimX, NDimP, NDimY);
-        //
-        printf("rs_lengths_: ");
-        print(rs_lengths_);
-        printf(", ");
-        //
-        printf("hs_lengthss_: ");
-        print(hs_lengthss_);
-        printf(", ");
-        //
-        printf("ps_to_rhss_major_: ");
-        print(ps_to_rhss_major_);
-        printf(", ");
-        //
-        printf("ps_to_rhss_minor_: ");
-        print(ps_to_rhss_minor_);
-        printf(", ");
-        //
-        printf("ys_to_rhs_major_: ");
-        print(ys_to_rhs_major_);
-        printf(", ");
-        //
-        printf("ys_to_rhs_minor_: ");
-        print(ys_to_rhs_minor_);
-        printf(", ");
-        //
-        printf("detail: ");
-        print(detail{});
-        //
-        printf("}");
-    }
 };
 
 template <typename encoding, typename shuffle>
@@ -896,4 +794,106 @@ make_reduce_tile_distribution_encoding(InDstr, sequence<InReduceDimXs...> reduce
 }
 
 } // namespace detail
+
+// Free print function for tile_distribution_encoding::detail
+template <typename RsLengths_,
+          typename HsLengthss_,
+          typename Ps2RHssMajor_,
+          typename Ps2RHssMinor_,
+          typename Ys2RHsMajor_,
+          typename Ys2RHsMinor_>
+CK_TILE_HOST_DEVICE void
+print(const typename tile_distribution_encoding<RsLengths_,
+                                                HsLengthss_,
+                                                Ps2RHssMajor_,
+                                                Ps2RHssMinor_,
+                                                Ys2RHsMajor_,
+                                                Ys2RHsMinor_>::detail& detail_obj)
+{
+    printf("tile_distribution_encoding::detail{");
+    printf("ndim_rh_major_: ");
+    print(detail_obj.ndim_rh_major_);
+    printf(", ");
+    printf("ndim_span_major_: ");
+    print(detail_obj.ndim_span_major_);
+    printf(", ");
+    printf("ndims_rhs_minor_: ");
+    print(detail_obj.ndims_rhs_minor_);
+    printf(", ");
+    printf("ndim_rh_major_: ");
+    print(detail_obj.ndim_rh_major_);
+    printf(", ");
+    printf("max_ndim_rh_minor_: ");
+    print(detail_obj.max_ndim_rh_minor_);
+    printf(", ");
+    printf("rhs_lengthss_: ");
+    print(detail_obj.rhs_lengthss_);
+    printf(", ");
+    printf("ys_lengths_: ");
+    print(detail_obj.ys_lengths_);
+    printf(", ");
+    printf("rhs_major_minor_to_ys_: ");
+    print(detail_obj.rhs_major_minor_to_ys_);
+    printf(", ");
+    printf("ndims_span_minor_: ");
+    print(detail_obj.ndims_span_minor_);
+    printf(", ");
+    printf("max_ndim_span_minor_: ");
+    print(detail_obj.max_ndim_span_minor_);
+    printf(", ");
+    printf("ys_to_span_major_: ");
+    print(detail_obj.ys_to_span_major_);
+    printf(", ");
+    printf("ys_to_span_minor_: ");
+    print(detail_obj.ys_to_span_minor_);
+    printf(", ");
+    printf("distributed_spans_lengthss_: ");
+    print(detail_obj.distributed_spans_lengthss_);
+    printf(", ");
+    printf("ndims_distributed_spans_minor_: ");
+    print(detail_obj.ndims_distributed_spans_minor_);
+    printf(", ");
+    printf("ps_over_rs_derivative_: ");
+    print(detail_obj.ps_over_rs_derivative_);
+    printf("}");
+}
+
+// Free print function for tile_distribution_encoding
+template <typename RsLengths_,
+          typename HsLengthss_,
+          typename Ps2RHssMajor_,
+          typename Ps2RHssMinor_,
+          typename Ys2RHsMajor_,
+          typename Ys2RHsMinor_>
+CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding<RsLengths_,
+                                                                HsLengthss_,
+                                                                Ps2RHssMajor_,
+                                                                Ps2RHssMinor_,
+                                                                Ys2RHsMajor_,
+                                                                Ys2RHsMinor_>& encoding)
+{
+    printf("tile_distribution_encoding{");
+
+    printf("NDimX: %d, NDimP: %d, NDimY: %d, ", encoding.NDimX, encoding.NDimP, encoding.NDimY);
+    printf("rs_lengths_: ");
+    print(encoding.rs_lengths_);
+    printf(", ");
+    printf("hs_lengthss_: ");
+    print(encoding.hs_lengthss_);
+    printf(", ");
+    printf("ps_to_rhss_major_: ");
+    print(encoding.ps_to_rhss_major_);
+    printf(", ");
+    printf("ps_to_rhss_minor_: ");
+    print(encoding.ps_to_rhss_minor_);
+    printf(", ");
+    printf("ys_to_rhs_major_: ");
+    print(encoding.ys_to_rhs_major_);
+    printf(", ");
+    printf("ys_to_rhs_minor_: ");
+    print(encoding.ys_to_rhs_minor_);
+    printf(", ");
+    printf("}");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/utility/print.hpp b/include/ck_tile/core/utility/print.hpp
new file mode 100644
index 0000000000..04635959af
--- /dev/null
+++ b/include/ck_tile/core/utility/print.hpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+/// Declare a ck_tile::print() interface that gets specialized in each header file for types that
+/// can be printed.
+template <typename T>
+CK_TILE_HOST_DEVICE void print(const T&)
+{
+    static_assert(sizeof(T) == 0,
+                  "No print implementation available for this type. Please specialize "
+                  "ck_tile::print for your type.");
+}
+
+/// Specialization for int
+template <>
+CK_TILE_HOST_DEVICE void print(const int& value)
+{
+    printf("%d", value);
+}
+
+/// Specialization for float
+template <>
+CK_TILE_HOST_DEVICE void print(const float& value)
+{
+    printf("%f", value);
+}
+
+/// Specialization for double
+template <>
+CK_TILE_HOST_DEVICE void print(const double& value)
+{
+    printf("%f", value);
+}
+
+/// Specialization for long
+template <>
+CK_TILE_HOST_DEVICE void print(const long& value)
+{
+    printf("%ld", value);
+}
+
+/// Specialization for unsigned int
+template <>
+CK_TILE_HOST_DEVICE void print(const unsigned int& value)
+{
+    printf("%u", value);
+}
+
+/// Specialization for char
+template <>
+CK_TILE_HOST_DEVICE void print(const char& value)
+{
+    printf("%c", value);
+}
+
+/// Specialization for array
+template <typename T, size_t N>
+CK_TILE_HOST_DEVICE void print(const T (&value)[N])
+{
+    printf("[");
+    for(size_t i = 0; i < N; ++i)
+    {
+        if(i > 0)
+            printf(", ");
+        print(value[i]); // Recursively call print for each element
+    }
+    printf("]");
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 9a1df56208..374e5b4990 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -21,4 +21,5 @@ add_subdirectory(add_rmsnorm2d_rdquant)
 # add_subdirectory(layernorm2d)
 # add_subdirectory(rmsnorm2d)
 add_subdirectory(gemm_block_scale)
-add_subdirectory(reduce)
\ No newline at end of file
+add_subdirectory(utility)
+add_subdirectory(reduce)
diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt
new file mode 100644
index 0000000000..c57cafca5a
--- /dev/null
+++ b/test/ck_tile/utility/CMakeLists.txt
@@ -0,0 +1,4 @@
+message("-- Adding: test/ck_tile/utility/")
+
+# Add print tests
+add_subdirectory(print)
diff --git a/test/ck_tile/utility/print/CMakeLists.txt b/test/ck_tile/utility/print/CMakeLists.txt
new file mode 100644
index 0000000000..5300dd20ca
--- /dev/null
+++ b/test/ck_tile/utility/print/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Print utility tests
+add_gtest_executable(test_print_sequence test_print_sequence.cpp)
+add_gtest_executable(test_print_array test_print_array.cpp)
+add_gtest_executable(test_print_tuple test_print_tuple.cpp)
+add_gtest_executable(test_print_coordinate_transform test_print_coordinate_transform.cpp)
+add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp)
+add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp)
+add_gtest_executable(test_print_basic_types test_print_basic_types.cpp)
diff --git a/test/ck_tile/utility/print/README.md b/test/ck_tile/utility/print/README.md
new file mode 100644
index 0000000000..558c6faee4
--- /dev/null
+++ b/test/ck_tile/utility/print/README.md
@@ -0,0 +1,70 @@
+# Print Function Tests
+
+This directory contains unit tests for testing the print functionality of various data structures and coordinate transformations in the composable_kernel library.
+
+## Tests Included
+
+### test_print_sequence.cpp
+Tests the print functionality for `sequence<...>` containers:
+- Simple sequences with multiple elements
+- Single element sequences
+- Empty sequences
+- Longer sequences
+
+### test_print_array.cpp
+Tests the print functionality for `array<T, N>` containers:
+- Arrays with integer values
+- Single element arrays
+- Empty arrays (size 0)
+- Arrays with floating point values
+
+### test_print_tuple.cpp
+Tests the print functionality for `tuple<...>` containers:
+- Simple tuples with numbers
+- Single element tuples
+- Empty tuples
+- Mixed type tuples
+
+### test_print_coordinate_transform.cpp
+Tests the print functionality for coordinate transformation structures:
+- `pass_through` transform
+- `embed` transform
+- `merge` transform
+- `unmerge` transform
+- `freeze` transform
+
+## Testing Approach
+
+All tests use Google Test's `CaptureStdout()` functionality to capture the output from print functions and verify the formatting:
+
+```cpp
+testing::internal::CaptureStdout();
+print(object);
+std::string output = testing::internal::GetCapturedStdout();
+EXPECT_EQ(output, "expected_format");
+```
+
+This approach enables testing of print function output without affecting the console during test execution.
+
+## Building and Running
+
+The tests are integrated into the CMake build system. To build and run the print tests:
+
+```bash
+# Build the specific test
+make test_print_sequence
+
+# Run the test
+./test_print_sequence
+
+# Or run all print tests using CTest
+ctest -R "test_print"
+```
+
+## Adding New Tests
+
+To add tests for new data structures:
+
+1. Create a new test file: `test_print_<structure_name>.cpp`
+2. Follow the existing pattern using `CaptureStdout()`
+3. Add the test executable to `CMakeLists.txt`
diff --git a/test/ck_tile/utility/print/test_print_array.cpp b/test/ck_tile/utility/print/test_print_array.cpp
new file mode 100644
index 0000000000..2fe9bc2a0c
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_array.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintArrayTest : public PrintTest
+{
+};
+
+TEST_F(PrintArrayTest, PrintIntArray)
+{
+    // Test printing array<int, 3>
+    array<int, 3> arr{10, 20, 30};
+
+    std::string output = CapturePrintOutput(arr);
+
+    // The expected format should match the array print function implementation
+    EXPECT_EQ(output, "array{size: 3, data: [10, 20, 30]}");
+}
+
+TEST_F(PrintArrayTest, PrintSingleElementArray)
+{
+    // Test printing array<int, 1>
+    array<int, 1> arr{42};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "array{size: 1, data: [42]}");
+}
+
+TEST_F(PrintArrayTest, PrintEmptyArray)
+{
+    // Test printing array<int, 0> (empty array)
+    array<int, 0> arr{};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "array{size: 0, data: []}");
+}
+
+TEST_F(PrintArrayTest, PrintFloatArray)
+{
+    // Test printing array with float values
+    array<float, 2> arr{3.14f, 2.71f};
+
+    std::string output = CapturePrintOutput(arr);
+
+    // Note: float printing format may vary, so we'll test for basic structure
+    EXPECT_TRUE(output.find("array{size: 2, data: [") == 0);
+    EXPECT_TRUE(output.find("3.14") != std::string::npos);
+    EXPECT_TRUE(output.find("2.71") != std::string::npos);
+    EXPECT_TRUE(output.find("]}") == output.length() - 2);
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_basic_types.cpp b/test/ck_tile/utility/print/test_print_basic_types.cpp
new file mode 100644
index 0000000000..7a26b6371a
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_basic_types.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintBasicTypesTest : public PrintTest
+{
+};
+
+TEST_F(PrintBasicTypesTest, PrintIntArray)
+{
+    int arr[4] = {1, 2, 3, 4};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[1, 2, 3, 4]");
+}
+
+TEST_F(PrintBasicTypesTest, PrintFloatArray)
+{
+    float arr[3] = {1.5f, 2.5f, 3.5f};
+
+    std::string output = CapturePrintOutput(arr);
+
+    // Note: floating point formatting may vary, so we check for key elements
+    EXPECT_TRUE(output.find("[") == 0);
+    EXPECT_TRUE(output.find("1.5") != std::string::npos);
+    EXPECT_TRUE(output.find("2.5") != std::string::npos);
+    EXPECT_TRUE(output.find("3.5") != std::string::npos);
+    EXPECT_TRUE(output.back() == ']');
+    EXPECT_TRUE(output.find(", ") != std::string::npos);
+}
+
+TEST_F(PrintBasicTypesTest, PrintDoubleArray)
+{
+    double arr[2] = {10.123, 20.456};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_TRUE(output.find("[") == 0);
+    EXPECT_TRUE(output.find("10.123") != std::string::npos);
+    EXPECT_TRUE(output.find("20.456") != std::string::npos);
+    EXPECT_TRUE(output.back() == ']');
+}
+
+TEST_F(PrintBasicTypesTest, PrintUnsignedIntArray)
+{
+    unsigned int arr[3] = {100u, 200u, 300u};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[100, 200, 300]");
+}
+
+TEST_F(PrintBasicTypesTest, PrintCharArray)
+{
+    char arr[5] = {'a', 'b', 'c', 'd', 'e'};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[a, b, c, d, e]");
+}
+
+TEST_F(PrintBasicTypesTest, PrintSingleElementArray)
+{
+    int arr[1] = {42};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[42]");
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_buffer_view.cpp b/test/ck_tile/utility/print/test_print_buffer_view.cpp
new file mode 100644
index 0000000000..66668a2103
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_buffer_view.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/tensor/buffer_view.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintBufferViewTest : public PrintTest
+{
+};
+
+TEST_F(PrintBufferViewTest, PrintGenericBufferView)
+{
+    // Test printing generic address space buffer_view
+    float data[4] = {100.f, 200.f, 300.f, 400.f};
+    auto bv       = make_buffer_view<address_space_enum::generic>(&data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: generic") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+TEST_F(PrintBufferViewTest, PrintGlobalBufferView)
+{
+    // Test printing global address space buffer_view
+    float data[4] = {100.f, 200.f, 300.f, 400.f};
+    auto bv       = make_buffer_view<address_space_enum::global>(&data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: global") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+TEST_F(PrintBufferViewTest, PrintLdsBufferView)
+{
+    // Test printing LDS address space buffer_view
+    float data[4] = {100.f, 200.f, 300.f, 400.f};
+    auto bv       = make_buffer_view<address_space_enum::lds>(data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: lds") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+TEST_F(PrintBufferViewTest, PrintVgprBufferView)
+{
+    // Test printing VGPR address space buffer_view
+    float data[4] = {1.5f, 2.5f, 3.5f, 4.5f};
+    auto bv       = make_buffer_view<address_space_enum::vgpr>(data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: vgpr") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_common.hpp b/test/ck_tile/utility/print/test_print_common.hpp
new file mode 100644
index 0000000000..3ba2270802
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_common.hpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <gtest/gtest-spi.h>
+
+#include "ck_tile/core/utility/print.hpp"
+
+class PrintTest : public ::testing::Test
+{
+    protected:
+    void SetUp() override {}
+    void TearDown() override {}
+    // Helper function to capture and return the output of a print function
+    template <typename T>
+    std::string CapturePrintOutput(const T& type)
+    {
+        using namespace ck_tile;
+        testing::internal::CaptureStdout();
+        print(type);
+        return testing::internal::GetCapturedStdout();
+    }
+};
diff --git a/test/ck_tile/utility/print/test_print_coordinate_transform.cpp b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp
new file mode 100644
index 0000000000..639b113eb7
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintCoordinateTransformTest : public PrintTest
+{
+};
+
+TEST_F(PrintCoordinateTransformTest, PrintPassThrough)
+{
+    // Test printing pass_through transform
+    auto pt = make_pass_through_transform(number<32>{});
+
+    std::string output = CapturePrintOutput(pt);
+
+    // Verify it contains the pass_through identifier and some structure
+    EXPECT_TRUE(output.find("pass_through{") == 0);
+    EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintEmbed)
+{
+    // Test printing embed transform
+    auto embed_transform = make_embed_transform(make_tuple(number<4>{}, number<8>{}),
+                                                make_tuple(number<1>{}, number<4>{}));
+
+    std::string output = CapturePrintOutput(embed_transform);
+
+    // Verify it contains the embed identifier and key fields
+    EXPECT_TRUE(output.find("embed{") == 0);
+    EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.find("coefficients_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintMerge)
+{
+    // Test printing merge transform
+    auto merge_transform = make_merge_transform(make_tuple(number<4>{}, number<8>{}));
+
+    std::string output = CapturePrintOutput(merge_transform);
+
+    // Verify it contains merge identifier and key fields
+    EXPECT_TRUE(output.find("merge") ==
+                0); // Could be merge_v2_magic_division or merge_v3_division_mod
+    EXPECT_TRUE(output.find("low_lengths_") != std::string::npos ||
+                output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintUnmerge)
+{
+    // Test printing unmerge transform
+    auto unmerge_transform = make_unmerge_transform(make_tuple(number<4>{}, number<8>{}));
+
+    std::string output = CapturePrintOutput(unmerge_transform);
+
+    // Verify it contains the unmerge identifier and key fields
+    EXPECT_TRUE(output.find("unmerge{") == 0);
+    EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintFreeze)
+{
+    // Test printing freeze transform
+    auto freeze_transform = make_freeze_transform(number<5>{});
+
+    std::string output = CapturePrintOutput(freeze_transform);
+
+    // Verify it contains the freeze identifier and key fields
+    EXPECT_TRUE(output.find("freeze{") == 0);
+    EXPECT_TRUE(output.find("low_idx_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_sequence.cpp b/test/ck_tile/utility/print/test_print_sequence.cpp
new file mode 100644
index 0000000000..e73a9f7e33
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_sequence.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/utility/print.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+
+namespace ck_tile {
+
+class PrintSequenceTest : public PrintTest
+{
+};
+
+TEST_F(PrintSequenceTest, PrintSimpleSequence)
+{
+    // Test printing sequence<1, 5, 8>
+    constexpr auto seq = sequence<1, 5, 8>{};
+
+    std::string output = CapturePrintOutput(seq);
+
+    // Verify the output format
+    EXPECT_EQ(output, "sequence<1, 5, 8>");
+}
+
+TEST_F(PrintSequenceTest, PrintSingleElementSequence)
+{
+    // Test printing sequence<42>
+    constexpr auto seq = sequence<42>{};
+
+    std::string output = CapturePrintOutput(seq);
+
+    EXPECT_EQ(output, "sequence<42>");
+}
+
+TEST_F(PrintSequenceTest, PrintEmptySequence)
+{
+    // Test printing sequence<> (empty sequence)
+    constexpr auto seq = sequence<>{};
+
+    std::string output = CapturePrintOutput(seq);
+
+    EXPECT_EQ(output, "sequence<>");
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
new file mode 100644
index 0000000000..d1cb408b5c
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+#include <sstream>
+
+namespace ck_tile {
+
+class PrintStaticEncodingPatternTest : public PrintTest
+{
+    protected:
+    void TestY0Y1Y2(const std::string& output, auto Y0, auto Y1, auto Y2)
+    {
+        std::stringstream expected;
+        expected << "<Y0, Y1, Y2>: <" << Y0 << ", " << Y1 << ", " << Y2 << ">";
+        EXPECT_TRUE(output.find(expected.str()) != std::string::npos);
+    }
+    void TestX0X1(const std::string& output, auto X0, auto X1)
+    {
+        std::stringstream expected;
+        expected << "<X0, X1>: <" << X0 << ", " << X1 << ">";
+        EXPECT_TRUE(output.find(expected.str()) != std::string::npos);
+    }
+};
+
+TEST_F(PrintStaticEncodingPatternTest, PrintThreadRakedPattern)
+{
+    // Test printing thread raked pattern
+    using PatternType =
+        TileDistributionEncodingPattern2D<64, 8, 16, 4, tile_distribution_pattern::thread_raked>;
+    PatternType pattern;
+
+    std::string output = CapturePrintOutput(pattern);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("BlockSize:64") != std::string::npos);
+    EXPECT_TRUE(output.find("YPerTile:8") != std::string::npos);
+    EXPECT_TRUE(output.find("XPerTile:16") != std::string::npos);
+    EXPECT_TRUE(output.find("VecSize:4") != std::string::npos);
+    EXPECT_TRUE(output.find("thread_raked") != std::string::npos);
+    TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
+    TestX0X1(output, PatternType::X0, PatternType::X1);
+}
+
+TEST_F(PrintStaticEncodingPatternTest, PrintWarpRakedPattern)
+{
+    // Test printing warp raked pattern
+    using PatternType =
+        TileDistributionEncodingPattern2D<128, 16, 32, 8, tile_distribution_pattern::warp_raked>;
+    PatternType pattern;
+
+    std::string output = CapturePrintOutput(pattern);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("BlockSize:128") != std::string::npos);
+    EXPECT_TRUE(output.find("YPerTile:16") != std::string::npos);
+    EXPECT_TRUE(output.find("XPerTile:32") != std::string::npos);
+    EXPECT_TRUE(output.find("VecSize:8") != std::string::npos);
+    EXPECT_TRUE(output.find("warp_raked") != std::string::npos);
+    TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
+    TestX0X1(output, PatternType::X0, PatternType::X1);
+}
+
+TEST_F(PrintStaticEncodingPatternTest, PrintBlockRakedPattern)
+{
+    // Test printing block raked pattern
+    using PatternType =
+        TileDistributionEncodingPattern2D<256, 32, 64, 16, tile_distribution_pattern::block_raked>;
+    PatternType pattern;
+
+    std::string output = CapturePrintOutput(pattern);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("BlockSize:256") != std::string::npos);
+    EXPECT_TRUE(output.find("YPerTile:32") != std::string::npos);
+    EXPECT_TRUE(output.find("XPerTile:64") != std::string::npos);
+    EXPECT_TRUE(output.find("VecSize:16") != std::string::npos);
+    EXPECT_TRUE(output.find("block_raked") != std::string::npos);
+    TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
+    TestX0X1(output, PatternType::X0, PatternType::X1);
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_tuple.cpp b/test/ck_tile/utility/print/test_print_tuple.cpp
new file mode 100644
index 0000000000..79aaf1b3af
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_tuple.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintTupleTest : public PrintTest
+{
+};
+
+TEST_F(PrintTupleTest, PrintSimpleTuple)
+{
+    // Test printing tuple with numbers
+    auto tup = make_tuple(number<1>{}, number<5>{}, number<8>{});
+
+    std::string output = CapturePrintOutput(tup);
+
+    // Verify the output format matches tuple print implementation
+    EXPECT_TRUE(output.find("tuple<") == 0);
+    EXPECT_TRUE(output.find("1") != std::string::npos);
+    EXPECT_TRUE(output.find("5") != std::string::npos);
+    EXPECT_TRUE(output.find("8") != std::string::npos);
+    EXPECT_TRUE(output.back() == '>');
+}
+
+TEST_F(PrintTupleTest, PrintSingleElementTuple)
+{
+    // Test printing tuple with single element
+    auto tup = make_tuple(number<42>{});
+
+    std::string output = CapturePrintOutput(tup);
+
+    EXPECT_TRUE(output.find("tuple<") == 0);
+    EXPECT_TRUE(output.find("42") != std::string::npos);
+    EXPECT_TRUE(output.back() == '>');
+}
+
+TEST_F(PrintTupleTest, PrintEmptyTuple)
+{
+    // Test printing empty tuple
+    auto tup = make_tuple();
+
+    std::string output = CapturePrintOutput(tup);
+
+    EXPECT_EQ(output, "tuple<>");
+}
+
+TEST_F(PrintTupleTest, PrintMixedTypeTuple)
+{
+    // Test printing tuple with mixed types (numbers and constants)
+    auto tup = make_tuple(number<10>{}, constant<20>{}, number<30>{});
+
+    std::string output = CapturePrintOutput(tup);
+
+    EXPECT_TRUE(output.find("tuple<") == 0);
+    EXPECT_TRUE(output.find("10") != std::string::npos);
+    EXPECT_TRUE(output.find("20") != std::string::npos);
+    EXPECT_TRUE(output.find("30") != std::string::npos);
+    EXPECT_TRUE(output.back() == '>');
+}
+
+} // namespace ck_tile

From b0a97498b0965d1b33cf90d117f9783989ef9ccb Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Thu, 7 Aug 2025 21:24:43 +0800
Subject: [PATCH 402/443] [CK_TILE] FMHA BWD Remove Unnecessary Padding (#2550)

* Remove unnecessary pssk

* Add BlockFmhaBwdDQDKDVPipeline wrapper

* Resolve copilot comments & Remove kpad & fix

* Remove spad
---
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   | 171 +++++++-----------
 example/ck_tile/01_fmha/codegen/utils.py      |  21 +++
 example/ck_tile/01_fmha/fmha_bwd.hpp          |  28 ++-
 .../ck_tile/core/tensor/null_tile_window.hpp  |   7 +-
 include/ck_tile/ops/fmha.hpp                  |   2 +-
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       |  54 ++----
 ...k_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp |   6 +-
 ...a_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp |   7 +-
 ...ck_fmha_bwd_dq_dk_dv_pipeline_selector.hpp |  30 +++
 .../pipeline/block_fmha_bwd_pipeline_enum.hpp |  15 --
 .../block_fmha_bwd_pipeline_problem.hpp       |   6 +-
 11 files changed, 158 insertions(+), 189 deletions(-)
 create mode 100644 example/ck_tile/01_fmha/codegen/utils.py
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
 delete mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 77b63a0c83..47cf6b3ad4 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 
 import copy
@@ -8,21 +8,13 @@ import fnmatch
 import itertools
 from pathlib import Path
 from typing import List, Optional, Tuple, Dict, Literal
+from collections import defaultdict
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file
 
 
-BWD_DQDKDV_PIPELINE_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR",
-}
-
-BWD_DQDKDV_PIPELINE_ENUM_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR",
-}
-
 FMHA_BWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -56,8 +48,8 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
                                                          fmha_block_warps2_{F_idx},
                                                          fmha_warp_tile0_{F_idx}>;
 
-using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
-                                                       {F_skpad},
+using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false,  /* kPadSeqLenQ */
+                                                       false,  /* kPadSeqLenK */
                                                        {F_dpad},
                                                        {F_dvpad},
                                                        false,
@@ -93,18 +85,18 @@ using fmha_bwd_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdPipelineProblem<
     fmha_dropout_{F_idx},
     fmha_bwd_trait_{F_idx}>;
 
-using fmha_bwd_pipeline_{F_idx} = {F_pipeline}<fmha_bwd_pipeline_problem_{F_idx}>;
+using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline<fmha_bwd_pipeline_problem_{F_idx}>;
 
 using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType,
-                                      {F_skpad},
+                                      false,
                                       {F_dpad}>>;
 
 using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType,
-                                      {F_skpad},
+                                      false,
                                       {F_dvpad}>>;
 
 using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
@@ -115,13 +107,10 @@ using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
 using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                          {F_dtype},
                                                          {F_mode},
-                                                         {F_pipeline_enum},
                                                          fmha_mask_{F_idx},
                                                          fmha_dropout_{F_idx},
                                                          {F_bias},
                                                          {F_dbias},
-                                                         {F_spad},
-                                                         {F_skpad},
                                                          {F_dpad},
                                                          {F_dvpad},
                                                          {F_deterministic}>;
@@ -195,15 +184,18 @@ FMHA_BWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_BWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dvpad}>;
-                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_pipeline_enum}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_spad0}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
-                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dpad}, {F_deterministic}>;
+                        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
+                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
+                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
+                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
                 r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
                 return r;
             }}
 """
 
+# M0 size for 1d kernels (dot/convert)
+M0_1D = 64
+
 # GEMM0: Q@K=S^T
 # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v)
 # GEMM2: dO@V=dP^T(This was chosen as G2 because of the calculation order)
@@ -249,8 +241,6 @@ class FmhaBwdDQDKDVKernel:
     F_hdim          : int  # hdim
     F_dtype         : str  # data type
     F_tile          : FmhaBwdDQDKDVTileSize
-    F_spad          : str  # true/false
-    F_skpad         : str  #
     F_dpad          : str  #
     F_dvpad         : str  #
     F_bias          : str  #
@@ -259,7 +249,6 @@ class FmhaBwdDQDKDVKernel:
     F_mask          : str  # value from MASK_MAP
     F_mode          : str  # value from MODE_MAP
     F_deterministic : str  #
-    F_pipeline      : str  #
     mask_impl       : str  #
 
     @property
@@ -293,8 +282,6 @@ class FmhaBwdDQDKDVKernel:
                 F_wm1           = self.F_tile.F_wm1,
                 F_wn1           = self.F_tile.F_wn1,
                 F_wk1           = self.F_tile.F_wk1,
-                F_spad          = BOOL_MAP[self.F_spad],
-                F_skpad         = BOOL_MAP[self.F_skpad],
                 F_dpad          = BOOL_MAP[self.F_dpad],
                 F_dvpad         = BOOL_MAP[self.F_dvpad],
                 F_bias          = BIAS_MAP[self.F_bias],
@@ -304,21 +291,18 @@ class FmhaBwdDQDKDVKernel:
                 F_mask          = get_mask_map(self.mask_impl)[self.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
                 F_deterministic = BOOL_MAP[self.F_deterministic],
-                F_pipeline_enum = BWD_DQDKDV_PIPELINE_ENUM_MAP[self.F_pipeline],
-                F_pipeline      = BWD_DQDKDV_PIPELINE_MAP[self.F_pipeline])
+            )
 
     @property
     def name(self) -> str:
         def pad_name() -> str:
             n = ''
-            if self.F_spad == 't': n += 's'
-            if self.F_skpad == 't' : n += 'sk'
             if self.F_dpad == 't' : n += 'd'
             if self.F_dvpad == 't' : n += 'dv'
             if n != '' : n = 'p' + n
             return n
         pn = pad_name()
-        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}'
+        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name
         if pn != '' : n += f'_{pn}'
         else: n += '_npad'
 
@@ -347,20 +331,15 @@ class FmhaBwdDQDKDVKernel:
         return self.name + ".cpp"
 
 # TODO: design a more practical way to do it
-# this is current supported tile size & pipeline.
+# this is current supported tile size.
 def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : [FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '64'  : [FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            # '160' : [FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-            #             "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '256' : [FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"]
+            '32'  : FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            '64'  : FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            '128' : FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            '256' : FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
         }
     else:
         return None
@@ -375,7 +354,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::OGradDataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::DDataType,
-    /* BlockSize = */ 64,
+    /* BlockSize = M0 = */ 64,
     {F_hdim},
     {F_mode},
     fmha_bwd_dot_do_o_trait_{F_idx}>;
@@ -580,7 +559,6 @@ class FmhaBwdConvertQGradKernel:
 @dataclass(frozen=True)
 class FmhaBwdApiTrait:
     idx           : int  # this is not a tunable, but a counter to differentiate symbol
-    pipeline      : str
     # sync with fmha_bwd_traits<>, to generate fallback calls
     hdim          : int
     dtype         : str  # data type
@@ -590,9 +568,7 @@ class FmhaBwdApiTrait:
     bias          : str
     dbias         : str
     dropout       : str
-    spad          : str
-    spad1         : str # spad for dot/convert kernel
-    skpad         : str
+    spad1d        : str # spad for 1d kernels (dot/convert)
     dpad          : str
     dvpad         : str
     deterministic : str
@@ -611,24 +587,14 @@ class FmhaBwdApiTrait:
     def bhdv(self) -> int:
         return self.tile.F_bhdv
 
-    def scheck(self, spad1 : str) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.spad == 't' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} != 0'
-        elif self.spad == 'f' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0'
-        else: # self.skpad == 'f' and skpad1 == 'f'
-            return 'a.seqlen_q % 64 == 0'
-
     @property
-    def skcheck(self) -> str:
+    def scheck(self) -> str:
         if self.mode == 'group':
             return 'true' # always support
-        elif self.skpad == 't':
-            return f'a.seqlen_k % {self.bn0} != 0'
-        else:
-            return f'a.seqlen_k % {self.bn0} == 0'
+        elif self.spad1d == 't':
+            return f'a.seqlen_q % {M0_1D} != 0'
+        else: # self.spad1d == 'f'
+            return f'a.seqlen_q % {M0_1D} == 0'
 
     @property
     def dcheck(self) -> str:
@@ -647,14 +613,14 @@ class FmhaBwdApiTrait:
         def get_occupancy(dtype, hdim):
             return 2
 
-        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1,
+        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d,
             F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
 
     @property
     def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
         return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
-            F_spad=self.spad, F_skpad=self.skpad, F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias,
-            F_dbias=self.dbias, F_dropout=self.dropout, F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, F_pipeline=self.pipeline, mask_impl=self.mask_impl)
+            F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout,
+            F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl)
 
     @property
     def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
@@ -664,48 +630,46 @@ class FmhaBwdApiTrait:
             return 2
 
         return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
-            F_bm0=64, F_bn0=self.tile.F_bn0, F_spad=self.spad, F_dpad=self.dpad,
+            F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
             F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
             F_deterministic=self.deterministic)
 
 class FmhaBwdApiPool:
     def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = dict()
+        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(list))
         self.mask_impl = mask_impl
 
     def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
         # TODO: do we need to check duplication?
-        if trait.dtype not in self.dq_dk_dv_pool.keys():
-            self.dq_dk_dv_pool[trait.dtype] = dict()
-        if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys():
-            self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list()
-
         self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
 
+    @staticmethod
+    def if_(i: int) -> str:
+        return 'if' if i == 0 else 'else if'
+
+    def _api_innders(self, traits: List[FmhaBwdApiTrait]) -> str:
+        inners = ""
+        i = 0 
+        for trait in traits:
+            inners += FMHA_BWD_API_INNER_DISPATCH.format(F_if=self.if_(i), F_mode=MODE_MAP[trait.mode],
+                F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
+                F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
+                F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                F_deterministic=BOOL_MAP[trait.deterministic])
+            i += 1
+        return inners
+
     @property
     def api(self) -> str:
         per_dtypes=str()
-        for i, dtype in enumerate(self.dq_dk_dv_pool.keys()):
+        for i, dtype in enumerate(self.dq_dk_dv_pool):
             per_hdim_case=str()
-            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()):
+            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype]):
                 traits=self.dq_dk_dv_pool[dtype][hdim]
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    for spad1 in ["t", "f"]:
-                        if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")):
-                            continue
-                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
-                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
-                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
-                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                    F_deterministic=BOOL_MAP[trait.deterministic])
-
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+                inners = self._api_innders(traits)
+                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(j), F_hdim=hdim, F_inner_dispatch=inners)
+            per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(i), F_dtype=dtype, F_hdim_case=per_hdim_case)
         if not per_dtypes:
             # empty string we add some ignore to suppress warning in api
             per_dtypes += '    (void)t ; (void)s ; (void)a;'
@@ -730,21 +694,16 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
         d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
         if d is None:
             continue
-        for hdim_str, mode, mask, bias, dbias, dropout, spad, spad1, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 6)):
-            tile = d[hdim_str][0]
-            ppl = d[hdim_str][1]
+        for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
+            tile = d[hdim_str]
             hdim = int(hdim_str)
-            if (mode == "group") and (spad == "f" or skpad == "f"):
-                continue
-            if (spad1 == "f") and (spad == "t" or mode == "group"):
+            if (mode == "group") and (spad1d == "f"):
                 continue
             if ((bias == "no" or bias == "alibi") and dbias == "t"):
                 continue
             if ("wg32" in dropout):
                 continue
-            if (dpad == "t" or dvpad == "t"):
-                ppl = d[hdim_str][2]
-            t = FmhaBwdApiTrait(idx=0, pipeline=ppl, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad=spad, spad1=spad1, skpad=skpad, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
+            t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
 
             if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
                 continue
@@ -808,13 +767,13 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
 
 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list)
-    (output_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
+    update_file(output_dir / FMHA_BWD_API_FILENAME, api_pool.api)
     for k in kernels_dot_do_o:
-        (output_dir / k.filename).write_text(k.template)
+        update_file(output_dir / k.filename, k.template)
     for k in kernels_convert_dq:
-        (output_dir / k.filename).write_text(k.template)
+        update_file(output_dir / k.filename, k.template)
     for k in kernels_dq_dk_dv:
-        (output_dir / k.filename).write_text(k.template)
+        update_file(output_dir / k.filename, k.template)
 
 
 def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None:
diff --git a/example/ck_tile/01_fmha/codegen/utils.py b/example/ck_tile/01_fmha/codegen/utils.py
new file mode 100644
index 0000000000..e3bbb18c42
--- /dev/null
+++ b/example/ck_tile/01_fmha/codegen/utils.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import os.path as path
+
+
+def update_file(file_path, content):
+    """Update the file at file_path with the given content if it differs from the existing content.
+
+    It avoids unnecessary touching of the file which triggers rebuilds
+    """
+
+    existing_content = ""
+    if path.exists(file_path):
+        with open(file_path, "r") as file:
+            existing_content = file.read()
+    if existing_content == content:
+        return
+    with open(file_path, "w") as file:
+        file.write(content)
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 9179dbd9be..c999cf750e 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -357,31 +357,25 @@ auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
 template <ck_tile::index_t HDim_,
           typename DataType_,
           bool kIsGroupMode_,
-          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
           typename FmhaMask_,
           typename FmhaDropout_,
           ck_tile::BlockAttentionBiasEnum BiasEnum_,
           bool kHasBiasGrad_,
-          bool kPadS_,
-          bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
           bool kIsDeterministic_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
-    static constexpr ck_tile::index_t HDim    = HDim_;
-    using DataType                            = ck_tile::remove_cvref_t<DataType_>;
-    static constexpr bool kIsGroupMode        = kIsGroupMode_;
-    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
-    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
-    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
-    static constexpr auto BiasEnum            = BiasEnum_;
-    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
-    static constexpr bool kPadS               = kPadS_;
-    static constexpr bool kPadSK              = kPadSK_;
-    static constexpr bool kPadD               = kPadD_;
-    static constexpr bool kPadDv              = kPadDv_;
-    static constexpr bool kIsDeterministic    = kIsDeterministic_;
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    using FmhaMask                         = ck_tile::remove_cvref_t<FmhaMask_>;
+    using FmhaDropout                      = ck_tile::remove_cvref_t<FmhaDropout_>;
+    static constexpr auto BiasEnum         = BiasEnum_;
+    static constexpr bool kHasBiasGrad     = kHasBiasGrad_;
+    static constexpr bool kPadD            = kPadD_;
+    static constexpr bool kPadDv           = kPadDv_;
+    static constexpr bool kIsDeterministic = kIsDeterministic_;
 };
 
 template <typename Traits_>
diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp
index de99be1965..f7eca73afb 100644
--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -53,10 +53,13 @@ struct is_null_tile_window<null_tile_window<T>> : public std::true_type
 };
 } // namespace impl
 
+template <typename T>
+constexpr bool is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value;
+
 template <typename T>
 CK_TILE_DEVICE constexpr auto is_null_tile_window(const T&)
 {
-    return impl::is_null_tile_window<remove_cvref_t<T>>::value;
+    return is_null_tile_window_v<remove_cvref_t<T>>;
 }
 
 template <typename WindowLengths>
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 30bea193b7..313de5f29a 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -24,8 +24,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index ce3bf8fe8d..8b184b18f3 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -52,8 +52,6 @@ struct FmhaBwdDQDKDVKernel
     using BiasGradDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::BiasGradDataType>;
 
     static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ  = FmhaPipeline::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK  = FmhaPipeline::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
     static constexpr auto BiasEnum     = FmhaPipeline::BiasEnum;
@@ -85,8 +83,6 @@ struct FmhaBwdDQDKDVKernel
         #define _TS_  std::to_string
         auto pn = [&] () {
             std::string n;
-            if (kPadSeqLenQ) n += "s";
-            if (kPadSeqLenK) n += "sk";
             if (kPadHeadDimQ) n += "d";
             if (kPadHeadDimV) n += "dv";
             return n.empty() ? n : std::string("p") + n; }();
@@ -100,7 +96,7 @@ struct FmhaBwdDQDKDVKernel
             "r" + _TS_(gbr4::at(ck_tile::number<0>{})) + "x" + _TS_(gbr4::at(ck_tile::number<1>{})) + "x" + _TS_(gbr4::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt0::at(ck_tile::number<0>{})) + "x" + _TS_(gwt0::at(ck_tile::number<1>{})) + "x" + _TS_(gwt0::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt1::at(ck_tile::number<0>{})) + "x" + _TS_(gwt1::at(ck_tile::number<1>{})) + "x" + _TS_(gwt1::at(ck_tile::number<2>{})) + "_" +
-            ("o" + _TS_(kBlockPerCu) + "_") + _SS_(FmhaPipeline::name) + (pn.empty() ? "_npad" : "_" + pn) +
+            ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) +
             (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
             (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) +
             (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" );
@@ -1221,7 +1217,7 @@ struct FmhaBwdDQDKDVKernel
         const auto q_dram = pad_tensor_view(
             q_dram_naive,
             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-            sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            sequence<false, kPadHeadDimQ>{});
 
         const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
             k_ptr,
@@ -1232,7 +1228,7 @@ struct FmhaBwdDQDKDVKernel
         const auto k_dram = pad_tensor_view(
             k_dram_naive,
             make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-            sequence<kPadSeqLenK, kPadHeadDimQ>{});
+            sequence<false, kPadHeadDimQ>{});
 
         const auto v_dram = [&]() {
             const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
@@ -1244,22 +1240,15 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 v_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
-                sequence<kPadSeqLenK, kPadHeadDimV>{});
+                sequence<false, kPadHeadDimV>{});
         }();
 
-        const auto lse_dram = [&]() {
-            const auto lse_dram_naive = make_naive_tensor_view_packed<address_space_enum::global>(
-                lse_ptr, make_tuple(kargs.seqlen_q), number<1>{});
-            return pad_tensor_view(
-                lse_dram_naive, make_tuple(number<FmhaPipeline::kM0>{}), sequence<kPadSeqLenQ>{});
-        }();
+        // lse and d should be fine to read unpaded data as they are not on the reduction dimension
+        const auto lse_dram = make_naive_tensor_view_packed<address_space_enum::global>(
+            lse_ptr, make_tuple(kargs.seqlen_q), number<FmhaPipeline::kM0>{});
 
-        const auto d_dram = [&]() {
-            const auto d_dram_naive = make_naive_tensor_view_packed<address_space_enum::global>(
-                d_ptr, make_tuple(kargs.seqlen_q), number<1>{});
-            return pad_tensor_view(
-                d_dram_naive, make_tuple(number<FmhaPipeline::kM0>{}), sequence<kPadSeqLenQ>{});
-        }();
+        const auto d_dram = make_naive_tensor_view_packed<address_space_enum::global>(
+            d_ptr, make_tuple(kargs.seqlen_q), number<FmhaPipeline::kM0>{});
 
         const auto do_dram_naive = make_naive_tensor_view<address_space_enum::global>(
             do_ptr,
@@ -1270,7 +1259,7 @@ struct FmhaBwdDQDKDVKernel
         const auto do_dram = pad_tensor_view(
             do_dram_naive,
             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kVHeaddim>{}),
-            sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            sequence<false, kPadHeadDimV>{});
 
         auto q_dram_window = make_tile_window(
             q_dram,
@@ -1313,7 +1302,7 @@ struct FmhaBwdDQDKDVKernel
                     return pad_tensor_view(
                         dq_acc_dram_naive,
                         make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                        sequence<false, kPadHeadDimQ>{});
                 }();
 
                 return make_tile_window(
@@ -1341,7 +1330,7 @@ struct FmhaBwdDQDKDVKernel
                     return pad_tensor_view(
                         dq_acc_dram_naive,
                         make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                        sequence<false, kPadHeadDimQ>{});
                 }();
 
                 return make_tile_window(
@@ -1376,9 +1365,8 @@ struct FmhaBwdDQDKDVKernel
                         number<FmhaPipeline::kAlignmentBias>{},
                         number<1>{});
 
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        bias_dram_naive, bias_dram_window_lengths, sequence<false, true>{});
                 }();
 
                 return make_tile_window(bias_dram, bias_dram_window_lengths, {0, i_n0});
@@ -1406,9 +1394,8 @@ struct FmhaBwdDQDKDVKernel
                             number<FmhaPipeline::kAlignmentBias>{},
                             number<1>{});
 
-                    return pad_tensor_view(dbias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        dbias_dram_naive, bias_dram_window_lengths, sequence<false, true>{});
                 }();
 
                 return make_tile_window(dbias_dram, bias_dram_window_lengths, {0, i_n0});
@@ -1495,9 +1482,8 @@ struct FmhaBwdDQDKDVKernel
                             number<1>{},
                             number<1>{});
 
-                    return pad_tensor_view(randval_dram_naive,
-                                           randval_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        randval_dram_naive, randval_dram_window_lengths, sequence<false, true>{});
                 }();
 
                 return make_tile_window(randval_dram, randval_dram_window_lengths, {0, i_n0});
@@ -1550,7 +1536,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 dk_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<false, kPadHeadDimQ>{});
         }();
 
         auto dv_dram = [&]() {
@@ -1564,7 +1550,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 dv_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
-                sequence<kPadSeqLenK, kPadHeadDimV>{});
+                sequence<false, kPadHeadDimV>{});
         }();
 
         auto dk_dram_window = make_tile_window(
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
index 8a13c0b060..1f11569533 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
@@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
@@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
         kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
         kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
-    static constexpr index_t kAlignmentBias =
-        kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias<Problem>();
+    static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "kr_ktr_vr";
 
@@ -554,7 +551,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
                 });
             }
 
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
                 bool need_perpixel_check = mask.IsEdgeTile(
                     seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index c88b058d32..967fe2362d 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
@@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
         kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
         kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
-    static constexpr index_t kAlignmentBias =
-        kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias<Problem>();
+    static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "kr_ktr_vr_iglp";
 
@@ -590,7 +587,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
                 });
             }
 
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
                 bool need_perpixel_check = mask.IsEdgeTile(
                     seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
@@ -849,7 +845,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
             });
         }
 
-        if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
         {
             bool need_perpixel_check = mask.IsEdgeTile(
                 seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
new file mode 100644
index 0000000000..80c311de86
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
+
+namespace ck_tile {
+
+template <typename Problem>
+class BlockFmhaBwdDQDKDVPipelineSelector
+{
+    static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV;
+
+    public:
+    using type = std::conditional_t<has_dpad,
+                                    BlockFmhaBwdDQDKDVPipelineKRKTRVR<Problem>,
+                                    BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<Problem>>;
+};
+
+template <typename Problem>
+class BlockFmhaBwdDQDKDVPipeline : public BlockFmhaBwdDQDKDVPipelineSelector<Problem>::type
+{
+    public:
+    static constexpr const char* name = "auto";
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp
deleted file mode 100644
index 27f58ef2f8..0000000000
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-namespace ck_tile {
-
-// This class is used for codegen pattern matching
-enum class BlockFmhaBwdPipelineEnum
-{
-    KRKTRVR_IGLP = 0,
-    KRKTRVR,
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
index c4c4a745a7..f6c79c7db6 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -55,13 +55,13 @@ struct BlockFmhaBwdPipelineProblem
     static constexpr bool kIsDeterministic = kIsDeterministic_;
 
     // attributes from traits
-    static constexpr bool kPadSeqLenQ    = Traits::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK    = Traits::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ   = Traits::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV   = Traits::kPadHeadDimV;
     static constexpr auto BiasEnum       = Traits::BiasEnum;
     static constexpr bool kHasBiasGrad   = Traits::kHasBiasGrad;
     static constexpr index_t kBlockPerCu = Traits::kBlockPerCu;
+    static_assert(!Traits::kPadSeqLenQ, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ");
+    static_assert(!Traits::kPadSeqLenK, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ");
 };
 
 template <typename ODataType_,

From 5d6d236b255b4ef9c8f38e1bd35975acda0af19a Mon Sep 17 00:00:00 2001
From: Gino Lu <gino.lu@amd.com>
Date: Thu, 7 Aug 2025 21:37:28 +0800
Subject: [PATCH 403/443] Add e8m0 scaled convert into CK_TILE (#2617)

* first commit

* remove redundent code

* modify according to comments.

* fix type_convert error with scaled_type_convert
---
 include/ck_tile/core.hpp                      |   1 +
 include/ck_tile/core/numeric/e8m0.hpp         | 102 +++++++++++
 include/ck_tile/core/numeric/mxfp_convert.hpp |  27 +--
 include/ck_tile/core/numeric/pk_fp4.hpp       | 163 +++++++++++-------
 include/ck_tile/core/numeric/type_convert.hpp |  41 +++--
 include/ck_tile/host/host_tensor.hpp          |   8 +-
 test/ck_tile/data_type/CMakeLists.txt         |   1 +
 test/ck_tile/data_type/test_mx_scale.cpp      | 162 +++++++++++++++++
 8 files changed, 419 insertions(+), 86 deletions(-)
 create mode 100644 include/ck_tile/core/numeric/e8m0.hpp
 create mode 100644 test/ck_tile/data_type/test_mx_scale.cpp

diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index c8945f03e9..9f3c996873 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -27,6 +27,7 @@
 #include "ck_tile/core/container/thread_buffer.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/e8m0.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/int8.hpp"
diff --git a/include/ck_tile/core/numeric/e8m0.hpp b/include/ck_tile/core/numeric/e8m0.hpp
new file mode 100644
index 0000000000..ea94880f27
--- /dev/null
+++ b/include/ck_tile/core/numeric/e8m0.hpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/mxfp_convert.hpp"
+
+namespace ck_tile {
+
+/**
+ * @brief Unsigned representation of a conventional biased Float32 exponent.
+ *
+ * bias = 127;
+ *
+ * E8M0_1   = 0b01111111; => 2^(127-127) = 1
+ * E8M0_2   = 0b10000000; => 2^(128-127) = 2^1 = 2
+ * E8M0_3   = 0b10000010; => 2^(130-127) = 2^3 = 8
+ * E8M0_135 = 0b10000111; => 2^(135-127) = 2^8 = 256
+ * E8M0_142 = 0b10001110; => 2^(142-127) = 2^15 = 32768
+ * E8M0_MIN = 0b00000000; => 2^-127
+ * E8M0_MAX = 0b11111110; => 2^127
+ * E8M0_NAN = 0b11111111; => NaN
+ */
+
+struct e8m0_bexp_t
+{
+    using raw_type = uint8_t;
+    using type     = raw_type;
+
+    raw_type data;
+
+    CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t() : data{type{0b11111111}} {}
+    CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(type init) : data{init} {}
+    CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(float scale)
+        : e8m0_bexp_t(static_cast<type>(numeric_utils<float>::get_exponent(scale)))
+    {
+    }
+    CK_TILE_HOST_DEVICE constexpr operator type() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; }
+    CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr operator float() const;
+
+    constexpr bool operator==(const e8m0_bexp_t& other) const { return data == other.data; }
+
+    constexpr bool operator!=(const e8m0_bexp_t& other) const { return data != other.data; }
+};
+
+using e8m0_t     = e8m0_bexp_t;
+using e8m0_raw_t = typename e8m0_t::raw_type;
+
+template <>
+struct numeric_traits<e8m0_t>
+{
+    using bitwise_type = e8m0_raw_t;
+
+    static constexpr int exp        = 8;
+    static constexpr int mant       = 0;
+    static constexpr int bias       = 127;
+    static constexpr int PackedSize = 1;
+};
+
+// limits
+template <class T>
+struct numeric;
+
+template <>
+struct numeric<e8m0_t>
+{
+    static constexpr e8m0_raw_t binary_min = 0b00000000; // 2^-127
+    static constexpr e8m0_raw_t binary_max = 0b11111110; // 2^127
+    static constexpr e8m0_raw_t binary_nan = 0b11111111;
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t min() { return e8m0_t{binary_min}; }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t max() { return e8m0_t{binary_max}; }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t quiet_NaN() { return e8m0_t{binary_nan}; }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t signaling_NaN() { return e8m0_t{binary_nan}; }
+    CK_TILE_HOST_DEVICE static constexpr bool has_inf() { return false; }
+
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t epsilon() { return signaling_NaN(); }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t round_error() { return signaling_NaN(); }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t zero() { return signaling_NaN(); }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t infinity() { return signaling_NaN(); }
+};
+
+CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t::operator float() const
+{
+    using traits = numeric_traits<float>;
+    if(data == numeric<e8m0_t>::binary_nan)
+    {
+        return traits::NaN;
+    }
+    else if(data == 0)
+    {
+        return std::numeric_limits<float>::min();
+    }
+    else
+    {
+        return bit_cast<float>(static_cast<traits::bitwise_type>(data) << traits::mant);
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/mxfp_convert.hpp b/include/ck_tile/core/numeric/mxfp_convert.hpp
index b2e138e880..9b378933d0 100644
--- a/include/ck_tile/core/numeric/mxfp_convert.hpp
+++ b/include/ck_tile/core/numeric/mxfp_convert.hpp
@@ -12,15 +12,19 @@ struct numeric_utils : numeric_traits<T>
 
     using traits   = numeric_traits<T>;
     using _numeric = numeric<T>;
-    using raw_type = typename T::raw_type;
+    using raw_type = typename traits::bitwise_type;
 
     static constexpr int exp_mask = (1 << traits::exp) - 1;
 
-    static constexpr int get_exponent(raw_type x)
+    static constexpr raw_type get_exponent(raw_type x)
     {
         // TODO: check if repeated calls are optimized.
         return (x >> traits::mant) & exp_mask;
     }
+    static constexpr raw_type get_exponent(const T& x)
+    {
+        return get_exponent(bit_cast<raw_type>(x));
+    }
     static constexpr bool is_positive(raw_type x)
     {
         return (x >> (traits::exp + traits::mant)) == _numeric::binary_zero;
@@ -33,7 +37,7 @@ struct numeric_utils : numeric_traits<T>
     static constexpr double get_mantissa(raw_type x)
     {
         double mantissa = is_subnormal(x) ? 0.0f : 1.0f;
-        for(uint32_t i = 0; i < traits::mant; ++i)
+        for(raw_type i = 0; i < traits::mant; ++i)
         {
             mantissa += std::ldexp(static_cast<float>(x & 0b1), -(traits::mant - i));
             x >>= 1;
@@ -43,22 +47,23 @@ struct numeric_utils : numeric_traits<T>
 };
 
 template <typename T>
-CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, int scale_exp = 127)
+CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, float scale = 1.f)
 {
-    using utils                    = numeric_utils<T>;
-    static constexpr int e8m0_bias = 127; // TODO: make it generic.
-    float sign                     = utils::is_positive(data) ? 1.0 : -1.0;
-    int exp    = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias;
-    float mant = utils::get_mantissa(data);
+    using utils = numeric_utils<T>;
+    float sign  = utils::is_positive(data) ? 1.0 : -1.0;
+    int exp     = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias;
+    float mant  = utils::get_mantissa(data);
 
-    return std::ldexp(sign * mant, exp + scale_exp - e8m0_bias);
+    return std::ldexp(sign * mant * scale, exp);
 }
 
 template <typename T>
-CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value)
+CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value, float scale = 1.f)
 {
     using bitwise_type = typename numeric_traits<T>::bitwise_type;
 
+    value /= scale;
+
     if(std::abs(value) > float(numeric<T>::max()))
     {
         float max_value = numeric<T>::max();
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index 0dee750b69..a345cd1b75 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -23,14 +23,11 @@ using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
 using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
 
-CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float);
+CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
 
 // TODO: Add stochastic method
 struct pk_float4_e2m1_t
 {
-    static constexpr int exponent = 2;
-    static constexpr int mantissa = 1;
-    static constexpr int bias     = 1;
     // TODO: Can we merge raw_type and type?
     using raw_type = uint8_t;
     using type     = raw_type;
@@ -41,18 +38,27 @@ struct pk_float4_e2m1_t
     CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t(T init) : data{static_cast<type>(init)}
     {
     }
-    CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init) : data{float_to_e2m1(init)}
+    CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init, float scale = 1.f)
+        : data{float_to_e2m1(init, scale)}
     {
     }
     CK_TILE_HOST_DEVICE constexpr operator type() const { return data; }
     CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; }
     CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; }
-    CK_TILE_HOST_DEVICE constexpr operator float() const;
-    CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator fp16_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator bf16_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const;
+
+    CK_TILE_HOST_DEVICE constexpr float to_float(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr fp32x2_t to_fp32x2(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr fp16_t to_fp16(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr fp16x2_t to_fp16x2(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr bf16_t to_bf16(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr bf16x2_t to_bf16x2(float scale = 1.f) const;
+
+    CK_TILE_HOST_DEVICE constexpr operator float() const { return to_float(); }
+    CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const { return to_fp32x2(); }
+    CK_TILE_HOST_DEVICE constexpr operator fp16_t() const { return to_fp16(); }
+    CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const { return to_fp16x2(); }
+    CK_TILE_HOST_DEVICE constexpr operator bf16_t() const { return to_bf16(); }
+    CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const { return to_bf16x2(); }
 
     template <index_t I>
     CK_TILE_HOST_DEVICE constexpr raw_type unpack(number<I>) const;
@@ -191,131 +197,160 @@ CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f)
 } // namespace impl
 #endif
 
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16_t() const
+CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_t::to_bf16(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<bf16_t>(data);
+    return impl::_from_f4<bf16_t>(data, scale);
 #else
-    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{})))};
+    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale))};
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16x2_t() const
+
+CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_t::to_bf16x2(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<bf16x2_t>(data);
+    return impl::_from_f4<bf16x2_t>(data, scale);
 #else
-    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}))),
-                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{})))};
+    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale)),
+                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale))};
 #endif
 }
 
 // TODO: make float_to_e2m1 generic so that we can convert from directrly.
-CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return convert_to_type<pk_fp4_t>(x);
+    return convert_to_type<pk_fp4_t>(x, scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x) { return fp32x2_t(x); }
-CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x) { return fp16x2_t(x); }
-CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x) { return bf16x2_t(x); }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x) { return float_to_e2m1(x); }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x, float scale)
+{
+    return float_to_e2m1(x, scale);
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return float_to_e2m1(type_convert<float>(x));
+    return float_to_e2m1(type_convert<float>(x), scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return float_to_e2m1(type_convert<float>(x));
+    return float_to_e2m1(type_convert<float>(x), scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0])),
-                          float_to_e2m1(type_convert<float>(x[1])));
+    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0]), scale),
+                          float_to_e2m1(type_convert<float>(x[1]), scale));
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0])),
-                          float_to_e2m1(type_convert<float>(x[1])));
+    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0]), scale),
+                          float_to_e2m1(type_convert<float>(x[1]), scale));
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(x[0]), float_to_e2m1(x[1]));
+    return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale));
 #endif
 }
 
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x, float scale)
+{
+    return x.to_fp32x2(scale);
+}
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x, float scale)
+{
+    return x.to_fp16x2(scale);
+}
+CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x, float scale)
+{
+    return x.to_bf16x2(scale);
+}
+CK_TILE_HOST_DEVICE constexpr float pk_fp4_to_float(const pk_fp4_t& x, float scale)
+{
+    return x.to_float(scale);
+}
+CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_to_fp16(const pk_fp4_t& x, float scale)
+{
+    return x.to_fp16(scale);
+}
+CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_to_bf16(const pk_fp4_t& x, float scale)
+{
+    return x.to_bf16(scale);
+}
+
 #if TEST_convert_with_table == 0
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const
+CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp32_t>(data);
+    return impl::_from_f4<fp32_t>(data, scale);
 #else
-    return convert_to_float<pk_fp4_t>(unpack(number<0>{}));
+    return convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp32x2_t>(data);
+    return impl::_from_f4<fp32x2_t>(data, scale);
 #else
-    return fp32x2_t{convert_to_float<pk_fp4_t>(unpack(number<0>{})),
-                    convert_to_float<pk_fp4_t>(unpack(number<1>{}))};
+    return fp32x2_t{convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale),
+                    convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale)};
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const
+
+CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp16_t>(data);
+    return impl::_from_f4<fp16_t>(data, scale);
 #else
-    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{})))};
+    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale))};
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp16x2_t>(data);
+    return impl::_from_f4<fp16x2_t>(data, scale);
 #else
-    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}))),
-                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{})))};
+    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale)),
+                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale))};
 #endif
 }
 #else
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const
+CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const
 {
-    return e2m1_to_fp32_table[data & 0xf];
+    return e2m1_to_fp32_table[unpack(number<0>{})] * scale;
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
 {
-    return fp32x2_t{e2m1_to_fp32_table[data & 0xf], e2m1_to_fp32_table[data >> 4]};
+    return fp32x2_t{e2m1_to_fp32_table[unpack(number<0>{})] * scale, e2m1_to_fp32_table[unpack(number<1>{}] * scale};
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const
+CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const
 {
-    return e2m1_to_fp16_table[data & 0xf];
+    return type_convert<float>(e2m1_to_fp16_table[unpack(number<0>{})]) * scale;
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
 {
-    return fp16x2_t{e2m1_to_fp16_table[data & 0xf], e2m1_to_fp16_table[data >> 4]};
+    return fp16x2_t{
+        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[unpack(number<0>{})]) * scale),
+        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[unpack(number<1>{})]) * scale)};
 }
 #endif
 
diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp
index 94d6e3cd34..1455fce0ea 100644
--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -64,6 +64,7 @@ CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float)
 
 CK_TILE_TYPE_CONVERT(float, float, int8_t, int8)
 CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
+#undef CK_TILE_TYPE_CONVERT
 
 } // namespace ck_tile
 
@@ -71,16 +72,36 @@ CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
 
 namespace ck_tile {
 
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2)
-CK_TILE_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2)
-CK_TILE_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2)
-CK_TILE_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16)
-#undef CK_TILE_TYPE_CONVERT
+template <typename Y, typename X>
+CK_TILE_HOST_DEVICE constexpr Y scaled_type_convert(X x, float scale);
+
+#define CK_TILE_SCALED_TYPE_CONVERT(dtype_, dname_, stype_, sname_)                       \
+    template <>                                                                           \
+    CK_TILE_HOST_DEVICE constexpr dtype_ scaled_type_convert<dtype_, stype_>(stype_ x,    \
+                                                                             float scale) \
+    {                                                                                     \
+        return sname_##_to_##dname_(x, scale);                                            \
+    }                                                                                     \
+    template <>                                                                           \
+    CK_TILE_HOST_DEVICE constexpr dtype_ type_convert<dtype_, stype_>(stype_ x)           \
+    {                                                                                     \
+        return sname_##_to_##dname_(x, 1.f);                                              \
+    }
+
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2)
+CK_TILE_SCALED_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2)
+CK_TILE_SCALED_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2)
+CK_TILE_SCALED_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float)
+CK_TILE_SCALED_TYPE_CONVERT(float, float, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16)
+CK_TILE_SCALED_TYPE_CONVERT(bf16_t, bf16, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16)
+CK_TILE_SCALED_TYPE_CONVERT(fp16_t, fp16, pk_fp4_t, pk_fp4)
+#undef CK_TILE_SCALED_TYPE_CONVERT
+
 #endif
 
 } // namespace ck_tile
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index c3f1b7d221..b7329fcac7 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -409,7 +409,13 @@ struct HostTensor
     }
 
     // void SetZero() { ck_tile::ranges::fill<T>(mData, 0); }
-    void SetZero() { std::fill(mData.begin(), mData.end(), 0); }
+    void SetZero()
+    {
+        if constexpr(std::is_same_v<T, e8m0_t>)
+            std::fill(mData.begin(), mData.end(), e8m0_t{1.f});
+        else
+            std::fill(mData.begin(), mData.end(), 0);
+    }
 
     template <typename F>
     void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt
index a9461dca9c..384fd3c1c4 100644
--- a/test/ck_tile/data_type/CMakeLists.txt
+++ b/test/ck_tile/data_type/CMakeLists.txt
@@ -3,6 +3,7 @@ if(GPU_TARGETS MATCHES "gfx9")
 endif()
 if(GPU_TARGETS MATCHES "gfx95")
     add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp)
+    add_gtest_executable(test_ck_tile_mx_scale test_mx_scale.cpp)
 endif()
 
 if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8)
diff --git a/test/ck_tile/data_type/test_mx_scale.cpp b/test/ck_tile/data_type/test_mx_scale.cpp
new file mode 100644
index 0000000000..7a024d238f
--- /dev/null
+++ b/test/ck_tile/data_type/test_mx_scale.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include <hip/hip_runtime.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using ck_tile::bf16_t;
+using ck_tile::bf16x2_t;
+using ck_tile::fp16_t;
+using ck_tile::fp16x2_t;
+using ck_tile::fp32_t;
+using ck_tile::fp32x2_t;
+using ck_tile::number;
+using ck_tile::pk_fp4_t;
+
+template <typename SRC, typename DST, bool is_device>
+CK_TILE_HOST void test_convert();
+
+using ck_tile::e8m0_raw_t;
+using ck_tile::e8m0_t;
+
+TEST(OCP_Scale, NumericLimits)
+{
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::has_inf(), false);
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::zero(), ck_tile::numeric<e8m0_t>::signaling_NaN());
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::min(), e8m0_t{e8m0_raw_t{0b00000000}});
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::max(), e8m0_t{e8m0_raw_t{0b11111110}});
+}
+TEST(OCP_Scale, NumericBasic)
+{
+    auto scale_1 = e8m0_t{1.0f};
+    auto scale_2 = e8m0_t{e8m0_raw_t{ck_tile::numeric_traits<e8m0_t>::bias}}; // 2^0
+    EXPECT_EQ(scale_1, scale_2);
+
+    auto scale_3 = e8m0_t{8.0f};
+    auto scale_4 = e8m0_t{e8m0_raw_t{3 + ck_tile::numeric_traits<e8m0_t>::bias}}; // 2^3
+    EXPECT_EQ(scale_3, scale_4);
+}
+
+TEST(OCP_Scale, ScaledConvertDevice)
+{
+    constexpr bool is_device = true;
+    test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
+    test_convert<fp16_t, fp16_t, is_device>();
+    test_convert<bf16_t, bf16_t, is_device>();
+    test_convert<fp32_t, fp16_t, is_device>();
+    test_convert<fp32_t, bf16_t, is_device>();
+    test_convert<fp16_t, fp32_t, is_device>();
+    test_convert<bf16_t, fp32_t, is_device>();
+}
+TEST(OCP_Scale, ScaledConvertHost)
+{
+    constexpr bool is_device = false;
+    test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
+    test_convert<fp16_t, fp16_t, is_device>();
+    test_convert<bf16_t, bf16_t, is_device>();
+    test_convert<fp32_t, fp16_t, is_device>();
+    test_convert<fp32_t, bf16_t, is_device>();
+    test_convert<fp16_t, fp32_t, is_device>();
+    test_convert<bf16_t, fp32_t, is_device>();
+}
+TEST(OCP_Scale, tensorInit)
+{
+    using scale_t = e8m0_t;
+    ck_tile::HostTensor<scale_t> scales({10, 10});
+    ck_tile::FillUniformDistribution<scale_t>{1.f, 1.f}(scales);
+    scales.SetZero();
+}
+
+#define toPF4(x, y) ck_tile::scaled_type_convert<pk_fp4_t>(x, y)
+#define toDST(x, y) ck_tile::scaled_type_convert<DST>(x, y)
+#define toDSTx2(x, y) ck_tile::scaled_type_convert<DSTx2_t>(x, y)
+
+#define toF32(x) ck_tile::type_convert<float>(x)
+#define toPF4_(x) ck_tile::type_convert<pk_fp4_t>(x)
+#define toSRC(x) ck_tile::type_convert<SRC>(x)
+#define toDST_(x) ck_tile::type_convert<DST>(x)
+
+template <typename Kernel, typename... Args>
+__global__ void MyKernel(Args... args)
+{
+    Kernel{}(args...);
+}
+template <typename SRC, typename DST, int N>
+struct SrcPkfp4Dst
+{
+    CK_TILE_HOST_DEVICE void
+    operator()(const SRC* src, DST* dst, e8m0_t scale1, e8m0_t scale2) const
+    {
+
+        using SRCx2_t = ck_tile::ext_vector_t<SRC, 2>;
+        using DSTx2_t = ck_tile::ext_vector_t<DST, 2>;
+
+        ck_tile::static_for<0, N, 2>{}([&](auto i) {
+            const auto input2 = SRCx2_t{src[i], src[i + 1]};
+
+            if(i % 4 == 0)
+            {
+                // ex: fp32_t -> fp4 -> bf16_t
+                dst[i] = toDST(toPF4(src[i], scale1), scale2);
+                // ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t
+                dst[i + 1] = toDST(toPF4_(toPF4(input2, scale1).unpack(number<1>{})), scale2);
+            }
+            else
+            {
+                // ex: fp32x2_t -> pk_fp4_t -> bf16x2_t
+                reinterpret_cast<DSTx2_t*>(dst)[i >> 1] = toDSTx2(toPF4(input2, scale1), scale2);
+            }
+        });
+    }
+};
+
+template <typename SRC, typename DST, bool is_device>
+CK_TILE_HOST void test_convert()
+{
+    const auto test_data = std::array{4.f, 6.f, 8.f, 10.f};
+    const auto ref_data  = std::array{8.f, 16.f, 16.f, 16.f};
+    const auto scale1    = e8m0_t{8.0f};
+    const auto scale2    = e8m0_t{16.0f};
+
+    static_assert(test_data.size() == ref_data.size());
+    static_assert(test_data.size() % 2 == 0);
+
+    constexpr int N = test_data.size();
+    std::array<SRC, N> in;
+    std::array<DST, N> ref, out;
+
+    // prepare input and ground truth in host
+    for(int i = 0; i < N; ++i)
+    {
+        in[i]  = toSRC(test_data[i]);
+        ref[i] = toDST_(ref_data[i]);
+        EXPECT_EQ(test_data[i], toF32(in[i]));
+        EXPECT_EQ(ref_data[i], toF32(ref[i]));
+    }
+
+    using job = SrcPkfp4Dst<SRC, DST, N>;
+
+    if constexpr(is_device)
+    {
+        auto in_d  = std::make_unique<ck_tile::DeviceMem>(in.size() * sizeof(SRC));
+        auto out_d = std::make_unique<ck_tile::DeviceMem>(out.size() * sizeof(DST));
+        in_d->ToDevice(in.data());
+
+        MyKernel<job><<<1, 1>>>(reinterpret_cast<const SRC*>(in_d->GetDeviceBuffer()),
+                                reinterpret_cast<DST*>(out_d->GetDeviceBuffer()),
+                                scale1,
+                                scale2);
+
+        out_d->FromDevice(out.data());
+    }
+    else
+    {
+        job{}(in.data(), out.data(), scale1, scale2);
+    }
+
+    for(int i = 0; i < N; ++i)
+        EXPECT_EQ(ref[i], out[i]) << "i:" << i;
+}

From 3c9400471dcd4b3f55d8f6b88b562bda63b75657 Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Fri, 8 Aug 2025 02:03:49 +0300
Subject: [PATCH 404/443] [CK_TILE] Enable persistent kernel and tail handler
 in tile_engine (#2300)

* Enable persistent kernel in tile_engine and use tail handler

* Fix formatting

* Add persistent to default_config.json

* Remove extra newlines and add persistent also to user config

* Reduce instances from default_config.json

* add persistent to benchmark.json and custom_ci_config.json

* changed the config file to have few instances

---------

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
Co-authored-by: ThomasNing <thomasning@amd.com>
---
 tile_engine/ops/gemm/codegen_utils.py         | 89 -------------------
 tile_engine/ops/gemm/configs/benchmark.json   |  6 ++
 .../ops/gemm/configs/custom_ci_config.json    |  6 ++
 .../ops/gemm/configs/default_config.json      |  7 +-
 .../gemm/configs/user_provided_config.json    |  6 ++
 tile_engine/ops/gemm/gemm_host_api.hpp        | 16 ++--
 tile_engine/ops/gemm/gemm_instance_builder.py | 51 +++++------
 tile_engine/ops/gemm/json_config.py           |  4 +
 8 files changed, 60 insertions(+), 125 deletions(-)

diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 9ff76724cc..4a990f3309 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -65,93 +65,6 @@ CSHUFFLE_EPILOGUE = """
                                                              UniversalGemmProblem::TransposeC,
                                                              memory_operation>>;
 """
-HOT_LOOP_FALSE = """
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Odd)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Even)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-            }
-            else
-            {
-                throw std::runtime_error("Num K loop must be larger than number of prefetech stages.");
-            }
-"""
-RUN_MEM = """
-            // Handle One and Full cases directly
-            if (tail_num == ck_tile::TailNumber::One) {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-            } else if (tail_num == ck_tile::TailNumber::Full) {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            
-            auto check_tail = [&](auto... TNs) {
-                ([&]{
-                    if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
-                        if(tail_num == decltype(TNs)::value) {
-                            RunSplitk(ck_tile::bool_constant<true>{},
-                                    ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
-                        }
-                    }
-                }(), ...);
-            };
-
-            check_tail(
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{}
-            );
-"""
-
-RUN_COMPV3 = """
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Odd)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Even)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-            }
-            else
-            {
-                throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even.");
-            }
-"""
-
-RUN_COMPV4 = """
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-            else
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-"""
-
 
 PIPELINE_MAP = {
     "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"],
@@ -172,8 +85,6 @@ SCHEDULER_MAP = {
 
 EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE}
 
-HOT_LOOP_TRUE = {"mem": RUN_MEM, "compv3": RUN_COMPV3, "compv4": RUN_COMPV4}
-
 
 def BOOL_MAP(b_):
     return {True: "true", False: "false"}[bool(b_)]
diff --git a/tile_engine/ops/gemm/configs/benchmark.json b/tile_engine/ops/gemm/configs/benchmark.json
index 1560698b77..def3ca4453 100644
--- a/tile_engine/ops/gemm/configs/benchmark.json
+++ b/tile_engine/ops/gemm/configs/benchmark.json
@@ -96,6 +96,12 @@
             "values": [
                 false
             ]
+        },
+        "persistent": {
+            "values": [
+                false,
+                true
+            ]
         }
     }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/custom_ci_config.json b/tile_engine/ops/gemm/configs/custom_ci_config.json
index 9187fb01eb..ca6c7230fd 100644
--- a/tile_engine/ops/gemm/configs/custom_ci_config.json
+++ b/tile_engine/ops/gemm/configs/custom_ci_config.json
@@ -77,6 +77,12 @@
       "values": [
         false
       ]
+    },
+    "persistent": {
+      "values": [
+        false,
+        true
+      ]
     }
   }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
index 12a8ddd4b7..5bd51b809a 100644
--- a/tile_engine/ops/gemm/configs/default_config.json
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -95,6 +95,11 @@
       "values": [
         false
       ]
+    },
+    "persistent": {
+      "values": [
+        false
+      ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
index 5761b39ada..76e194f6b9 100644
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -82,6 +82,12 @@
       "values": [
         false
       ]
+    },
+    "persistent": {
+        "values": [
+            false,
+            true
+        ]
     }
   }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 2c4af8955f..f28f5dd29c 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -144,7 +144,8 @@ inline auto create_args(int argc, char* argv[])
         .insert("pad_k",
                 "false",
                 "Whether pad or not in k direction. Possible values are true or false. Default is "
-                "false.");
+                "false.")
+        .insert("persistent", "false", "Whether to use persistent kernel. Default is false.");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -208,12 +209,13 @@ void permute_vectors_i4x4_b(Tensor& tensor)
 auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser)
 {
     KernelTraits trait;
-    trait.pipeline  = arg_parser.get_str("pipeline");
-    trait.scheduler = arg_parser.get_str("scheduler");
-    trait.epilogue  = arg_parser.get_str("epilogue");
-    trait.pad_m     = arg_parser.get_bool("pad_m");
-    trait.pad_n     = arg_parser.get_bool("pad_n");
-    trait.pad_k     = arg_parser.get_bool("pad_k");
+    trait.pipeline   = arg_parser.get_str("pipeline");
+    trait.scheduler  = arg_parser.get_str("scheduler");
+    trait.epilogue   = arg_parser.get_str("epilogue");
+    trait.pad_m      = arg_parser.get_bool("pad_m");
+    trait.pad_n      = arg_parser.get_bool("pad_n");
+    trait.pad_k      = arg_parser.get_bool("pad_k");
+    trait.persistent = arg_parser.get_bool("persistent");
 
     bool structured_sparsity = arg_parser.get_bool("structured_sparsity");
 
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 4a35a2bcd3..6d713bdcb8 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -15,16 +15,9 @@ from json_config import GemmConfig, RangeConfigParam
 from codegen_utils import (
     DATA_TYPE_MAP,
     LAYOUT_MAP,
-    DEFAULT_EPILOGUE,
-    CSHUFFLE_EPILOGUE,
-    HOT_LOOP_FALSE,
-    RUN_MEM,
-    RUN_COMPV3,
-    RUN_COMPV4,
     PIPELINE_MAP,
     SCHEDULER_MAP,
     EPILOGUE_MAP,
-    HOT_LOOP_TRUE,
     BOOL_MAP,
     warp_tile_supported_combinations,
     trait_unsupported_combinations,
@@ -114,7 +107,7 @@ class GemmCodeGenerator:
 
     def _generate_all_traits(self):
         """Generate all possible kernel traits names."""
-        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"]
+        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k", "persistent"]
 
         # Generate all unique_combinations
         _unique = set(
@@ -124,13 +117,14 @@ class GemmCodeGenerator:
         )
 
         for combo in _unique:
-            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = combo
             current_combination = (pipeline, epilogue, scheduler)
 
             if current_combination not in trait_unsupported_combinations:
                 trait_name = (
                     f"{pipeline}_{epilogue}_{scheduler}_"
-                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}"
+                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}_"
+                    f"{BOOL_MAP(persistent)}"
                 )
                 self.valid_trait_names.append(trait_name)
             else:
@@ -189,7 +183,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_c"]]};
 
     def _generate_trait_file(self, trait: str):
         """Generate a trait with all tile/warp combinations."""
-        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_")
+        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = trait.split("_")
         filename = f"gemm_{trait}.hpp"
 
         content = f"""// SPDX-License-Identifier: MIT
@@ -206,8 +200,7 @@ namespace {trait} {{
 """
         # Add template struct with configuration
         content += self._generate_kernel_struct(
-            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k
-        )
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent)
 
         content += f"\n}} // namespace {trait}\n"
         (self.output_dir / filename).write_text(content)
@@ -220,6 +213,7 @@ namespace {trait} {{
         pad_m: str,
         pad_n: str,
         pad_k: str,
+        persistent: str,
     ) -> str:
         """Generate the code block of kernel struct"""
         return f"""
@@ -229,9 +223,10 @@ template <int TileM, int TileN, int TileK,
           int WarpTileM, int WarpTileN, int WarpTileK,
           bool structured_sparsity>
 struct GemmKernel {{
-    static constexpr bool kPadM = {pad_m};
-    static constexpr bool kPadN = {pad_n};
-    static constexpr bool kPadK = {pad_k};
+    static constexpr bool kPadM       = {pad_m};
+    static constexpr bool kPadN       = {pad_n};
+    static constexpr bool kPadK       = {pad_k};
+    static constexpr bool kPersistent = {persistent};
 
     static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
@@ -250,7 +245,6 @@ struct GemmKernel {{
                                    permuteA,
                                    permuteB>;
 
-
         using TilePartitioner =
             ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
                                                       TileParitionerGroupNum,
@@ -261,7 +255,8 @@ struct GemmKernel {{
 
         using GemmUniversalTraits =
             ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
-                                             ALayout, BLayout, CLayout, TransposeC, structured_sparsity>;
+                                             ALayout, BLayout, CLayout, TransposeC, 
+                                             structured_sparsity, kPersistent>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -297,14 +292,14 @@ struct GemmKernel {{
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-            constexpr dim3 blocks = Kernel::BlockSize();
-
             if(!Kernel::IsSupportedArgument(kargs))
             {{
                 throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
             }}
 
+            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids = {'Kernel::MaxOccupancyGridSize(stream)' if persistent == 'true' else 'Kernel::GridSize(args.M, args.N, args.k_batch)'};
+
             if(stream.log_level_ > 0)
             {{
                 std::cout << "Launching kernel with args:"
@@ -377,11 +372,7 @@ struct GemmKernel {{
             }}
         }};
 
-        if(has_hot_loop) {{
-            {HOT_LOOP_TRUE[pipeline]}
-        }} else {{
-            {HOT_LOOP_FALSE}
-        }}
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
         return ave_time;
     }}
@@ -395,7 +386,8 @@ struct GemmKernel {{
                 "{pad_k}" + "_" +
                 "{pipeline}" + "_" +
                 "{epilogue}" + "_" +
-                "{scheduler}";
+                "{scheduler}" + "_" +
+                "{persistent}";
     }}
 }};
 """
@@ -673,6 +665,8 @@ struct KernelTraits
     bool pad_n;
     /// @brief Indicates whether padding is applied to the K dimension.
     bool pad_k;
+    /// @brief Indicates whether the kernel is persistent.
+    bool persistent;
 };
 
 struct GemmDispatcher {
@@ -773,7 +767,8 @@ private:
                trait.scheduler + "_" +
                (trait.pad_m ? "true" : "false") + "_" +
                (trait.pad_n ? "true" : "false") + "_" +
-               (trait.pad_k ? "true" : "false");
+               (trait.pad_k ? "true" : "false") + "_" +
+               (trait.persistent ? "true" : "false");
     }
 };
 
diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
index 675a2052ef..04f2dd4890 100644
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -107,6 +107,7 @@ class TraitConfig:
     pad_m: EnumConfigParam
     pad_n: EnumConfigParam
     pad_k: EnumConfigParam
+    persistent: EnumConfigParam
 
 
 @dataclass
@@ -215,6 +216,9 @@ class GemmConfig:
                 pad_k=EnumConfigParam(
                     values=config_dict["trait_config"]["pad_k"]["values"]
                 ),
+                persistent=EnumConfigParam(
+                    values=config_dict["trait_config"]["persistent"]["values"]
+                ),
             )
 
             return cls(

From ab26026835b0766e068ed4458b3f7a17633ca7a7 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Thu, 7 Aug 2025 16:51:53 -0700
Subject: [PATCH 405/443] [CK-tile] add more tests for batched transpose
 testing the rectangular block tile sizes (#2634)

* add failing tests

* swap out and reference

* add constraint assert to transpose input distribution

* test both pipelines with rectangular block tile

* print mismatched indices

* add a smaller failing test for old pipeline

* print grid and block

* fill output before operating on it

* swap m/n tile sizes and make one test pass

* add device syncs

* add one more flipped test case

* flip block tile at host arg init

* fix tiles for lds pipeline

* clang-format

* rename tests

* roll back error check

* remove device syncs

* reduce large test case's size
---
 .../kernel/batched_transpose_kernel.hpp       | 40 ++++----
 .../batched_transpose_common_policy.hpp       | 16 ++--
 .../batched_transpose_lds_problem.hpp         | 20 ++--
 .../pipeline/batched_transpose_policy.hpp     |  2 +-
 .../test_batched_transpose.cpp                | 92 +++++++++++++++++--
 5 files changed, 127 insertions(+), 43 deletions(-)

diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index a89a190489..a4150e8d84 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -49,9 +49,11 @@ struct BatchedTransposeKernel
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& host_args)
     {
-        size_t grid_size_x = (host_args.height + host_args.dim_block_h - 1) / host_args.dim_block_h;
-        size_t grid_size_y = (host_args.width + host_args.dim_block_w - 1) / host_args.dim_block_w;
-        size_t grid_size_z = host_args.batch;
+        const size_t grid_size_x =
+            ck_tile::integer_divide_ceil(host_args.height, host_args.dim_block_h);
+        const size_t grid_size_y =
+            ck_tile::integer_divide_ceil(host_args.width, host_args.dim_block_w);
+        const size_t grid_size_z = host_args.batch;
         return dim3(grid_size_x, grid_size_y, grid_size_z);
     }
 
@@ -71,41 +73,43 @@ struct BatchedTransposeKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        static constexpr ck_tile::index_t kMPerBlock       = Problem::kMPerBlock;
-        static constexpr ck_tile::index_t kNPerBlock       = Problem::kNPerBlock;
-        static constexpr bool kPadM                        = Problem::kPadM;
-        static constexpr bool kPadN                        = Problem::kPadN;
-        static constexpr ck_tile::index_t VectorSizeInput  = Problem::VectorSizeInput;
-        static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput;
+        static constexpr ck_tile::index_t kMPerBlock         = Problem::kMPerBlock;
+        static constexpr ck_tile::index_t kNPerBlock         = Problem::kNPerBlock;
+        static constexpr bool kPadM                          = Problem::kPadM;
+        static constexpr bool kPadN                          = Problem::kPadN;
+        static constexpr ck_tile::index_t VectorSizeInput    = Problem::VectorSizeInput;
+        static constexpr ck_tile::index_t VectorStrideInput  = 1;
+        static constexpr ck_tile::index_t VectorSizeOutput   = Problem::VectorSizeOutput;
+        static constexpr ck_tile::index_t VectorStrideOutput = 1;
 
-        const auto iM   = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const auto iN   = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
-        const auto iDim = blockIdx.z;
+        const auto iM     = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
+        const auto iN     = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
+        const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width);
 
         const auto x_m_n = [&]() {
             const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                static_cast<const Type*>(kargs.p_input) + iDim * kargs.dim_stride,
+                static_cast<const Type*>(kargs.p_input) + offset,
                 make_tuple(kargs.height, kargs.width),
                 make_tuple(kargs.width, 1),
                 number<VectorSizeInput>{},
-                number<1>{});
+                number<VectorStrideInput>{});
 
             return pad_tensor_view(x_dram_naive,
                                    make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                                   sequence<kPadN, kPadM>{});
+                                   sequence<kPadM, kPadN>{});
         }();
 
         const auto y_n_m = [&]() {
             const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                static_cast<Type*>(kargs.p_output) + iDim * kargs.dim_stride,
+                static_cast<Type*>(kargs.p_output) + offset,
                 make_tuple(kargs.width, kargs.height),
                 make_tuple(kargs.height, 1),
                 number<VectorSizeOutput>{},
-                number<1>{});
+                number<VectorStrideOutput>{});
 
             return pad_tensor_view(y_dram_naive,
                                    make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
-                                   sequence<kPadM, kPadN>{});
+                                   sequence<kPadN, kPadM>{});
         }();
 
         auto x_block_window = make_tile_window(
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
index e344c24bf5..3b8d5a142e 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
@@ -15,15 +15,15 @@ struct BatchedTransposeCommonPolicy
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeInputDistribution()
     {
-        constexpr index_t BlockSize         = Problem::kBlockSize;
-        constexpr index_t LeadDimPerBlock   = Problem::kMPerBlock;
-        constexpr index_t SecondDimPerBlock = Problem::kNPerBlock;
+        constexpr index_t kBlockSize         = Problem::kBlockSize;
+        constexpr index_t kLeadDimPerBlock   = Problem::kNPerBlock;
+        constexpr index_t kSecondDimPerBlock = Problem::kMPerBlock;
 
-        constexpr index_t kVectorSize = Problem::VectorSizeOutput;
-
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      SecondDimPerBlock,
-                                                                      LeadDimPerBlock,
+        constexpr index_t kVectorSize = Problem::VectorSizeInput;
+        static_assert((kLeadDimPerBlock * kVectorSize) % kBlockSize == 0, "");
+        using TileEncodingPattern = TileDistributionEncodingPattern2D<kBlockSize,
+                                                                      kSecondDimPerBlock,
+                                                                      kLeadDimPerBlock,
                                                                       kVectorSize,
                                                                       TileAccessPattern>;
         return TileEncodingPattern::Make2DStaticTileDistribution();
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
index 491db37564..45803ae2da 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
@@ -18,19 +18,19 @@ struct BatchedTransposeLdsProblem
 {
     using DataType = remove_cvref_t<DataType_>;
 
-    static constexpr index_t kRowWarps_    = NumWarps::at(number<1>{});
-    static constexpr index_t kColWarps_    = NumWarps::at(number<0>{});
+    static constexpr index_t kRowWarps_    = NumWarps::at(number<0>{});
+    static constexpr index_t kColWarps_    = NumWarps::at(number<1>{});
     static constexpr index_t kBlockSize_   = get_warp_size() * kRowWarps_ * kColWarps_;
-    static constexpr index_t kRowPerBlock_ = BlockTile::at(number<1>{});
-    static constexpr index_t kColPerBlock_ = BlockTile::at(number<0>{});
+    static constexpr index_t kRowPerBlock_ = BlockTile::at(number<0>{});
+    static constexpr index_t kColPerBlock_ = BlockTile::at(number<1>{});
 
     static constexpr index_t kBlockSize = kBlockSize_;
     // warps per block
-    static constexpr index_t kLeadNumWarps   = kRowWarps_;
-    static constexpr index_t kSecondNumWarps = kColWarps_;
+    static constexpr index_t kLeadNumWarps   = kColWarps_;
+    static constexpr index_t kSecondNumWarps = kRowWarps_;
 
-    static constexpr index_t kLeadSizePerBlock   = kRowPerBlock_;
-    static constexpr index_t kSecondSizePerBlock = kColPerBlock_;
+    static constexpr index_t kLeadSizePerBlock   = kColPerBlock_;
+    static constexpr index_t kSecondSizePerBlock = kRowPerBlock_;
 
     static constexpr index_t kQuadrantLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
     static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
@@ -60,8 +60,8 @@ struct BatchedTransposeLdsProblem
     static constexpr bool kPadM = kPadM_;
     static constexpr bool kPadN = kPadN_;
 
-    static constexpr auto kMPerBlock = kLeadSizePerBlock;
-    static constexpr auto kNPerBlock = kSecondSizePerBlock;
+    static constexpr auto kMPerBlock = kSecondSizePerBlock;
+    static constexpr auto kNPerBlock = kLeadSizePerBlock;
 
     // 128-bit is the max single-instruction bandwidth for load/store
     static constexpr index_t MaxLoadStoreSize = 16;
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
index 5238fecdc5..e6bbc709ea 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
@@ -19,8 +19,8 @@ struct BatchedTransposePolicy : public BatchedTransposeCommonPolicy
         constexpr index_t VecLoadSize = Problem::VectorSizeOutput;
 
         using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      NPerBlock,
                                                                       MPerBlock,
+                                                                      NPerBlock,
                                                                       VecLoadSize,
                                                                       TileAccessPattern>;
         return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
index cce00e27cb..77d5825eed 100644
--- a/test/ck_tile/batched_transpose/test_batched_transpose.cpp
+++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
@@ -95,10 +95,12 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
         ck_tile::HostTensor<DataType> y_ref(Y_dim, Y_stride);
 
         ck_tile::FillUniformDistribution<DataType>{-.5f, .5f}(x_host);
+        ck_tile::FillConstant<DataType>{-37}(y_host);
 
         ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
         ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
         x_dev.ToDevice(x_host.data());
+        y_dev.ToDevice(y_host.data());
 
         using Kernel = typename Config::Kernel;
 
@@ -131,8 +133,8 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
                                                                  height,
                                                                  width,
                                                                  height * width,
-                                                                 Config::BlockTile::at(1),
-                                                                 Config::BlockTile::at(0)};
+                                                                 Config::BlockTile::at(0),
+                                                                 Config::BlockTile::at(1)};
         auto kargs           = Kernel::MakeKargs(host_args);
 
         auto sc                   = ck_tile::stream_config{};
@@ -140,15 +142,24 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
         constexpr dim3 block_size = Kernel::BlockSize();
         ck_tile::launch_kernel(
             sc, ck_tile::make_kernel<block_size.x, 1>(Kernel{}, grid_size, block_size, 0, kargs));
+
         y_dev.FromDevice(y_host.data());
         ck_tile::reference_batched_transpose<DataType>(x_host, y_ref, layout_in, layout_out);
 
         std::ostringstream message;
         message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in
-                << " layout_out=" << layout_out << " device_name=" << device_name;
+                << " layout_out=" << layout_out << " grid_size={" << grid_size.x << ", "
+                << grid_size.y << ", " << grid_size.z << "} block_size=" << block_size.x
+                << " device_name=" << device_name;
 
+        // NB: order of output and reference matters
         bool pass = ck_tile::check_err(
-            y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false);
+            /* out */ y_host,
+            /* ref */ y_ref,
+            message.str(),
+            /* rtol */ 0,
+            /* atol */ 0,
+            /* allow inf */ false);
 
         EXPECT_TRUE(pass);
     }
@@ -160,14 +171,16 @@ static const auto kTestingValues = ::testing::Values(
 //             N  C   H  W   layout_in==NCHW    
     std::tuple{1, 32, 1, 32, true},
     std::tuple{1, 64, 1, 64, true},
+    std::tuple{1, 32, 1, 64, true},
+    std::tuple{1, 64, 1, 32, true},
     std::tuple{2, 12, 1, 32, false},
     std::tuple{3, 1334, 1, 37, false},
     std::tuple{4, 27, 1, 32, true},
     std::tuple{5, 1234, 1, 12, true},
     std::tuple{1, 1, 1, 1, true},
     std::tuple{1, 1, 1, 1, false},
-    std::tuple{128, 1024, 64, 64, true},
-    std::tuple{128, 1024, 64, 64, false},
+    std::tuple{17, 1024, 64, 64, true},
+    std::tuple{17, 1024, 64, 64, false},
     std::tuple{16, 64, 32, 128, true},
     std::tuple{16, 64, 128, 32, false},
     std::tuple{1, 2048, 1, 1, true},
@@ -239,6 +252,60 @@ class CaseHalfPadMultiWarpLoadTranspose
 {
 };
 
+class CaseHalfPadMultiWarp128MNLoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       128,
+                                                       128,
+                                                       2,
+                                                       2,
+                                                       false,
+                                                       false>>
+{
+};
+
+class CaseHalfPadMultiWarp128MN
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 128, 128, 2, 2, false, false>>
+{
+};
+
+class CaseHalfPadRectTile1
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 32, 64, 1, 1, false, false>>
+{
+};
+
+class CaseHalfPadRectTile2
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 64, 32, 1, 1, false, false>>
+{
+};
+
+class CaseHalfPadRectTile1LoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       32,
+                                                       64,
+                                                       1,
+                                                       1,
+                                                       false,
+                                                       false>>
+{
+};
+
+class CaseHalfPadRectTile2LoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       64,
+                                                       32,
+                                                       1,
+                                                       1,
+                                                       false,
+                                                       false>>
+{
+};
+
 TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); }
@@ -248,6 +315,12 @@ TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadMultiWarp128MN, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadMultiWarp128MNLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile1, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile1LoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile2, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile2LoadTranspose, TestCorrectness) { this->Run(GetParam()); }
 
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues);
@@ -259,4 +332,11 @@ INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingV
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues);
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues);
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MN, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MNLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1LoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2LoadTranspose, kTestingValues);
+
 // clang-format on

From 7ac850ac72996ef462baf7c23efd9cfa3ba96bfe Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 8 Aug 2025 09:30:46 -0700
Subject: [PATCH 406/443] Add daily AITER tests on gfx942. (#2639)

* add option to select aiter branch, add tests on gfx942
---
 Dockerfile.aiter | 10 +++++++---
 Jenkinsfile      | 20 ++++++++++++++------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/Dockerfile.aiter b/Dockerfile.aiter
index f6e66f460a..245e39fb75 100644
--- a/Dockerfile.aiter
+++ b/Dockerfile.aiter
@@ -1,16 +1,20 @@
 ARG BASE_DOCKER="rocm/pytorch:latest"
 FROM $BASE_DOCKER
-RUN groupadd -f render && \
+ARG AITER_BRANCH="main"
+ARG CK_AITER_BRANCH="develop"
+RUN groupadd -g 109 render && \
+    usermod -u 1001 jenkins && \
+    groupmod -g 1001 jenkins && \
     pip install pandas zmq einops && \
     pip install numpy==1.26.2 && \
     sudo mkdir /home/jenkins && \
     sudo mkdir /home/jenkins/workspace && \
     cd /home/jenkins/workspace && \
     rm -rf aiter && \
-    git clone --recursive https://github.com/ROCm/aiter.git && \
+    git clone -b "$AITER_BRANCH" --recursive https://github.com/ROCm/aiter.git && \
     cd aiter && \
     rm -rf 3rdparty/composable_kernel/ && \
-    git clone https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
+    git clone -b "$CK_AITER_BRANCH" https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
     python3 setup.py develop && \
     chown -R jenkins:jenkins /home/jenkins/workspace && \
     chmod -R a+rwx /home/jenkins/workspace && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 0363b07d89..28e9f6dd00 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -190,7 +190,7 @@ def buildDocker(install_prefix){
     }
     else if(params.RUN_AITER_TESTS){
         image_name = "rocm/composable_kernel:ck_aiter"
-        dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter . "
+        dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . "
     }
     else{
         dockerArgs = dockerArgs + " -f Dockerfile . "
@@ -843,10 +843,10 @@ def run_aiter_tests(Map conf=[:]){
     withDockerContainer(image: image, args: dockerOpts) {
         timeout(time: 45, unit: 'MINUTES'){
             try{
-                sh "python3 --version"
                 sh "rocminfo"
-                sh "python3 ../aiter/op_tests/test_gemm_a8w8_blockscale.py"
-                //sh "python3 ../aiter/op_tests/test_mha.py"
+                sh "python3 --version"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
             }
             catch(e){
                 echo "Throwing error exception while running AITER tests"
@@ -1009,6 +1009,14 @@ pipeline {
             name: "RUN_AITER_TESTS",
             defaultValue: false,
             description: "Run AITER tests with latest CK develop branch (default: OFF)")
+        string(
+            name: 'aiter_branch',
+            defaultValue: 'main',
+            description: 'Specify which branch of AITER to use (default: main)')
+        string(
+            name: 'ck_aiter_branch',
+            defaultValue: 'develop',
+            description: 'Specify which branch of CK to test with AITER (default: develop)')
     }
     environment{
         dbuser = "${dbuser}"
@@ -1093,13 +1101,13 @@ pipeline {
         {
             parallel
             {
-                stage("Run AITER Tests on gfx90a")
+                stage("Run AITER Tests on gfx942")
                 {
                     when {
                         beforeAgent true
                         expression { params.RUN_AITER_TESTS.toBoolean() }
                     }
-                    agent{ label rocmnode("gfx90a")}
+                    agent{ label rocmnode("gfx942")}
                     steps{
                         run_aiter_tests()
                         cleanWs()

From 8613aa1e40349a966111e694c93313fe88187df5 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 8 Aug 2025 10:48:44 -0700
Subject: [PATCH 407/443] remove ck_tile transpose and gemm stages from CI
 (#2646)

---
 Jenkinsfile | 144 +---------------------------------------------------
 1 file changed, 1 insertion(+), 143 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 28e9f6dd00..c0efaa3b91 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -438,34 +438,6 @@ def cmake_build(Map conf=[:]){
             echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
         }
     }
-    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
-        try{
-            archiveArtifacts "perf_transpose_*.log"
-            if (arch_type == 1){
-                stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a"
-            }
-            else if (arch_type == 2){
-                stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942"
-            }
-        }
-        catch(Exception err){
-            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
-        }
-    }
-    if (params.RUN_CK_TILE_GEMM_TESTS){
-        try{
-            archiveArtifacts "perf_tile_gemm_**.log"
-            if (arch == 1){
-                stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
-            }
-            else if (arch == 2){
-                stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
-            }
-        }
-        catch(Exception err){
-            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
-        }
-    }
 }
 
 def buildHipClangJob(Map conf=[:]){
@@ -762,24 +734,6 @@ def process_results(Map conf=[:]){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                         }
                     }
-                    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
-                        try{
-                            unstash "perf_transpose_log_gfx942"
-                            unstash "perf_transpose_log_gfx90a"
-                        }
-                        catch(Exception err){
-                            echo "could not locate the Transpose performance logs: ${err.getMessage()}."
-                        }
-                    }
-                    if (params.RUN_CK_TILE_GEMM_TESTS){
-                        try{
-                            unstash "perf_tile_gemm_log_gfx942"
-                            unstash "perf_tile_gemm_log_gfx90a"
-                        }
-                        catch(Exception err){
-                            echo "could not locate the GEMM performance logs: ${err.getMessage()}."
-                        }
-                    }
                     if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
                         // unstash deb packages
                         unstash "packages"
@@ -861,7 +815,7 @@ def run_aiter_tests(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -941,14 +895,6 @@ pipeline {
             name: "RUN_CK_TILE_FMHA_TESTS",
             defaultValue: false,
             description: "Run the ck_tile FMHA tests (default: OFF)")
-        booleanParam(
-            name: "RUN_CK_TILE_TRANSPOSE_TESTS",
-            defaultValue: false,
-            description: "Run the ck_tile Transpose tests (default: OFF)")
-        booleanParam(
-            name: "RUN_CK_TILE_GEMM_TESTS",
-            defaultValue: false,
-            description: "Run the ck_tile GEMM tests (default: OFF)")
         booleanParam(
             name: "RUN_TILE_ENGINE_GEMM_TESTS",
             defaultValue: false,
@@ -1206,94 +1152,6 @@ pipeline {
                 }
             }
         }
-        stage("Run CK_TILE_TRANSPOSE Tests")
-        {
-            parallel
-            {
-                stage("Run CK_TILE_TRANSPOSE Tests on gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx90a") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 tile_example_batched_transpose && \
-                                           cd ../ &&
-                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-                stage("Run CK_TILE_TRANSPOSE Tests on gfx942")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx942") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_batched_transpose && \
-                                           cd ../ &&
-                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-            }
-        }
-        stage("Run CK_TILE_GEMM Tests")
-        {
-            parallel
-            {
-                stage("Run CK_TILE_GEMM Tests on gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx90a") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 tile_example_gemm_universal && \
-                                           cd ../ &&
-                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-                stage("Run CK_TILE_GEMM Tests on gfx942")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx942") }
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j64 tile_example_gemm_universal && \
-                                           cd ../ &&
-                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-            }
-        }
         stage("Run TILE_ENGINE_GEMM Tests")
         {
             parallel

From 1e1ee758fad8e86be20e7eedb28d68e32845d453 Mon Sep 17 00:00:00 2001
From: geozhai <44495440+geozhai@users.noreply.github.com>
Date: Mon, 11 Aug 2025 00:26:13 -0400
Subject: [PATCH 408/443] update CK build instruction step 4 (#2563)

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 29d3d4e85a..459e17d9a3 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
 4. Build the entire CK library:
 
     ```bash
-    make -j
+    make -j"$(nproc)"
     ```
 
 5. Install CK:
@@ -213,4 +213,4 @@ script/uninstall_precommit.sh
 ```
 
 If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
-`git commit` command.
\ No newline at end of file
+`git commit` command.

From 191c62967bf05f58641725b88f038bea462fe651 Mon Sep 17 00:00:00 2001
From: Yashvardhan Agarwal <yashagar@amd.com>
Date: Mon, 11 Aug 2025 16:01:33 +0300
Subject: [PATCH 409/443] Fixes to  "General 2D Reduction Kernel" (#2535)
 (#2656)

* fix reduce2d

- revret the combine_partial_results() chnages
- remove auto from function def

* clang-format
---
 .../ck_tile/core/utility/reduce_operator.hpp  | 28 -------------------
 .../ops/reduce/block/block_reduce2d.hpp       | 22 ++-------------
 .../ops/reduce/kernel/reduce2d_kernel.hpp     |  4 ++-
 test/ck_tile/reduce/test_reduce2d.cpp         | 16 ++---------
 4 files changed, 7 insertions(+), 63 deletions(-)

diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
index 2d7ac78b06..a698c91e45 100644
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -35,8 +35,6 @@ struct Add
 
         return type_convert<T>(y_ + x_);
     }
-
-    static constexpr bool requires_special_combine = false;
 };
 
 struct SquareAdd
@@ -64,28 +62,6 @@ struct SquareAdd
         float x_ = type_convert<float>(x);
         return type_convert<T>(y_ + (x_ * x_));
     }
-
-    // For combining partial results
-    template <typename T,
-              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
-    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1,
-                                                            const T& partial2) const
-    {
-        return partial1 + partial2; // Just add the partial sums, don't square again
-    }
-
-    template <typename T,
-              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
-                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
-    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const
-    {
-        float partial1_ = type_convert<float>(partial1);
-        float partial2_ = type_convert<float>(partial2);
-        return type_convert<T>(partial1_ + partial2_);
-    }
-
-    static constexpr bool requires_special_combine = true;
 };
 
 struct Max
@@ -109,8 +85,6 @@ struct Max
     {
         return max(y, x);
     }
-
-    static constexpr bool requires_special_combine = false;
 };
 
 struct AbsMax
@@ -134,8 +108,6 @@ struct AbsMax
     {
         return max(y, abs(x));
     }
-
-    static constexpr bool requires_special_combine = false;
 };
 
 } // namespace ReduceOp
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index 849fa6c252..b72657b785 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -183,16 +183,7 @@ struct BlockReduce2dSync
 
                         // pull data from remote lane
                         const auto v_remote = warp_shuffle(v_local, src_lane);
-
-                        // For reduce, use combine_partial_results for operations that require it
-                        if constexpr(ReduceFunc::requires_special_combine)
-                        {
-                            v_local = reduce_func.combine_partial_results(v_local, v_remote);
-                        }
-                        else
-                        {
-                            v_local = reduce_func(v_local, v_remote);
-                        }
+                        v_local             = reduce_func(v_local, v_remote);
                     });
                 }
             });
@@ -309,16 +300,7 @@ struct BlockReduce2dCrossWarpSync
             static_for<0, num_reduce_warps - 1, 1>{}([&](auto i_1_n1) {
                 constexpr auto i_1      = number<i_1_n1 + 1>{};
                 const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
-
-                // For reduce, use combine_partial_results for operations that require it
-                if constexpr(ReduceFunc::requires_special_combine)
-                {
-                    v_local = reduce_func.combine_partial_results(v_local, v_remote);
-                }
-                else
-                {
-                    v_local = reduce_func(v_local, v_remote);
-                }
+                v_local                 = reduce_func(v_local, v_remote);
             });
 
             y_tensor.get_thread_buffer()(i_0) = v_local;
diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
index f65487ea6e..0cae4023b7 100644
--- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -189,7 +189,9 @@ struct Reduce
     /// @note Requirements:
     ///       - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution)
     ///       - input_strides[-1] == 1 (for contiguous memory access)
-    CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim, auto input_strides)
+    template <typename InputStrides>
+    CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim,
+                                                 InputStrides input_strides)
     {
         using S = typename Problem::BlockShape;
 
diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp
index 4ce0b56ef3..821d0a6c3e 100644
--- a/test/ck_tile/reduce/test_reduce2d.cpp
+++ b/test/ck_tile/reduce/test_reduce2d.cpp
@@ -308,20 +308,8 @@ using TestConfig_F32_Max = std::tuple<float,
                                       Shape1_WarpTile,
                                       Shape1_ThreadTile>;
 
-using TestConfig_F32_SquareAdd = std::tuple<float,
-                                            float,
-                                            float,
-                                            ck_tile::ReduceOp::SquareAdd,
-                                            Shape1_BlockWarps,
-                                            Shape1_BlockTile,
-                                            Shape1_WarpTile,
-                                            Shape1_ThreadTile>;
-
-using TestTypes = ::testing::Types<TestConfig_F32_Add,
-                                   TestConfig_F16_Add,
-                                   TestConfig_F32_CrossWarp,
-                                   TestConfig_F32_Max,
-                                   TestConfig_F32_SquareAdd>;
+using TestTypes = ::testing::
+    Types<TestConfig_F32_Add, TestConfig_F16_Add, TestConfig_F32_CrossWarp, TestConfig_F32_Max>;
 
 TYPED_TEST_SUITE(TestCkTileReduce, TestTypes);
 

From 6bfef6341417c81e06b7487fc16ad7222ded9386 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 11 Aug 2025 09:50:33 -0700
Subject: [PATCH 410/443] enable aiter test_mha in daily CI (#2659)

---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index c0efaa3b91..590ee92e90 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -801,6 +801,7 @@ def run_aiter_tests(Map conf=[:]){
                 sh "python3 --version"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
             }
             catch(e){
                 echo "Throwing error exception while running AITER tests"

From a7badc6ec516add9bb0c692ea9eabcfc8f475df7 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Mon, 11 Aug 2025 13:54:37 -0400
Subject: [PATCH 411/443] feat(copy_kernel): add basic copy kernel example with
 beginner friendly documentation (#2582)

* feat(copy_kernel): add basic copy kernel example with documentation

* docs(CHANGELOG): Updated changelog

* chore: performed clang format

* Update example/ck_tile/39_copy/copy_basic.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update example/ck_tile/39_copy/README.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update example/ck_tile/39_copy/README.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update example/ck_tile/39_copy/README.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update example/ck_tile/39_copy/README.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* Update example/ck_tile/39_copy/README.md

Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>

* fix(terminology): follow amd terms

* extract elementwise copy to a new kernel

* fix(copy_kernel): bug in verification

* add comments about vgpr usage

* lint and nits

* add notes and comments

* print hostTensor via stream

* print hostTensor via stream

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: spolifroni-amd <Sandra.Polifroni@amd.com>
---
 CHANGELOG.md                           |   1 +
 example/ck_tile/39_copy/CMakeLists.txt |   7 +
 example/ck_tile/39_copy/README.md      | 313 +++++++++++++++++++++
 example/ck_tile/39_copy/copy_basic.cpp | 147 ++++++++++
 example/ck_tile/39_copy/copy_basic.hpp | 369 +++++++++++++++++++++++++
 example/ck_tile/CMakeLists.txt         |   1 +
 6 files changed, 838 insertions(+)
 create mode 100644 example/ck_tile/39_copy/CMakeLists.txt
 create mode 100644 example/ck_tile/39_copy/README.md
 create mode 100644 example/ck_tile/39_copy/copy_basic.cpp
 create mode 100644 example/ck_tile/39_copy/copy_basic.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7a21634b7d..9c942a776d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 ### Added
 
+* Added a basic copy kernel example and supporting documentation for new CK Tile developers.
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
 * Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
 * Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
diff --git a/example/ck_tile/39_copy/CMakeLists.txt b/example/ck_tile/39_copy/CMakeLists.txt
new file mode 100644
index 0000000000..98397a33d2
--- /dev/null
+++ b/example/ck_tile/39_copy/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(tile_example_copy EXCLUDE_FROM_ALL copy_basic.cpp)
+
+# Impact: This flag ensures that the compiler doesn't make 
+# assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns.
+target_compile_options(tile_example_copy PRIVATE
+  -mllvm -enable-noalias-to-md-conversion=0
+)
diff --git a/example/ck_tile/39_copy/README.md b/example/ck_tile/39_copy/README.md
new file mode 100644
index 0000000000..f45fcb682b
--- /dev/null
+++ b/example/ck_tile/39_copy/README.md
@@ -0,0 +1,313 @@
+# CK Tile Framework: Getting Started with Tile Copy Operations
+
+## Overview
+
+### Copy Kernel
+A minimal CK_Tile memory copy implementation demonstrating the basic setup required to write a kernel in CK Tile.
+This experimental kernel is intended for novice CK developers. It introduces the building blocks of CK Tile and provides a sandbox for experimenting with kernel parameters.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture 
+# (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# Make the copy kernel executable
+make tile_example_copy -j
+```
+This will result in an executable `build/bin/test_copy_basic`
+
+## example
+```
+args:
+          -m        input matrix rows. (default 64)
+          -n        input matrix cols. (default 8)
+          -id       wave to use for computation. (default 0)
+          -v        validation flag to check device results. (default 1)
+          -prec     datatype precision to use. (default fp16)
+          -warmup   no. of warmup iterations. (default 50)
+          -repeat   no. of iterations for kernel execution time. (default 100)
+```
+
+## CK Tile Architecture Components
+
+The CK Tile framework is built around four key architectural components that work together to define and execute GPU kernels: shape, policy, problem, and pipeline.
+
+### **1. Shape**
+Defines the **hierarchical tile structure** and **memory layout** of the kernel:
+
+```cpp
+using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+```
+
+**Components:**
+- **BlockWaves**: Number of concurrent waves per block (e.g., `seq<4, 1>` for 4 waves along M, 1 along N)
+- **BlockTile**: Total elements processed by one block (e.g., `seq<512, 8>`)
+- **WaveTile**: Elements processed by one wave (e.g., `seq<32, 8>`)
+- **Vector**: Elements processed by one thread (e.g., `seq<1, 4>` for 4 contiguous elements)
+
+**Purpose**: Defines the **work distribution hierarchy** from threads → waves → blocks.
+
+### **2. Problem**
+Defines the **data types** and **kernel configuration**:
+
+```cpp
+using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
+```
+
+**Components:**
+- **XDataType**: Input/output data type (e.g., `float`, `half`)
+- **Shape**: The tile shape defined above
+
+**Purpose**: Encapsulates **what** the kernel operates on and **how** it's configured.
+
+### **3. Policy**
+Defines the **memory access patterns** and **distribution strategies**:
+
+```cpp
+using Policy = ck_tile::TileCopyPolicy<Problem>;
+```
+
+**Key Functions:**
+- **MakeDRAMDistribution()**: Defines how threads access DRAM memory.
+
+**Purpose**: Defines **how** data is accessed and distributed across threads.
+
+### **4. Pipeline**
+Defines the **execution flow** and **memory movement patterns**:
+
+```cpp
+// Example pipeline stages:
+// 1. DRAM → Registers (load_tile)
+// 2. Registers → LDS (store_tile)
+// 3. LDS → Registers (load_tile with distribution)
+// 4. Registers → DRAM (store_tile)
+```
+
+**Purpose**: Defines the **sequence of operations** and **memory movement strategy**.
+
+### **Component Interaction**
+
+```cpp
+// Complete kernel definition
+using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
+using Policy  = ck_tile::TileCopyPolicy<Problem>;
+using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
+```
+
+**Flow:**
+1. **Shape** defines the tile structure and work distribution
+2. **Problem** combines data types with the shape
+3. **Policy** defines memory access patterns for the problem
+4. **Kernel** implements the actual computation using all components
+
+### **Why This Architecture?**
+
+#### **Separation of Concerns**
+- **Shape**: Focuses on **work distribution** and **tile structure**
+- **Problem**: Focuses on **data types** and **configuration**
+- **Policy**: Focuses on **memory access** and **optimization**
+- **Pipeline**: Focuses on **execution flow** and **synchronization**
+
+#### **Reusability**
+- Same **Shape** can be used with different **Problems**
+- Same **Policy** can be applied to different **Shapes**
+- **Pipelines** can be reused across different kernels
+
+#### **Performance Optimization**
+- **Shape** enables optimal work distribution
+- **Policy** enables optimal memory access patterns
+- **Pipeline** enables optimal execution flow
+
+## Core Concepts
+
+### Hierarchical Tile Structure
+
+The CK Tile framework organizes work in a hierarchical manner:
+
+1. **Vector**: Number of contiguous elements processed by a single thread
+   - Enables vectorized memory loads/stores.
+   - Example: `Vector = seq<1, 4>` means each thread loads 4 contiguous elements along the N dimension
+   - A Vector can be imagined as a thread-level tile
+
+2. **WaveTile**: Number of elements covered by a single wave (64 threads on AMD)
+   - Must satisfy: `Wave_Tile_M / Vector_M * Wave_Tile_N / Vector_N == WaveSize`
+   - This ensures the number of threads needed equals the wave size
+   - Example: `WaveTile = seq<64, 4>` with `Vector = seq<1, 4>` means:
+     - Each thread handles 4 elements (Vector_N = 4)
+     - Wave needs 64×4/4 = 64 threads to cover 64×4 = 256 elements
+     - Total elements = 256, which requires WaveSize = 64 threads
+
+3. **BlockTile**: Number of elements covered by one block (typically mapped to one CU)
+   - Example: `BlockTile = seq<256, 64>` means each block processes 256×64 elements
+
+4. **BlockWaves**: Number of concurrent waves active in a block
+   - Usually 4 waves per block on modern AMD GPUs
+   - Example: `BlockWaves = seq<4, 1>` means 4 waves along M dimension, 1 along N
+
+### Wave Repetition
+
+In many scenarios, the total work (BlockTile) is larger than what the available waves can cover in a single iteration. This requires **wave repetition**:
+
+```cpp
+// Calculate how many times a wave needs to repeat to cover the entire block tile
+static constexpr index_t WaveRepetitionPerBlock_M =
+    Block_Tile_M / (Waves_Per_Block_M * Wave_Tile_M);
+static constexpr index_t WaveRepetitionPerBlock_N =
+    Block_Tile_N / (Waves_Per_Block_N * Wave_Tile_N);
+```
+
+**Key Insight**: When waves repeat, the effective work per thread becomes `Vector * Repeat`, not just `Vector`.
+
+## Tile Distribution Encoding
+
+The tile distribution encoding specifies how work is distributed across threads:
+
+```cpp
+constexpr auto outer_encoding =
+    tile_distribution_encoding<sequence<1>, // replication
+                               tuple<sequence<M0, M1, M2>, sequence<N0, N1>>, // hierarchy
+                               tuple<sequence<1>, sequence<1, 2>>, // parallelism
+                               tuple<sequence<1>, sequence<2, 0>>,  // paralleism
+                               sequence<1, 2>, // yield
+                               sequence<0, 1>>{}; // yield
+```
+
+### Encoding Parameters Explained
+
+- **M0, M1, M2**: Hierarchical distribution along M dimension
+  - M0: Number of wave iterations along M
+  - M1: Number of waves along M  
+  - M2: Number of threads per wave along M
+- **N0, N1**: Distribution along N dimension
+  - N0: Number of threads along N
+  - N1: Vector size (elements per thread)
+- **YIELD arguments**: Both `Repeat` and `Vector` because effective work per thread is `Vector * Repeat`
+
+## Tensor Abstractions
+
+### Tensor Descriptor
+Defines the logical structure of a tensor:
+```cpp
+auto desc = make_naive_tensor_descriptor(
+    make_tuple(M, N),           // tensor dimensions
+    make_tuple(N, 1),           // strides
+    number<Vector_N>{},         // vector length for vectorized access
+    number<1>{}                 // guaranteed last dimension vector stride
+);
+```
+
+### Tensor View
+Combines memory buffer with tensor descriptor:
+```cpp
+auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+    p_x,                        // memory buffer
+    make_tuple(M, N),           // dimensions
+    make_tuple(N, 1),           // strides  
+    number<S::Vector_N>{},      // vector length
+    number<1>{}                 // guaranteed last dimension vector stride
+);
+```
+
+### Tile Window
+A view into a specific tile of the tensor with thread distribution:
+```cpp
+auto x_window = make_tile_window(
+    x_m_n,                      // tensor view
+    make_tuple(Block_Tile_M, Block_Tile_N),  // tile size
+    {iM, 0},                    // tile origin
+    tile_distribution           // how work is distributed among threads
+);
+```
+
+## The test_copy_basic Kernel
+
+### Kernel Structure
+
+The `TileCopyKernel` implements a basic copy operation from input tensor `x` to output tensor `y`:
+
+```cpp
+template <typename Problem_, typename Policy_>
+struct TileCopyKernel
+{
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N) const
+    {
+        // 1. Create tensor views
+        // 2. Create tile windows  
+        // 3. Iterate over N dimension tiles
+        // 4. Load, copy, and store data
+    }
+};
+```
+
+### Step-by-Step Execution
+
+1. **Tensor View Creation**:
+   ```cpp
+   const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+       p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+   ```
+   - Creates views for both input and output tensors
+   - Specifies vectorized access with `Vector_N` elements per load
+
+2. **Tile Window Creation**:
+   ```cpp
+   auto x_window = make_tile_window(x_m_n,
+                                   make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                                   {iM, 0},
+                                   Policy::template MakeDRAMDistribution<Problem>());
+   ```
+   - Creates windows into specific tiles of the tensors
+   - Each block processes one tile starting at `{iM, 0}`
+   - Tile distribution determines how threads access data
+
+3. **N-Dimension Iteration**:
+   ```cpp
+   index_t num_n_tile_iteration = __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_Tile_N));
+   for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+   ```
+   - If tensor N dimension > Block_Tile_N, multiple iterations are needed
+   - Each iteration processes one tile along N dimension
+
+4. **Load-Store Operations**:
+   ```cpp
+   dram_reg_tile dram_tile;
+   load_tile(dram_tile, x_window);      // Load from global memory to registers
+   store_tile(y_window, dram_tile);     // Store from registers to global memory
+   move_tile_window(x_window, {0, S::Block_Tile_N});  // Move to next N tile
+   move_tile_window(y_window, {0, S::Block_Tile_N});
+   ```
+
+### How Load/Store Works
+
+1. **Load Tile**: 
+   - Each thread loads its assigned elements based on tile distribution
+   - Vectorized loads enable efficient memory bandwidth utilization
+   - Data is distributed to per-thread register buffers
+
+2. **Store Tile**:
+   - Each thread writes its assigned elements back to global memory
+   - Maintains the same distribution pattern as load
+
+3. **Tile Window Movement**:
+   - Moves the window to the next tile along N dimension
+   - Enables processing of large tensors that don't fit in one tile
+
+## Memory Access Patterns
+
+### Vectorized Access
+- Enabled by specifying vector length in tensor views
+- Each thread loads/stores multiple contiguous elements in one operation
+- Improves memory bandwidth utilization
+
+### Thread Distribution
+- Tile distribution encoding determines which threads access which elements
+- Ensures all threads participate and no data is missed
+- Enables memory coalescing for optimal performance
+
+### Coordinate Transform (Embed)
+- Maps multi-dimensional tensor indices to linear memory addresses
+- Handles stride calculations automatically
+- Enables efficient access to non-contiguous memory layouts
diff --git a/example/ck_tile/39_copy/copy_basic.cpp b/example/ck_tile/39_copy/copy_basic.cpp
new file mode 100644
index 0000000000..d46add879c
--- /dev/null
+++ b/example/ck_tile/39_copy/copy_basic.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include <cstring>
+#include "copy_basic.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "128", "m dimension")
+        .insert("n", "8", "n dimension")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision(fp16 or fp32)")
+        .insert("warmup", "50", "cold iter")
+        .insert("repeat", "100", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using XDataType = DataType;
+    using YDataType = DataType;
+
+    ck_tile::index_t m = arg_parser.get_int("m");
+    ck_tile::index_t n = arg_parser.get_int("n");
+    int do_validation  = arg_parser.get_int("v");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    // Create host tensors
+    ck_tile::HostTensor<XDataType> x_host({m, n});     // input matrix
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}); // reference output matrix
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}); // device output matrix
+
+    // Initialize input data with increasing values
+    ck_tile::half_t value = 1;
+    for(int i = 0; i < m; i++)
+    {
+        value = 1;
+        for(int j = 0; j < n; j++)
+        {
+            x_host(i, j) = value++;
+        }
+    }
+
+    // Allocate device memory
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+
+    // Define tile configuration
+    using Vector     = ck_tile::sequence<1, 4>;   // vector size along M and N dimension
+    using WaveTile   = ck_tile::sequence<64, 4>;  // wave size along M and N dimension
+    using BlockWaves = ck_tile::sequence<4, 1>;   // number of waves along M dimension
+    using BlockTile  = ck_tile::sequence<512, 4>; // block size along M and N dimension
+
+    // Calculate grid size
+    ck_tile::index_t kGridSize =
+        ck_tile::integer_divide_ceil(m, BlockTile::at(ck_tile::number<0>{}));
+    std::cout << "grid size (number of blocks per grid) " << kGridSize << std::endl;
+
+    // Define kernel types
+    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+    using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
+    using Policy  = ck_tile::TileCopyPolicy<Problem>;
+    using Kernel  = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>;
+    // using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
+    // using Kernel = ck_tile::TileCopyKernel_LDS<Problem, Policy>;
+
+    // question: Why do we not have a pipeline?
+    // answer: For basic copy operation, pipeline is not needed.
+    // we intentionally do not use pipeline for this example and let the kernel be composite of
+    // Problem and Policy
+
+    constexpr ck_tile::index_t kBlockSize = Shape::BlockSize;
+
+    // Print configuration information
+    std::cout << "block size (number of threads per block) " << kBlockSize << std::endl;
+    std::cout << "wave size (number of threads per wave) " << ck_tile::get_warp_size() << std::endl;
+    std::cout << "block waves (number of waves per block) " << BlockWaves::at(ck_tile::number<0>{})
+              << " " << BlockWaves::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "block tile (number of elements per block) " << BlockTile::at(ck_tile::number<0>{})
+              << " " << BlockTile::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "wave tile (number of elements per wave) " << WaveTile::at(ck_tile::number<0>{})
+              << " " << WaveTile::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "vector (number of elements per thread) " << Vector::at(ck_tile::number<0>{})
+              << " " << Vector::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "WaveRepetitionPerBlock_M =  " << Shape::WaveRepetitionPerBlock_M << " --> ("
+              << Shape::Block_Tile_M << "/" << Shape::Waves_Per_Block_M << "*" << Shape::Wave_Tile_M
+              << ")" << std::endl;
+    std::cout << "WaveRepetitionPerBlock_N =  " << Shape::WaveRepetitionPerBlock_N << " --> ("
+              << Shape::Block_Tile_N << "/" << Shape::Waves_Per_Block_N << "*" << Shape::Wave_Tile_N
+              << ")" << std::endl;
+
+    // Launch kernel
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, warmup, repeat, 1},
+        ck_tile::make_kernel<kBlockSize, 1>(Kernel{},
+                                            kGridSize,
+                                            kBlockSize,
+                                            0,
+                                            static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                            static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                            m,
+                                            n));
+
+    // Calculate and print performance metrics
+    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m * n;
+    float gb_per_sec      = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // Copy results back to host
+        y_buf.FromDevice(y_host_dev.mData.data());
+        // Use exact equality (tolerance = 0) for copy operations since copy should be exact
+        pass = ck_tile::check_err(y_host_dev, x_host, "Error: Copy operation failed!", 0.0, 0.0);
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    // Print results for debugging
+    // std::cout << "Input matrix (x_host):" << std::endl;
+    // std::cout << x_host << std::endl;
+    // std::cout << "Output matrix (y_host_dev):" << std::endl;
+    // std::cout << y_host_dev << std::endl;
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    if(arg_parser.get_str("prec") == "fp16")
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    else
+        return run<float>(arg_parser) ? 0 : -2;
+}
diff --git a/example/ck_tile/39_copy/copy_basic.hpp b/example/ck_tile/39_copy/copy_basic.hpp
new file mode 100644
index 0000000000..bbeb964fda
--- /dev/null
+++ b/example/ck_tile/39_copy/copy_basic.hpp
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+namespace ck_tile {
+
+/**
+ * @brief Tile copy shape configuration
+ *
+ * @tparam BlockWaves Number of waves along seq<M, N>
+ * @tparam BlockTile Block size, seq<M, N>
+ * @tparam WaveTile Wave size, seq<M, N>
+ * @tparam Vector Contiguous elements (vector size) along seq<M, N>
+ */
+template <typename BlockWaves, typename BlockTile, typename WaveTile, typename Vector>
+struct TileCopyShape
+{
+    // Vector dimensions for memory operations
+    static constexpr index_t Vector_M = Vector::at(number<0>{});
+    static constexpr index_t Vector_N = Vector::at(number<1>{});
+
+    // Wave tile dimensions
+    static constexpr index_t Wave_Tile_M = WaveTile::at(number<0>{});
+    static constexpr index_t Wave_Tile_N = WaveTile::at(number<1>{});
+
+    // Block tile dimensions
+    static constexpr index_t Block_Tile_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_Tile_N = BlockTile::at(number<1>{});
+
+    // Waves per block configuration
+    static constexpr index_t Waves_Per_Block_M = BlockWaves::at(number<0>{});
+    static constexpr index_t Waves_Per_Block_N = BlockWaves::at(number<1>{});
+
+    // Calculate wave repetition to cover entire block tile
+    static constexpr index_t WaveRepetitionPerBlock_M =
+        Block_Tile_M / (Waves_Per_Block_M * Wave_Tile_M);
+    static constexpr index_t WaveRepetitionPerBlock_N =
+        Block_Tile_N / (Waves_Per_Block_N * Wave_Tile_N);
+
+    // Hardware configuration
+    static constexpr index_t WaveSize  = get_warp_size();
+    static constexpr index_t BlockSize = Waves_Per_Block_M * Waves_Per_Block_N * WaveSize;
+
+    // Configuration validation
+    static_assert(Block_Tile_M > 0 && Block_Tile_N > 0, "Block tile dimensions must be positive");
+    static_assert(Wave_Tile_M > 0 && Wave_Tile_N > 0, "Wave tile dimensions must be positive");
+    static_assert(Vector_M > 0 && Vector_N > 0, "Vector dimensions must be positive");
+    static_assert(Waves_Per_Block_M > 0 && Waves_Per_Block_N > 0,
+                  "Waves per block must be positive");
+    static_assert(Waves_Per_Block_M * Wave_Tile_M > 0,
+                  "Invalid wave configuration for M dimension");
+    static_assert(Waves_Per_Block_N * Wave_Tile_N > 0,
+                  "Invalid wave configuration for N dimension");
+
+    // Ensure wave tile dimensions align with wave size
+    static_assert(Wave_Tile_M / Vector_M * Wave_Tile_N / Vector_N == WaveSize,
+                  "(Wave_Tile_M/Vector_M) * (Wave_Tile_N/Vector_N) != WaveSize");
+};
+
+/**
+ * @brief Problem definition for tile copy operation
+ */
+template <typename XDataType_, typename BlockShape_>
+struct TileCopyProblem
+{
+    using XDataType  = remove_cvref_t<XDataType_>;
+    using BlockShape = remove_cvref_t<BlockShape_>;
+};
+
+/**
+ * @brief Policy for tile copy operation
+ */
+template <typename Problem_>
+struct TileCopyPolicy
+{
+    using Problem   = ck_tile::remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+
+    /**
+     * @brief Create DRAM distribution for optimal memory access
+     */
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeDRAMDistribution()
+    {
+        using S = typename Problem::BlockShape;
+
+        constexpr index_t wave_size  = S::WaveSize;
+        constexpr index_t block_size = S::BlockSize;
+
+        // Distribution calculation to ensure all threads participate
+        constexpr index_t N1 = S::Vector_N;          // Elements per thread along N
+        constexpr index_t N0 = S::Block_Tile_N / N1; // Threads needed along N
+
+        constexpr index_t M2 = wave_size / N0;              // Threads per wave along M
+        constexpr index_t M1 = block_size / wave_size;      // Waves possible along M
+        constexpr index_t M0 = S::Block_Tile_M / (M1 * M2); // Wave iterations along M
+
+        // Validate complete coverage
+        static_assert(M0 * M1 * M2 * N0 * N1 == S::Block_Tile_M * S::Block_Tile_N,
+                      "Tile distribution must cover entire block tile");
+
+        constexpr auto outer_encoding =
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<M0, M1, M2>, sequence<N0, N1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{};
+        return make_static_tile_distribution(outer_encoding);
+    }
+};
+
+/**
+ * @brief Direct copy kernel from global memory to global memory
+ */
+template <typename Problem_, typename Policy_>
+struct TileCopyKernel
+{
+    using Problem   = ck_tile::remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+    using Policy    = ck_tile::remove_cvref_t<Policy_>;
+
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // Calculate tile block origin and validate bounds
+        // Use __builtin_amdgcn_readfirstlane to broadcast the same value to all threads in a wave
+        // This saves VGPR usage by avoiding per-thread storage of the same value
+        const auto tile_block_origin_m =
+            __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_Tile_M);
+        if(tile_block_origin_m >= M)
+        {
+            return; // Early exit for out-of-bounds blocks
+        }
+
+        // Create tensor views for input and output
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        // Create tile windows with DRAM distribution
+        auto x_window =
+            make_tile_window(x_m_n,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {tile_block_origin_m, 0},
+                             Policy::template MakeDRAMDistribution<Problem>());
+
+        auto y_window =
+            make_tile_window(y_m_n,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {tile_block_origin_m, 0},
+                             Policy::template MakeDRAMDistribution<Problem>());
+
+        // Calculate iterations needed to cover N dimension
+        // Note: This kernel uses data parallelism only in the M dimension.
+        // Each block processes one tile in M dimension, but iterates through N dimension tiles.
+        // This design choice is for simplicity and to avoid complex tile distribution.
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_Tile_N));
+
+        // Get tile distribution for register tensor
+        auto DramTileDist   = x_window.get_tile_distribution();
+        using dram_reg_tile = decltype(make_static_distributed_tensor<XDataType>(DramTileDist));
+
+        // Main copy loop - processes N dimension tiles sequentially within each block
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            dram_reg_tile dram_tile;
+
+            // Direct copy implementation
+            load_tile(dram_tile, x_window);
+            store_tile(y_window, dram_tile);
+
+            // Move to next N tile
+            move_tile_window(x_window, {0, S::Block_Tile_N});
+            move_tile_window(y_window, {0, S::Block_Tile_N});
+        }
+    }
+};
+
+/**
+ * @brief Element-wise copy kernel for data transformation scenarios
+ *
+ * This kernel performs element-wise copy operations, allowing for data transformation
+ * during the copy process. Useful when data needs to be processed or converted
+ * between different formats.
+ */
+template <typename Problem_, typename Policy_>
+struct ElementWiseTileCopyKernel
+{
+    using Problem   = ck_tile::remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+    using Policy    = ck_tile::remove_cvref_t<Policy_>;
+
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // Calculate block origin and validate bounds
+        // Use __builtin_amdgcn_readfirstlane to broadcast the same value to all threads in a wave
+        // This saves VGPR usage by avoiding per-thread storage of the same value
+        const auto tile_block_origin_m =
+            __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_Tile_M);
+        if(tile_block_origin_m >= M)
+        {
+            return; // Early exit for out-of-bounds blocks
+        }
+
+        // Create tensor views for input and output
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        // Create tile windows with DRAM distribution
+        auto x_window =
+            make_tile_window(x_m_n,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {tile_block_origin_m, 0},
+                             Policy::template MakeDRAMDistribution<Problem>());
+
+        auto y_window =
+            make_tile_window(y_m_n,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {tile_block_origin_m, 0},
+                             Policy::template MakeDRAMDistribution<Problem>());
+
+        // Calculate iterations needed to cover N dimension
+        // Note: This kernel uses data parallelism only in the M dimension.
+        // Each block processes one tile in M dimension, but iterates through N dimension tiles.
+        // This design choice is for simplicity and to avoid complex tile distribution.
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_Tile_N));
+
+        // Main element-wise copy loop - processes N dimension tiles sequentially within each block
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            // Element-wise copy implementation for data transformation
+            const auto xa  = load_tile(x_window);
+            auto y_compute = load_tile(y_window);
+
+            constexpr auto spans = decltype(xa)::get_distributed_spans();
+
+            sweep_tile_span(spans[number<0>{}], [&](auto idx0) {
+                sweep_tile_span(spans[number<1>{}], [&](auto idx1) {
+                    constexpr auto i_j_idx = ck_tile::make_tuple(idx0, idx1);
+                    const auto x           = ck_tile::type_convert<XDataType>(xa[i_j_idx]);
+                    y_compute(i_j_idx)     = x;
+                });
+            });
+
+            store_tile(y_window, y_compute);
+
+            // Move to next N tile
+            move_tile_window(x_window, {0, S::Block_Tile_N});
+            move_tile_window(y_window, {0, S::Block_Tile_N});
+        }
+    }
+};
+
+/**
+ * @brief LDS-based copy kernel for data processing scenarios
+ *
+ * This kernel copies data from global memory to LDS and then to global memory,
+ * useful when data needs to be processed or transformed during the copy operation.
+ */
+template <typename Problem_, typename Policy_>
+struct TileCopyKernel_LDS
+{
+    using Problem   = ck_tile::remove_cvref_t<Problem_>;
+    using XDataType = typename Problem::XDataType;
+    using Policy    = ck_tile::remove_cvref_t<Policy_>;
+
+    CK_TILE_DEVICE void operator()(const XDataType* p_x, XDataType* p_y, index_t M, index_t N) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // Calculate block origin and validate bounds
+        // Use __builtin_amdgcn_readfirstlane to broadcast the same value to all threads in a wave
+        // This saves VGPR usage by avoiding per-thread storage of the same value
+        const auto tile_block_origin_m =
+            __builtin_amdgcn_readfirstlane(get_block_id() * S::Block_Tile_M);
+        if(tile_block_origin_m >= M)
+        {
+            return; // Early exit for out-of-bounds blocks
+        }
+
+        // LDS buffer allocation
+        __shared__ XDataType x_lds_buffer[S::Block_Tile_M * S::Block_Tile_N];
+
+        // LDS tensor descriptor and view
+        const auto x_lds_descriptor =
+            make_naive_tensor_descriptor(make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         make_tuple(S::Block_Tile_N, 1),
+                                         number<S::Vector_N>{},
+                                         number<1>{});
+
+        auto x_lds_view = make_tensor_view<address_space_enum::lds>(x_lds_buffer, x_lds_descriptor);
+
+        // LDS windows with different distributions for optimal access patterns
+        auto x_lds_write_window = make_tile_window(
+            x_lds_view, make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}), {0, 0});
+
+        auto x_lds_read_window =
+            make_tile_window(x_lds_view,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {0, 0},
+                             Policy::template MakeDRAMDistribution<Problem>());
+
+        // Global memory tensor views
+        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+
+        // Global memory tile windows
+        auto x_window =
+            make_tile_window(x_m_n,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {tile_block_origin_m, 0},
+                             Policy::template MakeDRAMDistribution<Problem>());
+
+        auto y_window =
+            make_tile_window(y_m_n,
+                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
+                             {tile_block_origin_m, 0});
+
+        // Calculate iterations needed to cover N dimension
+        // Note: This kernel uses data parallelism only in the M dimension.
+        // Each block processes one tile in M dimension, but iterates through N dimension tiles.
+        // This design choice is for simplicity and to avoid complex tile distribution.
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_Tile_N));
+
+        // Main copy loop with LDS staging - processes N dimension tiles sequentially within each
+        // block
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            // Global memory to LDS
+            auto dram_tile = load_tile(x_window);
+            store_tile(x_lds_write_window, dram_tile);
+
+            // Synchronize LDS access
+            block_sync_lds();
+
+            // LDS to global memory
+            auto lds_tile = load_tile(x_lds_read_window);
+            store_tile(y_window, lds_tile);
+
+            // Move to next N tile
+            move_tile_window(x_window, {0, S::Block_Tile_N});
+            move_tile_window(y_window, {0, S::Block_Tile_N});
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index 630b96ede0..8fce70ba04 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -23,3 +23,4 @@ add_subdirectory(20_grouped_convolution)
 add_subdirectory(21_elementwise)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(38_block_scale_gemm)
+add_subdirectory(39_copy)

From 4fde1646e534415221edf81146d41f85fbf33e63 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Tue, 12 Aug 2025 11:11:55 +0800
Subject: [PATCH 412/443] [CK_TILE] FMHA BWD Optimization For GFX950 (#2628)

* simplify fmha_bwd_kernel MakeKargs & dq_dram_window

* simply duplicate

* trload pipeline

* Try two-stage

* add prefetch

* optimize & iglp
---
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   |   96 +-
 example/ck_tile/01_fmha/fmha_bwd.hpp          |    5 +-
 .../core/numeric/integral_constant.hpp        |   12 +-
 .../ck_tile/core/tensor/tensor_adaptor.hpp    |   22 +-
 .../ck_tile/core/tensor/tensor_descriptor.hpp |   28 +-
 include/ck_tile/host/device_prop.hpp          |    6 +
 include/ck_tile/ops/fmha.hpp                  |    2 +
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       |  557 +-------
 ...k_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp |    4 +-
 ...a_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp |   10 +-
 ...ck_fmha_bwd_dq_dk_dv_pipeline_selector.hpp |   20 +-
 ...bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp |  760 ++++++++++
 ...block_fmha_bwd_pipeline_default_policy.hpp |   16 +-
 .../block_fmha_bwd_pipeline_problem.hpp       |    2 +
 ...mha_bwd_pipeline_trload_default_policy.hpp | 1220 +++++++++++++++++
 .../block/block_gemm_areg_breg_creg_v1.hpp    |   42 +-
 16 files changed, 2216 insertions(+), 586 deletions(-)
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 47cf6b3ad4..8ca917cb6c 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -83,6 +83,7 @@ using fmha_bwd_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdPipelineProblem<
     {F_deterministic},
     fmha_mask_{F_idx},
     fmha_dropout_{F_idx},
+    {F_trload},
     fmha_bwd_trait_{F_idx}>;
 
 using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline<fmha_bwd_pipeline_problem_{F_idx}>;
@@ -113,7 +114,8 @@ using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                          {F_dbias},
                                                          {F_dpad},
                                                          {F_dvpad},
-                                                         {F_deterministic}>;
+                                                         {F_deterministic},
+                                                         {F_trload}>;
 
 #include <iostream>
 
@@ -168,29 +170,35 @@ float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
 
 template <>
 float fmha_bwd<2>(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
     float r = -1;
 {F_dispatch}
     return r;
 }}
 """
 
-FMHA_BWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
-{F_hdim_case}
+FMHA_BWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
+{F_body}
     }}
 """
-FMHA_BWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim}) {{
-{F_inner_dispatch}
-        }}
+
+FMHA_BWD_API_PER_DTYPE="""      {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_body}
+      }}
+"""
+FMHA_BWD_API_PER_HDIM_CASE="""          {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim}) {{
+{F_body}
+          }}
 """
 
-FMHA_BWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-                        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
-                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
-                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
-                r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
-                return r;
-            }}
+FMHA_BWD_API_INNER_DISPATCH="""              {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
+                          ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
+                  using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
+                  using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}>;
+                  using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
+                  r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
+                  return r;
+              }}
 """
 
 # M0 size for 1d kernels (dot/convert)
@@ -250,6 +258,7 @@ class FmhaBwdDQDKDVKernel:
     F_mode          : str  # value from MODE_MAP
     F_deterministic : str  #
     mask_impl       : str  #
+    F_trload       : str  #
 
     @property
     def template(self) -> str:
@@ -291,6 +300,7 @@ class FmhaBwdDQDKDVKernel:
                 F_mask          = get_mask_map(self.mask_impl)[self.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
                 F_deterministic = BOOL_MAP[self.F_deterministic],
+                F_trload        = BOOL_MAP[self.F_trload],
             )
 
     @property
@@ -324,6 +334,9 @@ class FmhaBwdDQDKDVKernel:
 
         if self.F_deterministic == 't' : n += '_deterministic'
         else: n += '_ndeterministic'
+
+        if self.F_trload == 't' : n += '_trload'
+        else: n += '_ntrload'
         return n
 
     @property
@@ -332,8 +345,8 @@ class FmhaBwdDQDKDVKernel:
 
 # TODO: design a more practical way to do it
 # this is current supported tile size.
-def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]:
-    if dtype == 'fp16' or dtype == 'bf16':
+def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str, tr_load: str) -> Optional[dict]:
+    if (dtype == 'fp16' or dtype == 'bf16') and tr_load == 'f':
         return {
             '32'  : FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             '64'  : FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
@@ -341,6 +354,10 @@ def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict
             # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
             '256' : FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
         }
+    elif (dtype == 'fp16' or dtype == 'bf16') and tr_load == 't':
+        return {
+            '128' : FmhaBwdDQDKDVTileSize( 32, 128, 128, 32, 128, 32, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 32, 1),
+        }
     else:
         return None
 
@@ -573,6 +590,7 @@ class FmhaBwdApiTrait:
     dvpad         : str
     deterministic : str
     mask_impl     : str
+    tr_load       : bool
 
     @property
     def bm0(self) -> int:
@@ -620,7 +638,7 @@ class FmhaBwdApiTrait:
     def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
         return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
             F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout,
-            F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl)
+            F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl, F_trload=self.tr_load)
 
     @property
     def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
@@ -636,12 +654,13 @@ class FmhaBwdApiTrait:
 
 class FmhaBwdApiPool:
     def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(list))
+        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        
         self.mask_impl = mask_impl
 
     def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
         # TODO: do we need to check duplication?
-        self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+        self.dq_dk_dv_pool[trait.tr_load][trait.dtype][trait.hdim].append(copy.copy(trait))
 
     @staticmethod
     def if_(i: int) -> str:
@@ -656,24 +675,31 @@ class FmhaBwdApiPool:
                 F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
                 F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
                 F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                F_deterministic=BOOL_MAP[trait.deterministic])
+                F_deterministic=BOOL_MAP[trait.deterministic], F_trload=BOOL_MAP[trait.tr_load])
             i += 1
         return inners
 
     @property
     def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.dq_dk_dv_pool):
-            per_hdim_case=str()
-            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype]):
-                traits=self.dq_dk_dv_pool[dtype][hdim]
-                inners = self._api_innders(traits)
-                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(j), F_hdim=hdim, F_inner_dispatch=inners)
-            per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(i), F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
+        tr_load_cond_map = {
+            "t": "has_load_tr",
+            "f": "true"
+        }
+        per_tr_load = ''
+        for tr_load in ["t", "f"]:
+            per_dtypes = ''
+            for j, dtype in enumerate(self.dq_dk_dv_pool[tr_load]):
+                per_hdim_case = ''
+                for k, hdim in enumerate(self.dq_dk_dv_pool[tr_load][dtype]):
+                    traits = self.dq_dk_dv_pool[tr_load][dtype][hdim]
+                    inners = self._api_innders(traits)
+                    per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(k), F_hdim=hdim, F_body=inners)
+                per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(j), F_dtype=dtype, F_body=per_hdim_case)
+            per_tr_load += FMHA_BWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_body=per_dtypes)
+        if not per_tr_load:
             # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
+            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_tr_load)
 
 def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
     if filter_list == '':
@@ -690,8 +716,8 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
     gen_convert_dq: Dict[FmhaBwdConvertQGradKernel, Literal[True]] = {}
     api_pool = FmhaBwdApiPool(mask_impl)
 
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
+    for dtype, tr_load in itertools.product(BWD_DTYPE_MAP.keys(), ["t", "f"]):
+        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype, tr_load)
         if d is None:
             continue
         for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
@@ -703,7 +729,9 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
                 continue
             if ("wg32" in dropout):
                 continue
-            t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
+            if tr_load == "t" and (dpad == "t" or dvpad == "t"):
+                continue  # tr_load cannot work with dpad or dvpad
+            t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl, tr_load=tr_load)
 
             if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
                 continue
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index c999cf750e..bd63c96eb1 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/fmha.hpp"
 #include "ck_tile/ops/epilogue.hpp"
@@ -363,7 +364,8 @@ template <ck_tile::index_t HDim_,
           bool kHasBiasGrad_,
           bool kPadD_,
           bool kPadDv_,
-          bool kIsDeterministic_>
+          bool kIsDeterministic_,
+          bool kUseTrLoad_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
     static constexpr ck_tile::index_t HDim = HDim_;
@@ -376,6 +378,7 @@ struct fmha_bwd_dq_dk_dv_traits_
     static constexpr bool kPadD            = kPadD_;
     static constexpr bool kPadDv           = kPadDv_;
     static constexpr bool kIsDeterministic = kIsDeterministic_;
+    static constexpr bool kUseTrLoad       = kUseTrLoad_;
 };
 
 template <typename Traits_>
diff --git a/include/ck_tile/core/numeric/integral_constant.hpp b/include/ck_tile/core/numeric/integral_constant.hpp
index 2ba2fd10c6..1eec80828a 100644
--- a/include/ck_tile/core/numeric/integral_constant.hpp
+++ b/include/ck_tile/core/numeric/integral_constant.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -83,4 +83,14 @@ CK_TILE_BINARY_OP(<=)
 #undef CK_TILE_LEFT_UNARY_OP
 #undef CK_TILE_BINARY_OP
 
+template <typename T>
+struct is_constant : std::false_type
+{
+};
+template <auto v>
+struct is_constant<constant<v>> : std::true_type
+{
+};
+template <typename T>
+inline constexpr bool is_constant_v = is_constant<T>::value;
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index ec5538d79c..eb226debfd 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -259,6 +259,7 @@ struct tensor_adaptor
 
     CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time() { return is_static(); }
 
+    template <index_t Internal = 0>
     CK_TILE_HOST_DEVICE static constexpr auto get_top_dimension_safe_vector_length_strides(
         const array<index_t, ndim_hidden_>& guaranteed_vector_lengths,
         const array<index_t, ndim_hidden_>& guaranteed_vector_strides)
@@ -266,7 +267,9 @@ struct tensor_adaptor
         auto vector_lengths = guaranteed_vector_lengths;
         auto vector_strides = guaranteed_vector_strides;
 
-        static_for<0, get_num_of_transform(), 1>{}([&](auto itran) {
+        static_for<0,
+                   Internal ? std::min(Internal, get_num_of_transform()) : get_num_of_transform(),
+                   1>{}([&](auto itran) {
             constexpr auto low_dims = get_lower_dimension_hidden_idss().at(itran);
             constexpr auto up_dims  = get_upper_dimension_hidden_idss().at(itran);
 
@@ -298,11 +301,16 @@ struct tensor_adaptor
             set_container_subset(vector_lengths, up_dims, up_vector_lengths);
             set_container_subset(vector_strides, up_dims, up_vector_strides);
         });
-
-        constexpr auto top_dims = TopDimensionHiddenIds{};
-
-        return make_tuple(get_container_subset(vector_lengths, top_dims),
-                          get_container_subset(vector_strides, top_dims));
+        if constexpr(Internal > 0)
+        {
+            return make_tuple(vector_lengths, vector_strides);
+        }
+        else
+        {
+            constexpr auto top_dims = TopDimensionHiddenIds{};
+            return make_tuple(get_container_subset(vector_lengths, top_dims),
+                              get_container_subset(vector_strides, top_dims));
+        }
     }
 
     private:
diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp
index 0e4787a2f1..3b372d45dd 100644
--- a/include/ck_tile/core/tensor/tensor_descriptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -133,9 +133,10 @@ struct tensor_descriptor : public tensor_adaptor<Transforms,
 
     CK_TILE_HOST_DEVICE static constexpr bool is_known_at_compile_time() { return is_static(); }
 
+    template <index_t Internal = 0>
     CK_TILE_HOST_DEVICE static constexpr auto get_top_dimension_safe_vector_length_strides()
     {
-        return Base::get_top_dimension_safe_vector_length_strides(
+        return Base::template get_top_dimension_safe_vector_length_strides<Internal>(
             to_array<index_t, ndim_hidden_>(GuaranteedVectorLengths{}),
             to_array<index_t, ndim_hidden_>(GuaranteedVectorStrides{}));
     }
@@ -377,12 +378,29 @@ make_naive_tensor_descriptor_packed(const tuple<Lengths...>& lengths,
 
     const auto element_space_size = container_reduce(lengths, multiplies{}, long_number<1>{});
 
+    constexpr index_t first_dim_length = []() {
+        if constexpr(is_constant_v<remove_cvref_t<decltype(element_space_size)>>)
+            return decltype(element_space_size)::value;
+        else
+            return -1;
+    }();
+    using last_t                      = remove_cvref_t<decltype(lengths.template get<N - 1>())>;
+    constexpr index_t last_dim_length = []() {
+        if constexpr(is_constant_v<last_t>)
+            return std::max(last_t::value, GuaranteedLastDimensionVectorLength);
+        else
+            return -1;
+    }();
+
     using GuaranteedVectorLengths =
-        typename sequence_merge<typename uniform_sequence_gen<N, -1>::type,
-                                sequence<GuaranteedLastDimensionVectorLength>>::type;
+        typename sequence_merge<sequence<first_dim_length>,
+                                typename uniform_sequence_gen<N - 1, -1>::type,
+                                sequence<last_dim_length>>::type;
 
     using GuaranteedVectorStrides =
-        typename sequence_merge<typename uniform_sequence_gen<N, -1>::type, sequence<1>>::type;
+        typename sequence_merge<sequence<1>,
+                                typename uniform_sequence_gen<N - 1, -1>::type,
+                                sequence<1>>::type;
 
     return tensor_descriptor<remove_cv_t<decltype(transforms)>,
                              remove_cv_t<decltype(low_dim_hidden_idss)>,
diff --git a/include/ck_tile/host/device_prop.hpp b/include/ck_tile/host/device_prop.hpp
index d33b298369..0d8f89ea31 100644
--- a/include/ck_tile/host/device_prop.hpp
+++ b/include/ck_tile/host/device_prop.hpp
@@ -51,6 +51,12 @@ inline std::string get_device_name()
     default: return name;
     }
 }
+
+inline bool is_load_tr_supported()
+{
+    // Check if load transpose is supported.
+    return get_device_name() == "gfx950";
+}
 } // namespace ck_tile
 
 #endif
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 313de5f29a..276ec4852f 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -25,8 +25,10 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index 8b184b18f3..595e2cfccf 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -62,6 +62,12 @@ struct FmhaBwdDQDKDVKernel
     static constexpr bool kHasDropout = FmhaDropout::IsDropout;
     static constexpr bool kIsStoreRandval  = FmhaDropout::IsStoreRandval;
     static constexpr bool kIsDeterministic = FmhaPipeline::kIsDeterministic;
+    static constexpr bool kUseTrLoad       = FmhaPipeline::kUseTrLoad;
+#if defined(__gfx950__)
+    static constexpr bool kIsAvialable = true;
+#else
+    static constexpr bool kIsAvialable = !kUseTrLoad;
+#endif
 
     // clang-format off
     template <typename T> struct t2s;
@@ -99,7 +105,7 @@ struct FmhaBwdDQDKDVKernel
             ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) +
             (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
             (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) +
-            (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" );
+            (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" ) + (kUseTrLoad ? "_trload" : "_ntrload");
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -298,6 +304,24 @@ struct FmhaBwdDQDKDVKernel
 
     using Kargs = std::conditional_t<kIsGroupMode, FmhaBwdGroupModeKargs, FmhaBwdBatchModeKargs>;
 
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
+    template <typename... Ts>
+    CK_TILE_HOST static constexpr Kargs
+    MakeKargs(Ts... args, const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
+    {
+        return MakeKargsImpl(
+            args..., std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
+    // std::variant<> can't take in a list initializer, overload for backward compatibility
+    template <typename... Ts>
+    CK_TILE_HOST static constexpr Kargs
+    MakeKargs(Ts... args, const std::tuple<const void*, const void*>& drop_seed_offset)
+    {
+        return MakeKargsImpl(
+            args..., std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
+    }
+
     template <bool Cond = !kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargsImpl(const void* q_ptr,
@@ -466,248 +490,6 @@ struct FmhaBwdDQDKDVKernel
         return kargs;
     }
 
-    // std::variant<> can't take in a list initializer, overload for backward compatibility
-    template <bool Cond = !kIsGroupMode>
-    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              const void* lse_ptr,
-              const void* do_ptr,
-              const void* d_ptr,
-              void* rand_val_ptr,
-              void* dk_ptr,
-              void* dv_ptr,
-              void* dbias_ptr,
-              void* dq_acc_ptr,
-              ck_tile::index_t seqlen_q,
-              ck_tile::index_t seqlen_k,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_do,
-              ck_tile::index_t stride_dq_acc,
-              ck_tile::index_t stride_dk,
-              ck_tile::index_t stride_dv,
-              ck_tile::index_t stride_dbias,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_do,
-              ck_tile::index_t nhead_stride_lsed,
-              ck_tile::index_t nhead_stride_dq_acc,
-              ck_tile::index_t nhead_stride_dk,
-              ck_tile::index_t nhead_stride_dv,
-              ck_tile::index_t nhead_stride_dbias,
-              ck_tile::index_t batch_stride_q,
-              ck_tile::index_t batch_stride_k,
-              ck_tile::index_t batch_stride_v,
-              ck_tile::index_t batch_stride_bias,
-              ck_tile::index_t batch_stride_randval,
-              ck_tile::index_t batch_stride_do,
-              ck_tile::index_t batch_stride_lsed,
-              ck_tile::index_t batch_stride_dq_acc,
-              ck_tile::index_t batch_stride_dk,
-              ck_tile::index_t batch_stride_dv,
-              ck_tile::index_t batch_stride_dbias,
-              ck_tile::index_t split_stride_dq_acc,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
-    {
-        return MakeKargsImpl(
-            q_ptr,
-            k_ptr,
-            v_ptr,
-            bias_ptr,
-            lse_ptr,
-            do_ptr,
-            d_ptr,
-            rand_val_ptr,
-            dk_ptr,
-            dv_ptr,
-            dbias_ptr,
-            dq_acc_ptr,
-            seqlen_q,
-            seqlen_k,
-            hdim_q,
-            hdim_v,
-            num_head_q,
-            nhead_ratio_qk,
-            scale,
-            stride_q,
-            stride_k,
-            stride_v,
-            stride_bias,
-            stride_randval,
-            stride_do,
-            stride_dq_acc,
-            stride_dk,
-            stride_dv,
-            stride_dbias,
-            nhead_stride_q,
-            nhead_stride_k,
-            nhead_stride_v,
-            nhead_stride_bias,
-            nhead_stride_randval,
-            nhead_stride_do,
-            nhead_stride_lsed,
-            nhead_stride_dq_acc,
-            nhead_stride_dk,
-            nhead_stride_dv,
-            nhead_stride_dbias,
-            batch_stride_q,
-            batch_stride_k,
-            batch_stride_v,
-            batch_stride_bias,
-            batch_stride_randval,
-            batch_stride_do,
-            batch_stride_lsed,
-            batch_stride_dq_acc,
-            batch_stride_dk,
-            batch_stride_dv,
-            batch_stride_dbias,
-            split_stride_dq_acc,
-            window_size_left,
-            window_size_right,
-            mask_type,
-            p_drop,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
-    }
-
-    // std::variant<> can't take in a list initializer, overload for backward compatibility
-    template <bool Cond = !kIsGroupMode>
-    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              const void* lse_ptr,
-              const void* do_ptr,
-              const void* d_ptr,
-              void* rand_val_ptr,
-              void* dk_ptr,
-              void* dv_ptr,
-              void* dbias_ptr,
-              void* dq_acc_ptr,
-              ck_tile::index_t seqlen_q,
-              ck_tile::index_t seqlen_k,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_do,
-              ck_tile::index_t stride_dq_acc,
-              ck_tile::index_t stride_dk,
-              ck_tile::index_t stride_dv,
-              ck_tile::index_t stride_dbias,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_do,
-              ck_tile::index_t nhead_stride_lsed,
-              ck_tile::index_t nhead_stride_dq_acc,
-              ck_tile::index_t nhead_stride_dk,
-              ck_tile::index_t nhead_stride_dv,
-              ck_tile::index_t nhead_stride_dbias,
-              ck_tile::index_t batch_stride_q,
-              ck_tile::index_t batch_stride_k,
-              ck_tile::index_t batch_stride_v,
-              ck_tile::index_t batch_stride_bias,
-              ck_tile::index_t batch_stride_randval,
-              ck_tile::index_t batch_stride_do,
-              ck_tile::index_t batch_stride_lsed,
-              ck_tile::index_t batch_stride_dq_acc,
-              ck_tile::index_t batch_stride_dk,
-              ck_tile::index_t batch_stride_dv,
-              ck_tile::index_t batch_stride_dbias,
-              ck_tile::index_t split_stride_dq_acc,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              const std::tuple<const void*, const void*>& drop_seed_offset)
-    {
-        return MakeKargsImpl(
-            q_ptr,
-            k_ptr,
-            v_ptr,
-            bias_ptr,
-            lse_ptr,
-            do_ptr,
-            d_ptr,
-            rand_val_ptr,
-            dk_ptr,
-            dv_ptr,
-            dbias_ptr,
-            dq_acc_ptr,
-            seqlen_q,
-            seqlen_k,
-            hdim_q,
-            hdim_v,
-            num_head_q,
-            nhead_ratio_qk,
-            scale,
-            stride_q,
-            stride_k,
-            stride_v,
-            stride_bias,
-            stride_randval,
-            stride_do,
-            stride_dq_acc,
-            stride_dk,
-            stride_dv,
-            stride_dbias,
-            nhead_stride_q,
-            nhead_stride_k,
-            nhead_stride_v,
-            nhead_stride_bias,
-            nhead_stride_randval,
-            nhead_stride_do,
-            nhead_stride_lsed,
-            nhead_stride_dq_acc,
-            nhead_stride_dk,
-            nhead_stride_dv,
-            nhead_stride_dbias,
-            batch_stride_q,
-            batch_stride_k,
-            batch_stride_v,
-            batch_stride_bias,
-            batch_stride_randval,
-            batch_stride_do,
-            batch_stride_lsed,
-            batch_stride_dq_acc,
-            batch_stride_dk,
-            batch_stride_dv,
-            batch_stride_dbias,
-            split_stride_dq_acc,
-            window_size_left,
-            window_size_right,
-            mask_type,
-            p_drop,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
-    }
-
     template <bool Cond = kIsGroupMode>
     CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
     MakeKargsImpl(const void* q_ptr,
@@ -854,208 +636,6 @@ struct FmhaBwdDQDKDVKernel
         return kargs;
     }
 
-    // std::variant<> can't take in a list initializer, overload for backward compatibility
-    template <bool Cond = kIsGroupMode>
-    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              const void* lse_ptr,
-              const void* do_ptr,
-              const void* d_ptr,
-              void* rand_val_ptr,
-              void* dk_ptr,
-              void* dv_ptr,
-              void* dbias_ptr,
-              void* dq_acc_ptr,
-              const void* seqstart_q_ptr,
-              const void* seqstart_k_ptr,
-              const void* seqlen_k_ptr,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_do,
-              ck_tile::index_t stride_dq_acc,
-              ck_tile::index_t stride_dk,
-              ck_tile::index_t stride_dv,
-              ck_tile::index_t stride_dbias,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_do,
-              ck_tile::index_t nhead_stride_lsed,
-              ck_tile::index_t nhead_stride_dq_acc,
-              ck_tile::index_t nhead_stride_dk,
-              ck_tile::index_t nhead_stride_dv,
-              ck_tile::index_t nhead_stride_dbias,
-              ck_tile::index_t split_stride_dq_acc,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
-    {
-        return MakeKargsImpl(
-            q_ptr,
-            k_ptr,
-            v_ptr,
-            bias_ptr,
-            lse_ptr,
-            do_ptr,
-            d_ptr,
-            rand_val_ptr,
-            dk_ptr,
-            dv_ptr,
-            dbias_ptr,
-            dq_acc_ptr,
-            seqstart_q_ptr,
-            seqstart_k_ptr,
-            seqlen_k_ptr,
-            hdim_q,
-            hdim_v,
-            num_head_q,
-            nhead_ratio_qk,
-            scale,
-            stride_q,
-            stride_k,
-            stride_v,
-            stride_bias,
-            stride_randval,
-            stride_do,
-            stride_dq_acc,
-            stride_dk,
-            stride_dv,
-            stride_dbias,
-            nhead_stride_q,
-            nhead_stride_k,
-            nhead_stride_v,
-            nhead_stride_bias,
-            nhead_stride_randval,
-            nhead_stride_do,
-            nhead_stride_lsed,
-            nhead_stride_dq_acc,
-            nhead_stride_dk,
-            nhead_stride_dv,
-            nhead_stride_dbias,
-            split_stride_dq_acc,
-            window_size_left,
-            window_size_right,
-            mask_type,
-            p_drop,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
-    }
-
-    // std::variant<> can't take in a list initializer, overload for backward compatibility
-    template <bool Cond = kIsGroupMode>
-    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargs(const void* q_ptr,
-              const void* k_ptr,
-              const void* v_ptr,
-              const void* bias_ptr,
-              const void* lse_ptr,
-              const void* do_ptr,
-              const void* d_ptr,
-              void* rand_val_ptr,
-              void* dk_ptr,
-              void* dv_ptr,
-              void* dbias_ptr,
-              void* dq_acc_ptr,
-              const void* seqstart_q_ptr,
-              const void* seqstart_k_ptr,
-              const void* seqlen_k_ptr,
-              ck_tile::index_t hdim_q,
-              ck_tile::index_t hdim_v,
-              ck_tile::index_t num_head_q,
-              ck_tile::index_t nhead_ratio_qk,
-              float scale,
-              ck_tile::index_t stride_q,
-              ck_tile::index_t stride_k,
-              ck_tile::index_t stride_v,
-              ck_tile::index_t stride_bias,
-              ck_tile::index_t stride_randval,
-              ck_tile::index_t stride_do,
-              ck_tile::index_t stride_dq_acc,
-              ck_tile::index_t stride_dk,
-              ck_tile::index_t stride_dv,
-              ck_tile::index_t stride_dbias,
-              ck_tile::index_t nhead_stride_q,
-              ck_tile::index_t nhead_stride_k,
-              ck_tile::index_t nhead_stride_v,
-              ck_tile::index_t nhead_stride_bias,
-              ck_tile::index_t nhead_stride_randval,
-              ck_tile::index_t nhead_stride_do,
-              ck_tile::index_t nhead_stride_lsed,
-              ck_tile::index_t nhead_stride_dq_acc,
-              ck_tile::index_t nhead_stride_dk,
-              ck_tile::index_t nhead_stride_dv,
-              ck_tile::index_t nhead_stride_dbias,
-              ck_tile::index_t split_stride_dq_acc,
-              ck_tile::index_t window_size_left,
-              ck_tile::index_t window_size_right,
-              ck_tile::index_t mask_type,
-              float p_drop,
-              const std::tuple<const void*, const void*>& drop_seed_offset)
-    {
-        return MakeKargsImpl(
-            q_ptr,
-            k_ptr,
-            v_ptr,
-            bias_ptr,
-            lse_ptr,
-            do_ptr,
-            d_ptr,
-            rand_val_ptr,
-            dk_ptr,
-            dv_ptr,
-            dbias_ptr,
-            dq_acc_ptr,
-            seqstart_q_ptr,
-            seqstart_k_ptr,
-            seqlen_k_ptr,
-            hdim_q,
-            hdim_v,
-            num_head_q,
-            nhead_ratio_qk,
-            scale,
-            stride_q,
-            stride_k,
-            stride_v,
-            stride_bias,
-            stride_randval,
-            stride_do,
-            stride_dq_acc,
-            stride_dk,
-            stride_dv,
-            stride_dbias,
-            nhead_stride_q,
-            nhead_stride_k,
-            nhead_stride_v,
-            nhead_stride_bias,
-            nhead_stride_randval,
-            nhead_stride_do,
-            nhead_stride_lsed,
-            nhead_stride_dq_acc,
-            nhead_stride_dk,
-            nhead_stride_dv,
-            nhead_stride_dbias,
-            split_stride_dq_acc,
-            window_size_left,
-            window_size_right,
-            mask_type,
-            p_drop,
-            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
-    }
-
     CK_TILE_HOST static constexpr auto
     GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_k_)
     {
@@ -1082,6 +662,12 @@ struct FmhaBwdDQDKDVKernel
     }
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if constexpr(kIsAvialable)
+            run_(std::move(kargs));
+    }
+
+    CK_TILE_DEVICE void run_(Kargs kargs) const
     {
         // allocate LDS
         __shared__ char smem_ptr[GetSmemSize()];
@@ -1282,62 +868,33 @@ struct FmhaBwdDQDKDVKernel
             {0, 0});
 
         auto dq_dram_window = [&, i_tile_n_ = i_tile_n, i_nhead_ = i_nhead]() {
-            if constexpr(kIsDeterministic)
-            {
-                AccDataType* dq_acc_ptr =
-                    reinterpret_cast<AccDataType*>(kargs.dq_acc_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_dq_acc +
-                    static_cast<long_index_t>(i_tile_n_) * kargs.split_stride_dq_acc +
-                    batch_offset_dq_acc;
+            AccDataType* dq_acc_ptr = reinterpret_cast<AccDataType*>(kargs.dq_acc_ptr) + [&]() {
+                if constexpr(kIsDeterministic)
+                    return static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_dq_acc +
+                           static_cast<long_index_t>(i_tile_n_) * kargs.split_stride_dq_acc +
+                           batch_offset_dq_acc;
+                else
+                    return static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_dq_acc +
+                           batch_offset_dq_acc;
+            }();
 
-                auto dq_acc_dram = [&]() {
-                    const auto dq_acc_dram_naive =
-                        make_naive_tensor_view<address_space_enum::global>(
-                            dq_acc_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                            make_tuple(kargs.stride_dq_acc, 1),
-                            number<FmhaPipeline::kAlignmentQGrad>{},
-                            number<1>{});
-
-                    return pad_tensor_view(
-                        dq_acc_dram_naive,
-                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                        sequence<false, kPadHeadDimQ>{});
-                }();
-
-                return make_tile_window(
-                    dq_acc_dram,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                    {0, 0});
-            }
-            else
-            {
-                AccDataType* dq_acc_ptr =
-                    reinterpret_cast<AccDataType*>(kargs.dq_acc_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_dq_acc +
-                    batch_offset_dq_acc;
-
-                auto dq_acc_dram = [&]() {
-                    const auto dq_acc_dram_naive =
-                        make_naive_tensor_view<address_space_enum::global,
-                                               memory_operation_enum::atomic_add>(
-                            dq_acc_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                            make_tuple(kargs.stride_dq_acc, 1),
-                            number<FmhaPipeline::kAlignmentQGrad>{},
-                            number<1>{});
-
-                    return pad_tensor_view(
-                        dq_acc_dram_naive,
-                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                        sequence<false, kPadHeadDimQ>{});
-                }();
-
-                return make_tile_window(
-                    dq_acc_dram,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                    {0, 0});
-            }
+            constexpr auto DstInMemOp = conditional_expr<kIsDeterministic>(
+                memory_operation_enum::set, memory_operation_enum::atomic_add);
+            const auto dq_acc_dram_naive =
+                make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    dq_acc_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                    make_tuple(kargs.stride_dq_acc, 1),
+                    number<FmhaPipeline::kAlignmentQGrad>{},
+                    number<1>{});
+            const auto dq_acc_dram = pad_tensor_view(
+                dq_acc_dram_naive,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
+                sequence<false, kPadHeadDimQ>{});
+            return make_tile_window(
+                dq_acc_dram,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
+                {0, 0});
         }();
 
         auto lse_dram_window =
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
index 1f11569533..d36f8ad724 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -54,6 +54,8 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
     static constexpr auto BiasEnum         = Problem::BiasEnum;
     static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
     static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
+    static constexpr bool kUseTrLoad       = Problem::kUseTrLoad;
+    static_assert(!kUseTrLoad, "This pipeline does not use trload!");
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index 967fe2362d..88fb1281aa 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -54,6 +54,8 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
     static constexpr auto BiasEnum         = Problem::BiasEnum;
     static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
     static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
+    static constexpr bool kUseTrLoad       = Problem::kUseTrLoad;
+    static_assert(!kUseTrLoad, "This pipeline does not use trload!");
 
     // last dimension vector length used to create tensor view(and decide buffer_load vector length)
     // ... together with tensor distribution. tensor dist should able to overwrite this
@@ -654,9 +656,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
             }();
 
             // STAGE 3, P^T@OGrad^T Gemm1
-            Policy::template PTFromGemm0CToGemm1A<Problem,
-                                                  decltype(pt_reg_tensor),
-                                                  decltype(p_gemm)>(pt_reg_tensor, p_gemm);
+            Policy::template PTFromGemm0CToGemm1A<Problem>(pt_reg_tensor, p_gemm);
             gemm_1(dv_acc, pt_reg_tensor, dot_reg_tensor);
 
             auto qt_reg_tensor = load_tile(qt_lds_read_window);
@@ -728,9 +728,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
             // STAGE 6, SGrad^T@Q^T Gemm3
             const auto ds_gemm = cast_tile<GemmDataType>(ds);
 
-            Policy::template SGradTFromGemm2CToGemm3A<Problem,
-                                                      decltype(dst_reg_tensor),
-                                                      decltype(ds_gemm)>(dst_reg_tensor, ds_gemm);
+            Policy::template SGradTFromGemm2CToGemm3A<Problem>(dst_reg_tensor, ds_gemm);
 
             gemm_3(dk_acc, dst_reg_tensor, qt_reg_tensor);
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
index 80c311de86..bf38c3c07d 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
@@ -6,22 +6,30 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp"
 
 namespace ck_tile {
 
-template <typename Problem>
+template <typename Problem, typename Policy>
 class BlockFmhaBwdDQDKDVPipelineSelector
 {
     static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV;
 
     public:
-    using type = std::conditional_t<has_dpad,
-                                    BlockFmhaBwdDQDKDVPipelineKRKTRVR<Problem>,
-                                    BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<Problem>>;
+    template <typename... TS>
+    using type_ =
+        std::conditional_t<Problem::kUseTrLoad,
+                           BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR<TS...>,
+                           std::conditional_t<has_dpad,
+                                              BlockFmhaBwdDQDKDVPipelineKRKTRVR<TS...>,
+                                              BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<TS...>>>;
+    using type = std::conditional_t<std::is_same_v<Policy, void>, //
+                                    type_<Problem>,
+                                    type_<Problem, Policy>>;
 };
 
-template <typename Problem>
-class BlockFmhaBwdDQDKDVPipeline : public BlockFmhaBwdDQDKDVPipelineSelector<Problem>::type
+template <typename Problem, typename Policy = void>
+class BlockFmhaBwdDQDKDVPipeline : public BlockFmhaBwdDQDKDVPipelineSelector<Problem, Policy>::type
 {
     public:
     static constexpr const char* name = "auto";
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
new file mode 100644
index 0000000000..1d95bc2801
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp
@@ -0,0 +1,760 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy = BlockFmhaBwdPipelineTrLoadDefaultPolicy>
+struct BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR
+{
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using GemmDataType          = remove_cvref_t<typename Problem::GemmDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using AccDataType           = remove_cvref_t<typename Problem::AccDataType>;
+    using DDataType             = remove_cvref_t<typename Problem::DDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using OGradDataType         = remove_cvref_t<typename Problem::OGradDataType>;
+    using QGradDataType         = remove_cvref_t<typename Problem::QGradDataType>;
+    using KGradDataType         = remove_cvref_t<typename Problem::KGradDataType>;
+    using VGradDataType         = remove_cvref_t<typename Problem::VGradDataType>;
+    using BiasGradDataType      = remove_cvref_t<typename Problem::BiasGradDataType>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+    using FmhaDropout           = remove_cvref_t<typename Problem::FmhaDropout>;
+    // using HotLoopScheduler      = typename Policy::template HotLoopScheduler<Problem>;
+
+    using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;
+
+    static constexpr index_t kBlockPerCu = Problem::kBlockPerCu;
+    static constexpr index_t kBlockSize  = Problem::kBlockSize;
+
+    static constexpr index_t kM0        = BlockFmhaShape::kM0;
+    static constexpr index_t kN0        = BlockFmhaShape::kN0;
+    static constexpr index_t kK0        = BlockFmhaShape::kK0;
+    static constexpr index_t kK1        = BlockFmhaShape::kK1;
+    static constexpr index_t kK2        = BlockFmhaShape::kK2;
+    static constexpr index_t kK3        = BlockFmhaShape::kK3;
+    static constexpr index_t kK4        = BlockFmhaShape::kK4;
+    static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
+
+    static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
+    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr auto BiasEnum         = Problem::BiasEnum;
+    static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
+    static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
+    static constexpr bool kUseTrLoad       = Problem::kUseTrLoad;
+    static_assert(kUseTrLoad, "This pipeline uses trload!");
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+    static constexpr index_t kAlignmentOGrad =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentOGrad<Problem>();
+    static constexpr index_t kAlignmentQGrad = 1;
+    static constexpr index_t kAlignmentKGrad =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
+    static constexpr index_t kAlignmentVGrad =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
+    static constexpr index_t kAlignmentBias = 1;
+
+    static constexpr const char* name = "trload_kr_ktr_vr";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static LSEDataType get_validated_lse(const LSEDataType raw_lse)
+    {
+        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || FmhaMask::IsMasking)
+            return (raw_lse == -numeric<LSEDataType>::infinity()) //
+                       ? type_convert<LSEDataType>(0.f)
+                       : raw_lse;
+        else
+            return raw_lse;
+    };
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename RandValDramBlockWindowTmp,
+              typename OGradDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename DDramBlockWindowTmp,
+              typename QGradDramBlockWindowTmp,
+              typename BiasGradDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_DEVICE auto operator()( //
+        const QDramBlockWindowTmp& q_dram_block_window_tmp,
+        const KDramBlockWindowTmp& k_dram_block_window_tmp,
+        const VDramBlockWindowTmp& v_dram_block_window_tmp,
+        const BiasDramBlockWindowTmp& bias_dram_block_window_tmp,
+        const RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
+        const OGradDramBlockWindowTmp& do_dram_block_window_tmp,
+        const LSEDramBlockWindowTmp& lse_dram_block_window_tmp,
+        const DDramBlockWindowTmp& d_dram_block_window_tmp,
+        const QGradDramBlockWindowTmp& dq_dram_block_window_tmp,
+        const BiasGradDramBlockWindowTmp& dbias_dram_block_window_tmp,
+        FmhaMask mask,
+        PositionEncoding position_encoding,
+        float raw_scale,
+        float scale,
+        float rp_undrop,
+        float scale_rp_undrop,
+        void* smem_ptr,
+        FmhaDropout& dropout) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<OGradDataType,
+                               remove_cvref_t<typename OGradDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<LSEDataType,
+                               remove_cvref_t<typename LSEDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<DDataType, remove_cvref_t<typename DDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == VDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kM0 == OGradDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == LSEDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == DDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == QGradDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == BiasGradDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasGradDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPTOGradTBlockGemm<Problem>();
+        constexpr auto gemm_2 = Policy::template GetOGradVBlockGemm<Problem>();
+        constexpr auto gemm_3 = Policy::template GetSGradTQTBlockGemm<Problem>();
+        constexpr auto gemm_4 = Policy::template GetSGradKTBlockGemm<Problem>();
+
+        // init VGrad & KGrad
+        auto dv_acc = decltype(gemm_1.MakeCBlockTile()){};
+        auto dk_acc = decltype(gemm_3.MakeCBlockTile()){};
+
+        // K, HBM ->LDS ->Reg
+        auto k_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<KDataType>(
+                                 k_dram_block_window_tmp.get_bottom_tensor_view()),
+                             k_dram_block_window_tmp.get_window_lengths(),
+                             k_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeKDramTileDistribution<Problem>());
+
+        const auto k_origin = k_dram_window.get_window_origin();
+
+        // Early termination
+        const auto [seqlen_q_start, seqlen_q_end] =
+            mask.GetTileRangeAlongY(k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+
+        const auto num_total_loop = integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0);
+
+        // check early exit if masked and no work to do.
+        if constexpr(FmhaMask::IsMasking)
+        {
+            if(num_total_loop <= 0)
+            {
+                // Note: here dk_acc&dv_acc are all cleard, return it
+                // Note: v loaded but no fence, ignore it.
+                return make_tuple(dk_acc, dv_acc);
+            }
+        }
+
+        // LDS allocation
+        const auto smem_ptr_ =
+            reinterpret_cast<char*>(smem_ptr); // cast to char* to do pointer arithmetic
+
+        const auto k_lds_ptr = reinterpret_cast<KDataType* __restrict__>(smem_ptr_);
+        const auto v_lds_ptr = reinterpret_cast<VDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeK<Problem>());
+
+        const auto do_lds_ptr0 = reinterpret_cast<OGradDataType* __restrict__>(smem_ptr_);
+        const auto do_lds_ptr1 = reinterpret_cast<OGradDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>());
+        const auto q_lds_ptr0 = reinterpret_cast<QDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>());
+        const auto q_lds_ptr1 = reinterpret_cast<QDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>());
+        const auto lse_lds_ptr = reinterpret_cast<LSEDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>());
+        const auto d_lds_ptr = reinterpret_cast<DDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>());
+        const auto ds_lds_ptr = reinterpret_cast<GemmDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeQ<Problem>() +
+            Policy::template GetSmemSizeLSE<Problem>() + Policy::template GetSmemSizeD<Problem>());
+        const auto bias_lds_ptr = reinterpret_cast<BiasDataType* __restrict__>(ds_lds_ptr);
+
+        auto k_lds = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>());
+        auto k_lds_write_window =
+            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
+
+        //------------------------------------------------------------------
+        // V, HBM ->LDS ->Reg
+        auto v_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<VDataType>(
+                                 v_dram_block_window_tmp.get_bottom_tensor_view()),
+                             v_dram_block_window_tmp.get_window_lengths(),
+                             v_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeVDramTileDistribution<Problem>());
+        auto v_lds = make_tensor_view<address_space_enum::lds>(
+            v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>());
+        auto v_lds_write_window =
+            make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kVHeaddim>{}), {0, 0});
+
+        //------------------------------------------------------------------
+        // KT, HBM -> LDS --trload-->Reg
+        async_load_tile(k_lds_write_window, k_dram_window);
+        async_load_tile(v_lds_write_window, v_dram_window);
+        __builtin_amdgcn_s_waitcnt(3952);
+        block_sync_lds();
+
+        //------------------------------------------------------------------
+        // Pre-Load KV into Registers
+        auto k_lds_read = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsReadBlockDescriptor<Problem>());
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             k_lds_write_window.get_window_origin(),
+                             Policy::template MakeKRegBlockDescriptor<Problem>());
+        auto k_reg_tensor = load_tile(k_lds_read_window);
+
+        auto kt_lds_read_window =
+            make_tile_window(k_lds_read,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKTRegBlockDescriptor<Problem>());
+
+        auto kt_reg_tensor = load_tile_transpose(kt_lds_read_window);
+
+        auto v_lds_read = make_tensor_view<address_space_enum::lds>(
+            v_lds_ptr, Policy::template MakeVLdsReadBlockDescriptor<Problem>());
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read,
+                             make_tuple(number<kN0>{}, number<kK2>{}),
+                             v_lds_write_window.get_window_origin(),
+                             Policy::template MakeVRegBlockDescriptor<Problem>());
+        auto v_reg_tensor = load_tile(v_lds_read_window);
+
+        __builtin_amdgcn_s_waitcnt(3952);
+        block_sync_lds();
+        //---------------------------- Loop Load in ----------------------------//
+        // Q: HBM -->LDS
+        auto q_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<QDataType>(
+                                 q_dram_block_window_tmp.get_bottom_tensor_view()),
+                             q_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_q_start, 0},
+                             Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds = make_tensor_view<address_space_enum::lds>(
+            q_lds_ptr0, Policy::template MakeQLdsWriteBlockDescriptor<Problem>());
+        auto q_lds_write_window =
+            make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
+
+        auto q_lds_read = make_tensor_view<address_space_enum::lds>(
+            q_lds_ptr0, Policy::template MakeQLdsReadBlockDescriptor<Problem>());
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read,
+                             make_tuple(number<kM0>{}, number<kK0>{}),
+                             q_lds_write_window.get_window_origin(),
+                             Policy::template MakeQRegSliceBlockDescriptor<Problem>());
+        auto qt_lds_read_window =
+            make_tile_window(q_lds_read,
+                             make_tuple(number<kM0>{}, number<kQKHeaddim>{}),
+                             {0, 0},
+                             Policy::template MakeQTRegSliceBlockDescriptor<Problem>());
+
+        // dO: HBM ->LDS ---load--> Reg
+        // dOT:          \-loadtr-> Reg
+        auto do_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<OGradDataType>(
+                                 do_dram_block_window_tmp.get_bottom_tensor_view()),
+                             do_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_q_start, 0},
+                             Policy::template MakeOGradDramTileDistribution<Problem>());
+
+        auto do_lds = make_tensor_view<address_space_enum::lds>(
+            do_lds_ptr0, Policy::template MakeOGradLdsWriteBlockDescriptor<Problem>());
+        auto do_lds_write_window =
+            make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
+
+        auto do_lds_read = make_tensor_view<address_space_enum::lds>(
+            do_lds_ptr0, Policy::template MakeOGradLdsReadBlockDescriptor<Problem>());
+        auto do_lds_read_window =
+            make_tile_window(do_lds_read,
+                             make_tuple(number<kM0>{}, number<kK2>{}),
+                             do_lds_write_window.get_window_origin(),
+                             Policy::template MakeOGradRegSliceBlockDescriptor<Problem>());
+        auto dot_lds_read_window =
+            make_tile_window(do_lds_read,
+                             make_tuple(number<kM0>{}, number<kK2>{}),
+                             {0, 0},
+                             Policy::template MakeOGradTRegSliceBlockDescriptor<Problem>());
+
+        // dS: Reg -> Reg -> LDS
+        auto ds_lds = make_tensor_view<address_space_enum::lds>(
+            ds_lds_ptr, Policy::template MakeSGradLdsBlockDescriptor<Problem>());
+
+        auto ds_lds_window =
+            make_tile_window(ds_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
+
+        // transform it to make it from col-major to row-major; prepared for load_tile_transpose
+        auto ds_lds_t = make_tensor_view<address_space_enum::lds>(
+            ds_lds_ptr, Policy::template MakeSGradLdsBlockDescriptor<Problem, true>());
+        auto ds_lds_read_window =
+            make_tile_window(ds_lds_t,
+                             make_tuple(number<kM0>{}, number<kK4>{}),
+                             {0, 0},
+                             Policy::template MakeSGradRegSliceBlockDescriptor<Problem>());
+
+        // Bias: HBM ->Reg ->Reg ->LDS
+        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+
+        auto bias_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<QDataType>(
+                                 bias_dram_block_window_tmp.get_bottom_tensor_view()),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_q_start, bias_origin.at(number<1>{})},
+                             Policy::template MakeBiasTileDistribution<Problem>());
+
+        auto bias_lds = make_tensor_view<address_space_enum::lds>(
+            bias_lds_ptr, Policy::template MakeBiasLdsWriteBlockDescriptor<Problem>());
+        auto bias_lds_write_window =
+            make_tile_window(bias_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
+
+        auto bias_lds_read = make_tensor_view<address_space_enum::lds>(
+            bias_lds_ptr, Policy::template MakeBiasLdsReadBlockDescriptor<Problem>());
+        auto bias_s_lds_read_window =
+            make_tile_window(bias_lds_read,
+                             make_tuple(number<kM0>{}, number<kN0>{}),
+                             bias_lds_write_window.get_window_origin(),
+                             Policy::template MakeBiasSTileDistribution<decltype(gemm_0)>());
+
+        static_assert(std::is_same_v<BiasDataType, BiasGradDataType>,
+                      "BiasDataType and BiasGradDataType should be the same!");
+
+        // LSE: HBM -> LDS ->Reg
+        auto lse_dram_window = make_tile_window(
+            lse_dram_block_window_tmp.get_bottom_tensor_view(),
+            lse_dram_block_window_tmp.get_window_lengths(),
+            {seqlen_q_start},
+            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+
+        auto lse_lds = make_tensor_view<address_space_enum::lds>(
+            lse_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
+
+        auto lse_lds_write_window = make_tile_window(lse_lds, make_tuple(number<kM0>{}), {0});
+
+        auto lse_lds_read_window = make_tile_window(
+            lse_lds,
+            make_tuple(number<kM0>{}),
+            {0},
+            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+
+        // D: HBM ->Reg
+        auto d_dram_window = make_tile_window(
+            d_dram_block_window_tmp.get_bottom_tensor_view(),
+            d_dram_block_window_tmp.get_window_lengths(),
+            {seqlen_q_start},
+            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+
+        auto d_lds = make_tensor_view<address_space_enum::lds>(
+            d_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
+        auto d_lds_write_window = make_tile_window(d_lds, make_tuple(number<kM0>{}), {0});
+        auto d_lds_read_window  = make_tile_window(
+            d_lds,
+            make_tuple(number<kM0>{}),
+            {0},
+            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+
+        // RandVal: HBM ->Reg
+        auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0), false>(
+            randval_dram_block_window_tmp, seqlen_q_start);
+
+        // BiasGrad
+        // Reg ->LDS ->Reg ->HBM
+        const auto dbias_origin = dbias_dram_block_window_tmp.get_window_origin();
+
+        auto dbias_dram_window =
+            make_tile_window(dbias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             dbias_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_q_start, dbias_origin.at(number<1>{})}); // M/N
+
+        auto dbias_lds_read_window =
+            make_tile_window(bias_lds,
+                             make_tuple(number<kM0>{}, number<kN0>{}),
+                             {0, 0},
+                             Policy::template MakeShuffledBiasTileDistribution<Problem>());
+
+        // ----------------------------Loop write out------------------------------//
+        auto dq_dram_window = make_tile_window(dq_dram_block_window_tmp.get_bottom_tensor_view(),
+                                               dq_dram_block_window_tmp.get_window_lengths(),
+                                               {seqlen_q_start, 0});
+
+        index_t i_total_loops = 0;
+        index_t seqlen_q_step = seqlen_q_start;
+        static_assert(kQKHeaddim >= kK0, "kQKHeaddim should be equal or greater than kK0");
+        static_assert(kM0 == kK1, "kM0 should equal to kK1");
+        static_assert(kVHeaddim >= kK2, "kVHeaddim should be equal or greater than kK2");
+        static_assert(kM0 == kK3, "kM0 should equal to kK3");
+        constexpr index_t k4_loops = kN0 / kK4;
+
+        clear_tile(dv_acc);
+        clear_tile(dk_acc);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        decltype(load_tile(q_lds_read_window)) q_reg_tensor;
+        decltype(load_tile(lse_lds_read_window)) lse;
+        decltype(load_tile_transpose(ds_lds_read_window)) ds_reg_tensor;
+        decltype(load_tile_transpose(ds_lds_read_window)) ds_reg_tensor_next;
+        decltype(load_tile(do_lds_read_window)) do_reg_tensor;
+        decltype(load_tile_transpose(dot_lds_read_window)) dot_reg_tensor;
+        decltype(load_tile(d_lds_read_window)) d;
+        decltype(load_tile_transpose(qt_lds_read_window)) qt_reg_tensor;
+        decltype(gemm_0.MakeCBlockTile()) s_acc, p;
+        decltype(gemm_2.MakeCBlockTile()) dp_acc, ds;
+        decltype(gemm_4.MakeCBlockTile()) dq_acc;
+
+        decltype(load_tile(lse_dram_window)) lse_block_tile;
+        decltype(load_tile(d_dram_window)) d_block_tile;
+
+        index_t i_total_bodys = 0;
+        auto main_body        = [&](auto is_prologue_, auto is_epilogue_) mutable {
+            const bool is_even                                = (i_total_bodys % 2 == 0);
+            QDataType* const __restrict__ q_lds_ptr_curr      = is_even ? q_lds_ptr1 : q_lds_ptr0;
+            QDataType* const __restrict__ q_lds_ptr_next      = is_even ? q_lds_ptr0 : q_lds_ptr1;
+            OGradDataType* const __restrict__ do_lds_ptr_curr = is_even ? do_lds_ptr1 : do_lds_ptr0;
+            OGradDataType* const __restrict__ do_lds_ptr_next = is_even ? do_lds_ptr0 : do_lds_ptr1;
+
+            constexpr bool is_prologue = is_prologue_.value;
+            constexpr bool is_epilogue = is_epilogue_.value;
+            static_assert(is_prologue || is_epilogue, "is_prologue or is_epilogue should be true");
+            constexpr bool is_main_body = is_prologue && is_epilogue;
+
+            if constexpr(is_prologue)
+            {
+                q_lds_write_window.set_bottom_tensor_view_data_ptr(q_lds_ptr_next);
+                async_load_tile(q_lds_write_window, q_dram_window);
+                move_tile_window(q_dram_window, {kM0, 0});
+
+                lse_block_tile = load_tile(lse_dram_window);
+                move_tile_window(lse_dram_window, {kM0});
+
+                do_lds_write_window.set_bottom_tensor_view_data_ptr(do_lds_ptr_next);
+                async_load_tile(do_lds_write_window, do_dram_window);
+                move_tile_window(do_dram_window, {kM0, 0});
+
+                d_block_tile = load_tile(d_dram_window);
+                move_tile_window(d_dram_window, {kM0});
+            }
+            if constexpr(is_epilogue)
+            {
+                // STAGE 1, Q@K Gemm0
+                s_acc = gemm_0(q_reg_tensor, k_reg_tensor);
+
+                dot_lds_read_window.set_bottom_tensor_view_data_ptr(do_lds_ptr_curr);
+                dot_reg_tensor = load_tile_transpose(dot_lds_read_window);
+            }
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm0();
+            __builtin_amdgcn_sched_barrier(0);
+            if constexpr(is_epilogue)
+            {
+                // STAGE 2, Scale, Add bias, Mask, Softmax, Dropout
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    async_load_tile(bias_lds_write_window, bias_dram_window);
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+                    auto bias_s_tile = load_tile(bias_s_lds_read_window);
+                    tile_elementwise_inout(
+                        [&](auto& x, const auto& y) {
+                            x = scale * x + log2e_v<AccDataType> * type_convert<AccDataType>(y);
+                        },
+                        s_acc,
+                        bias_s_tile);
+                    move_tile_window(bias_dram_window, {kM0, 0});
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
+                    sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                        sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                            const auto tile_idx = get_x_indices_from_distributed_indices(
+                                s_acc.get_tile_distribution(), make_tuple(idx0, idx1));
+
+                            const auto row = seqlen_q_step + tile_idx.at(number<0>{});
+                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                            constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                            s_acc(i_j_idx) *= scale;
+                            position_encoding.update(s_acc(i_j_idx), row, col);
+                        });
+                    });
+                }
+
+                {
+                    bool need_perpixel_check = mask.IsEdgeTile(
+                        seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+                    if(need_perpixel_check)
+                    {
+                        set_tile_if(s_acc, -numeric<AccDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = seqlen_q_step + tile_idx.at(number<0>{});
+                            const auto col = k_origin.at(number<0>{}) + tile_idx.at(number<1>{});
+                            return mask.IsOutOfBound(row, col);
+                        });
+                    }
+                }
+
+                constexpr auto p_spans = decltype(p)::get_distributed_spans();
+                sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
+                    constexpr auto i_idx = make_tuple(idx0);
+                    auto row_lse         = log2e_v<LSEDataType> * get_validated_lse(lse[i_idx]);
+
+                    sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                     BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                            p(i_j_idx) = exp2(s_acc[i_j_idx] - row_lse);
+                        else
+                            p(i_j_idx) = exp2(scale * s_acc[i_j_idx] - row_lse);
+                    });
+                });
+
+                if constexpr(FmhaDropout::IsDropout)
+                {
+                    dropout.template Run<decltype(gemm_0), RandValOutputDataType>(
+                        seqlen_q_step, k_origin.at(number<0>{}), p, randval_dram_window);
+                }
+                const auto p_gemm = [&]() { // dropout / type conversion
+                    if constexpr(FmhaDropout::IsDropout)
+                    {
+                        return tile_elementwise_in(
+                            [](const auto& x) {
+                                return type_convert<GemmDataType>(x > 0.f ? x : 0.f);
+                            },
+                            p);
+                    }
+                    else
+                    {
+                        return cast_tile<GemmDataType>(p);
+                    }
+                }();
+
+                // STAGE 4, OGrad@V Gemm2
+                dp_acc = gemm_2(do_reg_tensor, v_reg_tensor);
+
+                qt_lds_read_window.set_bottom_tensor_view_data_ptr(q_lds_ptr_curr);
+                qt_reg_tensor = load_tile_transpose(qt_lds_read_window);
+
+                // STAGE 3, P^T@OGrad^T Gemm1
+                auto pt_reg_tensor = make_static_distributed_tensor<GemmDataType>(
+                    Policy::template MakePTRegSliceBlockDescriptor<Problem>());
+                pt_reg_tensor.get_thread_buffer() = p_gemm.get_thread_buffer();
+                gemm_1(dv_acc, pt_reg_tensor, dot_reg_tensor);
+            }
+            block_sync_lds();
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm12();
+            __builtin_amdgcn_sched_barrier(0);
+            if constexpr(is_prologue)
+            {
+                store_tile(lse_lds_write_window, lse_block_tile);
+                store_tile(d_lds_write_window, d_block_tile);
+            }
+            if constexpr(is_epilogue)
+            {
+                // STAGE 5, P^T(PGrad^T - D)
+                constexpr auto ds_spans = decltype(ds)::get_distributed_spans();
+                sweep_tile_span(ds_spans[number<0>{}], [&](auto idx0) {
+                    constexpr auto i_idx = make_tuple(idx0);
+                    sweep_tile_span(ds_spans[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        bool undrop_flag       = p[i_j_idx] >= 0;
+                        ds(i_j_idx) = p[i_j_idx] * (!FmhaDropout::IsDropout || undrop_flag
+                                                               ? (dp_acc[i_j_idx] - d[i_idx])
+                                                               : d[i_idx]);
+                    });
+                });
+
+                if constexpr(kHasBiasGrad)
+                {
+                    const auto dbias = [&]() {
+                        if constexpr(FmhaDropout::IsDropout)
+                        {
+                            return tile_elementwise_in(
+                                [&rp_undrop](const auto& x) {
+                                    return type_convert<BiasGradDataType>(x * rp_undrop);
+                                },
+                                ds);
+                        }
+                        else
+                        {
+                            return cast_tile<BiasGradDataType>(ds);
+                        }
+                    }();
+                    store_tile(bias_lds_write_window, dbias);
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+                    auto shuffled_dbias_tile = load_tile(dbias_lds_read_window);
+                    auto dbias_tile          = make_static_distributed_tensor<BiasGradDataType>(
+                        Policy::template MakeBiasTileDistribution<Problem>());
+                    shuffle_tile(dbias_tile, shuffled_dbias_tile);
+                    store_tile(dbias_dram_window, dbias_tile);
+                    move_tile_window(dbias_dram_window, {kM0, 0});
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+            }
+            if constexpr(is_epilogue)
+            {
+                // STAGE 6, SGrad^T@Q^T Gemm3
+                const auto ds_gemm  = cast_tile<GemmDataType>(ds);
+                auto dst_reg_tensor = make_static_distributed_tensor<GemmDataType>(
+                    Policy::template MakeSGradTRegSliceBlockDescriptor<Problem>());
+                dst_reg_tensor.get_thread_buffer() = ds_gemm.get_thread_buffer();
+                gemm_3(dk_acc, dst_reg_tensor, qt_reg_tensor);
+
+                store_tile(ds_lds_window, ds_gemm);
+            }
+            __builtin_amdgcn_s_waitcnt(3952);
+            block_sync_lds();
+            if constexpr(is_prologue)
+            {
+                q_lds_read_window.set_bottom_tensor_view_data_ptr(q_lds_ptr_next);
+                q_reg_tensor = load_tile(q_lds_read_window);
+                lse          = load_tile(lse_lds_read_window);
+            }
+            if constexpr(is_epilogue)
+            {
+                ds_reg_tensor = load_tile_transpose(ds_lds_read_window);
+                move_tile_window(ds_lds_read_window, {kK4, 0});
+            }
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm3();
+            __builtin_amdgcn_sched_barrier(0);
+            if constexpr(is_epilogue)
+            {
+                // STAGE7 SGrad@K^T Gemm4
+                clear_tile(dq_acc);
+                static_for<0, k4_loops, 1>{}([&](auto i_k4) {
+                    if constexpr(i_k4 < k4_loops - 1)
+                    {
+                        ds_reg_tensor_next = load_tile_transpose(ds_lds_read_window);
+                        move_tile_window(ds_lds_read_window, {kK4, 0});
+                    }
+                    auto kt_reg_tensor_slice = get_slice_tile( //
+                        kt_reg_tensor,
+                        sequence<0, i_k4 * kK4>{},
+                        sequence<kQKHeaddim, (i_k4 + 1) * kK4>{});
+                    gemm_4(dq_acc, ds_reg_tensor, kt_reg_tensor_slice);
+
+                    if constexpr(i_k4 < k4_loops - 1)
+                    {
+                        ds_reg_tensor.get_thread_buffer() = ds_reg_tensor_next.get_thread_buffer();
+                    }
+                });
+                move_tile_window(ds_lds_read_window, {-kN0, 0});
+            }
+            block_sync_lds();
+            if constexpr(is_prologue)
+            {
+                do_lds_read_window.set_bottom_tensor_view_data_ptr(do_lds_ptr_next);
+                do_reg_tensor = load_tile(do_lds_read_window);
+                d             = load_tile(d_lds_read_window);
+            }
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm4();
+            if constexpr(is_epilogue)
+            {
+                // QGrad Scale
+                if constexpr(FmhaDropout::IsDropout)
+                {
+                    tile_elementwise_inout([&scale_rp_undrop](auto& x) { x = x * scale_rp_undrop; },
+                                           dq_acc);
+                }
+                else
+                {
+                    tile_elementwise_inout([&raw_scale](auto& x) { x = x * raw_scale; }, dq_acc);
+                }
+                if constexpr(kIsDeterministic)
+                {
+                    store_tile(dq_dram_window, dq_acc);
+                }
+                else
+                {
+                    update_tile(dq_dram_window, dq_acc);
+                }
+                move_tile_window(dq_dram_window, {kM0, 0});
+            }
+            i_total_bodys += 1;
+        };
+
+        main_body(std::true_type{}, std::false_type{});
+        // Hot loop
+        if(num_total_loop > 1)
+        {
+            do
+            {
+                main_body(std::true_type{}, std::true_type{});
+                i_total_loops += 1;
+                seqlen_q_step += kM0;
+            } while(i_total_loops < num_total_loop - 1);
+        }
+        main_body(std::false_type{}, std::true_type{});
+
+        // Results Scale
+        if constexpr(FmhaDropout::IsDropout)
+        {
+            tile_elementwise_inout([&scale_rp_undrop](auto& x) { x = x * scale_rp_undrop; },
+                                   dk_acc);
+            tile_elementwise_inout([&rp_undrop](auto& x) { x = x * rp_undrop; }, dv_acc);
+        }
+        else
+        {
+            tile_elementwise_inout([&raw_scale](auto& x) { x = x * raw_scale; }, dk_acc);
+        }
+
+        return make_tuple(dk_acc, dv_acc);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index 521968a43b..aa2ec99590 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -64,7 +64,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
     }
 
     template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetPTOGradTBlockGemm()
+    CK_TILE_DEVICE static constexpr auto GetPTOGradTBlockGemm()
     {
         using GemmProblem =
             BlockGemmProblem<typename Problem::GemmDataType,
@@ -84,7 +84,12 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
                                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
                                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-                                   true>;
+                                   true,
+                                   false, // SwizzleAccess
+                                   false, // UseStructuredSparsity
+                                   (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
+                                       ? WGAttrNumAccessEnum ::Double
+                                       : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
@@ -151,7 +156,12 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                    Problem::BlockFmhaShape::Gemm3WarpTile::at(number<0>{}),
                                    Problem::BlockFmhaShape::Gemm3WarpTile::at(number<1>{}),
                                    Problem::BlockFmhaShape::Gemm3WarpTile::at(number<2>{}),
-                                   true>;
+                                   true,
+                                   false, // SwizzleAccess
+                                   false, // UseStructuredSparsity
+                                   (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
+                                       ? WGAttrNumAccessEnum ::Double
+                                       : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
index f6c79c7db6..99718a187f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
@@ -27,6 +27,7 @@ template <typename QDataType_,
           bool kIsDeterministic_,
           typename FmhaMask_,
           typename FmhaDropout_,
+          bool kUseTrLoad_,
           typename Traits_>
 struct BlockFmhaBwdPipelineProblem
 {
@@ -53,6 +54,7 @@ struct BlockFmhaBwdPipelineProblem
     static constexpr index_t kBlockSize    = BlockFmhaShape::NumWarps * get_warp_size();
     static constexpr bool kIsGroupMode     = kIsGroupMode_;
     static constexpr bool kIsDeterministic = kIsDeterministic_;
+    static constexpr bool kUseTrLoad       = kUseTrLoad_;
 
     // attributes from traits
     static constexpr bool kPadHeadDimQ   = Traits::kPadHeadDimQ;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
new file mode 100644
index 0000000000..6cef1db730
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
@@ -0,0 +1,1220 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp"
+
+#include "ck_tile/core/utility/debug.hpp"
+
+namespace ck_tile {
+
+struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::QDataType,
+                             typename Problem::KDataType,
+                             typename Problem::AccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK0>,
+                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
+
+        constexpr auto SwizzleA = false;
+        using WarpGemm          = WarpGemmMfmaDispatcher< //
+            typename Problem::QDataType,
+            typename Problem::KDataType,
+            typename Problem::AccDataType,
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+            false,
+            SwizzleA>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::QDataType,
+                                                typename Problem::KDataType,
+                                                typename Problem::AccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                                WarpGemm>;
+
+        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy, /* TransposeC */ true>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetPTOGradTBlockGemm()
+    {
+        return BlockFmhaBwdPipelineDefaultPolicy::GetPTOGradTBlockGemm<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetOGradVBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::OGradDataType,
+                             typename Problem::VDataType,
+                             typename Problem::AccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK2>,
+                                           typename Problem::BlockFmhaShape::Gemm2BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm2WarpTile>>;
+
+        using WarpGemm = WarpGemmMfmaDispatcher<
+            typename Problem::OGradDataType,
+            typename Problem::VDataType,
+            typename Problem::AccDataType,
+            Problem::BlockFmhaShape::Gemm2WarpTile::at(number<0>{}),
+            Problem::BlockFmhaShape::Gemm2WarpTile::at(number<1>{}),
+            Problem::BlockFmhaShape::Gemm2WarpTile::at(number<2>{}),
+            false,
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}) == 16 ? false : true>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::OGradDataType,
+                                                typename Problem::VDataType,
+                                                typename Problem::AccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm2BlockWarps,
+                                                WarpGemm>;
+
+        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy, /* TransposeC */ true>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSGradTQTBlockGemm()
+    {
+        return BlockFmhaBwdPipelineDefaultPolicy::GetSGradTQTBlockGemm<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSGradKTBlockGemm()
+    {
+        using BlockFmhaShape = typename Problem::BlockFmhaShape;
+        using GemmProblem    = BlockGemmProblem<
+               typename Problem::GemmDataType,
+               typename Problem::KDataType,
+               typename Problem::AccDataType,
+               Problem::kBlockSize,
+               TileGemmShape<
+                   sequence<BlockFmhaShape::kM0, BlockFmhaShape::kQKHeaddim, BlockFmhaShape::kK4>,
+                   typename BlockFmhaShape::Gemm4BlockWarps,
+                   typename BlockFmhaShape::Gemm4WarpTile>>;
+
+        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
+                                                typename Problem::KDataType,
+                                                typename Problem::AccDataType,
+                                                BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
+                                                BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
+                                                BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
+                                                false,
+                                                false,
+                                                false,
+                                                WGAttrNumAccessEnum::Double>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
+                                                typename Problem::KDataType,
+                                                typename Problem::AccDataType,
+                                                typename BlockFmhaShape::Gemm4BlockWarps,
+                                                WarpGemm>;
+
+        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    // these are for global load
+    template <typename Problem, typename T>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentX() noexcept
+    {
+        return 16 / sizeof(T);
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
+    {
+        return GetAlignmentX<Problem, typename Problem::QDataType>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentK()
+    {
+        return GetAlignmentX<Problem, typename Problem::KDataType>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
+    {
+        return GetAlignmentX<Problem, typename Problem::VDataType>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentO()
+    {
+        return GetAlignmentX<Problem, typename Problem::ODataType>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOGrad()
+    {
+        return GetAlignmentX<Problem, typename Problem::OGradDataType>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentBias()
+    {
+        return GetAlignmentX<Problem, typename Problem::BiasDataType>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentKGrad()
+    {
+        return GetAlignmentX<Problem, typename Problem::KGradDataType>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentVGrad()
+    {
+        return GetAlignmentX<Problem, typename Problem::VGradDataType>();
+    }
+
+    // these are for load_tr_b64
+    template <typename T>
+    CK_TILE_HOST_DEVICE static constexpr auto GetTransposedAlignmentX() noexcept
+    {
+        return 8 / sizeof(T);
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetTransposedAlignmentQ() noexcept
+    {
+        return GetTransposedAlignmentX<typename Problem::QDataType>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetTransposedAlignmentOGrad()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kVHeaddim;
+
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+
+        return total_pixels / GetAlignmentOGrad<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetTransposedAlignmentBias()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t total_pixels = kMPerBlock * kNPerBlock / kBlockSize;
+
+        return total_pixels / GetAlignmentBias<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentPostQGradAcc()
+    {
+        using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
+        return 16 / sizeof(AccDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentPostQGrad()
+    {
+        return GetAlignmentPostQGradAcc<Problem>();
+    }
+
+    // It is found that alignment of 8x dwordx4 can avoid bank conflicts for both transposed and
+    // non-transposed load
+    static constexpr index_t WarpAlignmentBytes = 128;
+
+    // As load_lds requires contiguous LDS write, we need to transform the distribution of DRAM for
+    // reading
+    template <typename T, typename TensorView>
+    CK_TILE_HOST_DEVICE static constexpr auto TransformXDramTensorView(const TensorView& naive_view)
+    {
+        if constexpr(std::is_same_v<TensorView, ck_tile::null_tensor_view>)
+        {
+            return naive_view;
+        }
+        else
+        {
+            const auto transformed_desc =
+                TransformXDramDescriptor<T>(naive_view.get_tensor_descriptor());
+            return tensor_view<typename TensorView::buffer_view,
+                               remove_cvref_t<decltype(transformed_desc)>,
+                               TensorView::DstInMemOp>{naive_view.buf_, transformed_desc};
+        }
+    }
+    template <typename T, typename... TD_TS>
+    CK_TILE_HOST_DEVICE static constexpr auto
+    TransformXDramDescriptor(const tensor_descriptor<TD_TS...>& from_desc)
+    {
+        using from_desc_t = tensor_descriptor<TD_TS...>;
+
+        constexpr auto ndims = from_desc_t::get_num_of_dimension();
+        static_assert(ndims == 2, "XDram descriptor must have 2 dimensions");
+        const auto Rows = from_desc.get_length(number<0>{});
+        // constexpr auto Cols = 128;
+        // assert(from_desc.get_length(number<1>{}) == 128);
+        const auto Cols = from_desc.get_length(number<1>{});
+
+        constexpr index_t Dwordx4Bytes = 16;
+        constexpr index_t K2           = Dwordx4Bytes / sizeof(T);
+        constexpr index_t K1           = WarpAlignmentBytes / Dwordx4Bytes;
+        const index_t K0               = Cols / K1;
+        const auto ColLens             = make_tuple(K0, number<K1>{}, number<K2>{});
+
+        const auto desc_tmp1 = transform_tensor_descriptor(
+            from_desc,
+            make_tuple(make_pass_through_transform(Rows), make_unmerge_transform(ColLens)),
+            make_tuple(sequence<0>{}, sequence<1>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2, 3>{}));
+
+        const auto desc_tmp2 = transform_tensor_descriptor(
+            desc_tmp1,
+            make_tuple(make_xor_transform(make_tuple(Rows, number<K1>{})),
+                       make_pass_through_transform(K0),
+                       make_pass_through_transform(number<K2>{})),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        return transform_tensor_descriptor(
+            desc_tmp2,
+            make_tuple(make_pass_through_transform(Rows),
+                       make_merge_transform_v3_division_mod(ColLens)),
+            make_tuple(sequence<0>{}, sequence<1, 2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+    }
+
+    template <typename Problem, typename T, index_t RowsPerBlock, index_t ColsPerBlock>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeXDramTileDistribution()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kWarps     = kBlockSize / get_warp_size();
+
+        constexpr index_t K2 = GetAlignmentK<Problem>();
+        constexpr index_t K1 = WarpAlignmentBytes / sizeof(T) / K2;
+        constexpr index_t K0 = ColsPerBlock / K1 / K2;
+        static_assert((K0 * K1 * K2 == ColsPerBlock) && K1 * K2 * sizeof(T) == WarpAlignmentBytes,
+                      "ColsPerBlock notdivisible");
+
+        constexpr index_t N2 = get_warp_size() / K1;
+        constexpr index_t N1 = kWarps / K0;
+        constexpr index_t N0 = RowsPerBlock / N1 / N2;
+        static_assert((N0 * N1 * N2 == RowsPerBlock) && (K0 * N1 == kWarps) &&
+                          (K1 * N2 == get_warp_size()),
+                      "RowsPerBlock not divisible");
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1, K2>>,
+                                       tuple<sequence<2, 1>, sequence<1, 2>>, // K0 N1, N2 K1
+                                       tuple<sequence<0, 1>, sequence<2, 1>>,
+                                       sequence<1, 2>, // N0 K2
+                                       sequence<0, 2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
+    {
+        return MakeXDramTileDistribution<Problem,
+                                         typename Problem::KDataType,
+                                         Problem::BlockFmhaShape::kN0,
+                                         Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVDramTileDistribution()
+    {
+        return MakeXDramTileDistribution<Problem,
+                                         typename Problem::VDataType,
+                                         Problem::BlockFmhaShape::kN0,
+                                         Problem::BlockFmhaShape::kVHeaddim>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
+    {
+        return MakeXDramTileDistribution<Problem,
+                                         typename Problem::QDataType,
+                                         Problem::BlockFmhaShape::kM0,
+                                         Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOGradDramTileDistribution()
+    {
+        return MakeXDramTileDistribution<Problem,
+                                         typename Problem::OGradDataType,
+                                         Problem::BlockFmhaShape::kM0,
+                                         Problem::BlockFmhaShape::kVHeaddim>();
+    }
+
+    template <typename Problem, typename BlockGemm>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLSEDDramTileDistribution()
+    {
+        return BlockFmhaBwdPipelineDefaultPolicy::MakeLSEDDramTileDistribution<Problem,
+                                                                               BlockGemm>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasTileDistribution()
+    {
+        return BlockFmhaBwdPipelineDefaultPolicy::MakeBiasTileDistribution<Problem>();
+    }
+
+    template <typename DataType, index_t MPerBlock, index_t KPerBlock>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePreXDramTileDistribution()
+    {
+        constexpr index_t K1 = 16 / sizeof(DataType);
+        constexpr index_t K0 = KPerBlock / K1;
+        constexpr index_t M2 = 1;
+        constexpr index_t M1 = get_warp_size();
+        constexpr index_t M0 = MPerBlock / M1;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1>>,
+                                       tuple<sequence<0>, sequence<1>>,
+                                       sequence<1, 2, 2>,
+                                       sequence<2, 0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePreODramTileDistribution()
+    {
+        using ODataType = remove_cvref_t<typename Problem::ODataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kKPerBlock = Problem::kVHeaddim;
+
+        return MakePreXDramTileDistribution<ODataType, kBlockSize, kKPerBlock>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePreOGradDramTileDistribution()
+    {
+        using OGradDataType = remove_cvref_t<typename Problem::OGradDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kKPerBlock = Problem::kVHeaddim;
+
+        return MakePreXDramTileDistribution<OGradDataType, kBlockSize, kKPerBlock>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePostQGradAccDramTileDistribution()
+    {
+        using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::kM0;
+        constexpr index_t kKPerBlock = Problem::kQKHeaddim;
+
+        constexpr index_t K1 = 16 / sizeof(AccDataType);
+        constexpr index_t K0 = kKPerBlock / K1;
+
+        constexpr index_t M2 = get_warp_size() / K0;
+        constexpr index_t M1 = kBlockSize / get_warp_size();
+        constexpr index_t M0 = kMPerBlock / (M1 * M2);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<1>, sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                       tuple<sequence<2>, sequence<2, 3>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2, 3>,
+                                       sequence<0, 0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePostQGradDramTileDistribution()
+    {
+        using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::kM0;
+        constexpr index_t kKPerBlock = Problem::kQKHeaddim;
+
+        constexpr index_t K1 = 16 / sizeof(AccDataType);
+        constexpr index_t K0 = kKPerBlock / K1;
+
+        constexpr index_t M2 = get_warp_size() / K0;
+        constexpr index_t M1 = kBlockSize / get_warp_size();
+        constexpr index_t M0 = kMPerBlock / (M1 * M2);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKRegBlockDescriptor()
+    {
+        return BlockFmhaBwdPipelineDefaultPolicy::MakeKRegBlockDescriptor<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVRegBlockDescriptor()
+    {
+        return BlockFmhaBwdPipelineDefaultPolicy::MakeVRegBlockDescriptor<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKTRegBlockDescriptor()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetSGradKTBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm4BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm4BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto kt_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<MWarp>,
+            tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>, // 2 4, 4
+            tuple<sequence<0, 1>>,
+            tuple<sequence<0, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto kt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            kt_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        auto output =
+            make_static_tile_distribution(typename InputTileDistributionTraits<
+                                          decltype(kt_block_dstr_encode),
+                                          typename Problem::KDataType>::TransposedDstrEncode{});
+        return output;
+    }
+
+    // lds write descriptor used together with block_sync_lds (transformed dram descriptor)
+    template <typename T, index_t MNPerBlock, index_t KPerBlock>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeXLdsWriteBlockDescriptor()
+    {
+        constexpr index_t KPack = WarpAlignmentBytes / sizeof(T);
+
+        constexpr auto desc_0 = make_naive_tensor_descriptor_packed(
+            make_tuple(number<KPerBlock / KPack>{}, number<MNPerBlock>{}, number<KPack>{}));
+        return transform_tensor_descriptor(
+            desc_0,
+            make_tuple(make_pass_through_transform(number<MNPerBlock>{}),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsWriteBlockDescriptor()
+    {
+        return MakeXLdsWriteBlockDescriptor<typename Problem::KDataType,
+                                            Problem::BlockFmhaShape::kN0,
+                                            Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsWriteBlockDescriptor()
+    {
+        return MakeXLdsWriteBlockDescriptor<typename Problem::VDataType,
+                                            Problem::BlockFmhaShape::kN0,
+                                            Problem::BlockFmhaShape::kVHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsWriteBlockDescriptor()
+    {
+        return MakeXLdsWriteBlockDescriptor<typename Problem::QDataType,
+                                            Problem::BlockFmhaShape::kM0,
+                                            Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOGradLdsWriteBlockDescriptor()
+    {
+        return MakeXLdsWriteBlockDescriptor<typename Problem::OGradDataType,
+                                            Problem::BlockFmhaShape::kM0,
+                                            Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasLdsWriteBlockDescriptor()
+    {
+        return MakeXLdsWriteBlockDescriptor<typename Problem::BiasDataType,
+                                            Problem::BlockFmhaShape::kM0,
+                                            Problem::BlockFmhaShape::kN0>();
+    }
+
+    template <typename Problem, bool Transposed = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSGradLdsBlockDescriptor()
+    {
+        // SGrad should be of the same distr as Gemm2 OGradV's output (i.e. PGrad)
+        using BlockGemm = remove_cvref_t<decltype(GetOGradVBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t M2 = WarpGemm::WarpGemmAttribute::Impl::kCM1PerLane;
+        constexpr index_t M1 = WarpGemm::WarpGemmAttribute::Impl::kCMLane;
+        static_assert(WarpGemm::WarpGemmAttribute::Impl::kCM0PerLane == 1, "kCM0PerLane must be 1");
+        constexpr index_t M0 = kMPerBlock / (M1 * M2);
+
+        constexpr index_t N1 = WarpGemm::WarpGemmAttribute::Impl::kCNLane;
+        constexpr index_t N0 = kNPerBlock / N1;
+
+        constexpr auto desc_0 = make_naive_tensor_descriptor_packed(
+            make_tuple(number<M0>{}, number<N0>{}, number<M1>{}, number<N1>{}, number<M2>{}));
+
+        constexpr index_t M1_0 = 2, M1_1 = 2;
+        constexpr index_t N1_0 = 2, N1_1 = 8;
+        static_assert(M1_0 * M1_1 == M1, "M1_0 * M1_1 must equal M1");
+        static_assert(N1_0 * N1_1 == N1, "N1_0 * N1_1 must equal N1");
+
+        constexpr auto desc_1 = transform_tensor_descriptor(
+            desc_0,
+            make_tuple(make_pass_through_transform(number<M0>{}),
+                       make_pass_through_transform(number<N0>{}),
+                       make_unmerge_transform(make_tuple(number<M1_0>{}, number<M1_1>{})),
+                       make_unmerge_transform(make_tuple(number<N1_0>{}, number<N1_1>{})),
+                       make_pass_through_transform(number<M2>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}, sequence<4>{}),
+            make_tuple(
+                sequence<0>{}, sequence<1>{}, sequence<2, 3>{}, sequence<4, 5>{}, sequence<6>{}));
+        constexpr auto desc_2 = transform_tensor_descriptor(
+            desc_1,
+            make_tuple(make_pass_through_transform(number<M0>{}),
+                       make_pass_through_transform(number<N0>{}),
+                       make_xor_transform(make_tuple(number<M1_0>{}, number<N1_0>{})),
+                       make_pass_through_transform(number<M1_1>{}),
+                       make_pass_through_transform(number<N1_1>{}),
+                       make_pass_through_transform(number<M2>{})),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2, 4>{},
+                       sequence<3>{},
+                       sequence<5>{},
+                       sequence<6>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2, 4>{},
+                       sequence<3>{},
+                       sequence<5>{},
+                       sequence<6>{}));
+
+        constexpr auto top_dims = []() {
+            if constexpr(Transposed)
+                return make_tuple(sequence<1>{}, sequence<0>{});
+            else
+                return make_tuple(sequence<0>{}, sequence<1>{});
+        }();
+        return transform_tensor_descriptor(
+            desc_2,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<M0>{}, number<M1_0>{}, number<M1_1>{}, number<M2>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<N0>{}, number<N1_0>{}, number<N1_1>{}))),
+            make_tuple(sequence<0, 2, 3, 6>{}, sequence<1, 4, 5>{}),
+            top_dims);
+    }
+
+    template <typename T, index_t MNPerBlock, index_t KPerBlock>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeXLdsReadBlockDescriptor()
+    {
+        const auto Dwordx4Bytes = 16;
+        const auto K2           = Dwordx4Bytes / sizeof(T);
+        const auto K1           = WarpAlignmentBytes / Dwordx4Bytes;
+        const auto K0           = KPerBlock / (K1 * K2);
+
+        constexpr auto desc_0 = make_naive_tensor_descriptor_packed(
+            make_tuple(number<K0>{}, number<MNPerBlock>{}, number<K1>{}, number<K2>{}));
+        constexpr auto desc_1 = transform_tensor_descriptor(
+            desc_0,
+            make_tuple(make_pass_through_transform(number<K0>{}),
+                       make_xor_transform(make_tuple(number<MNPerBlock>{}, number<K1>{})),
+                       make_pass_through_transform(number<K2>{})),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+        return transform_tensor_descriptor(
+            desc_1,
+            make_tuple(make_pass_through_transform(number<MNPerBlock>{}),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<K0>{}, number<K1>{}, number<K2>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsReadBlockDescriptor()
+    {
+        return MakeXLdsReadBlockDescriptor<typename Problem::KDataType,
+                                           Problem::BlockFmhaShape::kN0,
+                                           Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsReadBlockDescriptor()
+    {
+        return MakeXLdsReadBlockDescriptor<typename Problem::VDataType,
+                                           Problem::BlockFmhaShape::kN0,
+                                           Problem::BlockFmhaShape::kVHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsReadBlockDescriptor()
+    {
+        return MakeXLdsReadBlockDescriptor<typename Problem::QDataType,
+                                           Problem::BlockFmhaShape::kM0,
+                                           Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOGradLdsReadBlockDescriptor()
+    {
+        return MakeXLdsReadBlockDescriptor<typename Problem::OGradDataType,
+                                           Problem::BlockFmhaShape::kM0,
+                                           Problem::BlockFmhaShape::kQKHeaddim>();
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasLdsReadBlockDescriptor()
+    {
+        return MakeXLdsReadBlockDescriptor<typename Problem::BiasDataType,
+                                           Problem::BlockFmhaShape::kM0,
+                                           Problem::BlockFmhaShape::kN0>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegSliceBlockDescriptor()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto q_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+        return q_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQTRegSliceBlockDescriptor()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetSGradTQTBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm3BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm3BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kQKHeaddim;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK3;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto qt_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto qt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            qt_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        return make_static_tile_distribution(typename InputTileDistributionTraits<
+                                             decltype(qt_block_dstr_encode),
+                                             typename Problem::QDataType>::TransposedDstrEncode{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSGradTRegSliceBlockDescriptor()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetSGradTQTBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm3BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm3BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK3;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto dst_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto dst_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            dst_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto dst_block_dstr = make_static_tile_distribution(dst_block_dstr_encode);
+
+        return dst_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLSEDLdsWriteBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        using LSEDType               = remove_cvref_t<typename Problem::DDataType>;
+        constexpr index_t kMPack     = 16 / sizeof(LSEDType);
+
+        constexpr auto lsed_lds_block_desc =
+            make_naive_tensor_descriptor(make_tuple(number<kMPerBlock>{}),
+                                         make_tuple(number<1>{}),
+                                         number<kMPack>{},
+                                         number<1>{});
+
+        return lsed_lds_block_desc;
+    }
+
+    template <typename Problem, typename BlockGemm>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLSEDLdsReadBlockDescriptor()
+    {
+        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+
+        constexpr index_t N1 = WG::WarpGemmAttribute::Impl::kCNLane;
+        constexpr index_t N0 = NWarp;
+
+        // M4 *2 and M2 /2 when swizzle mode enabled
+        constexpr index_t SwizzleConfig = WG::kM == 16 ? 1 : 2;
+        // constexpr index_t SwizzleConfig = 1;
+        constexpr index_t M4 = WG::WarpGemmAttribute::Impl::kCM1PerLane * SwizzleConfig;
+        constexpr index_t M3 = WG::WarpGemmAttribute::Impl::kCMLane;
+        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kCM0PerLane / SwizzleConfig;
+        constexpr index_t M1 = MWarp;
+        constexpr index_t M0 = kMPerBlock / (M1 * WG::WarpGemmAttribute::Impl::kM);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<N0, N1>,
+                                       tuple<sequence<M0, M1, M2, M3, M4>>,
+                                       tuple<sequence<1, 0>, sequence<1, 0>>,
+                                       tuple<sequence<1, 0>, sequence<3, 1>>,
+                                       sequence<1, 1, 1>,
+                                       sequence<0, 2, 4>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOGradRegSliceBlockDescriptor()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetOGradVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm2BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm2BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK2;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto do_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto do_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            do_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto do_block_dstr = make_static_tile_distribution(do_block_dstr_encode);
+
+        return do_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeOGradTRegSliceBlockDescriptor()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetPTOGradTBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kVHeaddim;
+        // constexpr index_t kNPerBlock = 32;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto dot_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto dot_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            dot_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+        // CK_PRINT<typename WarpGemm::BWarpDstrEncoding>();
+        // CK_PRINT<decltype(dot_block_dstr_encode)>();
+
+        return make_static_tile_distribution(
+            typename InputTileDistributionTraits<
+                decltype(dot_block_dstr_encode),
+                typename Problem::OGradDataType>::TransposedDstrEncode{});
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakePTRegSliceBlockDescriptor()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetPTOGradTBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto pt_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto pt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            pt_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto pt_block_dstr = make_static_tile_distribution(pt_block_dstr_encode);
+
+        return pt_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSGradRegSliceBlockDescriptor()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetSGradKTBlockGemm<Problem>())>;
+        using WarpGemm  = typename BlockGemm::WarpGemm;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm4BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm4BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK4;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        constexpr auto ds_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto ds_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            ds_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        return make_static_tile_distribution(
+            typename InputTileDistributionTraits<
+                decltype(ds_block_dstr_encode),
+                typename Problem::GemmDataType>::TransposedDstrEncode{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBiasTileDistribution()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t N1 = GetAlignmentBias<Problem>();
+        constexpr index_t N0 = kNPerBlock / N1;
+        constexpr index_t M2 = GetTransposedAlignmentBias<Problem>();
+        constexpr index_t M1 = get_warp_size() / N0;
+        constexpr index_t M0 = kBlockSize / get_warp_size();
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<>,
+                                       tuple<sequence<M0, M1, M2>, sequence<N0, N1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<0>, sequence<1, 0>>,
+                                       sequence<2, 1>,
+                                       sequence<1, 2>>{});
+    }
+
+    template <typename BlockGemm>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBiasSTileDistribution()
+    {
+        using c_block_tensor_type = decltype(BlockGemm{}.MakeCBlockTile());
+        return c_block_tensor_type::get_tile_distribution();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeQ()
+    {
+        return sizeof(typename Problem::QDataType) *
+               MakeQLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeK()
+    {
+        return sizeof(typename Problem::KDataType) *
+               MakeKLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeLSE()
+    {
+        return sizeof(typename Problem::LSEDataType) *
+               MakeLSEDLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeD()
+    {
+        return sizeof(typename Problem::DDataType) *
+               MakeLSEDLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeV()
+    {
+        return sizeof(typename Problem::VDataType) *
+               MakeVLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeOGrad()
+    {
+        return sizeof(typename Problem::OGradDataType) *
+               MakeOGradLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeSGrad()
+    {
+        return sizeof(typename Problem::GemmDataType) *
+               MakeSGradLdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeBias()
+    {
+        if constexpr(Problem::BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            return sizeof(typename Problem::BiasDataType) *
+                   MakeBiasLdsWriteBlockDescriptor<Problem>().get_element_space_size();
+        else
+            return 0;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_q    = GetSmemSizeQ<Problem>();
+        constexpr index_t smem_size_lse  = GetSmemSizeLSE<Problem>();
+        constexpr index_t smem_size_k    = GetSmemSizeK<Problem>();
+        constexpr index_t smem_size_v    = GetSmemSizeV<Problem>();
+        constexpr index_t smem_size_do   = GetSmemSizeOGrad<Problem>();
+        constexpr index_t smem_size_d    = GetSmemSizeD<Problem>();
+        constexpr index_t smem_size_ds   = GetSmemSizeSGrad<Problem>();
+        constexpr index_t smem_size_bias = GetSmemSizeBias<Problem>();
+
+        constexpr index_t smem_size_stage0 = smem_size_k + smem_size_v;
+        constexpr index_t smem_size_stage1 = smem_size_q * 2 + smem_size_do * 2 + smem_size_lse +
+                                             smem_size_d + max(smem_size_bias, smem_size_ds);
+        return max(smem_size_stage0, smem_size_stage1);
+    }
+
+    template <typename Problem>
+    class HotLoopScheduler
+    {
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
+        static constexpr index_t kM0        = Problem::BlockFmhaShape::kM0;
+        static constexpr index_t kN0        = Problem::BlockFmhaShape::kN0;
+        static constexpr index_t kQKHeaddim = Problem::BlockFmhaShape::kQKHeaddim;
+        static constexpr index_t kVHeaddim  = Problem::BlockFmhaShape::kVHeaddim;
+        static constexpr index_t kK0        = Problem::BlockFmhaShape::kK0;
+        static constexpr index_t kK2        = Problem::BlockFmhaShape::kK2;
+        static constexpr index_t kK4        = Problem::BlockFmhaShape::kK4;
+
+        static constexpr index_t WarpGemmM =
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{});
+        static constexpr index_t WarpGemmN =
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{});
+        static constexpr index_t WarpGemmK =
+            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{});
+        static constexpr index_t Gemm4MWarp =
+            Problem::BlockFmhaShape::Gemm4BlockWarps::at(number<0>{});
+        static constexpr index_t Gemm4NWarp =
+            Problem::BlockFmhaShape::Gemm4BlockWarps::at(number<1>{});
+
+        static constexpr index_t blockWarps = kBlockSize / get_warp_size();
+        using GemmDataType                  = typename Problem::GemmDataType;
+
+        // Compute
+        static constexpr index_t Gemm0MFMA =
+            kM0 * kN0 * kK0 / (blockWarps * WarpGemmM * WarpGemmN * WarpGemmK);
+        static constexpr index_t Gemm1MFMA =
+            kN0 * kVHeaddim * kM0 / (blockWarps * WarpGemmM * WarpGemmN * WarpGemmK);
+        static constexpr index_t Gemm2MFMA =
+            kM0 * kN0 * kK2 / (blockWarps * WarpGemmM * WarpGemmN * WarpGemmK);
+        static constexpr index_t Gemm3MFMA =
+            kN0 * kQKHeaddim * kM0 / (blockWarps * WarpGemmM * WarpGemmN * WarpGemmK);
+        static constexpr index_t Gemm4MFMA =
+            kM0 * kQKHeaddim * kN0 / (blockWarps * WarpGemmM * WarpGemmN * WarpGemmK);
+
+        // VMEM
+        static constexpr index_t Q_VMEM_READ =
+            kM0 * kQKHeaddim / kBlockSize / GetAlignmentQ<Problem>();
+        static constexpr index_t OGrad_VMEM_READ =
+            kM0 * kVHeaddim / kBlockSize / GetAlignmentOGrad<Problem>();
+        static constexpr index_t LSE_VMEM_READ = 1;
+        static constexpr index_t D_VMEM_READ   = 1;
+
+        // LDS Read
+        static constexpr index_t OGradT_LDS_READ =
+            kM0 * kVHeaddim / get_warp_size() / GetTransposedAlignmentOGrad<Problem>();
+        static constexpr index_t QT_LDS_READ =
+            kM0 * kQKHeaddim / get_warp_size() / GetTransposedAlignmentQ<Problem>();
+        static constexpr index_t SGradT_LDS_READ_P1 =
+            kM0 * kK4 / (get_warp_size() * Gemm4MWarp) / GetTransposedAlignmentX<GemmDataType>();
+        static constexpr index_t SGradT_LDS_READ_P2 =
+            kM0 * kN0 / (get_warp_size() * Gemm4MWarp) / GetTransposedAlignmentX<GemmDataType>() -
+            SGradT_LDS_READ_P1;
+        static constexpr index_t Q_LDS_READ =
+            kM0 * kK0 / get_warp_size() / GetAlignmentQ<Problem>();
+        static constexpr index_t LSE_LDS_READ = kM0 / (4 * 4);
+        static constexpr index_t D_LDS_READ   = LSE_LDS_READ;
+        static constexpr index_t OGrad_LDS_READ =
+            kM0 * kK2 / kBlockSize / GetAlignmentOGrad<Problem>();
+
+        // LDS Write
+        static constexpr index_t Q_LDS_WRITE =
+            kM0 * kQKHeaddim / Problem::kBlockSize / GetAlignmentQ<Problem>();
+        static constexpr index_t QT_LDS_WRITE =
+            kM0 * kQKHeaddim / kBlockSize / GetTransposedAlignmentQ<Problem>();
+        static constexpr index_t OGrad_LDS_WRITE =
+            kM0 * kVHeaddim / kBlockSize / GetAlignmentOGrad<Problem>();
+        static constexpr index_t OGradT_LDS_WRITE =
+            kM0 * kVHeaddim / kBlockSize / GetTransposedAlignmentOGrad<Problem>();
+        static constexpr index_t LSE_LDS_WRITE    = 1;
+        static constexpr index_t D_LDS_WRITE      = 1;
+        static constexpr index_t SGradT_LDS_WRITE = kM0 * kN0 / kBlockSize;
+
+        public:
+        CK_TILE_DEVICE static constexpr void SchedulerGemm0()
+        {
+            // Mem: Q, LSE, OGrad, D global load, OGrad^T LDS load
+            // Comp: Q x K
+            constexpr index_t VMEM_READ_INST =
+                Q_VMEM_READ + OGrad_VMEM_READ + LSE_VMEM_READ + D_VMEM_READ;
+            constexpr index_t MFMA_INST     = Gemm0MFMA;
+            constexpr index_t LDS_READ_INST = OGradT_LDS_READ;
+
+            constexpr index_t lcm_inst = lcm(VMEM_READ_INST, MFMA_INST, LDS_READ_INST);
+            static_for<0, lcm_inst, 1>{}([&](auto i) {
+                if constexpr(i % (lcm_inst / VMEM_READ_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                if constexpr(i % (lcm_inst / MFMA_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(i % (lcm_inst / LDS_READ_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+        }
+
+        CK_TILE_DEVICE static constexpr void SchedulerGemm12()
+        {
+            // Mem:  Q^T LDS load
+            // Comp: PT x OGrad
+            constexpr index_t LDS_READ_INST = QT_LDS_READ;
+            constexpr index_t MFMA_INST     = Gemm1MFMA + Gemm2MFMA;
+
+            constexpr index_t lcm_inst = lcm(MFMA_INST, LDS_READ_INST);
+            static_for<0, lcm_inst, 1>{}([&](auto i) {
+                if constexpr(i % (lcm_inst / MFMA_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(i % (lcm_inst / LDS_READ_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // VMEM read
+            });
+        }
+
+        CK_TILE_DEVICE static constexpr void SchedulerGemm3()
+        {
+            // Mem: LSE/D LDS store, SGradT LDS store, SGrad, Q, LSE LDS load.
+            // Comp: SGradT x QT
+            constexpr index_t LDS_WRITE_INST = LSE_LDS_WRITE + D_LDS_WRITE + SGradT_LDS_WRITE;
+            constexpr index_t LDS_READ_INST  = SGradT_LDS_READ_P1 + Q_LDS_READ + LSE_LDS_READ;
+            constexpr index_t MFMA_INST      = Gemm3MFMA;
+
+            constexpr index_t lds_rw_inst = LDS_WRITE_INST + LDS_READ_INST;
+            constexpr index_t lcm_inst    = lcm(MFMA_INST, lds_rw_inst);
+
+            static_for<0, lcm_inst, 1>{}([&](auto i) {
+                if constexpr(i % (lcm_inst / MFMA_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(i % (lcm_inst / lds_rw_inst) == 0)
+                {
+                    if constexpr(i / (lcm_inst / lds_rw_inst) < LDS_WRITE_INST)
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    else
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS Read
+                }
+            });
+        }
+
+        CK_TILE_DEVICE static constexpr void SchedulerGemm4()
+        {
+            // Mem: SGrad, OGrad, D LDS load.
+            // Comp: SGrad x KT
+            constexpr index_t LDS_READ_INST = SGradT_LDS_READ_P2 + OGrad_LDS_READ + D_LDS_READ;
+            constexpr index_t MFMA_INST     = Gemm4MFMA;
+
+            constexpr index_t lcm_inst = lcm(MFMA_INST, LDS_READ_INST);
+            static_for<0, lcm_inst, 1>{}([&](auto i) {
+                if constexpr(i % (lcm_inst / MFMA_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr(i % (lcm_inst / LDS_READ_INST) == 0)
+                    __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+        }
+    };
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
index 28d8b3eead..4652e5f20f 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -11,7 +11,9 @@ namespace ck_tile {
 // A is block distributed tensor
 // B is block distributed tensor
 // C is block distributed tensor
-template <typename Problem_, typename Policy_ = BlockGemmARegBRegCRegV1DefaultPolicy>
+template <typename Problem_,
+          typename Policy_ = BlockGemmARegBRegCRegV1DefaultPolicy,
+          bool TransposeC_ = false>
 struct BlockGemmARegBRegCRegV1
 {
     private:
@@ -44,8 +46,9 @@ struct BlockGemmARegBRegCRegV1
     };
 
     public:
-    using Problem = remove_cvref_t<Problem_>;
-    using Policy  = remove_cvref_t<Policy_>;
+    using Problem                    = remove_cvref_t<Problem_>;
+    using Policy                     = remove_cvref_t<Policy_>;
+    static constexpr bool TransposeC = TransposeC_;
 
     using Traits = GemmTraits_<Problem, Policy>;
 
@@ -131,6 +134,7 @@ struct BlockGemmARegBRegCRegV1
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockDistributionEncode()
     {
+        using c_distr_ys_major = std::conditional_t<TransposeC, sequence<2, 1>, sequence<1, 2>>;
         if constexpr(UseDefaultScheduler)
         {
             constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
@@ -138,7 +142,7 @@ struct BlockGemmARegBRegCRegV1
                 tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
                 tuple<>,
                 tuple<>,
-                sequence<1, 2>,
+                c_distr_ys_major,
                 sequence<0, 0>>{};
             constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
                 c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
@@ -152,7 +156,7 @@ struct BlockGemmARegBRegCRegV1
                 tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
                 tuple<sequence<1, 2>>,
                 tuple<sequence<1, 1>>,
-                sequence<1, 2>,
+                c_distr_ys_major,
                 sequence<0, 0>>{};
             constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
                 c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
@@ -172,25 +176,19 @@ struct BlockGemmARegBRegCRegV1
                           std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
                       "wrong!");
 
-        constexpr auto a_block_dstr_encode = MakeABlockDistributionEncode();
-
-        constexpr auto b_block_dstr_encode = MakeBBlockDistributionEncode();
-
-        constexpr auto c_block_dstr_encode = MakeCBlockDistributionEncode();
-
         // check ABC-block-distribution
         static_assert(
-            std::is_same_v<remove_cvref_t<decltype(a_block_dstr_encode)>,
+            std::is_same_v<remove_cvref_t<decltype(MakeABlockDistributionEncode())>,
                            remove_cvref_t<decltype(ABlockTensor::get_tile_distribution()
                                                        .get_static_tile_distribution_encoding())>>,
             "A distribution is wrong!");
         static_assert(
-            std::is_same_v<remove_cvref_t<decltype(b_block_dstr_encode)>,
+            std::is_same_v<remove_cvref_t<decltype(MakeBBlockDistributionEncode())>,
                            remove_cvref_t<decltype(BBlockTensor::get_tile_distribution()
                                                        .get_static_tile_distribution_encoding())>>,
             "B distribution is wrong!");
         static_assert(
-            std::is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
+            std::is_same_v<remove_cvref_t<decltype(MakeCBlockDistributionEncode())>,
                            remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
                                                        .get_static_tile_distribution_encoding())>>,
             "C distribution is wrong!");
@@ -219,7 +217,6 @@ struct BlockGemmARegBRegCRegV1
             static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                 // read A warp tensor from A Block window
                 AWarpTensor a_warp_tensor;
-
                 a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
                     merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
                     merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
@@ -227,16 +224,16 @@ struct BlockGemmARegBRegCRegV1
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                     // read B warp tensor from B block tensor
                     BWarpTensor b_warp_tensor;
-
                     b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
                         merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
                     // read C warp tensor from C block tensor
+                    using c_iter_idx = std::
+                        conditional_t<TransposeC, sequence<nIter, mIter>, sequence<mIter, nIter>>;
                     CWarpTensor c_warp_tensor;
-
                     c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                     // warp GEMM
@@ -244,7 +241,7 @@ struct BlockGemmARegBRegCRegV1
 
                     // write C warp tensor into C block tensor
                     c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
                         c_warp_tensor.get_thread_buffer());
                 });
@@ -254,6 +251,7 @@ struct BlockGemmARegBRegCRegV1
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
+        using c_distr_ys_major = std::conditional_t<TransposeC, sequence<2, 1>, sequence<1, 2>>;
         if constexpr(UseDefaultScheduler)
         {
             constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
@@ -261,7 +259,7 @@ struct BlockGemmARegBRegCRegV1
                 tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
                 tuple<>,
                 tuple<>,
-                sequence<1, 2>,
+                c_distr_ys_major,
                 sequence<0, 0>>{};
 
             constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -277,7 +275,7 @@ struct BlockGemmARegBRegCRegV1
                 tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
                 tuple<sequence<1, 2>>,
                 tuple<sequence<1, 1>>,
-                sequence<1, 2>,
+                c_distr_ys_major,
                 sequence<0, 0>>{};
 
             constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(

From 352f87e6841f04c83a86eeab6c9718a99f7aad84 Mon Sep 17 00:00:00 2001
From: Cameron Shinn <camerontshinn@gmail.com>
Date: Mon, 11 Aug 2025 22:44:01 -0700
Subject: [PATCH 413/443] Fix num_byte calculations to use nhead_k for K & V
 size (#2653)

Simple fix just to calculate the number of bytes correctly for what's reported in the output. I was getting 6200 GB/s which is past the SoL of MI300.

Before:
```
./bin/tile_example_fmha_fwd -prec=bf16 -b=2 -s=1 -s_k=32768 -h=32 -h_k=8 -d=128 -page_block_size=128 -num_splits=8 -iperm=0 -operm=0 -v=0 -kname=1
[bf16|batch|bshd] b:2, h:32/8, s:1/32768, d:128/128, scale_s:0.0883883, bias:n, p_drop:0, lse:0, squant:0, mask:n, v:r, num_splits:8, page_block_size:128, fmha_fwd_splitkv_d128_bf16_batch_b16x64x64x128x64x128_r1x4x1_r1x4x1_w16x16x16_w16x16x16_qr_nwarp_sshuffle_vr_ps_nlogits_nbias_nmask_lse_nsquant_pagedkv, fmha_fwd_splitkv_combine_d128_bf16_batch_b32_unused_ps_nlse_nsquant, 0.173 ms, 6.20 TFlops, 6202.95 GB/s
```

After:
```
./bin/tile_example_fmha_fwd -prec=bf16 -b=2 -s=1 -s_k=32768 -h=32 -h_k=8 -d=128 -page_block_size=128 -num_splits=8 -iperm=0 -operm=0 -v=0 -kname=1
[bf16|batch|bshd] b:2, h:32/8, s:1/32768, d:128/128, scale_s:0.0883883, bias:n, p_drop:0, lse:0, squant:0, mask:n, v:r, num_splits:8, page_block_size:128, fmha_fwd_splitkv_d128_bf16_batch_b16x64x64x128x64x128_r1x4x1_r1x4x1_w16x16x16_w16x16x16_qr_nwarp_sshuffle_vr_ps_nlogits_nbias_nmask_lse_nsquant_pagedkv, fmha_fwd_splitkv_combine_d128_bf16_batch_b32_unused_ps_nlse_nsquant, 0.163 ms, 6.58 TFlops, 1644.53 GB/s
```
---
 example/ck_tile/01_fmha/fmha_fwd.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index e9403f4698..48306e35fe 100755
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -525,10 +525,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
             flop += nhead * (static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_q +
                              static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_v);
 
-            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
-                                 sizeof(KDataType) * real_seqlen_k * hdim_q +
-                                 sizeof(VDataType) * hdim_v * real_seqlen_k +
-                                 sizeof(ODataType) * real_seqlen_q * hdim_v);
+            num_byte += nhead *   (sizeof(QDataType) * real_seqlen_q * hdim_q +
+                                   sizeof(ODataType) * real_seqlen_q * hdim_v);
+            num_byte += nhead_k * (sizeof(KDataType) * real_seqlen_k * hdim_q +
+                                   sizeof(VDataType) * hdim_v * real_seqlen_k);
         }
     }
 

From 8e1eb0c1ee36cad0292c960fc346625a0d82a167 Mon Sep 17 00:00:00 2001
From: Yi DING <yi.ding@amd.com>
Date: Tue, 12 Aug 2025 17:02:52 +0800
Subject: [PATCH 414/443] [CK_TILE] FMHA BWD Decode Pipeline (#2643)

* Fix distr

* Duplicate block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr

* decode 16x16 o2
---
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |   4 +-
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   | 207 +++--
 example/ck_tile/01_fmha/fmha_bwd.cpp          |  22 +
 example/ck_tile/01_fmha/fmha_bwd.hpp          |  36 +-
 .../ops/epilogue/default_2d_epilogue.hpp      |   4 +-
 include/ck_tile/ops/fmha.hpp                  |   1 +
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       | 119 ++-
 ...ck_fmha_bwd_dq_dk_dv_pipeline_selector.hpp |   6 +-
 ...wd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp | 743 ++++++++++++++++++
 ...mha_bwd_pipeline_trload_default_policy.hpp |  65 +-
 .../ops/fmha/pipeline/tile_fmha_shape.hpp     |   9 +-
 11 files changed, 1051 insertions(+), 165 deletions(-)
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp

diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 9e15a822ef..6fca800c90 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -127,5 +127,7 @@ PIPELINE_ENUM_MAP = {
 
 BOOL_MAP = {
     "t" : "true",
-    "f" : "false"
+    "f" : "false",
+    True : "true",
+    False : "false",
 }
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 8ca917cb6c..bb3a0587e7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 import fnmatch
 import itertools
 from pathlib import Path
-from typing import List, Optional, Tuple, Dict, Literal
+from typing import List, Tuple, Dict, Literal, Any
 from collections import defaultdict
 
 from codegen.cmake_config import *
@@ -31,6 +31,7 @@ using fmha_block_warps1_{F_idx} = ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>;
 using fmha_block_warps2_{F_idx} = ck_tile::sequence<{F_rm2}, {F_rn2}, {F_rk2}>;
 using fmha_warp_tile0_{F_idx}   = ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>;
 using fmha_warp_tile1_{F_idx}   = ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>;
+using fmha_warp_tile2_{F_idx}   = ck_tile::sequence<{F_wm0}, {F_wn0}, ck_tile::min({F_wk0}, {F_bk4})>;
 
 // TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape
 //       G0&G2 -> GSdP
@@ -46,7 +47,8 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
                                                          fmha_block_warps1_{F_idx},
                                                          fmha_warp_tile1_{F_idx},
                                                          fmha_block_warps2_{F_idx},
-                                                         fmha_warp_tile0_{F_idx}>;
+                                                         fmha_warp_tile2_{F_idx},
+                                                         {F_maxq}>;
 
 using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false,  /* kPadSeqLenQ */
                                                        false,  /* kPadSeqLenK */
@@ -100,10 +102,17 @@ using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
                                       false,
                                       {F_dvpad}>>;
 
+using fmha_bwd_dq_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
+    ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
+                                      typename FmhaBwdTypeConfig<{F_dtype}>::QGradDataType,
+                                      false,
+                                      {F_dpad}>>;
+
 using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
     ck_tile::FmhaBwdDQDKDVKernel<fmha_bwd_pipeline_{F_idx},
                                  fmha_bwd_dk_epilogue_{F_idx},
-                                 fmha_bwd_dv_epilogue_{F_idx}>;
+                                 fmha_bwd_dv_epilogue_{F_idx},
+                                 fmha_bwd_dq_epilogue_{F_idx}>;
 
 using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                          {F_dtype},
@@ -115,7 +124,8 @@ using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                          {F_dpad},
                                                          {F_dvpad},
                                                          {F_deterministic},
-                                                         {F_trload}>;
+                                                         {F_trload},
+                                                         {F_maxq}>;
 
 #include <iostream>
 
@@ -144,6 +154,13 @@ void fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_{F_idx}>(const ck_tile::stream_co
         ck_tile::stream_config{{s.stream_id_}});
 }}
 
+template <>
+int fmha_bwd_dq_dk_dv_maxq_<dq_dk_dv_trait_{F_idx}>()
+{{
+    using k_ = fmha_bwd_dq_dk_dv_kernel_{F_idx};
+    return k_::kMaxSeqLenQ;
+}}
+
 template <>
 std::string fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_{F_idx}>()
 {{
@@ -159,13 +176,25 @@ FMHA_BWD_API="""
 template <typename dot_do_o_trait_, typename dq_dk_dv_trait_, typename convert_dq_trait_>
 float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
 {{
-    if(s.log_level_ > 0)
-        std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << "@" << fmha_bwd_convert_dq_get_name_<convert_dq_trait_>() << "@" << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << std::flush;
-    return ck_tile::launch_kernel(s,
-        [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
-        [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }},
-        [=](const ck_tile::stream_config& s_){{ fmha_bwd_convert_dq_oneshot_<convert_dq_trait_>(s_, a); }}
-    );
+    if constexpr (!std::is_same_v<convert_dq_trait_, void>)
+    {{
+        if(s.log_level_ > 0)
+            std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << "@" << fmha_bwd_convert_dq_get_name_<convert_dq_trait_>() << "@" << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << std::flush;
+        return ck_tile::launch_kernel(s,
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }},
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_convert_dq_oneshot_<convert_dq_trait_>(s_, a); }}
+        );
+    }}
+    else
+    {{
+        if(s.log_level_ > 0)
+            std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << "@" << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << std::flush;
+        return ck_tile::launch_kernel(s,
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }}
+        );
+    }}
 }}
 
 template <>
@@ -177,28 +206,25 @@ float fmha_bwd<2>(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_conf
 }}
 """
 
-FMHA_BWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
-{F_body}
-    }}
-"""
+def FMHA_BWD_API_COND_STATEMENT(F_cond: str, F_body: str, *, indent=0, if_ = 0) -> str:
+    lines = [
+        f"{'if' if if_ == 0 else 'else if'}({F_cond})",
+        "{",
+        *['    ' + line for line in F_body.split('\n') if line.strip() != ''],
+        "}",
+    ]
+    return '\n'.join(' ' * indent + line for line in lines) + '\n'
 
-FMHA_BWD_API_PER_DTYPE="""      {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
-{F_body}
-      }}
-"""
-FMHA_BWD_API_PER_HDIM_CASE="""          {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim}) {{
-{F_body}
-          }}
-"""
 
-FMHA_BWD_API_INNER_DISPATCH="""              {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-                          ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-                  using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
-                  using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}>;
-                  using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
-                  r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
-                  return r;
-              }}
+FMHA_BWD_API_INNER_DISPATCH="""
+{F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
+        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
+    using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
+    using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}, {F_maxq}>;
+    using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
+    r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, std::conditional_t<{F_convert_dq_enabled}, convert_dq_trait_, void>>(s, a);
+    return r;
+}}
 """
 
 # M0 size for 1d kernels (dot/convert)
@@ -237,11 +263,13 @@ class FmhaBwdDQDKDVTileSize:
     F_wn1       : int  # warp size along n in gemm1/gemm3
     F_wk1       : int  # warp size along k in gemm1/gemm3
     F_occupancy : int  # occupancy
+    max_seq_q   : int = 0
+
     @property
     def name(self) -> str:
         return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bk1}x{self.F_bk2}x{self.F_bk3}x{self.F_bk4}x{self.F_bhdq}x{self.F_bhdv}" +\
         f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}_r{self.F_rm2}x{self.F_rn2}x{self.F_rk2}" +\
-        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}_o{self.F_occupancy}"
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}_o{self.F_occupancy}_maxq{self.max_seq_q}"
 
 @dataclass(frozen=True)
 class FmhaBwdDQDKDVKernel:
@@ -301,6 +329,7 @@ class FmhaBwdDQDKDVKernel:
                 F_mode          = MODE_MAP[self.F_mode],
                 F_deterministic = BOOL_MAP[self.F_deterministic],
                 F_trload        = BOOL_MAP[self.F_trload],
+                F_maxq          = self.F_tile.max_seq_q
             )
 
     @property
@@ -345,21 +374,23 @@ class FmhaBwdDQDKDVKernel:
 
 # TODO: design a more practical way to do it
 # this is current supported tile size.
-def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str, tr_load: str) -> Optional[dict]:
+def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize]:
     if (dtype == 'fp16' or dtype == 'bf16') and tr_load == 'f':
-        return {
-            '32'  : FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-            '64'  : FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-            '128' : FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-            # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-            '256' : FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-        }
+        return [
+            FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            # FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+        ]
     elif (dtype == 'fp16' or dtype == 'bf16') and tr_load == 't':
-        return {
-            '128' : FmhaBwdDQDKDVTileSize( 32, 128, 128, 32, 128, 32, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 32, 1),
-        }
+        return [
+                FmhaBwdDQDKDVTileSize( 32, 128, 128, 32, 128, 32, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 32, 1),
+                # FmhaBwdDQDKDVTileSize( 16, 32, 128, 16, 128, 16, 32, 128, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 16, 1, 16),
+                FmhaBwdDQDKDVTileSize( 16,  16, 128, 16, 128, 16, 16, 128, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 16, 2, 16),
+        ]
     else:
-        return None
+        return []
 
 FMHA_BWD_DOT_DO_O_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
@@ -537,6 +568,7 @@ class FmhaBwdConvertQGradKernel:
     F_mode          : str  # value from MODE_MAP
     F_occupancy     : int  #
     F_deterministic : str  #
+    disabled        : bool # sometimes this kernel is not used
 
     @property
     def template(self) -> str:
@@ -590,7 +622,7 @@ class FmhaBwdApiTrait:
     dvpad         : str
     deterministic : str
     mask_impl     : str
-    tr_load       : bool
+    tr_load       : str
 
     @property
     def bm0(self) -> int:
@@ -650,17 +682,17 @@ class FmhaBwdApiTrait:
         return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
             F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
             F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
-            F_deterministic=self.deterministic)
+            F_deterministic=self.deterministic, disabled=self.tile.max_seq_q != 0)
 
 class FmhaBwdApiPool:
     def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
         
         self.mask_impl = mask_impl
 
     def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
         # TODO: do we need to check duplication?
-        self.dq_dk_dv_pool[trait.tr_load][trait.dtype][trait.hdim].append(copy.copy(trait))
+        self.dq_dk_dv_pool[trait.tr_load][trait.tile.max_seq_q][trait.dtype][trait.hdim].append(copy.copy(trait))
 
     @staticmethod
     def if_(i: int) -> str:
@@ -675,40 +707,68 @@ class FmhaBwdApiPool:
                 F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
                 F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
                 F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                F_deterministic=BOOL_MAP[trait.deterministic], F_trload=BOOL_MAP[trait.tr_load])
+                F_deterministic=BOOL_MAP[trait.deterministic], F_trload=BOOL_MAP[trait.tr_load], F_maxq=trait.tile.max_seq_q,
+                F_convert_dq_enabled=BOOL_MAP[not trait.convert_dq_kernel.disabled])
             i += 1
         return inners
 
+    @staticmethod
+    def trload_sort_key(tf):
+        return 0 if tf == 't' else 1  # sort 't' before 'f'
+
+    @staticmethod
+    def max_seq_q_sort_key(max_seq_q):
+        return max_seq_q if max_seq_q != 0 else 1000000  # sort 0 to the end
+
+    @staticmethod
+    def max_seq_q_cond(max_seq_q: int) -> str:
+        if max_seq_q == 0:
+            return 'true /* no seqlen_q limit */'
+        else:
+            return f'a.seqlen_q <= {max_seq_q}'
+
+    @staticmethod
+    def dtype_cond(dtype: str) -> str:
+        return f't.data_type.compare("{dtype}") == 0'
+
+    @staticmethod
+    def hdim_cond(hdim: int) -> str:
+        return f't.hdim_q <= {hdim} && t.hdim_v <= {hdim}'
+
     @property
     def api(self) -> str:
         tr_load_cond_map = {
             "t": "has_load_tr",
-            "f": "true"
+            "f": "true /* no trload requirement */"
         }
         per_tr_load = ''
-        for tr_load in ["t", "f"]:
-            per_dtypes = ''
-            for j, dtype in enumerate(self.dq_dk_dv_pool[tr_load]):
-                per_hdim_case = ''
-                for k, hdim in enumerate(self.dq_dk_dv_pool[tr_load][dtype]):
-                    traits = self.dq_dk_dv_pool[tr_load][dtype][hdim]
-                    inners = self._api_innders(traits)
-                    per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(k), F_hdim=hdim, F_body=inners)
-                per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(j), F_dtype=dtype, F_body=per_hdim_case)
-            per_tr_load += FMHA_BWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_body=per_dtypes)
+        for tr_load in sorted(self.dq_dk_dv_pool.keys(), key=self.trload_sort_key):
+            per_max_seq_q = ''
+            for max_seq_q in sorted(self.dq_dk_dv_pool[tr_load].keys(), key=self.max_seq_q_sort_key):
+                per_dtypes = ''
+                for j, dtype in enumerate(self.dq_dk_dv_pool[tr_load][max_seq_q]):
+                    per_hdim_case = ''
+                    for k, hdim in enumerate(self.dq_dk_dv_pool[tr_load][max_seq_q][dtype]):
+                        traits = self.dq_dk_dv_pool[tr_load][max_seq_q][dtype][hdim]
+                        inners = self._api_innders(traits)
+                        per_hdim_case += FMHA_BWD_API_COND_STATEMENT(if_=k, F_cond=self.hdim_cond(hdim), F_body=inners)
+                    per_dtypes += FMHA_BWD_API_COND_STATEMENT(if_=j, F_cond=self.dtype_cond(dtype), F_body=per_hdim_case)
+                per_max_seq_q += FMHA_BWD_API_COND_STATEMENT(F_cond=self.max_seq_q_cond(max_seq_q), F_body=per_dtypes)
+            per_tr_load += FMHA_BWD_API_COND_STATEMENT(F_cond=tr_load_cond_map[tr_load], F_body=per_max_seq_q, indent=4)
         if not per_tr_load:
             # empty string we add some ignore to suppress warning in api
             per_tr_load += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_tr_load)
+        result = FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_tr_load)
+        return result.replace('\n\n', '\n')
 
 def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
     if filter_list == '':
         filter_list = '*@*@*'
-    filter_list = filter_list.split('@')
-    filter_list.extend(['*'] * (3 - len(filter_list)))
-    filter_dot_do_o = filter_list[0]
-    filter_convert_dq = filter_list[1]
-    filter_dq_dk_dv = filter_list[2]
+    filters = filter_list.split('@')
+    filters.extend(['*'] * (3 - len(filters)))
+    filter_dot_do_o = filters[0]
+    filter_convert_dq = filters[1]
+    filter_dq_dk_dv = filters[2]
 
     # use dict as ordered set
     gen_dot_do_o: Dict[FmhaBwdOGradDotOKernel, Literal[True]] = {}
@@ -717,14 +777,14 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
     api_pool = FmhaBwdApiPool(mask_impl)
 
     for dtype, tr_load in itertools.product(BWD_DTYPE_MAP.keys(), ["t", "f"]):
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype, tr_load)
-        if d is None:
-            continue
-        for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
-            tile = d[hdim_str]
-            hdim = int(hdim_str)
+        tiles: Any = get_dq_dk_dv_tiles(dtype, tr_load)
+        for tile, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(tiles, MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
+            assert isinstance(tile, FmhaBwdDQDKDVTileSize), "tile must be FmhaBwdDQDKDVTileSize"
+            hdim = tile.F_bhdq
             if (mode == "group") and (spad1d == "f"):
                 continue
+            if (mode == "group" or ('no' not in mask)) and tile.max_seq_q != 0:
+                continue
             if ((bias == "no" or bias == "alibi") and dbias == "t"):
                 continue
             if ("wg32" in dropout):
@@ -788,7 +848,8 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
                     continue
             gen_dot_do_o[t.dot_do_o_kernel] = True
             gen_dq_dk_dv[t.dq_dk_dv_kernel] = True
-            gen_convert_dq[t.convert_dq_kernel] = True
+            if not t.convert_dq_kernel.disabled:
+                gen_convert_dq[t.convert_dq_kernel] = True
             api_pool.register_dq_dk_dv_traits(t)
 
     return api_pool, list(gen_dot_do_o.keys()), list(gen_dq_dk_dv.keys()), list(gen_convert_dq.keys())
diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp
index b6de5ea621..9c2907778f 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -793,6 +793,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
         }
     }
 
+    // set to bad values to check if the kernel writes to these buffers
+    ck_tile::FillConstant<QGradDataType>{ck_tile::numeric<QGradDataType>::infinity()}(dq_host);
+    ck_tile::FillConstant<KGradDataType>{ck_tile::numeric<KGradDataType>::infinity()}(dk_host);
+    ck_tile::FillConstant<VGradDataType>{ck_tile::numeric<VGradDataType>::infinity()}(dv_host);
+    dq_buf.ToDevice(dq_host.data());
+    dk_buf.ToDevice(dk_host.data());
+    dv_buf.ToDevice(dv_host.data());
+
     o_buf.ToDevice(o_host.data());
     lse_buf.ToDevice(lse_host.data());
     dq_buf.SetZero();
@@ -801,6 +809,20 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::stream_config stream_config_v{
         nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")};
+
+    printf("\nfmha_bwd_traits: hdim_q=%d, hdim_v=%d, data_type=%s, is_group_mode=%d, mask_type=%d, "
+           "bias_type=%d, has_dbias=%d, has_dropout=%d, is_store_randval=%d, is_deterministic=%d\n",
+           fmha_traits.hdim_q,
+           fmha_traits.hdim_v,
+           fmha_traits.data_type.c_str(),
+           fmha_traits.is_group_mode,
+           static_cast<int>(fmha_traits.mask_type),
+           static_cast<int>(fmha_traits.bias_type),
+           fmha_traits.has_dbias,
+           fmha_traits.has_dropout,
+           fmha_traits.is_store_randval,
+           fmha_traits.is_deterministic);
+    fflush(stdout);
     fmha_bwd(fmha_traits, fmha_args, stream_config_v);
 
     dq_buf.FromDevice(dq_host.data());
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index bd63c96eb1..8d35b2d12c 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -156,6 +156,12 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
 {
     assert(args.nhead_q % args.nhead_k == 0);
     auto kargs = [&] {
+        constexpr bool dq_uss_acc  = FmhaBwdDQDKDVKernel::kMaxSeqLenQ == 0;
+        const auto dq_ptr          = dq_uss_acc ? args.dq_acc_ptr : args.dq_ptr;
+        const auto stride_dq       = dq_uss_acc ? args.stride_dq_acc : args.stride_dq;
+        const auto nhead_stride_dq = dq_uss_acc ? args.nhead_stride_dq_acc : args.nhead_stride_dq;
+        const auto batch_stride_dq = dq_uss_acc ? args.batch_stride_dq_acc : args.batch_stride_dq;
+
         // create group mode kernel arguments
         if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
         {
@@ -170,7 +176,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.dk_ptr,
                                                       args.dv_ptr,
                                                       args.dbias_ptr,
-                                                      args.dq_acc_ptr,
+                                                      dq_ptr,
                                                       args.seqstart_q_ptr,
                                                       args.seqstart_k_ptr,
                                                       args.seqlen_k_ptr,
@@ -185,7 +191,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.stride_bias,
                                                       args.stride_randval,
                                                       args.stride_do,
-                                                      args.stride_dq_acc,
+                                                      stride_dq,
                                                       args.stride_dk,
                                                       args.stride_dv,
                                                       args.stride_dbias,
@@ -196,7 +202,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.nhead_stride_randval,
                                                       args.nhead_stride_do,
                                                       args.nhead_stride_lsed,
-                                                      args.nhead_stride_dq_acc,
+                                                      nhead_stride_dq,
                                                       args.nhead_stride_dk,
                                                       args.nhead_stride_dv,
                                                       args.nhead_stride_dbias,
@@ -220,7 +226,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.dk_ptr,
                                                       args.dv_ptr,
                                                       args.dbias_ptr,
-                                                      args.dq_acc_ptr,
+                                                      dq_ptr,
                                                       args.seqlen_q,
                                                       args.seqlen_k,
                                                       args.hdim_q,
@@ -234,7 +240,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.stride_bias,
                                                       args.stride_randval,
                                                       args.stride_do,
-                                                      args.stride_dq_acc,
+                                                      stride_dq,
                                                       args.stride_dk,
                                                       args.stride_dv,
                                                       args.stride_dbias,
@@ -245,7 +251,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.nhead_stride_randval,
                                                       args.nhead_stride_do,
                                                       args.nhead_stride_lsed,
-                                                      args.nhead_stride_dq_acc,
+                                                      nhead_stride_dq,
                                                       args.nhead_stride_dk,
                                                       args.nhead_stride_dv,
                                                       args.nhead_stride_dbias,
@@ -256,7 +262,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                       args.batch_stride_randval,
                                                       args.batch_stride_do,
                                                       args.batch_stride_lsed,
-                                                      args.batch_stride_dq_acc,
+                                                      batch_stride_dq,
                                                       args.batch_stride_dk,
                                                       args.batch_stride_dv,
                                                       args.batch_stride_dbias,
@@ -365,20 +371,10 @@ template <ck_tile::index_t HDim_,
           bool kPadD_,
           bool kPadDv_,
           bool kIsDeterministic_,
-          bool kUseTrLoad_>
+          bool kUseTrLoad_,
+          ck_tile::index_t MaxSeqLenQ_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
-    static constexpr ck_tile::index_t HDim = HDim_;
-    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
-    static constexpr bool kIsGroupMode     = kIsGroupMode_;
-    using FmhaMask                         = ck_tile::remove_cvref_t<FmhaMask_>;
-    using FmhaDropout                      = ck_tile::remove_cvref_t<FmhaDropout_>;
-    static constexpr auto BiasEnum         = BiasEnum_;
-    static constexpr bool kHasBiasGrad     = kHasBiasGrad_;
-    static constexpr bool kPadD            = kPadD_;
-    static constexpr bool kPadDv           = kPadDv_;
-    static constexpr bool kIsDeterministic = kIsDeterministic_;
-    static constexpr bool kUseTrLoad       = kUseTrLoad_;
 };
 
 template <typename Traits_>
@@ -389,6 +385,8 @@ void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
 
 template <typename Traits_>
 std::string fmha_bwd_dq_dk_dv_get_name_();
+template <typename Traits_>
+int fmha_bwd_dq_dk_dv_maxq_();
 
 template <ck_tile::index_t HDim_, typename DataType_, bool kIsGroupMode_, bool kPadS_, bool kPadDv_>
 struct fmha_bwd_dot_do_o_traits_
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index ff41ac0d61..fdbe2e7a6d 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -73,7 +73,7 @@ struct Default2DEpilogue
     //       how do we fix this ?
     template <typename ODramWindowTmp, typename OAccTile>
     CK_TILE_DEVICE auto
-    operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr)
+    operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr) const
     {
         // TODO: this is ugly
         if constexpr(UseRawStore && (kPadM || kPadN))
@@ -105,7 +105,7 @@ struct Default2DEpilogue
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
                                    const OAccTile& o_acc_tile,
                                    const DsDramWindows& /* unused */,
-                                   void* = nullptr)
+                                   void* = nullptr) const
     {
         return operator()<ODramWindowTmp, OAccTile>(o_dram_window_tmp, o_acc_tile);
     }
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 276ec4852f..d8dd5db12e 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -26,6 +26,7 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index 595e2cfccf..8750c8b377 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp"
 
 #include <string>
 #include <type_traits>
@@ -26,14 +27,22 @@
 
 namespace ck_tile {
 
-template <typename FmhaPipeline_, typename KGradEpiloguePipeline_, typename VGradEpiloguePipeline_>
+template <typename FmhaPipeline_,
+          typename KGradEpiloguePipeline_,
+          typename VGradEpiloguePipeline_,
+          typename QGradEpiloguePipeline_ = void>
 struct FmhaBwdDQDKDVKernel
 {
     using FmhaPipeline                            = ck_tile::remove_cvref_t<FmhaPipeline_>;
     using KGradEpiloguePipeline                   = ck_tile::remove_cvref_t<KGradEpiloguePipeline_>;
     using VGradEpiloguePipeline                   = ck_tile::remove_cvref_t<VGradEpiloguePipeline_>;
+    using QGradEpiloguePipeline                   = ck_tile::remove_cvref_t<QGradEpiloguePipeline_>;
     static constexpr ck_tile::index_t kBlockSize  = FmhaPipeline::kBlockSize;
     static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
+    static constexpr bool kUseQrQtrDorPipeline =
+        ck_tile::fmha_bwd_qr_qtr_dor_pipeline_c<FmhaPipeline>;
+    static_assert(!kUseQrQtrDorPipeline || !std::is_same_v<QGradEpiloguePipeline_, void>,
+                  "QrQtrDorPipeline needs QGradEpiloguePipeline");
 
     using QDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::QDataType>;
     using KDataType    = ck_tile::remove_cvref_t<typename FmhaPipeline::KDataType>;
@@ -63,6 +72,8 @@ struct FmhaBwdDQDKDVKernel
     static constexpr bool kIsStoreRandval  = FmhaDropout::IsStoreRandval;
     static constexpr bool kIsDeterministic = FmhaPipeline::kIsDeterministic;
     static constexpr bool kUseTrLoad       = FmhaPipeline::kUseTrLoad;
+    static constexpr index_t kMaxSeqLenQ   = FmhaPipeline::BlockFmhaShape::kMaxSeqLenQ;
+    static_assert(kUseQrQtrDorPipeline == (kMaxSeqLenQ != 0));
 #if defined(__gfx950__)
     static constexpr bool kIsAvialable = true;
 #else
@@ -128,7 +139,7 @@ struct FmhaBwdDQDKDVKernel
         const void* lse_ptr;
         const void* do_ptr;
         const void* d_ptr;
-        void* dq_acc_ptr;
+        void* dq_acc_ptr; // can be dq_ptr for qrqtrdor pipeline
         void* dk_ptr;
         void* dv_ptr;
 
@@ -335,7 +346,7 @@ struct FmhaBwdDQDKDVKernel
                   void* dk_ptr,
                   void* dv_ptr,
                   void* dbias_ptr,
-                  void* dq_acc_ptr,
+                  void* dq_acc_ptr, // can be dq_acc_ptr for qrqtrdor pipeline
                   ck_tile::index_t seqlen_q,
                   ck_tile::index_t seqlen_k,
                   ck_tile::index_t hdim_q,
@@ -482,7 +493,7 @@ struct FmhaBwdDQDKDVKernel
             }
         }
 
-        if constexpr(kIsDeterministic)
+        if constexpr(kIsDeterministic && !kUseQrQtrDorPipeline)
         {
             kargs.split_stride_dq_acc = split_stride_dq_acc;
         }
@@ -640,7 +651,9 @@ struct FmhaBwdDQDKDVKernel
     GridSize(ck_tile::index_t batch_size_, ck_tile::index_t nhead_, ck_tile::index_t seqlen_k_)
     {
         return dim3(
-            ck_tile::integer_divide_ceil(seqlen_k_, FmhaPipeline::kN0), nhead_, batch_size_);
+            kUseQrQtrDorPipeline ? 1 : ck_tile::integer_divide_ceil(seqlen_k_, FmhaPipeline::kN0),
+            nhead_,
+            batch_size_);
     }
 
     CK_TILE_DEVICE static constexpr auto GetTileIndex()
@@ -735,10 +748,9 @@ struct FmhaBwdDQDKDVKernel
 
             // # of required blocks is different in each groups, terminate unnecessary blocks
             // earlier
-            if(kargs.seqlen_k <= i_n0)
-            {
-                return;
-            }
+            if constexpr(!kUseQrQtrDorPipeline)
+                if(kargs.seqlen_k <= i_n0)
+                    return;
         }
         else
         {
@@ -786,12 +798,10 @@ struct FmhaBwdDQDKDVKernel
         const OGradDataType* do_ptr = reinterpret_cast<const OGradDataType*>(kargs.do_ptr) +
                                       static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_do +
                                       batch_offset_do;
-        KGradDataType* dk_ptr = reinterpret_cast<KGradDataType*>(kargs.dk_ptr) +
-                                static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_dk +
-                                batch_offset_dk;
-        VGradDataType* dv_ptr = reinterpret_cast<VGradDataType*>(kargs.dv_ptr) +
-                                static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_dv +
-                                batch_offset_dv;
+        auto dk_ptr = reinterpret_cast<KGradDataType*>(kargs.dk_ptr) +
+                      static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_dk + batch_offset_dk;
+        auto dv_ptr = reinterpret_cast<VGradDataType*>(kargs.dv_ptr) +
+                      static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_dv + batch_offset_dv;
 
         // Q/K/V/LSE/D/dO/dQ/dK/dV DRAM and DRAM window
         const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
@@ -868,8 +878,11 @@ struct FmhaBwdDQDKDVKernel
             {0, 0});
 
         auto dq_dram_window = [&, i_tile_n_ = i_tile_n, i_nhead_ = i_nhead]() {
-            AccDataType* dq_acc_ptr = reinterpret_cast<AccDataType*>(kargs.dq_acc_ptr) + [&]() {
-                if constexpr(kIsDeterministic)
+            constexpr bool kUseKSplit = !kUseQrQtrDorPipeline && kIsDeterministic;
+            using DType = std::conditional_t<kUseQrQtrDorPipeline, QGradDataType, AccDataType>;
+
+            auto dq_acc_ptr = reinterpret_cast<DType*>(kargs.dq_acc_ptr) + [&]() {
+                if constexpr(kUseKSplit)
                     return static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_dq_acc +
                            static_cast<long_index_t>(i_tile_n_) * kargs.split_stride_dq_acc +
                            batch_offset_dq_acc;
@@ -878,7 +891,7 @@ struct FmhaBwdDQDKDVKernel
                            batch_offset_dq_acc;
             }();
 
-            constexpr auto DstInMemOp = conditional_expr<kIsDeterministic>(
+            constexpr auto DstInMemOp = conditional_expr<kUseKSplit>(
                 memory_operation_enum::set, memory_operation_enum::atomic_add);
             const auto dq_acc_dram_naive =
                 make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
@@ -1063,25 +1076,6 @@ struct FmhaBwdDQDKDVKernel
                 return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
         }();
 
-        auto [dk_acc_tile, dv_acc_tile] = FmhaPipeline{}(q_dram_window,
-                                                         k_dram_window,
-                                                         v_dram_window,
-                                                         bias_dram_window,
-                                                         randval_dram_window,
-                                                         do_dram_window,
-                                                         lse_dram_window,
-                                                         d_dram_window,
-                                                         dq_dram_window,
-                                                         dbias_dram_window,
-                                                         mask,
-                                                         position_encoding,
-                                                         kargs.raw_scale,
-                                                         kargs.scale,
-                                                         rp_undrop,
-                                                         scale_rp_undrop,
-                                                         smem_ptr,
-                                                         dropout);
-
         auto dk_dram = [&]() {
             const auto dk_dram_naive = make_naive_tensor_view<address_space_enum::global>(
                 dk_ptr,
@@ -1119,9 +1113,56 @@ struct FmhaBwdDQDKDVKernel
             dv_dram,
             make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
             {i_n0, 0});
+        if constexpr(!kUseQrQtrDorPipeline)
+        {
+            auto [dk_acc_tile, dv_acc_tile] = FmhaPipeline{}(q_dram_window,
+                                                             k_dram_window,
+                                                             v_dram_window,
+                                                             bias_dram_window,
+                                                             randval_dram_window,
+                                                             do_dram_window,
+                                                             lse_dram_window,
+                                                             d_dram_window,
+                                                             dq_dram_window,
+                                                             dbias_dram_window,
+                                                             mask,
+                                                             position_encoding,
+                                                             kargs.raw_scale,
+                                                             kargs.scale,
+                                                             rp_undrop,
+                                                             scale_rp_undrop,
+                                                             smem_ptr,
+                                                             dropout);
 
-        KGradEpiloguePipeline{}(dk_dram_window, dk_acc_tile);
-        VGradEpiloguePipeline{}(dv_dram_window, dv_acc_tile);
+            KGradEpiloguePipeline{}(dk_dram_window, dk_acc_tile);
+            VGradEpiloguePipeline{}(dv_dram_window, dv_acc_tile);
+        }
+        else
+        {
+            FmhaPipeline{}(q_dram_window,
+                           k_dram_window,
+                           v_dram_window,
+                           bias_dram_window,
+                           randval_dram_window,
+                           do_dram_window,
+                           lse_dram_window,
+                           d_dram_window,
+                           dq_dram_window,
+                           dk_dram_window,
+                           dv_dram_window,
+                           dbias_dram_window,
+                           QGradEpiloguePipeline{},
+                           KGradEpiloguePipeline{},
+                           VGradEpiloguePipeline{},
+                           mask,
+                           position_encoding,
+                           kargs.raw_scale,
+                           kargs.scale,
+                           rp_undrop,
+                           scale_rp_undrop,
+                           smem_ptr,
+                           dropout);
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
index bf38c3c07d..c3e84df934 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
@@ -7,6 +7,7 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp"
 
 namespace ck_tile {
 
@@ -14,12 +15,15 @@ template <typename Problem, typename Policy>
 class BlockFmhaBwdDQDKDVPipelineSelector
 {
     static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV;
+    static constexpr bool is_decode = Problem::BlockFmhaShape::kMaxSeqLenQ > 0;
 
     public:
     template <typename... TS>
     using type_ =
         std::conditional_t<Problem::kUseTrLoad,
-                           BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR<TS...>,
+                           std::conditional_t<is_decode,
+                                              BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR<TS...>,
+                                              BlockFmhaBwdDQDKDVPipelineTrLoadKRKTRVR<TS...>>,
                            std::conditional_t<has_dpad,
                                               BlockFmhaBwdDQDKDVPipelineKRKTRVR<TS...>,
                                               BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<TS...>>>;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
new file mode 100644
index 0000000000..65f70c4f62
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp
@@ -0,0 +1,743 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/block/block_dropout.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy = BlockFmhaBwdPipelineTrLoadDefaultPolicy>
+struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR
+{
+    static constexpr auto is_qr_qtr_dor_pipeline = true;
+
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using GemmDataType          = remove_cvref_t<typename Problem::GemmDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using AccDataType           = remove_cvref_t<typename Problem::AccDataType>;
+    using DDataType             = remove_cvref_t<typename Problem::DDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using OGradDataType         = remove_cvref_t<typename Problem::OGradDataType>;
+    using QGradDataType         = remove_cvref_t<typename Problem::QGradDataType>;
+    using KGradDataType         = remove_cvref_t<typename Problem::KGradDataType>;
+    using VGradDataType         = remove_cvref_t<typename Problem::VGradDataType>;
+    using BiasGradDataType      = remove_cvref_t<typename Problem::BiasGradDataType>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+    using FmhaDropout           = remove_cvref_t<typename Problem::FmhaDropout>;
+    // using HotLoopScheduler      = typename Policy::template HotLoopScheduler<Problem>;
+
+    using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;
+
+    static constexpr index_t kBlockPerCu = Problem::kBlockPerCu;
+    static constexpr index_t kBlockSize  = Problem::kBlockSize;
+
+    static constexpr index_t kM0        = BlockFmhaShape::kM0;
+    static constexpr index_t kN0        = BlockFmhaShape::kN0;
+    static constexpr index_t kK0        = BlockFmhaShape::kK0;
+    static constexpr index_t kK1        = BlockFmhaShape::kK1;
+    static constexpr index_t kK2        = BlockFmhaShape::kK2;
+    static constexpr index_t kK3        = BlockFmhaShape::kK3;
+    static constexpr index_t kK4        = BlockFmhaShape::kK4;
+    static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
+
+    static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
+    static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
+    static constexpr auto BiasEnum         = Problem::BiasEnum;
+    static constexpr bool kHasBiasGrad     = Problem::kHasBiasGrad;
+    static constexpr bool kIsDeterministic = Problem::kIsDeterministic;
+    static constexpr bool kUseTrLoad       = Problem::kUseTrLoad;
+    static_assert(kUseTrLoad, "This pipeline uses trload!");
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
+    static constexpr index_t kAlignmentOGrad =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentOGrad<Problem>();
+    static constexpr index_t kAlignmentQGrad = 1;
+    static constexpr index_t kAlignmentKGrad =
+        kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
+    static constexpr index_t kAlignmentVGrad =
+        kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
+    static constexpr index_t kAlignmentBias = 1;
+
+    static constexpr const char* name = "trload_kr_ktr_vr";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static LSEDataType get_validated_lse(const LSEDataType raw_lse)
+    {
+        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || FmhaMask::IsMasking)
+            return (raw_lse == -numeric<LSEDataType>::infinity()) //
+                       ? type_convert<LSEDataType>(0.f)
+                       : raw_lse;
+        else
+            return raw_lse;
+    };
+
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename RandValDramBlockWindowTmp,
+              typename OGradDramBlockWindowTmp,
+              typename LSEDramBlockWindowTmp,
+              typename DDramBlockWindowTmp,
+              typename QGradDramBlockWindowTmp,
+              typename KGradDramBlockWindowTmp,
+              typename VGradDramBlockWindowTmp,
+              typename BiasGradDramBlockWindowTmp,
+              typename QGradEpilogue,
+              typename KGradEpilogue,
+              typename VGradEpilogue,
+              typename PositionEncoding>
+    CK_TILE_DEVICE auto operator()( //
+        const QDramBlockWindowTmp& q_dram_block_window_tmp,
+        const KDramBlockWindowTmp& k_dram_block_window_tmp,
+        const VDramBlockWindowTmp& v_dram_block_window_tmp,
+        const BiasDramBlockWindowTmp& bias_dram_block_window_tmp,
+        const RandValDramBlockWindowTmp& randval_dram_block_window_tmp,
+        const OGradDramBlockWindowTmp& do_dram_block_window_tmp,
+        const LSEDramBlockWindowTmp& lse_dram_block_window_tmp,
+        const DDramBlockWindowTmp& d_dram_block_window_tmp,
+        const QGradDramBlockWindowTmp& dq_dram_block_window_tmp,
+        const KGradDramBlockWindowTmp& dk_dram_block_window_tmp,
+        const VGradDramBlockWindowTmp& dv_dram_block_window_tmp,
+        const BiasGradDramBlockWindowTmp& dbias_dram_block_window_tmp,
+        const QGradEpilogue& dq_epilogue,
+        const KGradEpilogue& dk_epilogue,
+        const VGradEpilogue& dv_epilogue,
+        FmhaMask mask,
+        PositionEncoding position_encoding,
+        float raw_scale,
+        float scale,
+        float rp_undrop,
+        float scale_rp_undrop,
+        void* smem_ptr,
+        FmhaDropout& dropout) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<OGradDataType,
+                               remove_cvref_t<typename OGradDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<LSEDataType,
+                               remove_cvref_t<typename LSEDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<DDataType, remove_cvref_t<typename DDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == VDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[number<1>{}] &&
+                          kM0 == OGradDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == LSEDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == DDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == QGradDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kM0 == BiasGradDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kN0 == BiasGradDramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPTOGradTBlockGemm<Problem>();
+        constexpr auto gemm_2 = Policy::template GetOGradVBlockGemm<Problem>();
+        constexpr auto gemm_3 = Policy::template GetSGradTQTBlockGemm<Problem>();
+        constexpr auto gemm_4 = Policy::template GetSGradKTBlockGemm<Problem>();
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+
+        // Early termination
+        const auto [seqlen_kv_start, seqlen_kv_end] =
+            mask.GetTileRangeAlongX(q_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
+
+        const auto num_total_loop = integer_divide_ceil(seqlen_kv_end - seqlen_kv_start, kN0);
+
+        // K, HBM ->LDS ->Reg
+        auto k_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<KDataType>(
+                                 k_dram_block_window_tmp.get_bottom_tensor_view()),
+                             k_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_kv_start, 0},
+                             Policy::template MakeKDramTileDistribution<Problem>());
+
+        // LDS allocation
+        const auto smem_ptr_ =
+            reinterpret_cast<char*>(smem_ptr); // cast to char* to do pointer arithmetic
+
+        const auto k_lds_ptr = reinterpret_cast<KDataType* __restrict__>(smem_ptr_);
+        const auto v_lds_ptr = reinterpret_cast<VDataType* __restrict__>(
+            smem_ptr_ + Policy::template GetSmemSizeK<Problem>());
+
+        const auto do_lds_ptr  = reinterpret_cast<OGradDataType*>(smem_ptr_);
+        const auto q_lds_ptr   = reinterpret_cast<QDataType*>( //
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>());
+        const auto lse_lds_ptr = reinterpret_cast<LSEDataType*>( //
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>());
+        const auto d_lds_ptr   = reinterpret_cast<DDataType*>(
+            smem_ptr_ + Policy::template GetSmemSizeOGrad<Problem>() +
+            Policy::template GetSmemSizeQ<Problem>() + Policy::template GetSmemSizeLSE<Problem>());
+
+        const auto ds_lds_ptr =
+            reinterpret_cast<GemmDataType*>(smem_ptr_ + Policy::template GetSmemSizeK<Problem>() +
+                                            Policy::template GetSmemSizeV<Problem>());
+        const auto bias_lds_ptr = reinterpret_cast<BiasDataType*>(ds_lds_ptr);
+
+        auto k_lds = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>());
+        auto k_lds_write_window =
+            make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
+
+        //------------------------------------------------------------------
+        // V, HBM ->LDS ->Reg
+        auto v_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<VDataType>(
+                                 v_dram_block_window_tmp.get_bottom_tensor_view()),
+                             v_dram_block_window_tmp.get_window_lengths(),
+                             {seqlen_kv_start, 0},
+                             Policy::template MakeVDramTileDistribution<Problem>());
+        auto v_lds = make_tensor_view<address_space_enum::lds>(
+            v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>());
+        auto v_lds_write_window =
+            make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kVHeaddim>{}), {0, 0});
+
+        //------------------------------------------------------------------
+        // KT, HBM -> LDS --trload-->Reg
+
+        //------------------------------------------------------------------
+        // Pre-Load KV into Registers
+        auto k_lds_read = make_tensor_view<address_space_enum::lds>(
+            k_lds_ptr, Policy::template MakeKLdsReadBlockDescriptor<Problem>());
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             k_lds_write_window.get_window_origin(),
+                             Policy::template MakeKRegBlockDescriptor<Problem>());
+
+        auto kt_lds_read_window =
+            make_tile_window(k_lds_read,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKTRegBlockDescriptor<Problem>());
+
+        auto v_lds_read = make_tensor_view<address_space_enum::lds>(
+            v_lds_ptr, Policy::template MakeVLdsReadBlockDescriptor<Problem>());
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read,
+                             make_tuple(number<kN0>{}, number<kK2>{}),
+                             v_lds_write_window.get_window_origin(),
+                             Policy::template MakeVRegBlockDescriptor<Problem>());
+
+        //---------------------------- Loop Load in ----------------------------//
+        // Q: HBM -->LDS
+        auto q_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<QDataType>(
+                                 q_dram_block_window_tmp.get_bottom_tensor_view()),
+                             q_dram_block_window_tmp.get_window_lengths(),
+                             {0, 0},
+                             Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds = make_tensor_view<address_space_enum::lds>(
+            q_lds_ptr, Policy::template MakeQLdsWriteBlockDescriptor<Problem>());
+        auto q_lds_write_window =
+            make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
+
+        auto q_lds_read = make_tensor_view<address_space_enum::lds>(
+            q_lds_ptr, Policy::template MakeQLdsReadBlockDescriptor<Problem>());
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read,
+                             make_tuple(number<kM0>{}, number<kK0>{}),
+                             q_lds_write_window.get_window_origin(),
+                             Policy::template MakeQRegSliceBlockDescriptor<Problem>());
+        auto qt_lds_read_window =
+            make_tile_window(q_lds_read,
+                             make_tuple(number<kM0>{}, number<kQKHeaddim>{}),
+                             {0, 0},
+                             Policy::template MakeQTRegSliceBlockDescriptor<Problem>());
+
+        // dO: HBM ->LDS ---load--> Reg
+        // dOT:          \-loadtr-> Reg
+        auto do_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<OGradDataType>(
+                                 do_dram_block_window_tmp.get_bottom_tensor_view()),
+                             do_dram_block_window_tmp.get_window_lengths(),
+                             {0, 0},
+                             Policy::template MakeOGradDramTileDistribution<Problem>());
+
+        auto do_lds = make_tensor_view<address_space_enum::lds>(
+            do_lds_ptr, Policy::template MakeOGradLdsWriteBlockDescriptor<Problem>());
+        auto do_lds_write_window =
+            make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
+
+        auto do_lds_read = make_tensor_view<address_space_enum::lds>(
+            do_lds_ptr, Policy::template MakeOGradLdsReadBlockDescriptor<Problem>());
+        auto do_lds_read_window =
+            make_tile_window(do_lds_read,
+                             make_tuple(number<kM0>{}, number<kK2>{}),
+                             do_lds_write_window.get_window_origin(),
+                             Policy::template MakeOGradRegSliceBlockDescriptor<Problem>());
+        auto dot_lds_read_window =
+            make_tile_window(do_lds_read,
+                             make_tuple(number<kM0>{}, number<kK2>{}),
+                             {0, 0},
+                             Policy::template MakeOGradTRegSliceBlockDescriptor<Problem>());
+
+        // dS: Reg -> Reg -> LDS
+        auto ds_lds = make_tensor_view<address_space_enum::lds>(
+            ds_lds_ptr, Policy::template MakeSGradLdsBlockDescriptor<Problem>());
+
+        auto ds_lds_window =
+            make_tile_window(ds_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
+
+        // transform it to make it from col-major to row-major; prepared for load_tile_transpose
+        auto ds_lds_t = make_tensor_view<address_space_enum::lds>(
+            ds_lds_ptr, Policy::template MakeSGradLdsBlockDescriptor<Problem, true>());
+        auto ds_lds_read_window =
+            make_tile_window(ds_lds_t,
+                             make_tuple(number<kM0>{}, number<kK4>{}),
+                             {0, 0},
+                             Policy::template MakeSGradRegSliceBlockDescriptor<Problem>());
+
+        // Bias: HBM ->Reg ->Reg ->LDS
+        const auto bias_origin = bias_dram_block_window_tmp.get_window_origin();
+
+        auto bias_dram_window =
+            make_tile_window(Policy::template TransformXDramTensorView<QDataType>(
+                                 bias_dram_block_window_tmp.get_bottom_tensor_view()),
+                             bias_dram_block_window_tmp.get_window_lengths(),
+                             {bias_origin.at(number<0>{}), seqlen_kv_start},
+                             Policy::template MakeBiasTileDistribution<Problem>());
+
+        auto bias_lds = make_tensor_view<address_space_enum::lds>(
+            bias_lds_ptr, Policy::template MakeBiasLdsWriteBlockDescriptor<Problem>());
+        auto bias_lds_write_window =
+            make_tile_window(bias_lds, make_tuple(number<kM0>{}, number<kN0>{}), {0, 0});
+
+        auto bias_lds_read = make_tensor_view<address_space_enum::lds>(
+            bias_lds_ptr, Policy::template MakeBiasLdsReadBlockDescriptor<Problem>());
+        auto bias_s_lds_read_window =
+            make_tile_window(bias_lds_read,
+                             make_tuple(number<kM0>{}, number<kN0>{}),
+                             bias_lds_write_window.get_window_origin(),
+                             Policy::template MakeBiasSTileDistribution<decltype(gemm_0)>());
+
+        static_assert(std::is_same_v<BiasDataType, BiasGradDataType>,
+                      "BiasDataType and BiasGradDataType should be the same!");
+
+        // LSE: HBM -> LDS ->Reg
+        auto lse_dram_window = make_tile_window(
+            lse_dram_block_window_tmp.get_bottom_tensor_view(),
+            lse_dram_block_window_tmp.get_window_lengths(),
+            {0},
+            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+
+        auto lse_lds = make_tensor_view<address_space_enum::lds>(
+            lse_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
+
+        auto lse_lds_write_window = make_tile_window(lse_lds, make_tuple(number<kM0>{}), {0});
+
+        auto lse_lds_read_window = make_tile_window(
+            lse_lds,
+            make_tuple(number<kM0>{}),
+            {0},
+            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+
+        // D: HBM ->Reg
+        auto d_dram_window = make_tile_window(
+            d_dram_block_window_tmp.get_bottom_tensor_view(),
+            d_dram_block_window_tmp.get_window_lengths(),
+            {0},
+            Policy::template MakeLSEDDramTileDistribution<Problem, decltype(gemm_0)>());
+
+        auto d_lds = make_tensor_view<address_space_enum::lds>(
+            d_lds_ptr, Policy::template MakeLSEDLdsWriteBlockDescriptor<Problem>());
+        auto d_lds_write_window = make_tile_window(d_lds, make_tuple(number<kM0>{}), {0});
+        auto d_lds_read_window  = make_tile_window(
+            d_lds,
+            make_tuple(number<kM0>{}),
+            {0},
+            Policy::template MakeLSEDLdsReadBlockDescriptor<Problem, decltype(gemm_0)>());
+
+        // RandVal: HBM ->Reg
+        auto randval_dram_window = dropout.template MakeRandvalDramWindow<decltype(gemm_0), true>(
+            randval_dram_block_window_tmp, seqlen_kv_start);
+
+        // BiasGrad
+        // Reg ->LDS ->Reg ->HBM
+        const auto dbias_origin = dbias_dram_block_window_tmp.get_window_origin();
+
+        auto dbias_dram_window =
+            make_tile_window(dbias_dram_block_window_tmp.get_bottom_tensor_view(),
+                             dbias_dram_block_window_tmp.get_window_lengths(),
+                             {dbias_origin.at(number<0>{}), seqlen_kv_start}); // M/N
+
+        auto dbias_lds_read_window =
+            make_tile_window(bias_lds,
+                             make_tuple(number<kM0>{}, number<kN0>{}),
+                             {0, 0},
+                             Policy::template MakeShuffledBiasTileDistribution<Problem>());
+
+        // ----------------------------Loop write out------------------------------//
+        auto dq_dram_window = make_tile_window(dq_dram_block_window_tmp.get_bottom_tensor_view(),
+                                               dq_dram_block_window_tmp.get_window_lengths(),
+                                               {0, 0});
+        auto dk_dram_window = make_tile_window(dk_dram_block_window_tmp.get_bottom_tensor_view(),
+                                               dk_dram_block_window_tmp.get_window_lengths(),
+                                               {0, 0});
+        auto dv_dram_window = make_tile_window(dv_dram_block_window_tmp.get_bottom_tensor_view(),
+                                               dv_dram_block_window_tmp.get_window_lengths(),
+                                               {0, 0});
+
+        index_t i_total_loops  = 0;
+        index_t seqlen_kv_step = seqlen_kv_start;
+        static_assert(kQKHeaddim >= kK0, "kQKHeaddim should be equal or greater than kK0");
+        static_assert(kM0 == kK1, "kM0 should equal to kK1");
+        static_assert(kVHeaddim >= kK2, "kVHeaddim should be equal or greater than kK2");
+        static_assert(kM0 == kK3, "kM0 should equal to kK3");
+        constexpr index_t k4_loops = kN0 / kK4;
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        decltype(load_tile(q_lds_read_window)) q_reg_tensor;
+        decltype(load_tile(lse_lds_read_window)) lse;
+        decltype(load_tile_transpose(ds_lds_read_window)) ds_reg_tensor;
+        decltype(load_tile_transpose(ds_lds_read_window)) ds_reg_tensor_next;
+        decltype(load_tile(do_lds_read_window)) do_reg_tensor;
+        decltype(load_tile_transpose(dot_lds_read_window)) dot_reg_tensor;
+        decltype(load_tile(d_lds_read_window)) d;
+        decltype(load_tile_transpose(qt_lds_read_window)) qt_reg_tensor;
+        decltype(gemm_0.MakeCBlockTile()) s_acc, p;
+        decltype(gemm_2.MakeCBlockTile()) dp_acc, ds;
+        decltype(gemm_4.MakeCBlockTile()) dq_acc;
+        clear_tile(dq_acc);
+
+        decltype(load_tile(lse_dram_window)) lse_block_tile;
+        decltype(load_tile(d_dram_window)) d_block_tile;
+
+        async_load_tile(q_lds_write_window, q_dram_window);
+        async_load_tile(do_lds_write_window, do_dram_window);
+        __builtin_amdgcn_s_waitcnt(0);
+        qt_reg_tensor  = load_tile_transpose(qt_lds_read_window);
+        q_reg_tensor   = load_tile(q_lds_read_window);
+        dot_reg_tensor = load_tile_transpose(dot_lds_read_window);
+        do_reg_tensor  = load_tile(do_lds_read_window);
+
+        lse_block_tile = load_tile(lse_dram_window);
+        d_block_tile   = load_tile(d_dram_window);
+        __builtin_amdgcn_s_waitcnt(0);
+        store_tile(lse_lds_write_window, lse_block_tile);
+        store_tile(d_lds_write_window, d_block_tile);
+        __builtin_amdgcn_s_waitcnt(0);
+        lse = load_tile(lse_lds_read_window);
+        d   = load_tile(d_lds_read_window);
+
+        auto main_body = [&](auto is_prologue_, auto is_epilogue_) mutable {
+            constexpr bool is_prologue = is_prologue_.value;
+            constexpr bool is_epilogue = is_epilogue_.value;
+            static_assert(is_prologue || is_epilogue, "is_prologue or is_epilogue should be true");
+            constexpr bool is_main_body = is_prologue && is_epilogue;
+
+            // init VGrad & KGrad
+            decltype(gemm_1.MakeCBlockTile()) dv_acc;
+            decltype(gemm_3.MakeCBlockTile()) dk_acc;
+
+            decltype(load_tile(k_lds_read_window)) k_reg_tensor;
+            decltype(load_tile(v_lds_read_window)) v_reg_tensor;
+            decltype(load_tile_transpose(kt_lds_read_window)) kt_reg_tensor;
+
+            if constexpr(is_epilogue)
+            {
+                async_load_tile(k_lds_write_window, k_dram_window);
+                move_tile_window(k_dram_window, {kN0, 0});
+                async_load_tile(v_lds_write_window, v_dram_window);
+                move_tile_window(v_dram_window, {kN0, 0});
+                // __builtin_amdgcn_s_waitcnt(0);
+                k_reg_tensor  = load_tile(k_lds_read_window);
+                v_reg_tensor  = load_tile(v_lds_read_window);
+                kt_reg_tensor = load_tile_transpose(kt_lds_read_window);
+            }
+            if constexpr(is_epilogue)
+            {
+                // STAGE 1, Q@K Gemm0
+                s_acc = gemm_0(q_reg_tensor, k_reg_tensor);
+            }
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm0();
+            __builtin_amdgcn_sched_barrier(0);
+            if constexpr(is_epilogue)
+            {
+                // STAGE 2, Scale, Add bias, Mask, Softmax, Dropout
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    async_load_tile(bias_lds_write_window, bias_dram_window);
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+                    auto bias_s_tile = load_tile(bias_s_lds_read_window);
+                    tile_elementwise_inout(
+                        [&](auto& x, const auto& y) {
+                            x = scale * x + log2e_v<AccDataType> * type_convert<AccDataType>(y);
+                        },
+                        s_acc,
+                        bias_s_tile);
+                    move_tile_window(bias_dram_window, {kM0, 0});
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+                else if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
+                    sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
+                        sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
+                            const auto tile_idx = get_x_indices_from_distributed_indices(
+                                s_acc.get_tile_distribution(), make_tuple(idx0, idx1));
+
+                            const auto row         = tile_idx.at(number<0>{});
+                            const auto col         = seqlen_kv_step + tile_idx.at(number<1>{});
+                            constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                            s_acc(i_j_idx) *= scale;
+                            position_encoding.update(s_acc(i_j_idx), row, col);
+                        });
+                    });
+                }
+
+                {
+                    bool need_perpixel_check =
+                        mask.IsEdgeTile(0, seqlen_kv_step, number<kM0>{}, number<kN0>{});
+                    if(need_perpixel_check)
+                    {
+                        set_tile_if(s_acc, -numeric<AccDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = tile_idx.at(number<0>{});
+                            const auto col = seqlen_kv_step + tile_idx.at(number<1>{});
+                            return mask.IsOutOfBound(row, col);
+                        });
+                    }
+                }
+
+                constexpr auto p_spans = decltype(p)::get_distributed_spans();
+                sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
+                    constexpr auto i_idx = make_tuple(idx0);
+                    auto row_lse         = log2e_v<LSEDataType> * get_validated_lse(lse[i_idx]);
+
+                    sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                        if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                     BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                            p(i_j_idx) = exp2(s_acc[i_j_idx] - row_lse);
+                        else
+                            p(i_j_idx) = exp2(scale * s_acc[i_j_idx] - row_lse);
+                    });
+                });
+
+                if constexpr(FmhaDropout::IsDropout)
+                {
+                    dropout.template Run<decltype(gemm_0), RandValOutputDataType>(
+                        0, seqlen_kv_step, p, randval_dram_window);
+                }
+                const auto p_gemm = [&]() { // dropout / type conversion
+                    if constexpr(FmhaDropout::IsDropout)
+                    {
+                        return tile_elementwise_in(
+                            [](const auto& x) {
+                                return type_convert<GemmDataType>(x > 0.f ? x : 0.f);
+                            },
+                            p);
+                    }
+                    else
+                    {
+                        return cast_tile<GemmDataType>(p);
+                    }
+                }();
+
+                // STAGE 4, OGrad@V Gemm2
+                dp_acc = gemm_2(do_reg_tensor, v_reg_tensor);
+
+                // STAGE 3, P^T@OGrad^T Gemm1
+                auto pt_reg_tensor = make_static_distributed_tensor<GemmDataType>(
+                    Policy::template MakePTRegSliceBlockDescriptor<Problem>());
+                pt_reg_tensor.get_thread_buffer() = p_gemm.get_thread_buffer();
+
+                dv_acc = gemm_1(pt_reg_tensor, dot_reg_tensor);
+            }
+            block_sync_lds();
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm12();
+            __builtin_amdgcn_sched_barrier(0);
+            if constexpr(is_epilogue)
+            {
+                // STAGE 5, P^T(PGrad^T - D)
+                constexpr auto ds_spans = decltype(ds)::get_distributed_spans();
+                sweep_tile_span(ds_spans[number<0>{}], [&](auto idx0) {
+                    constexpr auto i_idx = make_tuple(idx0);
+                    sweep_tile_span(ds_spans[number<1>{}], [&](auto idx1) {
+                        constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                        bool undrop_flag       = p[i_j_idx] >= 0;
+                        ds(i_j_idx) = p[i_j_idx] * (!FmhaDropout::IsDropout || undrop_flag
+                                                        ? (dp_acc[i_j_idx] - d[i_idx])
+                                                        : d[i_idx]);
+                    });
+                });
+
+                if constexpr(kHasBiasGrad)
+                {
+                    const auto dbias = [&]() {
+                        if constexpr(FmhaDropout::IsDropout)
+                        {
+                            return tile_elementwise_in(
+                                [&rp_undrop](const auto& x) {
+                                    return type_convert<BiasGradDataType>(x * rp_undrop);
+                                },
+                                ds);
+                        }
+                        else
+                        {
+                            return cast_tile<BiasGradDataType>(ds);
+                        }
+                    }();
+                    store_tile(bias_lds_write_window, dbias);
+                    __builtin_amdgcn_s_waitcnt(3952);
+                    block_sync_lds();
+                    auto shuffled_dbias_tile = load_tile(dbias_lds_read_window);
+                    auto dbias_tile          = make_static_distributed_tensor<BiasGradDataType>(
+                        Policy::template MakeBiasTileDistribution<Problem>());
+                    shuffle_tile(dbias_tile, shuffled_dbias_tile);
+                    store_tile(dbias_dram_window, dbias_tile);
+                    move_tile_window(dbias_dram_window, {kM0, 0});
+                    __builtin_amdgcn_sched_barrier(0);
+                }
+            }
+            if constexpr(is_epilogue)
+            {
+                // STAGE 6, SGrad^T@Q^T Gemm3
+                const auto ds_gemm  = cast_tile<GemmDataType>(ds);
+                auto dst_reg_tensor = make_static_distributed_tensor<GemmDataType>(
+                    Policy::template MakeSGradTRegSliceBlockDescriptor<Problem>());
+                dst_reg_tensor.get_thread_buffer() = ds_gemm.get_thread_buffer();
+                dk_acc                             = gemm_3(dst_reg_tensor, qt_reg_tensor);
+
+                store_tile(ds_lds_window, ds_gemm);
+            }
+            __builtin_amdgcn_s_waitcnt(3952);
+            block_sync_lds();
+            if constexpr(is_epilogue)
+            {
+                ds_reg_tensor = load_tile_transpose(ds_lds_read_window);
+                move_tile_window(ds_lds_read_window, {kK4, 0});
+            }
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm3();
+            __builtin_amdgcn_sched_barrier(0);
+            if constexpr(is_epilogue)
+            {
+                // STAGE7 SGrad@K^T Gemm4
+                static_for<0, k4_loops, 1>{}([&](auto i_k4) {
+                    if constexpr(i_k4 < k4_loops - 1)
+                    {
+                        ds_reg_tensor_next = load_tile_transpose(ds_lds_read_window);
+                        move_tile_window(ds_lds_read_window, {kK4, 0});
+                    }
+                    auto kt_reg_tensor_slice = get_slice_tile( //
+                        kt_reg_tensor,
+                        sequence<0, i_k4 * kK4>{},
+                        sequence<kQKHeaddim, (i_k4 + 1) * kK4>{});
+                    gemm_4(dq_acc, ds_reg_tensor, kt_reg_tensor_slice);
+
+                    if constexpr(i_k4 < k4_loops - 1)
+                    {
+                        ds_reg_tensor.get_thread_buffer() = ds_reg_tensor_next.get_thread_buffer();
+                    }
+                });
+                move_tile_window(ds_lds_read_window, {-kN0, 0});
+            }
+            block_sync_lds();
+            if constexpr(is_main_body)
+                Policy::template HotLoopScheduler<Problem>::SchedulerGemm4();
+            if constexpr(is_epilogue)
+            {
+                // Results Scale
+                if constexpr(FmhaDropout::IsDropout)
+                {
+                    tile_elementwise_inout([&scale_rp_undrop](auto& x) { x = x * scale_rp_undrop; },
+                                           dk_acc);
+                    tile_elementwise_inout([&rp_undrop](auto& x) { x = x * rp_undrop; }, dv_acc);
+                }
+                else
+                {
+                    tile_elementwise_inout([&raw_scale](auto& x) { x = x * raw_scale; }, dk_acc);
+                }
+
+                dk_epilogue(dk_dram_window, dk_acc);
+                move_tile_window(dk_dram_window, {kN0, 0});
+                dv_epilogue(dv_dram_window, dv_acc);
+                move_tile_window(dv_dram_window, {kN0, 0});
+            }
+        };
+
+        for(index_t i = 0; i < seqlen_kv_start; i += kN0)
+        {
+            dk_epilogue(dk_dram_window, decltype(gemm_3.MakeCBlockTile()){0});
+            move_tile_window(dk_dram_window, {kN0, 0});
+            dv_epilogue(dv_dram_window, decltype(gemm_1.MakeCBlockTile()){0});
+            move_tile_window(dv_dram_window, {kN0, 0});
+        }
+
+        main_body(std::true_type{}, std::false_type{});
+        // Hot loop
+        if(num_total_loop > 1)
+        {
+            do
+            {
+                main_body(std::true_type{}, std::true_type{});
+                i_total_loops += 1;
+                seqlen_kv_step += kN0;
+            } while(i_total_loops < num_total_loop - 1);
+        }
+        main_body(std::false_type{}, std::true_type{});
+        seqlen_kv_step += kN0;
+
+        const auto k_length         = k_dram_block_window_tmp.get_window_lengths();
+        const auto seqlen_kv_length = k_length.at(number<0>{});
+        for(; seqlen_kv_step < seqlen_kv_length; seqlen_kv_step += kN0)
+        {
+            dk_epilogue(dk_dram_window, decltype(gemm_3.MakeCBlockTile()){0});
+            move_tile_window(dk_dram_window, {kN0, 0});
+            dv_epilogue(dv_dram_window, decltype(gemm_1.MakeCBlockTile()){0});
+            move_tile_window(dv_dram_window, {kN0, 0});
+        }
+
+        // QGrad Scale
+        if constexpr(FmhaDropout::IsDropout)
+            tile_elementwise_inout([&scale_rp_undrop](auto& x) { x = x * scale_rp_undrop; },
+                                   dq_acc);
+        else
+            tile_elementwise_inout([&raw_scale](auto& x) { x = x * raw_scale; }, dq_acc);
+        // static_assert(kIsDeterministic);
+        dq_epilogue(dq_dram_window, dq_acc);
+        return;
+    }
+};
+
+template <class T>
+concept fmha_bwd_qr_qtr_dor_pipeline_c = T::is_qr_qtr_dor_pipeline;
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
index 6cef1db730..d1fb1669c9 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
@@ -65,7 +65,8 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm2BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm2WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<
+        constexpr auto SwizzleA = false;
+        using WarpGemm          = WarpGemmMfmaDispatcher< //
             typename Problem::OGradDataType,
             typename Problem::VDataType,
             typename Problem::AccDataType,
@@ -73,7 +74,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
             Problem::BlockFmhaShape::Gemm2WarpTile::at(number<1>{}),
             Problem::BlockFmhaShape::Gemm2WarpTile::at(number<2>{}),
             false,
-            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}) == 16 ? false : true>;
+            SwizzleA>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::OGradDataType,
@@ -105,16 +106,19 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                    typename BlockFmhaShape::Gemm4BlockWarps,
                    typename BlockFmhaShape::Gemm4WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                                typename Problem::KDataType,
-                                                typename Problem::AccDataType,
-                                                BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
-                                                BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
-                                                BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
-                                                false,
-                                                false,
-                                                false,
-                                                WGAttrNumAccessEnum::Double>;
+        using WarpGemm = WarpGemmMfmaDispatcher< //
+            typename Problem::GemmDataType,
+            typename Problem::KDataType,
+            typename Problem::AccDataType,
+            BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
+            BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
+            BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
+            false,
+            false,
+            false,
+            (Problem::BlockFmhaShape::Gemm4WarpTile::at(number<2>{}) == 32)
+                ? WGAttrNumAccessEnum ::Double
+                : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
@@ -293,26 +297,29 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
         constexpr index_t kBlockSize = Problem::kBlockSize;
         constexpr index_t kWarps     = kBlockSize / get_warp_size();
 
-        constexpr index_t K2 = GetAlignmentK<Problem>();
-        constexpr index_t K1 = WarpAlignmentBytes / sizeof(T) / K2;
-        constexpr index_t K0 = ColsPerBlock / K1 / K2;
-        static_assert((K0 * K1 * K2 == ColsPerBlock) && K1 * K2 * sizeof(T) == WarpAlignmentBytes,
+        constexpr index_t K3       = GetAlignmentK<Problem>();            // 8
+        constexpr index_t K2       = WarpAlignmentBytes / sizeof(T) / K3; // 8
+        constexpr index_t K_remain = ColsPerBlock / K2 / K3;
+        constexpr index_t K1       = min(kWarps, K_remain);
+        constexpr index_t K0       = K_remain / K1;
+        static_assert((K0 * K1 * K2 * K3 == ColsPerBlock) &&
+                          K2 * K3 * sizeof(T) == WarpAlignmentBytes,
                       "ColsPerBlock notdivisible");
 
-        constexpr index_t N2 = get_warp_size() / K1;
-        constexpr index_t N1 = kWarps / K0;
+        constexpr index_t N2 = get_warp_size() / K2; // 8
+        constexpr index_t N1 = max(1, kWarps / K1);
         constexpr index_t N0 = RowsPerBlock / N1 / N2;
-        static_assert((N0 * N1 * N2 == RowsPerBlock) && (K0 * N1 == kWarps) &&
-                          (K1 * N2 == get_warp_size()),
+        static_assert((N0 * N1 * N2 == RowsPerBlock) && (K1 * N1 == kWarps) &&
+                          (K2 * N2 == get_warp_size()),
                       "RowsPerBlock not divisible");
 
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1, K2>>,
-                                       tuple<sequence<2, 1>, sequence<1, 2>>, // K0 N1, N2 K1
-                                       tuple<sequence<0, 1>, sequence<2, 1>>,
-                                       sequence<1, 2>, // N0 K2
-                                       sequence<0, 2>>{});
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1, K2, K3>>,
+                                       tuple<sequence<2, 1>, sequence<1, 2>>, // K1 N1, N2 K2
+                                       tuple<sequence<1, 1>, sequence<2, 2>>,
+                                       sequence<1, 2, 2>, // N0 K0 K3
+                                       sequence<0, 0, 3>>{});
     }
 
     template <typename Problem>
@@ -961,13 +968,15 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
     {
         constexpr index_t kBlockSize = Problem::kBlockSize;
 
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
 
-        constexpr index_t N1 = GetAlignmentBias<Problem>();
+        constexpr index_t N1 = min(static_cast<index_t>(GetAlignmentBias<Problem>()),
+                                   kMPerBlock * kNPerBlock / kBlockSize);
         constexpr index_t N0 = kNPerBlock / N1;
-        constexpr index_t M2 = GetTransposedAlignmentBias<Problem>();
-        constexpr index_t M1 = get_warp_size() / N0;
         constexpr index_t M0 = kBlockSize / get_warp_size();
+        constexpr index_t M1 = get_warp_size() / N0;
+        constexpr index_t M2 = kMPerBlock / M1 / M0;
 
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<>,
diff --git a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
index 570cff8bf0..41a744ea91 100644
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -74,7 +74,8 @@ template <typename BlockTile_, // sequence<...
           typename Gemm3BlockWarps_,
           typename Gemm3WarpTile_,
           typename Gemm4BlockWarps_,
-          typename Gemm4WarpTile_>
+          typename Gemm4WarpTile_,
+          index_t kMaxSeqLenQ_ = 0>
 struct TileFmhaBwdShape
 {
     using BlockTile       = remove_cvref_t<BlockTile_>;
@@ -111,6 +112,10 @@ struct TileFmhaBwdShape
                                     // K/K^T at once
     static constexpr index_t kVHeaddim = BlockTile::at(number<8>{}); // V headdim, used for pipeline
                                                                      // that need load V at once
+
+    static constexpr index_t kMaxSeqLenQ = kMaxSeqLenQ_;
+    static_assert(kMaxSeqLenQ == kM0 || kMaxSeqLenQ == 0,
+                  "kMaxSeqLenQ should be equal to kM0 or 0, if 0, it means seq len Q is unlimited");
 };
 
 } // namespace ck_tile

From c0c2ded56684a3a04ad9df1b907d27ae7635067d Mon Sep 17 00:00:00 2001
From: Mateusz Ozga <110818320+mozga-amd@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:02:10 +0200
Subject: [PATCH 415/443] fix (#2668)

---
 example/ck_tile/01_fmha/fmha_fwd.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 mode change 100755 => 100644 example/ck_tile/01_fmha/fmha_fwd.cpp

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
old mode 100755
new mode 100644
index 48306e35fe..c0e4dc3d30
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -525,8 +525,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
             flop += nhead * (static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_q +
                              static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_v);
 
-            num_byte += nhead *   (sizeof(QDataType) * real_seqlen_q * hdim_q +
-                                   sizeof(ODataType) * real_seqlen_q * hdim_v);
+            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
+                                 sizeof(ODataType) * real_seqlen_q * hdim_v);
             num_byte += nhead_k * (sizeof(KDataType) * real_seqlen_k * hdim_q +
                                    sizeof(VDataType) * hdim_v * real_seqlen_k);
         }

From b7322a521a91fe4762701237f0243dd2c94b7644 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Tue, 12 Aug 2025 19:43:14 +0800
Subject: [PATCH 416/443] Optimize fmha fwd decode & prefill for gfx950 (#2641)

* Fix for fwd/bwd kernel build filter

* fix bwd code

* save an example for __bf16 type

* temp save, waiting for debug

* tempsave, fmha_decode

* temp save, change all instance to 1wave

* fix async copytest bug

* Add block_sync_lds_direct_load utility

* fix the s_waitcnt_imm calculation

* Improve s_waitcnt_imm calculation

* fix vmcnt shift

* add input validation and bug fix

* remove unnecessary output

* move test_copy into test

* temp save

* tempsave

* compile pass

* tempsave, trload+asyncload done

* tempsave. asynccopy+trload sanity checked

* remove unnecessary features

* fix the lds alignment caused performance regression

* enable prefill overload operator().

* remove all lds bankconflict with xor layouts

* enable larger tile size; upgrade xor pattern

* upgrade prefill pipeline; simple iglp; consistent data produce and consume order

* small refactor

* Load Q through lds, implement xor;

* add vmcnt guard before load ktile

* Add v_permlaneb32 for block_reduce. Disable it as it will cause un-coexecutable packed math in FA

* Add XOR fold strategy for hdim<128, but perf dropped; disable it by default; wait further perf debug

* add __restrict__ to tr load

* merge fa_decode pipeline into fmha_fwd api

* remove unnecessary files; rename some files

* Remove unnecessary changes

* bug fix, clang format;

* remove non-necessary change

* fix clangformat with 18.1.3

* fix bugs

* fix bug

* fix bug on non-gfx950

* fix bugs in gemm

* fix bug in pki4

* tempsave, update the blocksync functions

* change the warp setting for hdim32 fmha fwd

* clang format

* fix conflict. disable all v-col instance for fmha fwd

* Fix the bug

* clang format

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
---
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |    2 +
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  147 +-
 example/ck_tile/01_fmha/fmha_fwd.cpp          |    2 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |    3 +
 .../ck_tile/01_fmha/script/benchmark_fwd.sh   |   11 -
 .../ck_tile/01_fmha/script/smoke_test_fwd.sh  |   21 +-
 .../core/arch/amd_buffer_addressing.hpp       |   17 +-
 .../arch/amd_buffer_addressing_builtins.hpp   |   17 +-
 include/ck_tile/core/arch/arch.hpp            |   27 +-
 include/ck_tile/core/arch/utility.hpp         |   15 +
 include/ck_tile/core/config.hpp               |   10 +
 include/ck_tile/core/numeric/bfloat16.hpp     |   11 +
 include/ck_tile/core/numeric/pk_fp4.hpp       |    2 +-
 include/ck_tile/core/numeric/pk_int4.hpp      |    2 +-
 include/ck_tile/core/numeric/vector_type.hpp  |   12 +-
 .../unary_element_wise_operation.hpp          |    7 -
 include/ck_tile/ops/fmha.hpp                  |    2 +
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 1504 ++++++++++++-----
 ...block_fmha_bwd_pipeline_default_policy.hpp |   24 +-
 .../pipeline/block_fmha_pipeline_enum.hpp     |    7 +
 .../pipeline/block_fmha_pipeline_problem.hpp  |    2 +
 ...ck_fmha_pipeline_qr_ks_vs_async_trload.hpp | 1177 +++++++++++++
 ..._pipeline_qr_ks_vs_async_trload_policy.hpp |  823 +++++++++
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   30 +-
 .../block/block_gemm_areg_breg_creg_v1.hpp    |  180 +-
 .../ops/gemm/block/block_gemm_problem.hpp     |    9 +-
 .../gemm_pipeline_ag_bg_cr_scheduler.hpp      |    6 +
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   48 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |    8 +
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |    4 +
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   30 +-
 31 files changed, 3533 insertions(+), 627 deletions(-)
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp

diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 6fca800c90..42a9d5148a 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -115,6 +115,7 @@ PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
     "qs" : "ck_tile::BlockFmhaPipelineQSKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
 }
 
 PIPELINE_ENUM_MAP = {
@@ -123,6 +124,7 @@ PIPELINE_ENUM_MAP = {
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
     "qr_pagedkv" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 269af4e6a7..ce35c6a2a7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -12,6 +12,7 @@ from typing import List, Optional, Tuple
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file
 
 
 DTYPE_BITS = {
@@ -83,6 +84,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     {F_mode},
     fmha_variant_{F_idx},
     fmha_mask_{F_idx},
+    {F_trload},
     fmha_trait_{F_idx}>;
 
 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -97,7 +99,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
 
 #include <iostream>
 
@@ -161,12 +163,19 @@ float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config&
     [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
         return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
     }};
+    
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
 
 {F_dispatch}
     return r;
 }}
 """
 
+FMHA_FWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
+{F_dtype_case}
+    }}
+"""
+
 FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
 {F_hdim_case}
     }}
@@ -177,8 +186,8 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -221,6 +230,7 @@ class FmhaFwdApiTrait:
     dpad       : str
     dvpad      : str
     skip       : str
+    tr_load    : str
     constraint : CppConstraint
 
     @property
@@ -231,13 +241,19 @@ class FmhaFwdApiTrait:
     @property
     def scheck(self) -> str:
         if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
-        if self.pipeline_tag == 'qr_async':
+        if self.pipeline_tag in ['qr_async', 'qr_async_trload']:
             if self.spad == 't' : return 'true' # always support
             else :                return 'true'
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
+    
+    @property
+    def seqtune(self) -> str:
+        if self.bm0 == 128: return 'true/*fall back to largest tile*/'                  # group mode only generate spad/skpad == true
+        else: 
+            return f'a.seqlen_q <= {self.bm0}'
 
     @property
     def skcheck(self) -> str:
@@ -248,6 +264,9 @@ class FmhaFwdApiTrait:
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag == 'qr_async_trload':
+            if self.skpad == 't' : return 'true'
+            else:                  return 'true'
         else: assert False
 
     @property
@@ -256,7 +275,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
-        elif self.pipeline_tag in ['qr', 'qs']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -268,7 +287,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
-        elif self.pipeline_tag in ['qr', 'qs']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -290,6 +309,7 @@ class FmhaFwdPipeline:
     F_squant     : str  #
     F_mask       : str  # value from MASK_MAP
     F_skip       : str  # true/false
+    F_trload     : str  # true/false
     F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
@@ -331,6 +351,9 @@ class FmhaFwdPipeline:
 
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
+        
+        if self.F_trload == 't' : n += '_trload'
+        else: n += '_ntrload'
 
         return n
 
@@ -351,31 +374,39 @@ class FmhaFwdApiPool:
 
     @property
     def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.pool.keys()):
-            per_hdim_case=str()
-            for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
-                traits=self.pool[dtype][(hdim, hdim_v)]
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
-                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip],
-                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
-                                   F_constraint=trait.constraint,
-                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
+        tr_load_cond_map = {
+            "t": "has_load_tr",
+            "f": "true"
+        }
+        
+        per_tr_load =str()
+        for tr_load in ["t", "f"]:
+            per_dtypes=str()
+            for i, dtype in enumerate(self.pool.keys()):
+                per_hdim_case=str()
+                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                    traits=self.pool[dtype][(hdim, hdim_v)]
+                    inners=str()
+                    for k, trait in enumerate(traits):
+                        if_k = 'if' if k == 0 else 'else if'
+                        inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                       F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                       F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                       F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip], F_trload=BOOL_MAP[trait.tr_load],
+                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                       F_constraint=trait.constraint,
+                                       F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                       F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                       F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                    if_j = 'if' if j == 0 else 'else if'
+                    per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
+                if_i = 'if' if i == 0 else 'else if'
+                per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+            per_tr_load += FMHA_FWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_dtype_case=per_dtypes)
+        if not per_tr_load:
             # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_tr_load)
 
 @dataclass
 class FmhaFwdTileSize:
@@ -458,7 +489,8 @@ class FmhaFwdKernel:
                 F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                 F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
+                F_trload        = BOOL_MAP[self.F_pipeline.F_trload])
 
     @property
     def name(self) -> str:
@@ -494,6 +526,7 @@ class FmhaFwdKernel:
                 dpad=self.F_pipeline.F_dpad,
                 dvpad=self.F_pipeline.F_dvpad,
                 skip=self.F_pipeline.F_skip,
+                tr_load=self.F_pipeline.F_trload,
                 constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
 
 class KernelComponentFactory:
@@ -503,10 +536,15 @@ class KernelComponentFactory:
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
         if dtype == 'fp16' or dtype == 'bf16':
             return {
-                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (128,128) : [FmhaFwdTileSize(16, 32, 64, 128, 32,  128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32, 128, 128, 32,  128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64, 32, 128, 16,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
@@ -534,34 +572,27 @@ class KernelComponentFactory:
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
                 if hdim == 256 and hdim_v == 256:
-                # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                     else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        if (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f":
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 't'))
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 't'))
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f')) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -599,6 +630,12 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
+                if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
+                    # non qr_async_trload only support km0=128 tile size when hdim is not 128
+                    # non qr_async only support kn0=128 tile size when hdim is 128
+                    continue
+                if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
+                    continue
                 # logits_soft_cap is only allowed if no bias
                 if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
                     continue
@@ -665,10 +702,10 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
     return (api_pool, gen)
 
 def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
+    update_file(autogen_dir / kernel.filename, kernel.template)
 
 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
 
 def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
     api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index c0e4dc3d30..d0f8e3798c 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1135,7 +1135,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
               << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush;
+              << " GB/s" << std::flush << std::endl;
 
     if(do_validation == 0)
     {
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 81dda692ea..df1e9e5699 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/fmha.hpp"
@@ -1028,6 +1029,7 @@ template <ck_tile::index_t HDim_,
           bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
+          bool kUseTrLoad_,
           bool kSkipMinSeqlenQ_ = false>
 struct fmha_fwd_traits_
 {
@@ -1052,6 +1054,7 @@ struct fmha_fwd_traits_
     static constexpr bool kPadSK                     = kPadSK_;
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kUseTrLoad                 = kUseTrLoad_;
     static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
 };
 
diff --git a/example/ck_tile/01_fmha/script/benchmark_fwd.sh b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
index 599c595a75..88c16cceb6 100755
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -18,14 +18,3 @@ $EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kn
 done
 done
 done
-
-for perm in 0 1 ; do
-
-$EXE -prec=fp8 -squant=1 -b=32 -h=16 -d=128 -s=512   -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=16 -h=16 -d=128 -s=1024  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=8  -h=16 -d=128 -s=2048  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=4  -h=16 -d=128 -s=4096  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=2  -h=16 -d=128 -s=8192  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=1  -h=16 -d=128 -s=16384 -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-
-done
\ No newline at end of file
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index b867cd6c07..dc2be933bd 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -42,7 +42,6 @@ run_fp16_bf16_tests() {
     for prec in "fp16" "bf16" ; do
     for mode in 1 0 ; do
     for perm in 0 1 ; do
-    for vlayout in "r" "c" ; do
     for hdim in 32 64 128 256 ; do
     for lse in 0 1 ; do
     for bias in "n" "e" "a" ; do
@@ -51,16 +50,16 @@ run_fp16_bf16_tests() {
     for page_block_size in $PAGE_BLOCK_SIZE ; do
     for cache_batch_idx in $CACHE_BATCH_IDX ; do
 
-    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
 
     done ; done ; done ; done ; done
     done ; done ; done ; done ; done
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 35da19cd3e..07be65a150 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -41,10 +41,6 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1318,6 +1314,17 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -2756,7 +2763,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 8c3bc0bc36..c64b296408 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -32,10 +32,6 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1186,6 +1182,17 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -2574,7 +2581,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index ab42ec8617..f0e9518120 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -89,21 +89,6 @@ CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
 
 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
 
-CK_TILE_DEVICE void block_sync_lds()
-{
-#if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
-    // asm volatile("\
-    // s_waitcnt lgkmcnt(0) \n \
-    // s_barrier \
-    // " ::);
-
-    __builtin_amdgcn_s_waitcnt(0xc07f);
-    __builtin_amdgcn_s_barrier();
-#else
-    __syncthreads();
-#endif
-}
-
 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 {
 #ifdef __gfx12__
@@ -174,6 +159,18 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
     __builtin_amdgcn_s_barrier();
 }
 
+template <index_t lgkmcnt = 0>
+CK_TILE_DEVICE void block_sync_lds()
+{
+    s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
+}
+
+template <index_t vmcnt = 0>
+CK_TILE_DEVICE void block_sync_lds_direct_load()
+{
+    s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
+}
+
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index 7184f99521..93008f8525 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -59,6 +59,21 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
 #endif
 }
 
+template <typename T>
+CK_TILE_DEVICE auto warp_shuffle_down_pair(const T& v_local)
+{
+    static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
+
+    const int32x2_t x = __builtin_amdgcn_permlane32_swap(
+        bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
+
+    thread_buffer<T, 2> v;
+    v(0) = bit_cast<T>(x[0]);
+    v(1) = bit_cast<T>(x[1]);
+
+    return v;
+}
+
 template <typename T>
 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
 {
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index c471f416c3..e472bd01e5 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -191,6 +191,16 @@
 #endif
 #endif
 
+// use llvm builtin bf16 data type after ROCm 6.5
+#ifndef CK_TILE_USE_LLVM_BUILTIN_BF16
+#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 5 && HIP_VERSION_PATCH >= 50421) || \
+    (HIP_VERSION_MAJOR >= 7)
+#define CK_TILE_USE_LLVM_BUILTIN_BF16 1
+#else
+#define CK_TILE_USE_LLVM_BUILTIN_BF16 0
+#endif
+#endif
+
 #ifndef CK_TILE_DEBUG_LOG
 #define CK_TILE_DEBUG_LOG 0
 #endif
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 6f31468809..245fb7244f 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -6,6 +6,9 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
+#include <hip/hip_bfloat16.h>
+#endif
 #include <stdint.h>
 
 #pragma once
@@ -102,7 +105,11 @@ struct native_t<bfloat16_t>
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = typename bf16_t::raw_type;
 #else
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
+using bfloat16_t = __bf16;
+#else
 using bfloat16_t = ushort;
+#endif
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = uint16_t;
 #endif
@@ -280,7 +287,11 @@ template <bf16_rounding_mode rounding =
               static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE constexpr bfloat16_t float_to_bf16(float f, constant<rounding> = {})
 {
+#if defined(__gfx950__)
+    return static_cast<bfloat16_t>(f);
+#else
     return bit_cast<bfloat16_t>(float_to_bf16_raw(f, constant<rounding>{}));
+#endif
 }
 
 template <bf16_rounding_mode rounding =
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index a345cd1b75..7464bc7c48 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -21,7 +21,7 @@ namespace ck_tile {
 using fp32_t   = float;
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
 
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index ba8b87a9b8..0b0eb70beb 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -99,7 +99,7 @@ struct numeric_traits<pk_int4_t>
 
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
 {
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index 58bdb43b08..bbd3d53827 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -131,12 +131,12 @@ using fp16x64_t = _Float16 __attribute__((ext_vector_type(64)));
 
 // bf16
 // using bf16_t = ...
-using bf16x2_t  = bf16_raw_t __attribute__((ext_vector_type(2)));
-using bf16x4_t  = bf16_raw_t __attribute__((ext_vector_type(4)));
-using bf16x8_t  = bf16_raw_t __attribute__((ext_vector_type(8)));
-using bf16x16_t = bf16_raw_t __attribute__((ext_vector_type(16)));
-using bf16x32_t = bf16_raw_t __attribute__((ext_vector_type(32)));
-using bf16x64_t = bf16_raw_t __attribute__((ext_vector_type(64)));
+using bf16x2_t  = bfloat16_t __attribute__((ext_vector_type(2)));
+using bf16x4_t  = bfloat16_t __attribute__((ext_vector_type(4)));
+using bf16x8_t  = bfloat16_t __attribute__((ext_vector_type(8)));
+using bf16x16_t = bfloat16_t __attribute__((ext_vector_type(16)));
+using bf16x32_t = bfloat16_t __attribute__((ext_vector_type(32)));
+using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
 
 // i32
 // using int32_t = ...
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 0e385901ed..b69c167315 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -330,13 +330,6 @@ struct PassThrough
         y = type_convert<float>(x);
     }
 
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf16_t, ck_tile::fp16_t>(ck_tile::bf16_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
     template <>
     CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp16_t>(float& y,
                                                                 const ck_tile::fp16_t& x) const
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index d8dd5db12e..69f645b850 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -52,6 +52,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 8d257a3329..5b3d38d3e7 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -13,6 +13,7 @@
 #include <utility>
 #include <variant>
 
+#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
 // S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] @ K[seqlen_k, hdim_q]
 // S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
 // S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
@@ -61,6 +62,14 @@ struct FmhaFwdKernel
 
     static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
 
+    static constexpr bool kUseTrLoad = FmhaPipeline::Problem::kUseTrLoad;
+#if defined(__gfx950__)
+    static constexpr bool kIsAvialable = true;
+#else
+    static constexpr bool kIsAvialable = !kUseTrLoad;
+#endif
+    static constexpr std::string_view kPipelineName = FmhaPipeline::name;
+
     // clang-format off
     template <typename T> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
@@ -100,7 +109,7 @@ struct FmhaFwdKernel
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kUseTrLoad ? "_trload" : "_ntrload");
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -1036,455 +1045,1142 @@ struct FmhaFwdKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        if constexpr(kIsAvialable)
+            run_(std::move(kargs));
+    }
 
-        // divide problem
-        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
-
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-        long_index_t batch_offset_q       = 0;
-        long_index_t batch_offset_k       = 0;
-        long_index_t batch_offset_v       = 0;
-        long_index_t batch_offset_bias    = 0;
-        long_index_t batch_offset_randval = 0;
-        long_index_t batch_offset_lse     = 0;
-        long_index_t batch_offset_o       = 0;
-
-        if constexpr(kIsGroupMode)
+    CK_TILE_DEVICE void run_(Kargs kargs) const
+    {
+        if constexpr(kPipelineName != "qr_async_trload")
         {
-            // get starting offset for each batch
-            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+            // allocate LDS
+            __shared__ char smem_ptr[GetSmemSize()];
 
-            batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                batch_offset_v = key_start * kargs.stride_v;
-            }
-            else
-            {
-                batch_offset_v = key_start;
-            }
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                batch_offset_bias = query_start * kargs.stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = query_start;
-            }
-            if constexpr(kHasDropout)
-            {
-                batch_offset_randval = query_start * kargs.stride_randval;
-            }
-            batch_offset_o = query_start * kargs.stride_o;
+            // divide problem
+            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-            // get real # queries & # keys under group mode
-            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
 
-            if constexpr(kSkipMinSeqlenQ)
+            long_index_t batch_offset_q       = 0;
+            long_index_t batch_offset_k       = 0;
+            long_index_t batch_offset_v       = 0;
+            long_index_t batch_offset_bias    = 0;
+            long_index_t batch_offset_randval = 0;
+            long_index_t batch_offset_lse     = 0;
+            long_index_t batch_offset_o       = 0;
+
+            if constexpr(kIsGroupMode)
             {
-                if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                // get starting offset for each batch
+                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+                batch_offset_q = query_start * kargs.stride_q;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias = query_start * kargs.stride_bias;
+                }
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = query_start;
+                }
+                if constexpr(kHasDropout)
+                {
+                    batch_offset_randval = query_start * kargs.stride_randval;
+                }
+                batch_offset_o = query_start * kargs.stride_o;
+
+                // get real # queries & # keys under group mode
+                const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+                kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+                if constexpr(kSkipMinSeqlenQ)
+                {
+                    if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                    {
+                        return;
+                    }
+                }
+
+                // # of required blocks is different in each groups, terminate unnecessary blocks
+                // earlier
+                if(kargs.seqlen_q <= i_m0)
                 {
                     return;
                 }
-            }
 
-            // # of required blocks is different in each groups, terminate unnecessary blocks
-            // earlier
-            if(kargs.seqlen_q <= i_m0)
-            {
-                return;
-            }
-
-            if(kargs.seqlen_k_ptr != nullptr)
-            {
-                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                }
+                else
+                {
+                    const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+                    kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+                }
             }
             else
             {
-                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                }
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                }
+                if constexpr(kHasDropout)
+                {
+                    batch_offset_randval =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
+                }
+                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
             }
-        }
-        else
-        {
-            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-            batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-            batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-            }
-            if constexpr(kHasDropout)
-            {
-                batch_offset_randval =
-                    static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
-            }
-            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-        }
 
-        // for simplicity, batch stride we just modify the pointer
-        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-                                 batch_offset_q;
-        const KDataType* k_ptr =
-            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
-            batch_offset_k;
-        const VDataType* v_ptr =
-            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
-            batch_offset_v;
-        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-                           batch_offset_o;
+            // for simplicity, batch stride we just modify the pointer
+            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                     batch_offset_q;
+            const KDataType* k_ptr =
+                reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+                batch_offset_k;
+            const VDataType* v_ptr =
+                reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
+                batch_offset_v;
+            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                               batch_offset_o;
 
-        // Q/K/V DRAM and DRAM window
-        const auto q_dram = [&]() {
-            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                q_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                make_tuple(kargs.stride_q, 1),
-                number<FmhaPipeline::kAlignmentQ>{},
-                number<1>{});
-            if constexpr(FmhaPipeline::kQLoadOnce)
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-        }();
-        const auto k_dram = [&]() {
-            const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                k_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_q),
-                make_tuple(kargs.stride_k, 1),
-                number<FmhaPipeline::kAlignmentK>{},
-                number<1>{});
-
-            constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
-            return pad_tensor_view(
-                k_dram_naive,
-                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK_, kPadHeadDimQ>{});
-        }();
-        const auto v_dram = [&]() {
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    v_ptr,
-                    make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                    make_tuple(kargs.stride_v, 1),
-                    number<FmhaPipeline::kAlignmentV>{},
+            // Q/K/V DRAM and DRAM window
+            const auto q_dram = [&]() {
+                const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    q_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                    make_tuple(kargs.stride_q, 1),
+                    number<FmhaPipeline::kAlignmentQ>{},
+                    number<1>{});
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                {
+                    return pad_tensor_view(q_dram_naive,
+                                           make_tuple(number<FmhaPipeline::kM0>{},
+                                                      number<FmhaPipeline::kSubQKHeaddim>{}),
+                                           sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                }
+                else
+                {
+                    return pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                }
+            }();
+            const auto k_dram = [&]() {
+                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    k_ptr,
+                    make_tuple(kargs.seqlen_k, kargs.hdim_q),
+                    make_tuple(kargs.stride_k, 1),
+                    number<FmhaPipeline::kAlignmentK>{},
                     number<1>{});
-
-                const auto v_dram_transposed =
-                    transform_tensor_view(v_dram_naive,
-                                          make_tuple(make_pass_through_transform(kargs.hdim_v),
-                                                     make_pass_through_transform(kargs.seqlen_k)),
-                                          make_tuple(sequence<1>{}, sequence<0>{}),
-                                          make_tuple(sequence<0>{}, sequence<1>{}));
 
                 constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
                 return pad_tensor_view(
-                    v_dram_transposed,
-                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK_>{});
+                    k_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<kPadSeqLenK_, kPadHeadDimQ>{});
+            }();
+            const auto v_dram = [&]() {
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        v_ptr,
+                        make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                        make_tuple(kargs.stride_v, 1),
+                        number<FmhaPipeline::kAlignmentV>{},
+                        number<1>{});
+
+                    const auto v_dram_transposed = transform_tensor_view(
+                        v_dram_naive,
+                        make_tuple(make_pass_through_transform(kargs.hdim_v),
+                                   make_pass_through_transform(kargs.seqlen_k)),
+                        make_tuple(sequence<1>{}, sequence<0>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
+                    return pad_tensor_view(
+                        v_dram_transposed,
+                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                        sequence<kPadHeadDimV, kPadSeqLenK_>{});
+                }
+                else
+                {
+                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        v_ptr,
+                        make_tuple(kargs.hdim_v, kargs.seqlen_k),
+                        make_tuple(kargs.stride_v, 1),
+                        number<FmhaPipeline::kAlignmentV>{},
+                        number<1>{});
+
+                    constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
+                    return pad_tensor_view(
+                        v_dram_naive,
+                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                        sequence<kPadHeadDimV_, kPadSeqLenK>{});
+                }
+            }();
+
+            auto q_dram_window = make_tile_window(
+                q_dram,
+                [&]() {
+                    if constexpr(FmhaPipeline::kQLoadOnce)
+                        return make_tuple(number<FmhaPipeline::kM0>{},
+                                          number<FmhaPipeline::kSubQKHeaddim>{});
+                    else
+                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+                }(),
+                {i_m0, 0});
+
+            auto k_dram_window = make_tile_window(
+                k_dram,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                {0, 0});
+
+            auto v_dram_window = make_tile_window(
+                v_dram,
+                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                {i_n1, 0});
+            /// FIXME: Before C++20, capturing structured binding variables are not supported.
+            /// Remove following copy capture of the 'i_nhead' if in C++20
+            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto bias_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    const BiasDataType* bias_ptr =
+                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                        batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                number<FmhaPipeline::kAlignmentBias>{},
+                                number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+                }
+                else
+                {
+                    return make_null_tile_window(bias_dram_window_lengths);
+                }
+            }();
+
+            // lse
+            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+                if constexpr(kStoreLSE)
+                {
+                    LSEDataType* lse_ptr =
+                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+                        batch_offset_lse;
+
+                    const auto lse_dram = [&]() {
+                        const auto lse_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                lse_ptr,
+                                make_tuple(kargs.seqlen_q),
+                                make_tuple(1),
+                                number<1>{},
+                                number<1>{});
+
+                        return pad_tensor_view(
+                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                    }();
+
+                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+                }
+                else
+                {
+                    return make_null_tile_window(lse_dram_window_lengths);
+                }
+            }();
+
+            auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
+                if constexpr(kHasDropout)
+                {
+                    return BlockDropout{i_batch_,
+                                        i_nhead_,
+                                        kargs.num_head_q,
+                                        kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
+                                                                            : *kargs.drop_seed.ptr,
+                                        kargs.is_drop_seed_offset_from_host
+                                            ? kargs.drop_offset.val
+                                            : *kargs.drop_offset.ptr,
+                                        kargs.rp_undrop,
+                                        kargs.p_undrop_in_uint8_t,
+                                        kargs.is_store_randval};
+                }
+                else
+                {
+                    return NullBlockDropout{};
+                };
+            }();
+
+            auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto randval_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(kHasDropout)
+                {
+                    RandValOutputDataType* rand_val_ptr =
+                        reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
+                        batch_offset_randval;
+
+                    const auto randval_dram = [&]() {
+                        const auto randval_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                rand_val_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_randval, 1),
+                                number<1>{},
+                                number<1>{});
+
+                        return pad_tensor_view(randval_dram_naive,
+                                               randval_dram_window_lengths,
+                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
+                }
+                else
+                {
+                    return make_null_tile_window(randval_dram_window_lengths);
+                }
+            }();
+
+            FmhaMask mask = [&]() {
+                if constexpr(kHasMask)
+                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                        kargs.window_size_left,
+                        kargs.window_size_right,
+                        kargs.seqlen_q,
+                        kargs.seqlen_k,
+                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+                else
+                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+            }();
+
+            // WA i_batch capture structure binding before c++20
+            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    // data loading, shared by entire wg
+                    // TODO: how to use s_read?
+                    SaccDataType slope =
+                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    slope *= ck_tile::log2e_v<>;
+#endif
+                    if constexpr(kHasMask)
+                    {
+                        return make_alibi_from_lr_mask<SaccDataType, true>(slope,
+                                                                           kargs.window_size_left,
+                                                                           kargs.window_size_right,
+                                                                           kargs.seqlen_q,
+                                                                           kargs.seqlen_k,
+                                                                           kargs.mask_type);
+                    }
+                    else
+                    {
+                        return Alibi<SaccDataType, true>{
+                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }
+                else
+                {
+                    return EmptyPositionEncoding<SaccDataType>{};
+                }
+            }();
+
+            AttentionVariant variant;
+            const auto variant_params = [&] {
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                        mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+                }
+                else
+                {
+                    return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+                }
+            }();
+
+            BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+
+            auto o_acc_tile = [&]() {
+                if constexpr(kDoFp8StaticQuant)
+                {
+                    return FmhaPipeline{}(
+                        q_dram_window,
+                        identity{}, // q_element_func
+                        k_dram_window,
+                        identity{}, // k_element_func
+                        v_dram_window,
+                        identity{}, // v_element_func
+                        bias_dram_window,
+                        identity{}, // bias_element_func
+                        randval_dram_window,
+                        lse_dram_window,
+                        identity{},            // lse_element_func
+                        identity{},            // s_acc_element_func
+                        scales{kargs.scale_p}, // p_compute_element_func
+                        composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
+                        mask,
+                        position_encoding,
+                        kargs.scale_s,
+                        variant,
+                        variant_params,
+                        block_indices,
+                        smem_ptr,
+                        dropout);
+                }
+                else
+                {
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          randval_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          variant,
+                                          variant_params,
+                                          block_indices,
+                                          smem_ptr,
+                                          dropout);
+                }
+            }();
+
+            // O DRAM and O DRAM window
+            auto o_dram = [&]() {
+                const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    o_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                    make_tuple(kargs.stride_o, 1),
+                    number<FmhaPipeline::kAlignmentO>{},
+                    number<1>{});
+
+                return pad_tensor_view(
+                    o_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            }();
+
+            auto o_dram_window = make_tile_window(
+                o_dram,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                {i_m0, i_n1});
+
+            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        }
+        else
+        {
+            // TODO: Refine the logical here.
+            // In Decode case
+            //     1. we don't expect KV data reused by different ThreadGroups, bypass the cache
+            //     2. limit the LDS usage, as we want higher occupancy
+            // In Prefill case
+            //     1. we expect KV data reused by different ThreadGroups, use cache
+            //     2. use more LDS, as we want better memory latency hiding
+            // If SplitKV off, we don't expect Q data reused by different ThreadGroups, bypass the
+            // cache
+            constexpr bool PrefillCase = FmhaPipeline::kM0 >= 128;
+            // divide problem
+            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
+
+            const index_t i_m0 = i_tile_m * FmhaPipeline::kM0;
+            const index_t i_n1 = i_tile_n * FmhaPipeline::kN1;
+
+            long_index_t batch_offset_q    = 0;
+            long_index_t batch_offset_k    = 0; // unused for paged-kvcache
+            long_index_t batch_offset_v    = 0; // unused for paged-kvcache
+            long_index_t batch_offset_bias = 0;
+            long_index_t batch_offset_lse  = 0;
+            long_index_t batch_offset_o    = 0;
+            // index_t kv_l2p_offset =
+            //     0; // logical-to-physical offset of seqlen_k coordinate. only used for
+            //     paged-kvcache
+
+            if constexpr(kIsGroupMode)
+            {
+                // get starting offset for each batch
+                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+                batch_offset_q = query_start * kargs.stride_q;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias = query_start * kargs.stride_bias;
+                }
+
+                batch_offset_lse = query_start;
+                batch_offset_o   = query_start * kargs.stride_o;
+
+                // get real # queries & # keys under group mode
+                kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - kargs.seqstart_q_ptr[i_batch];
+
+                // # of required blocks is different in each groups, terminate unnecessary blocks
+                // earlier
+                if(kargs.seqlen_q <= i_m0)
+                {
+                    return;
+                }
+
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                }
+                else
+                {
+                    kargs.seqlen_k =
+                        kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch];
+                }
             }
             else
             {
+                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                }
+                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                }
+            }
+
+            // for simplicity, batch stride we just modify the pointer
+            const index_t i_nhead_k = i_nhead / kargs.nhead_ratio_qk;
+
+            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                     batch_offset_q;
+            const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_k +
+                                     batch_offset_k;
+            const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_v +
+                                     batch_offset_v;
+
+            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                               batch_offset_o;
+
+            // Q/K/V DRAM and DRAM window
+            const auto q_dram = [&] {
+                const auto q_dram_naive = [&] {
+                    {
+                        return make_naive_tensor_view<address_space_enum::global,
+                                                      memory_operation_enum::set,
+                                                      amd_buffer_coherence_enum::SYSTEM_NT1>(
+                            q_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                            make_tuple(kargs.stride_q, 1),
+                            number<FmhaPipeline::kAlignmentQ>{},
+                            number<1>{});
+                    }
+                }();
+
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                {
+                    const auto seqlen_q   = kargs.seqlen_q;
+                    const auto q_dram_pad = pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<false, kPadHeadDimQ>{});
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                    constexpr index_t LDSLayerSize  = 256 / sizeof(QDataType);
+                    constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                    if constexpr(XorLengthFold > 1)
+                    {
+                        const auto q_dram_unmerged = transform_tensor_view(
+                            q_dram_pad,
+                            make_tuple(
+                                make_unmerge_transform(
+                                    make_tuple(seqlen_q / XorLengthFold, XorLengthFold)),
+                                make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        const auto q_dram_merged = transform_tensor_view(
+                            q_dram_unmerged,
+                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
+                                       make_merge_transform_v3_division_mod(make_tuple(
+                                           XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+
+                        const auto q_dram_unmerged_xor = transform_tensor_view(
+                            q_dram_merged,
+                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
+                                       make_unmerge_transform(make_tuple(
+                                           number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{},
+                                           number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                        const auto q_dram_permuted = transform_tensor_view(
+                            q_dram_unmerged_xor,
+                            make_tuple(
+                                make_xor_transform(
+                                    make_tuple(seqlen_q / XorLengthFold,
+                                               number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        const auto q_dram_tmp = transform_tensor_view(
+                            q_dram_permuted,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q / XorLengthFold),
+                                make_unmerge_transform(
+                                    make_tuple(number<XorLengthFold>{},
+                                               number<FmhaPipeline::kQKHeaddim /
+                                                      FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                        return transform_tensor_view(
+                            q_dram_tmp,
+                            make_tuple(
+                                make_merge_transform_v3_division_mod(
+                                    make_tuple(seqlen_q / XorLengthFold, number<XorLengthFold>{})),
+                                make_merge_transform_v3_division_mod(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                    }
+                    else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                    {
+                        const auto q_dram_unmerged = transform_tensor_view(
+                            q_dram_pad,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q),
+                                make_unmerge_transform(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                        const auto q_dram_permuted = transform_tensor_view(
+                            q_dram_unmerged,
+                            make_tuple(
+                                make_xor_transform(make_tuple(seqlen_q,
+                                                              number<FmhaPipeline::kQKHeaddim /
+                                                                     FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        return transform_tensor_view(
+                            q_dram_permuted,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q),
+                                make_merge_transform_v3_division_mod(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                    }
+                }
+                else
+                {
+                    return pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<false, kPadHeadDimQ>{});
+                }
+            }();
+
+            const auto make_k_dram = [&](const KDataType* data, index_t height) {
+                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(height, kargs.hdim_q),
+                    make_tuple(kargs.stride_k, 1),
+                    number<FmhaPipeline::kAlignmentK>{},
+                    number<1>{});
+
+                const auto k_dram_pad = pad_tensor_view(
+                    k_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<false, kPadHeadDimQ>{});
+
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr index_t LDSLayerSize  = 256 / sizeof(KDataType);
+                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    const auto k_dram_unmerged = transform_tensor_view(
+                        k_dram_pad,
+                        make_tuple(make_unmerge_transform(
+                                       make_tuple(height / XorLengthFold, XorLengthFold)),
+                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto k_dram_merged = transform_tensor_view(
+                        k_dram_unmerged,
+                        make_tuple(make_pass_through_transform(height / XorLengthFold),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    const auto k_dram_unmerged_xor = transform_tensor_view(
+                        k_dram_merged,
+                        make_tuple(make_pass_through_transform(height / XorLengthFold),
+                                   make_unmerge_transform(make_tuple(
+                                       number<LDSLayerSize / FmhaPipeline::kAlignmentK>{},
+                                       number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto k_dram_permuted = transform_tensor_view(
+                        k_dram_unmerged_xor,
+                        make_tuple(
+                            make_xor_transform(
+                                make_tuple(height / XorLengthFold,
+                                           number<LDSLayerSize / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto k_dram_tmp = transform_tensor_view(
+                        k_dram_permuted,
+                        make_tuple(
+                            make_pass_through_transform(height / XorLengthFold),
+                            make_unmerge_transform(make_tuple(
+                                number<XorLengthFold>{},
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_view(
+                        k_dram_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(height / XorLengthFold, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    const auto k_dram_unmerged = transform_tensor_view(
+                        k_dram_pad,
+                        make_tuple(
+                            make_pass_through_transform(height),
+                            make_unmerge_transform(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto k_dram_permuted = transform_tensor_view(
+                        k_dram_unmerged,
+                        make_tuple(
+                            make_xor_transform(make_tuple(
+                                height,
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_view(
+                        k_dram_permuted,
+                        make_tuple(
+                            make_pass_through_transform(height),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            };
+            const auto k_dram = [&]() {
+                {
+                    return make_k_dram(k_ptr, kargs.seqlen_k);
+                }
+            }();
+
+            const auto make_v_dram = [&](const VDataType* data, index_t length) {
                 const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    v_ptr,
-                    make_tuple(kargs.hdim_v, kargs.seqlen_k),
-                    make_tuple(kargs.stride_v, 1),
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(length, kargs.hdim_v),
+                    make_tuple(kargs.hdim_v, 1),
                     number<FmhaPipeline::kAlignmentV>{},
                     number<1>{});
 
-                constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
-                return pad_tensor_view(
+                // TODO: Add kVHeadDim
+                constexpr index_t XorGroupSize =
+                    FmhaPipeline::Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
+
+                const auto v_dram_pad = pad_tensor_view(
                     v_dram_naive,
-                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV_, kPadSeqLenK>{});
-            }
-        }();
+                    make_tuple(number<FmhaPipeline::kK1>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenK, false>{});
 
-        auto q_dram_window = make_tile_window(
-            q_dram,
-            [&]() {
-                if constexpr(FmhaPipeline::kQLoadOnce)
-                    return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kSubQKHeaddim>{});
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr index_t LDSLayerSize  = 256 / sizeof(VDataType);
+                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    const auto v_dram_unmerged = transform_tensor_view(
+                        v_dram_pad,
+                        make_tuple(make_unmerge_transform(
+                                       make_tuple(length / XorLengthFold, XorLengthFold)),
+                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto v_dram_merged = transform_tensor_view(
+                        v_dram_unmerged,
+                        make_tuple(make_pass_through_transform(length / XorLengthFold),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    const auto v_dram_unmerged_xor = transform_tensor_view(
+                        v_dram_merged,
+                        make_tuple(
+                            make_pass_through_transform(length / XorLengthFold),
+                            make_unmerge_transform(make_tuple(number<LDSLayerSize / XorGroupSize>{},
+                                                              number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto v_dram_permuted = transform_tensor_view(
+                        v_dram_unmerged_xor,
+                        make_tuple(
+                            make_xor_transform(make_tuple(length / XorLengthFold,
+                                                          number<LDSLayerSize / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto v_dram_tmp = transform_tensor_view(
+                        v_dram_permuted,
+                        make_tuple(make_pass_through_transform(length / XorLengthFold),
+                                   make_unmerge_transform(make_tuple(
+                                       number<XorLengthFold>{},
+                                       number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_view(
+                        v_dram_tmp,
+                        make_tuple(make_merge_transform_v3_division_mod(
+                                       make_tuple(length / XorLengthFold, number<XorLengthFold>{})),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
                 else
-                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
-            }(),
-            {i_m0, 0});
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    const auto v_dram_unmerged = transform_tensor_view(
+                        v_dram_pad,
+                        make_tuple(make_pass_through_transform(length),
+                                   make_unmerge_transform(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
 
-        auto k_dram_window = make_tile_window(
-            k_dram, make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), {0, 0});
+                    const auto v_dram_permuted = transform_tensor_view(
+                        v_dram_unmerged,
+                        make_tuple(make_xor_transform(make_tuple(
+                                       length, number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
 
-        auto v_dram_window =
-            make_tile_window(v_dram,
-                             make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                             {i_n1, 0});
-        /// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
-        /// following copy capture of the 'i_nhead' if in C++20
-        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto bias_dram_window_lengths =
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                const BiasDataType* bias_ptr =
-                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-                    batch_offset_bias;
-
-                const auto bias_dram = [&]() {
-                    const auto bias_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        bias_ptr,
-                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                        make_tuple(kargs.stride_bias, 1),
-                        number<FmhaPipeline::kAlignmentBias>{},
-                        number<1>{});
-
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(bias_dram_window_lengths);
-            }
-        }();
-
-        // lse
-        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
-            if constexpr(kStoreLSE)
-            {
-                LSEDataType* lse_ptr =
-                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
-
-                const auto lse_dram = [&]() {
-                    const auto lse_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        lse_ptr,
-                        make_tuple(kargs.seqlen_q),
-                        make_tuple(1),
-                        number<1>{},
-                        number<1>{});
-
-                    return pad_tensor_view(
-                        lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
-                }();
-
-                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-            }
-            else
-            {
-                return make_null_tile_window(lse_dram_window_lengths);
-            }
-        }();
-
-        auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
-            if constexpr(kHasDropout)
-            {
-                return BlockDropout{i_batch_,
-                                    i_nhead_,
-                                    kargs.num_head_q,
-                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
-                                                                        : *kargs.drop_seed.ptr,
-                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_offset.val
-                                                                        : *kargs.drop_offset.ptr,
-                                    kargs.rp_undrop,
-                                    kargs.p_undrop_in_uint8_t,
-                                    kargs.is_store_randval};
-            }
-            else
-            {
-                return NullBlockDropout{};
+                    return transform_tensor_view(
+                        v_dram_permuted,
+                        make_tuple(make_pass_through_transform(length),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
             };
-        }();
 
-        auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto randval_dram_window_lengths =
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-            if constexpr(kHasDropout)
-            {
-                RandValOutputDataType* rand_val_ptr =
-                    reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
-                    batch_offset_randval;
-
-                const auto randval_dram = [&]() {
-                    const auto randval_dram_naive =
-                        make_naive_tensor_view<address_space_enum::global>(
-                            rand_val_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                            make_tuple(kargs.stride_randval, 1),
-                            number<1>{},
-                            number<1>{});
-
-                    return pad_tensor_view(randval_dram_naive,
-                                           randval_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(randval_dram_window_lengths);
-            }
-        }();
-
-        FmhaMask mask = [&]() {
-            if constexpr(kHasMask)
-                return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
-                    kargs.window_size_left,
-                    kargs.window_size_right,
-                    kargs.seqlen_q,
-                    kargs.seqlen_k,
-                    kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
-            else
-                return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
-        }();
-
-        // WA i_batch capture structure binding before c++20
-        auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
-            {
-                // data loading, shared by entire wg
-                // TODO: how to use s_read?
-                SaccDataType slope =
-                    *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
-                      i_batch_ * kargs.alibi_slope_stride + i_nhead_);
-#if CK_TILE_FMHA_FWD_FAST_EXP2
-                slope *= ck_tile::log2e_v<>;
-#endif
-                if constexpr(kHasMask)
+            const auto v_dram = [&]() {
                 {
-                    return make_alibi_from_lr_mask<SaccDataType, true>(slope,
-                                                                       kargs.window_size_left,
-                                                                       kargs.window_size_right,
-                                                                       kargs.seqlen_q,
-                                                                       kargs.seqlen_k,
-                                                                       kargs.mask_type);
+                    return make_v_dram(v_ptr, kargs.seqlen_k);
+                }
+            }();
+
+            auto q_dram_window = make_tile_window(
+                q_dram,
+                [&]() {
+                    if constexpr(FmhaPipeline::kQLoadOnce)
+                        return make_tuple(number<FmhaPipeline::kM0>{},
+                                          number<FmhaPipeline::kSubQKHeaddim>{});
+                    else
+                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+                }(),
+                {i_m0, 0});
+
+            auto k_dram_window = make_tile_window(
+                k_dram,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                {0, 0});
+
+            auto v_dram_window = make_tile_window(
+                v_dram,
+                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                {0, 0});
+
+            /// FIXME: Before C++20, capturing structured binding variables are not supported.
+            /// Remove following copy capture of the 'i_nhead' if in C++20
+            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto bias_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    const BiasDataType* bias_ptr =
+                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                        batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                number<FmhaPipeline::kAlignmentBias>{},
+                                number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               sequence<false, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
                 }
                 else
                 {
-                    return Alibi<SaccDataType, true>{
-                        slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    return make_null_tile_window(bias_dram_window_lengths);
                 }
-            }
-            else
-            {
-                return EmptyPositionEncoding<SaccDataType>{};
-            }
-        }();
+            }();
 
-        AttentionVariant variant;
-        const auto variant_params = [&] {
-            if constexpr(kHasLogitsSoftCap)
-            {
-                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
-                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
-            }
-            else
-            {
-                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
-            }
-        }();
+            // lse acc
+            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+                if constexpr(kStoreLSE)
+                {
+                    LSEDataType* lse_ptr =
+                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+                        batch_offset_lse;
 
-        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+                    const auto lse_dram = [&] {
+                        const auto lse_dram_naive = [&] {
+                            {
+                                return make_naive_tensor_view<address_space_enum::global>(
+                                    lse_ptr,
+                                    make_tuple(kargs.seqlen_q),
+                                    make_tuple(1),
+                                    number<1>{},
+                                    number<1>{});
+                            }
+                        }();
+                        return pad_tensor_view(
+                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                    }();
 
-        auto o_acc_tile = [&]() {
-            if constexpr(kDoFp8StaticQuant)
-            {
-                return FmhaPipeline{}(
-                    q_dram_window,
-                    identity{}, // q_element_func
-                    k_dram_window,
-                    identity{}, // k_element_func
-                    v_dram_window,
-                    identity{}, // v_element_func
-                    bias_dram_window,
-                    identity{}, // bias_element_func
-                    randval_dram_window,
-                    lse_dram_window,
-                    identity{},                                          // lse_element_func
-                    identity{},                                          // s_acc_element_func
-                    scales{kargs.scale_p},                               // p_compute_element_func
-                    composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
-                    mask,
-                    position_encoding,
-                    kargs.scale_s,
-                    variant,
-                    variant_params,
-                    block_indices,
-                    smem_ptr,
-                    dropout);
-            }
-            else
-            {
-                return FmhaPipeline{}(q_dram_window,
-                                      k_dram_window,
-                                      v_dram_window,
-                                      bias_dram_window,
-                                      randval_dram_window,
-                                      lse_dram_window,
-                                      mask,
-                                      position_encoding,
-                                      kargs.scale_s,
-                                      variant,
-                                      variant_params,
-                                      block_indices,
-                                      smem_ptr,
-                                      dropout);
-            }
-        }();
+                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+                }
+                else
+                {
+                    return make_null_tile_window(lse_dram_window_lengths);
+                }
+            }();
 
-        // O DRAM and O DRAM window
-        auto o_dram = [&]() {
-            const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                o_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_v),
-                make_tuple(kargs.stride_o, 1),
-                number<FmhaPipeline::kAlignmentO>{},
-                number<1>{});
+            FmhaMask mask = [&]() {
+                if constexpr(kHasMask)
+                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                        kargs.window_size_left,
+                        kargs.window_size_right,
+                        kargs.seqlen_q,
+                        kargs.seqlen_k,
+                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+                else
+                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+            }();
 
-            return pad_tensor_view(
-                o_dram_naive,
+            // WA i_batch capture structure binding before c++20
+            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    // data loading, shared by entire wg
+                    // TODO: how to use s_read?
+                    SaccDataType slope =
+                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    slope *= ck_tile::log2e_v<>;
+#endif
+                    if constexpr(kHasMask)
+                    {
+                        return make_alibi_from_lr_mask<SaccDataType, true, 32>(
+                            slope,
+                            kargs.window_size_left,
+                            kargs.window_size_right,
+                            kargs.seqlen_q,
+                            kargs.seqlen_k,
+                            kargs.mask_type);
+                    }
+                    else
+                    {
+                        return Alibi<SaccDataType, true, 32>{
+                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }
+                else
+                {
+                    return EmptyPositionEncoding<SaccDataType>{};
+                }
+            }();
+
+            auto o_acc_tile = [&]() {
+                if constexpr(PrefillCase)
+                {
+                    // allocate double lds
+                    // add __restrict__ here to avoid aliasing
+                    __shared__ char smem_ptrk0
+                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
+                                                                     true>()];
+                    __shared__ char smem_ptrk1
+                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
+                                                                     true>()];
+                    __shared__ char smem_ptrv0[FmhaPipeline::Policy::template GetSmemSizeV<
+                        typename FmhaPipeline::Problem>()];
+                    __shared__ char smem_ptrv1[FmhaPipeline::Policy::template GetSmemSizeV<
+                        typename FmhaPipeline::Problem>()];
+
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          smem_ptrk0,
+                                          smem_ptrk1,
+                                          smem_ptrv0,
+                                          smem_ptrv1);
+                }
+                else
+                {
+                    __shared__ char smem_ptr[GetSmemSize()];
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          smem_ptr);
+                }
+            }();
+
+            // Oacc DRAM and Oacc DRAM window
+            auto o_dram = [&] {
+                const auto o_dram_naive = [&] {
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            o_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                            make_tuple(kargs.stride_o, 1),
+                            number<FmhaPipeline::kAlignmentOacc>{},
+                            number<1>{});
+                    }
+                }();
+
+                return pad_tensor_view(
+                    o_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            }();
+
+            auto o_dram_window = make_tile_window(
+                o_dram,
                 make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                sequence<kPadSeqLenQ, kPadHeadDimV>{});
-        }();
+                {i_m0, i_n1});
 
-        auto o_dram_window =
-            make_tile_window(o_dram,
-                             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                             {i_m0, i_n1});
-
-        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index aa2ec99590..f6a20c5cb5 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -1038,7 +1038,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto k_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1096,7 +1096,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto v_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1190,7 +1190,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto kt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1249,7 +1249,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1344,7 +1344,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto qt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1379,7 +1379,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto dst_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1490,7 +1490,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto do_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1589,7 +1589,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto dot_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1623,7 +1623,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto pt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1667,7 +1667,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
+                                       sequence<2, 1>,
                                        sequence<0, 0>>{};
 
         constexpr auto ds_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1718,7 +1718,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                     pt_out.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
                         pt_warp_tensor.get_thread_buffer());
                 });
@@ -1768,7 +1768,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                     dst_out.set_y_sliced_thread_data(
-                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
                         dst_warp_tensor.get_thread_buffer());
                 });
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
index cf70dff63f..45a1c8f4b8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
@@ -11,6 +11,7 @@ enum class BlockFmhaPipelineEnum
     QRKSVS = 0,
     QRKSVS_ASYNC,
     QSKSVS,
+    QRKSVS_ASYNC_TRLOAD,
 };
 
 template <BlockFmhaPipelineEnum>
@@ -32,4 +33,10 @@ struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QSKSVS>
     static constexpr const char* name = "qs";
 };
 
+template <>
+struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD>
+{
+    static constexpr const char* name = "qr_async_trload";
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 20b30b7417..86ac713b6f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -22,6 +22,7 @@ template <typename QDataType_,
           bool kIsGroupMode_,
           typename AttentionVariant_,
           typename FmhaMask_,
+          bool kUseTrLoad_,
           typename Traits_>
 struct BlockFmhaPipelineProblem
 {
@@ -46,6 +47,7 @@ struct BlockFmhaPipelineProblem
     static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
 
     static constexpr bool kIsGroupMode = kIsGroupMode_;
+    static constexpr bool kUseTrLoad   = kUseTrLoad_;
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
new file mode 100644
index 0000000000..39d8814692
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
@@ -0,0 +1,1177 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy>
+struct BlockFmhaPipelineQRKSVSAsyncTrload
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    using Problem               = remove_cvref_t<Problem_>;
+    using Policy                = remove_cvref_t<Policy_>;
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType          = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType   = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType             = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+    static constexpr index_t kNWarp        = BlockFmhaShape::Gemm0BlockWarps::at(I1);
+    static constexpr index_t kNXdl         = BlockFmhaShape::Gemm0WarpTile::at(I1);
+
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
+    // static_assert(Problem::kPadSeqLenQ == true && Problem::kPadHeadDimQ == true &&
+    //               Problem::kPadHeadDimV == true);
+
+    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ =
+        Problem::kPadHeadDimQ; // support multiple of vector(like 8x)
+    static constexpr bool kPadHeadDimV =
+        Problem::kPadHeadDimV; // support multiple of vector(like 8x)
+
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasUnevenSplits  = true;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ = Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK = Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+
+    static constexpr index_t kAlignmentOacc = Policy::template GetAlignmentO<Problem>();
+
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            if constexpr(kQKHeaddim <= 32)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim <= 64)
+            {
+                return 3;
+            }
+            else if constexpr(kQKHeaddim <= 128)
+            {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || kM0 >= 256)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 256)
+            {
+                return 1;
+            }
+            else
+            {
+                return 1;
+            }
+        }
+    }();
+
+    static constexpr const char* name = "qr_async_trload";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    // Decode
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* smem_ptr) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
+                      "wrong!");
+        ignore = bias_dram_block_window_tmp;
+        ignore = position_encoding;
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp, lse_acc);
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // Q tile in LDS
+        auto q_dram_window = make_tile_window(
+            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptr), Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptr),
+            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
+
+        auto q_lds_store_window =
+            make_tile_window(q_lds_write_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
+
+        async_load_tile(q_lds_store_window, q_dram_window);
+
+        // K tile in LDS
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem>());
+
+        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType*>(smem_ptr), Policy::template MakeKLdsBlockDescriptor<Problem>());
+        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType*>(smem_ptr),
+            Policy::template MakeKLdsBlockDescriptor<Problem, false, true>());
+
+        auto k_lds_write_window =
+            make_tile_window(k_lds_write_view,
+                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read_view,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKRegTileDistribution<Problem>());
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptr) +
+                                            Policy::template GetSmemSizeK<Problem>()),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // V tile in LDS
+        auto v_dram_window = make_tile_window(
+            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetSmemSizeK<Problem>() +
+                                         Policy::template GetSmemSizeS<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetSmemSizeK<Problem>() +
+                                         Policy::template GetSmemSizeS<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
+        auto v_lds_write_window =
+            make_tile_window(v_lds_write_view,
+                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read_view,
+                             make_tuple(number<kK1>{}, number<kN1>{}),
+                             {0, 0},
+                             Policy::template MakeVRegTileDistribution<Problem>());
+
+        block_sync_lds_direct_load<0>();
+        auto q_tile = load_tile(q_lds_read_window);
+
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+
+        block_sync_lds();
+        async_load_tile(k_lds_write_window, k_dram_window);
+
+        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
+        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
+
+        do
+        {
+            block_sync_lds();
+            async_load_tile(v_lds_write_window, v_dram_window); // prefetch load v tile
+
+            // move V tile windows
+            move_tile_window(v_dram_window, {kN0, 0});
+
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            if constexpr(1 < k0_loops)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    if constexpr(i_k0 == 0)
+                    {
+                        block_sync_lds_direct_load<v_vmem_insts>();
+                    }
+                    else
+                    {
+                        block_sync_lds_direct_load<0>();
+                    }
+
+                    auto k_tile = load_tile(k_lds_read_window);
+
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_tile);
+
+                    // loop over along the [K]ey head dimension
+                    move_tile_window(k_dram_window, {0, kK0});
+                    block_sync_lds();
+                    async_load_tile(k_lds_write_window, k_dram_window);
+                });
+                // move back to the origin
+                move_tile_window(k_dram_window, {0, -kK0 * (k0_loops - 1)});
+            }
+
+            if constexpr(k0_loops == 1)
+            {
+                block_sync_lds_direct_load<v_vmem_insts>();
+            }
+            else
+            {
+                block_sync_lds_direct_load<0>();
+            }
+
+            auto k_tile = load_tile(k_lds_read_window);
+
+            gemm_0(s_acc,
+                   get_slice_tile(q_tile,
+                                  sequence<0, (k0_loops - 1) * kK0>{},
+                                  sequence<kM0, k0_loops * kK0>{}),
+                   k_tile);
+
+            if constexpr(kHasUnevenSplits)
+            {
+                if(i_total_loops == (num_total_loop - 1))
+                {
+                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+                    set_tile_if(s_acc,
+                                -numeric<SMPLComputeDataType>::infinity(),
+                                [&,
+                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
+                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
+
+                                    {
+                                        return physical_seqlen_k_end_ <= col;
+                                    }
+                                });
+                }
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+
+                bool need_perpixel_check =
+                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
+                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            // move K tile windows after current status checked
+            // prefetch next-tile along [K]ey sequence length dimension
+            move_tile_window(k_dram_window, {kN0, 0});
+
+            block_sync_lds();
+            async_load_tile(k_lds_write_window, k_dram_window);
+
+            // Gemm1
+            auto s_new = [&]() {
+                if constexpr(kNWarp > 1)
+                {
+                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+                    store_tile(s_write_lds_window, s);
+                    block_sync_lds();
+                    return load_tile(s_read_lds_window);
+                }
+                else
+                {
+                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+                }
+            }();
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            // Set CrossWarp to false will trigger better strategy on gfx950, but will cause
+            // performance regression because of un-coexecutable packed math, silent it for now
+            block_tile_reduce_sync(
+                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                auto row_max         = scale_s * get_validated_m(m[i_idx]);
+                sweep_tile_span(p_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
+                    }
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(
+                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            auto p_tile = make_static_distributed_tensor<PDataType>(
+                Policy::template MakePRegTileDistribution<Problem>());
+            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                const auto tmp       = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds_direct_load<k_vmem_insts>();
+
+            auto v_tile = load_tile_transpose(v_lds_read_window);
+
+            if constexpr(1 < k1_loops)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    gemm_1(o_acc,
+                           get_slice_tile(p_tile,
+                                          sequence<0, i_k1 * kK1>{},
+                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_tile);
+
+                    // loop over along the [V]alue Sequence length
+                    move_tile_window(v_lds_read_window, {kK1, 0});
+                    v_tile = load_tile_transpose(v_lds_read_window);
+                });
+                // move back to the origin
+                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
+            }
+
+            gemm_1(o_acc,
+                   get_slice_tile(p_tile,
+                                  sequence<0, (k1_loops - 1) * kK1>{},
+                                  sequence<kM0, k1_loops * kK1>{}),
+                   v_tile);
+
+        } while(++i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp, lse_acc);
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[I0], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        return o_acc;
+    }
+
+    // Prefill, double lds
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* __restrict__ smem_ptrk0,
+               void* __restrict__ smem_ptrk1,
+               void* __restrict__ smem_ptrv0,
+               void* __restrict__ smem_ptrv1) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
+                      "wrong!");
+        ignore = bias_dram_block_window_tmp;
+        ignore = position_encoding;
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp, lse_acc);
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // Q tile in LDS
+        auto q_dram_window = make_tile_window(
+            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptrk0),
+            Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptrk0),
+            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
+
+        auto q_lds_store_window =
+            make_tile_window(q_lds_write_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
+
+        async_load_tile(q_lds_store_window, q_dram_window);
+        block_sync_lds_direct_load<0>();
+        auto q_tile = load_tile(q_lds_read_window);
+
+        // K tile in LDS
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem, true>());
+
+        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType* __restrict__>(smem_ptrk0),
+            Policy::template MakeKLdsBlockDescriptor<Problem, true>());
+
+        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType* __restrict__>(smem_ptrk0),
+            Policy::template MakeKLdsBlockDescriptor<Problem, true, true>());
+
+        auto k_lds_write_window =
+            make_tile_window(k_lds_write_view,
+                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read_view,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKRegTileDistribution<Problem>());
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptrk0) +
+                                            Policy::template GetSmemSizeK<Problem>()),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // V tile in LDS
+        auto v_dram_window = make_tile_window(
+            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+
+        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
+            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
+
+        auto v_lds_write_window =
+            make_tile_window(v_lds_write_view,
+                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read_view,
+                             make_tuple(number<kK1>{}, number<kN1>{}),
+                             {0, 0},
+                             Policy::template MakeVRegTileDistribution<Problem>());
+
+        // block_sync_lds_direct_load<0>();
+        // auto q_tile = load_tile(q_lds_read_window);
+
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+        block_sync_lds<0>();
+        async_load_tile(k_lds_write_window, k_dram_window);
+        async_load_tile(v_lds_write_window, v_dram_window);
+
+        move_tile_window(k_dram_window, {kN0, 0});
+        k_lds_write_window.set_bottom_tensor_view_data_ptr(
+            static_cast<KDataType* __restrict__>(smem_ptrk1));
+        async_load_tile(k_lds_write_window, k_dram_window);
+
+        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
+        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
+
+        constexpr index_t k_lds_insts = k_lds_read_window.get_num_of_access();
+        constexpr index_t v_lds_insts = v_lds_read_window.get_num_of_access();
+
+        block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+        auto k_tile = load_tile(k_lds_read_window);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        auto mainloop = [&](index_t cur_loop) {
+            const bool is_even_loop = (cur_loop % 2 == 0);
+
+            auto k_lds_write_ptr = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk0)
+                                                : static_cast<KDataType* __restrict__>(smem_ptrk1);
+            auto k_lds_read_ptr  = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk1)
+                                                : static_cast<KDataType* __restrict__>(smem_ptrk0);
+            auto v_lds_write_ptr = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv1)
+                                                : static_cast<VDataType* __restrict__>(smem_ptrv0);
+            auto v_lds_read_ptr  = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv0)
+                                                : static_cast<VDataType* __restrict__>(smem_ptrv1);
+
+            // move V tile windows
+            block_sync_lds<k_lds_insts>();
+            move_tile_window(v_dram_window, {kN0, 0});
+            v_lds_write_window.set_bottom_tensor_view_data_ptr(v_lds_write_ptr);
+            async_load_tile(v_lds_write_window, v_dram_window);
+
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            if constexpr(1 < k0_loops)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    // loop over along the [K]ey head dimension
+                    move_tile_window(k_lds_read_window, {0, kK0});
+                    auto k_tile_switch = load_tile(k_lds_read_window);
+
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_tile);
+
+                    k_tile = k_tile_switch;
+                });
+                // move back to the origin
+                move_tile_window(k_lds_read_window, {0, -kK0 * (k0_loops - 1)});
+            }
+
+            gemm_0(s_acc,
+                   get_slice_tile(q_tile,
+                                  sequence<0, (k0_loops - 1) * kK0>{},
+                                  sequence<kM0, k0_loops * kK0>{}),
+                   k_tile);
+
+            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+            v_lds_read_window.set_bottom_tensor_view_data_ptr(v_lds_read_ptr);
+            auto v_tile = load_tile_transpose(v_lds_read_window);
+
+            if constexpr(kHasUnevenSplits)
+            {
+                if(i_total_loops == (num_total_loop - 1))
+                {
+                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+                    set_tile_if(s_acc,
+                                -numeric<SMPLComputeDataType>::infinity(),
+                                [&,
+                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
+                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
+
+                                    {
+                                        return physical_seqlen_k_end_ <= col;
+                                    }
+                                });
+                }
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+
+                bool need_perpixel_check =
+                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
+                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            // Gemm1
+            auto s_new = [&]() {
+                if constexpr(kNWarp > 1)
+                {
+                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+                    store_tile(s_write_lds_window, s);
+                    block_sync_lds();
+                    return load_tile(s_read_lds_window);
+                }
+                else
+                {
+                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+                }
+            }();
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(
+                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            static_for<0, 12, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
+            });
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                auto row_max         = scale_s * get_validated_m(m[i_idx]);
+                sweep_tile_span(p_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
+                    }
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(
+                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            auto p_tile = make_static_distributed_tensor<PDataType>(
+                Policy::template MakePRegTileDistribution<Problem>());
+            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                const auto tmp       = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds<v_lds_insts>();
+            move_tile_window(k_dram_window, {kN0, 0});
+            k_lds_write_window.set_bottom_tensor_view_data_ptr(k_lds_write_ptr);
+            async_load_tile(k_lds_write_window, k_dram_window);
+
+            if constexpr(1 < k1_loops)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    // loop over along the [V]alue Sequence length
+                    move_tile_window(v_lds_read_window, {kK1, 0});
+                    auto v_tile_switch = load_tile_transpose(v_lds_read_window);
+
+                    gemm_1(o_acc,
+                           get_slice_tile(p_tile,
+                                          sequence<0, i_k1 * kK1>{},
+                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_tile);
+
+                    v_tile = v_tile_switch;
+                });
+                // move back to the origin
+                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
+            }
+
+            gemm_1(o_acc,
+                   get_slice_tile(p_tile,
+                                  sequence<0, (k1_loops - 1) * kK1>{},
+                                  sequence<kM0, k1_loops * kK1>{}),
+                   v_tile);
+
+            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+            k_lds_read_window.set_bottom_tensor_view_data_ptr(k_lds_read_ptr);
+            k_tile = load_tile(k_lds_read_window);
+
+            static_for<0, 12, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
+            });
+        };
+
+        do
+        {
+            mainloop(i_total_loops);
+            i_total_loops++;
+        } while(i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp, lse_acc);
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[I0], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        return o_acc;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
new file mode 100644
index 0000000000..ed22758566
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
@@ -0,0 +1,823 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
+
+// can remove all bank conflicts, but drop the performance for some cases
+// Probably it is limited by compiler optimization.
+#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
+namespace ck_tile {
+// This pipeline is qkv all located in LDS
+struct BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopy = */ false,
+                                          /* NumPrefetchK = */ 1,
+                                          /* NumPrefetchV = */ 1>
+{
+    using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                                           /* AsyncCopy = */ false,
+                                                           /* NumPrefetchK = */ 1,
+                                                           /* NumPrefetchV = */ 1>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+        // this should align with MakeQDramTileDistribution()
+        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc()
+    {
+        using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
+
+        return static_cast<index_t>(16 / sizeof(OaccDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentK()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::KDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem, bool BypassLDS = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
+    {
+        if constexpr(!BypassLDS)
+        {
+            constexpr index_t kBlockSize = Problem::kBlockSize;
+            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+            constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+            constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+            static_assert(0 < ElemPerThread);
+            constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+            constexpr index_t KPerThread     = kMaxVecLoad;
+            constexpr index_t KThreads       = kKPerBlock / KPerThread;
+            constexpr index_t MThreadPerWarp = get_warp_size() / KThreads;
+            constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+            constexpr index_t MPerThread     = kMPerBlock / (MThreadPerWarp * NumWarps);
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<MPerThread, NumWarps, MThreadPerWarp>,
+                                                 sequence<KThreads, KPerThread>>,
+                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           tuple<sequence<1>, sequence<2, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 1>>{});
+        }
+        else
+        {
+            using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+            constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+            using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+            constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+            constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+            constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+            constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+            constexpr auto q_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<NWarp>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 1>,
+                sequence<0, 0>>{};
+
+            constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+            return q_block_dstr;
+        }
+    }
+
+    template <typename Problem, bool LoadOnce = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
+    {
+        using KDataType = remove_cvref_t<typename Problem::KDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock =
+            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+
+        constexpr index_t K1 = min(MaxVectorSize, ElemPerThread);
+        constexpr index_t K0 = kKPerBlock / K1;
+        constexpr index_t N2 = get_warp_size() / K0;
+        constexpr index_t N1 = kBlockSize / get_warp_size();
+        constexpr index_t N0 = kNPerBlock / (N2 * N1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read M first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto q_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+        return q_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackQ()
+    {
+        // TODO: this is for 3d layout
+        using QDataType = remove_cvref_t<typename Problem::QDataType>;
+        return static_cast<index_t>(16 / sizeof(QDataType));
+    }
+
+    template <typename Problem, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t kKPack = GetSmemKPackQ<Problem>();
+
+        constexpr auto q_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::QDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kMPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / kKPack>{},
+                                   number<kKPack>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
+                        q_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kMPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto q_lds_block_desc_tmp = transform_tensor_descriptor(
+                        q_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kMPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(
+                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        q_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kMPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(number<kMPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(
+                            number<kMPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
+                        q_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(number<kMPerBlock>{},
+                                                                 number<kKPerBlock / kKPack>{})),
+                                   make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        q_lds_block_desc_permuted,
+                        make_tuple(make_pass_through_transform(number<kMPerBlock>{}),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                    make_tuple(number<kKPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return q_lds_block_desc;
+    }
+
+    template <typename Problem, bool LoadOnce = false, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock =
+            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t kKPack = GetSmemKPackK<Problem>();
+
+        constexpr auto k_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::KDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kNPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / kKPack>{},
+                                   number<kKPack>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
+                        k_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kNPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto k_lds_block_desc_tmp = transform_tensor_descriptor(
+                        k_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kNPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(
+                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        k_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(number<kNPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(
+                            number<kNPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
+                        k_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(number<kNPerBlock>{},
+                                                                 number<kKPerBlock / kKPack>{})),
+                                   make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        k_lds_block_desc_permuted,
+                        make_tuple(make_pass_through_transform(number<kNPerBlock>{}),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                    make_tuple(number<kKPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return k_lds_block_desc;
+    }
+
+    template <typename Problem, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t kKPack = GetSmemKPackV<Problem>();
+
+        constexpr auto v_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+                constexpr auto XorGroupSize =
+                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
+
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::VDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kNPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kKPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / XorGroupSize>{},
+                                   number<XorGroupSize>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<XorGroupSize>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
+                        v_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kKPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto v_lds_block_desc_tmp = transform_tensor_descriptor(
+                        v_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kKPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(make_tuple(number<XorLengthFold>{},
+                                                              number<kNPerBlock / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        v_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kKPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kKPerBlock>{},
+                                   number<kNPerBlock / XorGroupSize>{},
+                                   number<XorGroupSize>{}),
+                        make_tuple(number<kNPerBlock>{}, number<XorGroupSize>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
+                        v_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(
+                                       number<kKPerBlock>{}, number<kNPerBlock / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        v_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kKPerBlock>{}),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kKPerBlock>{}, number<kNPerBlock>{}),
+                    make_tuple(number<kNPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return v_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::QDataType,
+                             typename Problem::KDataType,
+                             typename Problem::SaccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK0>,
+                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>,
+                             GemmLoopOrder::MNK>;
+
+        using WarpGemm =
+            WarpGemmMfmaDispatcher<typename Problem::QDataType,
+                                   typename Problem::KDataType,
+                                   typename Problem::SaccDataType,
+                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                   true>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::QDataType,
+                                                typename Problem::KDataType,
+                                                typename Problem::SaccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                                WarpGemm>;
+
+        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetPVBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::PDataType,
+                             typename Problem::VDataType,
+                             typename Problem::OaccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN1,
+                                                    Problem::BlockFmhaShape::kK1>,
+                                           typename Problem::BlockFmhaShape::Gemm1BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm1WarpTile>,
+                             GemmLoopOrder::KMN>;
+
+        using WarpGemm = WarpGemmMfmaDispatcher<
+            typename Problem::PDataType,
+            typename Problem::VDataType,
+            typename Problem::OaccDataType,
+            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+            true,
+            false,
+            false,
+            ((Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 16 &&
+              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32) ||
+             (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 32 &&
+              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 16))
+                ? WGAttrNumAccessEnum::Double
+                : WGAttrNumAccessEnum::Single>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::PDataType,
+                                                typename Problem::VDataType,
+                                                typename Problem::OaccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm1BlockWarps,
+                                                WarpGemm>;
+
+        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read N first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto k_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto k_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            k_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        constexpr auto k_block_dstr = make_static_tile_distribution(k_block_dstr_encode);
+
+        return k_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeVDramTileDistribution()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+        constexpr index_t NPerThread     = kMaxVecLoad;
+        constexpr index_t NThreads       = kNPerBlock / NPerThread;
+        constexpr index_t KThreadPerWarp = get_warp_size() / NThreads;
+        constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+        constexpr index_t KPerThread     = kKPerBlock / (KThreadPerWarp * NumWarps);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<KPerThread, NumWarps, KThreadPerWarp>,
+                                             sequence<NThreads, NPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read M first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto p_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<2, 1>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto p_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            p_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto p_block_dstr = make_static_tile_distribution(p_block_dstr_encode);
+
+        return p_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read N first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto v_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<2, 1>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto v_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            v_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        constexpr auto v_block_dstr =
+            make_static_tile_distribution(typename InputTileDistributionTraits<
+                                          decltype(v_block_dstr_encode),
+                                          typename Problem::VDataType>::TransposedDstrEncode{});
+
+        return v_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemNPackS()
+    {
+        using SDataType = remove_cvref_t<typename Problem::SaccDataType>;
+        return static_cast<index_t>(16 / sizeof(SDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kNPack     = GetSmemNPackS<Problem>();
+
+        constexpr auto s_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kNPerBlock / kNPack>{}, number<kMPerBlock>{}, number<kNPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * kNPack>{}, number<kNPack>{}, number<1>{}),
+            number<kNPack>{},
+            number<1>{});
+
+        constexpr auto s_lds_block_desc = transform_tensor_descriptor(
+            s_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kMPerBlock>{}),
+                make_merge_transform(make_tuple(number<kNPerBlock / kNPack>{}, number<kNPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return s_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSRegTileDistribution()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetKVBlockGemm<Problem>())>;
+
+        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        // static_assert(MWarp == 1, "Check failed!");
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+        constexpr index_t kTileK     = Problem::BlockFmhaShape::kN0;
+
+        // K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm
+        constexpr index_t K3 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K2 = WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K1 = kKPerBlock / (K2 * K3);
+        constexpr index_t K0 = kTileK / kKPerBlock;
+        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane;
+        constexpr index_t M1 = MWarp;
+        constexpr index_t M0 = kMPerBlock / (M2 * M1);
+
+        constexpr auto s2_block_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2, K3>>,
+                                       tuple<sequence<1, 0>, sequence<2, 1>>,
+                                       tuple<sequence<1, 0>, sequence<2, 2>>,
+                                       sequence<1, 2, 2, 2>,
+                                       sequence<0, 0, 1, 3>>{};
+
+        constexpr auto s2_block_dstr = make_static_tile_distribution(s2_block_dstr_encoding);
+
+        return s2_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeQ()
+    {
+        return MakeQLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::QDataType);
+    }
+
+    template <typename Problem, bool LoadOnce = false>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK()
+    {
+        return MakeKLdsBlockDescriptor<Problem, LoadOnce>().get_element_space_size() *
+               sizeof(typename Problem::KDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV()
+    {
+        return MakeVLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::VDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeS()
+    {
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        return NWarp > 1 ? MakeSLdsBlockDescriptor<Problem>().get_element_space_size() *
+                               sizeof(typename Problem::SaccDataType)
+                         : 0;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        // Alignment on gfx950 is 1280 Bytes
+        // Alignment before gfx950 is 512 Bytes.
+        return max(GetSmemSizeQ<Problem>(),
+                   GetSmemSizeK<Problem>() + GetSmemSizeS<Problem>() + GetSmemSizeV<Problem>());
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 3489d6f9a1..e2cea97f9a 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -383,23 +383,31 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackV()
     {
         // TODO: this is for 3d layout
-        using VDataType = remove_cvref_t<typename Problem::VDataType>;
-        return 16 / sizeof(VDataType);
+        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
+        constexpr index_t kBlockSize   = Problem::kBlockSize;
+        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        constexpr index_t kMaxVecLoad =
+            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+
+        return kMaxVecLoad;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
     {
-        using VLayout   = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
-        using VDataType = remove_cvref_t<typename Problem::VDataType>;
+        using VLayout                  = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
+        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
+        constexpr index_t kBlockSize   = Problem::kBlockSize;
+        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        constexpr index_t kMaxVecLoad =
+            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+
         if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            constexpr index_t kBlockSize   = Problem::kBlockSize;
-            constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
-            constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
-            constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-            constexpr index_t kMaxVecLoad =
-                min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
             constexpr index_t kMinVecLoad = 4 / sizeof(VDataType);
 
             constexpr index_t kVecLoad = ((total_pixels / kMaxVecLoad) >= kMinVecLoad)
@@ -410,7 +418,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         }
         else
         {
-            return 16 / sizeof(VDataType);
+            return kMaxVecLoad;
         }
     }
 
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
index 4652e5f20f..0c90bb9e85 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -42,6 +42,8 @@ struct BlockGemmARegBRegCRegV1
         static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
         static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
 
+        static constexpr auto BlockGemmLoopOrder = Problem::BlockGemmLoopOrder;
+
         static constexpr index_t KPack = WarpGemm::kKPerThread;
     };
 
@@ -52,8 +54,9 @@ struct BlockGemmARegBRegCRegV1
 
     using Traits = GemmTraits_<Problem, Policy>;
 
-    using WarpGemm       = typename Traits::WarpGemm;
-    using BlockGemmShape = typename Traits::BlockGemmShape;
+    using WarpGemm                           = typename Traits::WarpGemm;
+    using BlockGemmShape                     = typename Traits::BlockGemmShape;
+    static constexpr auto BlockGemmLoopOrder = Traits::BlockGemmLoopOrder;
 
     using ADataType = remove_cvref_t<typename Traits::ADataType>;
     using BDataType = remove_cvref_t<typename Traits::BDataType>;
@@ -86,17 +89,36 @@ struct BlockGemmARegBRegCRegV1
         }
         else
         {
-            constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
-                sequence<NWarp>,
-                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                tuple<sequence<1, 0>>,
-                tuple<sequence<1, 0>>,
-                sequence<1, 2>,
-                sequence<0, 0>>{};
-            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+            {
+                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<NWarp>,
+                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<1, 0>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<2, 1>,
+                    sequence<0, 0>>{};
 
-            return a_block_dstr_encode;
+                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+                return a_block_dstr_encode;
+            }
+            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+            {
+                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<NWarp>,
+                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<1, 0>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<1, 2>,
+                    sequence<0, 0>>{};
+
+                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+                return a_block_dstr_encode;
+            }
         }
     }
 
@@ -118,17 +140,33 @@ struct BlockGemmARegBRegCRegV1
         }
         else
         {
-            constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
-                sequence<MWarp>,
-                tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-                tuple<sequence<0, 1>>,
-                tuple<sequence<0, 1>>,
-                sequence<1, 2>,
-                sequence<0, 0>>{};
-            constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+            {
+                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<MWarp>,
+                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<0, 1>>,
+                    tuple<sequence<0, 1>>,
+                    sequence<2, 1>,
+                    sequence<0, 0>>{};
+                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
-            return b_block_dstr_encode;
+                return b_block_dstr_encode;
+            }
+            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+            {
+                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<MWarp>,
+                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<0, 1>>,
+                    tuple<sequence<0, 1>>,
+                    sequence<1, 2>,
+                    sequence<0, 0>>{};
+                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+                return b_block_dstr_encode;
+            }
         }
     }
 
@@ -213,40 +251,82 @@ struct BlockGemmARegBRegCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                // read A warp tensor from A Block window
-                AWarpTensor a_warp_tensor;
-                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+        if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+        {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A Block window
+                    AWarpTensor a_warp_tensor;
+                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
 
-                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    // read B warp tensor from B block tensor
-                    BWarpTensor b_warp_tensor;
-                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                    // read C warp tensor from C block tensor
-                    using c_iter_idx = std::
-                        conditional_t<TransposeC, sequence<nIter, mIter>, sequence<mIter, nIter>>;
-                    CWarpTensor c_warp_tensor;
-                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                        // read C warp tensor from C block tensor
+                        using c_iter_idx = std::conditional_t<TransposeC,
+                                                              sequence<nIter, mIter>,
+                                                              sequence<mIter, nIter>>;
+                        CWarpTensor c_warp_tensor;
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                    // warp GEMM
-                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                    // write C warp tensor into C block tensor
-                    c_block_tensor.set_y_sliced_thread_data(
-                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                        c_warp_tensor.get_thread_buffer());
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
                 });
             });
-        });
+        }
+        else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+        {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                        // read A warp tensor from A Block window
+                        AWarpTensor a_warp_tensor;
+
+                        a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
     }
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp b/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
index fd5211a59a..d0be065fc9 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 
 namespace ck_tile {
 
@@ -13,7 +14,8 @@ template <typename ADataType_,
           typename CDataType_,
           index_t kBlockSize_,
           typename BlockGemmShape_,
-          index_t NumWaveGroups_ = 1>
+          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN,
+          index_t NumWaveGroups_            = 1>
 struct BlockGemmProblem
 {
     using ADataType      = remove_cvref_t<ADataType_>;
@@ -21,8 +23,9 @@ struct BlockGemmProblem
     using CDataType      = remove_cvref_t<CDataType_>;
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
-    static constexpr index_t kBlockSize    = kBlockSize_;
-    static constexpr index_t NumWaveGroups = NumWaveGroups_;
+    static constexpr index_t kBlockSize               = kBlockSize_;
+    static constexpr index_t NumWaveGroups            = NumWaveGroups_;
+    static constexpr GemmLoopOrder BlockGemmLoopOrder = BlockGemmLoopOrder_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
index b18bf603a9..b3c86b9456 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
@@ -39,6 +39,12 @@ enum struct TailNumber
     Full,
 };
 
+enum struct GemmLoopOrder
+{
+    KMN,
+    MNK,
+};
+
 } // namespace ck_tile
 
 inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineScheduler& s)
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 52bd07c9e2..c628614b54 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -14,10 +14,11 @@ template <typename ADataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_ = ADataType_,
-          bool FixedVectorSize_     = false,
-          index_t VectorSizeA_      = 1,
-          index_t VectorSizeB_      = 1>
+          typename ComputeDataType_         = ADataType_,
+          bool FixedVectorSize_             = false,
+          index_t VectorSizeA_              = 1,
+          index_t VectorSizeB_              = 1,
+          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
 struct GemmPipelineProblemBase
 {
     using Traits = remove_cvref_t<Traits_>;
@@ -45,9 +46,10 @@ struct GemmPipelineProblemBase
     static constexpr bool kPadN = Traits::kPadN;
     static constexpr bool kPadK = Traits::kPadK;
 
-    static constexpr bool DoubleSmemBuffer  = Traits::DoubleSmemBuffer;
-    static constexpr auto Scheduler         = GemmPipelineScheduler::Default;
-    static constexpr index_t VectorLoadSize = Traits::_VectorSize;
+    static constexpr bool DoubleSmemBuffer            = Traits::DoubleSmemBuffer;
+    static constexpr auto Scheduler                   = GemmPipelineScheduler::Default;
+    static constexpr index_t VectorLoadSize           = Traits::_VectorSize;
+    static constexpr GemmLoopOrder BlockGemmLoopOrder = BlockGemmLoopOrder_;
 
     // In the base situation, the Preshuffle setting should be false.
     static constexpr bool Preshuffle = false;
@@ -167,10 +169,11 @@ template <typename ADataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_ = ADataType_,
-          bool FixedVectorSize_     = false,
-          index_t VectorSizeA_      = 1,
-          index_t VectorSizeB_      = 1>
+          typename ComputeDataType_         = ADataType_,
+          bool FixedVectorSize_             = false,
+          index_t VectorSizeA_              = 1,
+          index_t VectorSizeB_              = 1,
+          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
 using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
                                                     BDataType_,
                                                     CDataType_,
@@ -179,20 +182,22 @@ using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
                                                     ComputeDataType_,
                                                     FixedVectorSize_,
                                                     VectorSizeA_,
-                                                    VectorSizeB_>;
+                                                    VectorSizeB_,
+                                                    BlockGemmLoopOrder_>;
 
 template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
-          bool HasHotLoop_                 = true,
-          TailNumber TailNum_              = TailNumber::Full,
-          typename ComputeDataType_        = ADataType_,
-          bool FixedVectorSize_            = false,
-          index_t VectorSizeA_             = 1,
-          index_t VectorSizeB_             = 1>
+          GemmPipelineScheduler Scheduler_  = GemmPipelineScheduler::Intrawave,
+          bool HasHotLoop_                  = true,
+          TailNumber TailNum_               = TailNumber::Full,
+          typename ComputeDataType_         = ADataType_,
+          bool FixedVectorSize_             = false,
+          index_t VectorSizeA_              = 1,
+          index_t VectorSizeB_              = 1,
+          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
 struct UniversalGemmPipelineProblem
 {
     using Traits = remove_cvref_t<Traits_>;
@@ -224,8 +229,9 @@ struct UniversalGemmPipelineProblem
     static constexpr auto Scheduler        = Scheduler_;
     static constexpr bool Preshuffle       = Traits::Preshuffle;
 
-    static constexpr index_t VectorSizeA = VectorSizeA_;
-    static constexpr index_t VectorSizeB = VectorSizeB_;
+    static constexpr index_t VectorSizeA              = VectorSizeA_;
+    static constexpr index_t VectorSizeB              = VectorSizeB_;
+    static constexpr GemmLoopOrder BlockGemmLoopOrder = BlockGemmLoopOrder_;
 
     static constexpr auto HasHotLoop        = HasHotLoop_;
     static constexpr auto TailNum           = TailNum_;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index fb191d565d..d1deaf9e0e 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -104,6 +104,10 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
         1>>;
 #endif
 
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
@@ -210,6 +214,10 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
         AttrNumAccess>>;
 #endif
 
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index e91d505c8e..8c6f39e511 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -45,6 +45,8 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp16 2:4 structural sparsity
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
@@ -74,6 +76,8 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 434be9f84a..7a10d1fa56 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -14,10 +14,14 @@ namespace ck_tile {
  * Y dim must have at least one dim not been reduced
  */
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
-template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
+template <typename AccDistributedTensor_,
+          typename ReduceFunc,
+          bool WithBroadcast = true,
+          bool CrossWarp     = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
                                            const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {})
+                                           bool_constant<WithBroadcast> = {},
+                                           bool_constant<CrossWarp>     = {})
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -56,14 +60,24 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 
                 // reduction sweep forward
                 static_for<0, nstage, 1>{}([&](auto istage) {
-                    constexpr index_t lid_delta =
-                        lid_over_rid_derivative * (1 << (nstage - istage - 1));
+                    if constexpr(CrossWarp)
+                    {
+                        constexpr index_t lid_delta =
+                            lid_over_rid_derivative * (1 << (nstage - istage - 1));
 
-                    // pull data from remote lane
-                    const auto v_remote = warp_shuffle_down(v_local, lid_delta);
+                        // pull data from remote lane
+                        const auto v_remote = warp_shuffle_down(v_local, lid_delta);
 
-                    // reduce
-                    v_local = reduce_func(v_local, v_remote);
+                        // reduce
+                        v_local = reduce_func(v_local, v_remote);
+                    }
+                    else
+                    {
+                        // pull data from remote lane
+                        const auto v_swapped_regs = warp_shuffle_down_pair(v_local);
+                        // reduce
+                        v_local = reduce_func(v_swapped_regs.at(0), v_swapped_regs.at(1));
+                    }
                 });
             }
         });

From 5b39de4bb61a3f0399fcd384f3a82c5e6ce28e5e Mon Sep 17 00:00:00 2001
From: asleepzzz <hanwen.chang@amd.com>
Date: Tue, 12 Aug 2025 20:27:10 +0800
Subject: [PATCH 417/443] Revert "Optimize fmha fwd decode & prefill for gfx950
 (#2641)" (#2670)

This reverts commit b7322a521a91fe4762701237f0243dd2c94b7644.
---
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |    2 -
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  147 +-
 example/ck_tile/01_fmha/fmha_fwd.cpp          |    2 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |    3 -
 .../ck_tile/01_fmha/script/benchmark_fwd.sh   |   11 +
 .../ck_tile/01_fmha/script/smoke_test_fwd.sh  |   21 +-
 .../core/arch/amd_buffer_addressing.hpp       |   17 +-
 .../arch/amd_buffer_addressing_builtins.hpp   |   17 +-
 include/ck_tile/core/arch/arch.hpp            |   27 +-
 include/ck_tile/core/arch/utility.hpp         |   15 -
 include/ck_tile/core/config.hpp               |   10 -
 include/ck_tile/core/numeric/bfloat16.hpp     |   11 -
 include/ck_tile/core/numeric/pk_fp4.hpp       |    2 +-
 include/ck_tile/core/numeric/pk_int4.hpp      |    2 +-
 include/ck_tile/core/numeric/vector_type.hpp  |   12 +-
 .../unary_element_wise_operation.hpp          |    7 +
 include/ck_tile/ops/fmha.hpp                  |    2 -
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 1530 +++++------------
 ...block_fmha_bwd_pipeline_default_policy.hpp |   24 +-
 .../pipeline/block_fmha_pipeline_enum.hpp     |    7 -
 .../pipeline/block_fmha_pipeline_problem.hpp  |    2 -
 ...ck_fmha_pipeline_qr_ks_vs_async_trload.hpp | 1177 -------------
 ..._pipeline_qr_ks_vs_async_trload_policy.hpp |  823 ---------
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   30 +-
 .../block/block_gemm_areg_breg_creg_v1.hpp    |  178 +-
 .../ops/gemm/block/block_gemm_problem.hpp     |    9 +-
 .../gemm_pipeline_ag_bg_cr_scheduler.hpp      |    6 -
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   48 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |    8 -
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |    4 -
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   30 +-
 31 files changed, 639 insertions(+), 3545 deletions(-)
 delete mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
 delete mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp

diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 42a9d5148a..6fca800c90 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -115,7 +115,6 @@ PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
     "qs" : "ck_tile::BlockFmhaPipelineQSKSVS",
-    "qr_async_trload" : "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
 }
 
 PIPELINE_ENUM_MAP = {
@@ -124,7 +123,6 @@ PIPELINE_ENUM_MAP = {
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
     "qr_pagedkv" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
-    "qr_async_trload" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index ce35c6a2a7..269af4e6a7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -12,7 +12,6 @@ from typing import List, Optional, Tuple
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
-from codegen.utils import update_file
 
 
 DTYPE_BITS = {
@@ -84,7 +83,6 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     {F_mode},
     fmha_variant_{F_idx},
     fmha_mask_{F_idx},
-    {F_trload},
     fmha_trait_{F_idx}>;
 
 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -99,7 +97,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
 
 #include <iostream>
 
@@ -163,19 +161,12 @@ float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config&
     [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
         return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
     }};
-    
-    const bool has_load_tr = ck_tile::is_load_tr_supported();
 
 {F_dispatch}
     return r;
 }}
 """
 
-FMHA_FWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
-{F_dtype_case}
-    }}
-"""
-
 FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
 {F_hdim_case}
     }}
@@ -186,8 +177,8 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
-                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -230,7 +221,6 @@ class FmhaFwdApiTrait:
     dpad       : str
     dvpad      : str
     skip       : str
-    tr_load    : str
     constraint : CppConstraint
 
     @property
@@ -241,19 +231,13 @@ class FmhaFwdApiTrait:
     @property
     def scheck(self) -> str:
         if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
-        if self.pipeline_tag in ['qr_async', 'qr_async_trload']:
+        if self.pipeline_tag == 'qr_async':
             if self.spad == 't' : return 'true' # always support
             else :                return 'true'
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
-    
-    @property
-    def seqtune(self) -> str:
-        if self.bm0 == 128: return 'true/*fall back to largest tile*/'                  # group mode only generate spad/skpad == true
-        else: 
-            return f'a.seqlen_q <= {self.bm0}'
 
     @property
     def skcheck(self) -> str:
@@ -264,9 +248,6 @@ class FmhaFwdApiTrait:
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_k % {self.bn0} == 0'
-        elif self.pipeline_tag == 'qr_async_trload':
-            if self.skpad == 't' : return 'true'
-            else:                  return 'true'
         else: assert False
 
     @property
@@ -275,7 +256,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
-        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
+        elif self.pipeline_tag in ['qr', 'qs']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -287,7 +268,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
-        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
+        elif self.pipeline_tag in ['qr', 'qs']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -309,7 +290,6 @@ class FmhaFwdPipeline:
     F_squant     : str  #
     F_mask       : str  # value from MASK_MAP
     F_skip       : str  # true/false
-    F_trload     : str  # true/false
     F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
@@ -351,9 +331,6 @@ class FmhaFwdPipeline:
 
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
-        
-        if self.F_trload == 't' : n += '_trload'
-        else: n += '_ntrload'
 
         return n
 
@@ -374,39 +351,31 @@ class FmhaFwdApiPool:
 
     @property
     def api(self) -> str:
-        tr_load_cond_map = {
-            "t": "has_load_tr",
-            "f": "true"
-        }
-        
-        per_tr_load =str()
-        for tr_load in ["t", "f"]:
-            per_dtypes=str()
-            for i, dtype in enumerate(self.pool.keys()):
-                per_hdim_case=str()
-                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
-                    traits=self.pool[dtype][(hdim, hdim_v)]
-                    inners=str()
-                    for k, trait in enumerate(traits):
-                        if_k = 'if' if k == 0 else 'else if'
-                        inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                       F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
-                                       F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                       F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip], F_trload=BOOL_MAP[trait.tr_load],
-                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
-                                       F_constraint=trait.constraint,
-                                       F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                       F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                       F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
-                    if_j = 'if' if j == 0 else 'else if'
-                    per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
-                if_i = 'if' if i == 0 else 'else if'
-                per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-            per_tr_load += FMHA_FWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_dtype_case=per_dtypes)
-        if not per_tr_load:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case=str()
+            for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][(hdim, hdim_v)]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip],
+                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_constraint=trait.constraint,
+                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
             # empty string we add some ignore to suppress warning in api
-            per_tr_load += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_tr_load)
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
 
 @dataclass
 class FmhaFwdTileSize:
@@ -489,8 +458,7 @@ class FmhaFwdKernel:
                 F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                 F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
-                F_trload        = BOOL_MAP[self.F_pipeline.F_trload])
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
 
     @property
     def name(self) -> str:
@@ -526,7 +494,6 @@ class FmhaFwdKernel:
                 dpad=self.F_pipeline.F_dpad,
                 dvpad=self.F_pipeline.F_dvpad,
                 skip=self.F_pipeline.F_skip,
-                tr_load=self.F_pipeline.F_trload,
                 constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
 
 class KernelComponentFactory:
@@ -536,15 +503,10 @@ class KernelComponentFactory:
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
         if dtype == 'fp16' or dtype == 'bf16':
             return {
-                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
-                             FmhaFwdTileSize(32, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-                             FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (128,128) : [FmhaFwdTileSize(16, 32, 64, 128, 32,  128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
-                             FmhaFwdTileSize(32, 32, 128, 128, 32,  128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-                             FmhaFwdTileSize(128, 64, 32, 128, 16,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-                             FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
@@ -572,27 +534,34 @@ class KernelComponentFactory:
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
                 if hdim == 256 and hdim_v == 256:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                # if True:
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
-                        if (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f":
-                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 't'))
-                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 't'))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f')) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -630,12 +599,6 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
-                if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
-                    # non qr_async_trload only support km0=128 tile size when hdim is not 128
-                    # non qr_async only support kn0=128 tile size when hdim is 128
-                    continue
-                if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
-                    continue
                 # logits_soft_cap is only allowed if no bias
                 if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
                     continue
@@ -702,10 +665,10 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
     return (api_pool, gen)
 
 def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
-    update_file(autogen_dir / kernel.filename, kernel.template)
+    (autogen_dir / kernel.filename).write_text(kernel.template)
 
 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
-    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
+    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
 
 def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
     api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index d0f8e3798c..c0e4dc3d30 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1135,7 +1135,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
               << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush << std::endl;
+              << " GB/s" << std::flush;
 
     if(do_validation == 0)
     {
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index df1e9e5699..81dda692ea 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/fmha.hpp"
@@ -1029,7 +1028,6 @@ template <ck_tile::index_t HDim_,
           bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
-          bool kUseTrLoad_,
           bool kSkipMinSeqlenQ_ = false>
 struct fmha_fwd_traits_
 {
@@ -1054,7 +1052,6 @@ struct fmha_fwd_traits_
     static constexpr bool kPadSK                     = kPadSK_;
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
-    static constexpr bool kUseTrLoad                 = kUseTrLoad_;
     static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
 };
 
diff --git a/example/ck_tile/01_fmha/script/benchmark_fwd.sh b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
index 88c16cceb6..599c595a75 100755
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -18,3 +18,14 @@ $EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kn
 done
 done
 done
+
+for perm in 0 1 ; do
+
+$EXE -prec=fp8 -squant=1 -b=32 -h=16 -d=128 -s=512   -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=fp8 -squant=1 -b=16 -h=16 -d=128 -s=1024  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=fp8 -squant=1 -b=8  -h=16 -d=128 -s=2048  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=fp8 -squant=1 -b=4  -h=16 -d=128 -s=4096  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=fp8 -squant=1 -b=2  -h=16 -d=128 -s=8192  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=fp8 -squant=1 -b=1  -h=16 -d=128 -s=16384 -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
+
+done
\ No newline at end of file
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index dc2be933bd..b867cd6c07 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -42,6 +42,7 @@ run_fp16_bf16_tests() {
     for prec in "fp16" "bf16" ; do
     for mode in 1 0 ; do
     for perm in 0 1 ; do
+    for vlayout in "r" "c" ; do
     for hdim in 32 64 128 256 ; do
     for lse in 0 1 ; do
     for bias in "n" "e" "a" ; do
@@ -50,16 +51,16 @@ run_fp16_bf16_tests() {
     for page_block_size in $PAGE_BLOCK_SIZE ; do
     for cache_batch_idx in $CACHE_BATCH_IDX ; do
 
-    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
 
     done ; done ; done ; done ; done
     done ; done ; done ; done ; done
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 07be65a150..35da19cd3e 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -41,6 +41,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1314,17 +1318,6 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -2763,7 +2756,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index c64b296408..8c3bc0bc36 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -32,6 +32,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
+    r.x         = __builtin_amdgcn_readfirstlane(r.x);
+    r.y         = __builtin_amdgcn_readfirstlane(r.y);
+    r.z         = __builtin_amdgcn_readfirstlane(r.z);
+    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1182,17 +1186,6 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -2581,7 +2574,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index f0e9518120..ab42ec8617 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -89,6 +89,21 @@ CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
 
 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
 
+CK_TILE_DEVICE void block_sync_lds()
+{
+#if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+    // asm volatile("\
+    // s_waitcnt lgkmcnt(0) \n \
+    // s_barrier \
+    // " ::);
+
+    __builtin_amdgcn_s_waitcnt(0xc07f);
+    __builtin_amdgcn_s_barrier();
+#else
+    __syncthreads();
+#endif
+}
+
 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 {
 #ifdef __gfx12__
@@ -159,18 +174,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
     __builtin_amdgcn_s_barrier();
 }
 
-template <index_t lgkmcnt = 0>
-CK_TILE_DEVICE void block_sync_lds()
-{
-    s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
-}
-
-template <index_t vmcnt = 0>
-CK_TILE_DEVICE void block_sync_lds_direct_load()
-{
-    s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
-}
-
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index 93008f8525..7184f99521 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -59,21 +59,6 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
 #endif
 }
 
-template <typename T>
-CK_TILE_DEVICE auto warp_shuffle_down_pair(const T& v_local)
-{
-    static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
-
-    const int32x2_t x = __builtin_amdgcn_permlane32_swap(
-        bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
-
-    thread_buffer<T, 2> v;
-    v(0) = bit_cast<T>(x[0]);
-    v(1) = bit_cast<T>(x[1]);
-
-    return v;
-}
-
 template <typename T>
 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
 {
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index e472bd01e5..c471f416c3 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -191,16 +191,6 @@
 #endif
 #endif
 
-// use llvm builtin bf16 data type after ROCm 6.5
-#ifndef CK_TILE_USE_LLVM_BUILTIN_BF16
-#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 5 && HIP_VERSION_PATCH >= 50421) || \
-    (HIP_VERSION_MAJOR >= 7)
-#define CK_TILE_USE_LLVM_BUILTIN_BF16 1
-#else
-#define CK_TILE_USE_LLVM_BUILTIN_BF16 0
-#endif
-#endif
-
 #ifndef CK_TILE_DEBUG_LOG
 #define CK_TILE_DEBUG_LOG 0
 #endif
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 245fb7244f..6f31468809 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -6,9 +6,6 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
-#if CK_TILE_USE_LLVM_BUILTIN_BF16
-#include <hip/hip_bfloat16.h>
-#endif
 #include <stdint.h>
 
 #pragma once
@@ -105,11 +102,7 @@ struct native_t<bfloat16_t>
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = typename bf16_t::raw_type;
 #else
-#if CK_TILE_USE_LLVM_BUILTIN_BF16
-using bfloat16_t = __bf16;
-#else
 using bfloat16_t = ushort;
-#endif
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = uint16_t;
 #endif
@@ -287,11 +280,7 @@ template <bf16_rounding_mode rounding =
               static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE constexpr bfloat16_t float_to_bf16(float f, constant<rounding> = {})
 {
-#if defined(__gfx950__)
-    return static_cast<bfloat16_t>(f);
-#else
     return bit_cast<bfloat16_t>(float_to_bf16_raw(f, constant<rounding>{}));
-#endif
 }
 
 template <bf16_rounding_mode rounding =
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index 7464bc7c48..a345cd1b75 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -21,7 +21,7 @@ namespace ck_tile {
 using fp32_t   = float;
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
 
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index 0b0eb70beb..ba8b87a9b8 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -99,7 +99,7 @@ struct numeric_traits<pk_int4_t>
 
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
 {
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index bbd3d53827..58bdb43b08 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -131,12 +131,12 @@ using fp16x64_t = _Float16 __attribute__((ext_vector_type(64)));
 
 // bf16
 // using bf16_t = ...
-using bf16x2_t  = bfloat16_t __attribute__((ext_vector_type(2)));
-using bf16x4_t  = bfloat16_t __attribute__((ext_vector_type(4)));
-using bf16x8_t  = bfloat16_t __attribute__((ext_vector_type(8)));
-using bf16x16_t = bfloat16_t __attribute__((ext_vector_type(16)));
-using bf16x32_t = bfloat16_t __attribute__((ext_vector_type(32)));
-using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
+using bf16x2_t  = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x4_t  = bf16_raw_t __attribute__((ext_vector_type(4)));
+using bf16x8_t  = bf16_raw_t __attribute__((ext_vector_type(8)));
+using bf16x16_t = bf16_raw_t __attribute__((ext_vector_type(16)));
+using bf16x32_t = bf16_raw_t __attribute__((ext_vector_type(32)));
+using bf16x64_t = bf16_raw_t __attribute__((ext_vector_type(64)));
 
 // i32
 // using int32_t = ...
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index b69c167315..0e385901ed 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -330,6 +330,13 @@ struct PassThrough
         y = type_convert<float>(x);
     }
 
+    template <>
+    CK_TILE_HOST_DEVICE void
+    operator()<ck_tile::bf16_t, ck_tile::fp16_t>(ck_tile::bf16_t& y, const ck_tile::fp16_t& x) const
+    {
+        y = type_convert<ck_tile::bf16_t>(x);
+    }
+
     template <>
     CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp16_t>(float& y,
                                                                 const ck_tile::fp16_t& x) const
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 69f645b850..d8dd5db12e 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -52,8 +52,6 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 5b3d38d3e7..8d257a3329 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -13,7 +13,6 @@
 #include <utility>
 #include <variant>
 
-#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
 // S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] @ K[seqlen_k, hdim_q]
 // S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
 // S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
@@ -62,14 +61,6 @@ struct FmhaFwdKernel
 
     static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
 
-    static constexpr bool kUseTrLoad = FmhaPipeline::Problem::kUseTrLoad;
-#if defined(__gfx950__)
-    static constexpr bool kIsAvialable = true;
-#else
-    static constexpr bool kIsAvialable = !kUseTrLoad;
-#endif
-    static constexpr std::string_view kPipelineName = FmhaPipeline::name;
-
     // clang-format off
     template <typename T> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
@@ -109,7 +100,7 @@ struct FmhaFwdKernel
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kUseTrLoad ? "_trload" : "_ntrload");
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -1045,1142 +1036,455 @@ struct FmhaFwdKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        if constexpr(kIsAvialable)
-            run_(std::move(kargs));
-    }
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
 
-    CK_TILE_DEVICE void run_(Kargs kargs) const
-    {
-        if constexpr(kPipelineName != "qr_async_trload")
+        // divide problem
+        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
+
+        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+
+        long_index_t batch_offset_q       = 0;
+        long_index_t batch_offset_k       = 0;
+        long_index_t batch_offset_v       = 0;
+        long_index_t batch_offset_bias    = 0;
+        long_index_t batch_offset_randval = 0;
+        long_index_t batch_offset_lse     = 0;
+        long_index_t batch_offset_o       = 0;
+
+        if constexpr(kIsGroupMode)
         {
-            // allocate LDS
-            __shared__ char smem_ptr[GetSmemSize()];
+            // get starting offset for each batch
+            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
 
-            // divide problem
-            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
-
-            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-            long_index_t batch_offset_q       = 0;
-            long_index_t batch_offset_k       = 0;
-            long_index_t batch_offset_v       = 0;
-            long_index_t batch_offset_bias    = 0;
-            long_index_t batch_offset_randval = 0;
-            long_index_t batch_offset_lse     = 0;
-            long_index_t batch_offset_o       = 0;
-
-            if constexpr(kIsGroupMode)
+            batch_offset_q = query_start * kargs.stride_q;
+            batch_offset_k = key_start * kargs.stride_k;
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
-                // get starting offset for each batch
-                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
-
-                batch_offset_q = query_start * kargs.stride_q;
-                batch_offset_k = key_start * kargs.stride_k;
-                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-                {
-                    batch_offset_v = key_start * kargs.stride_v;
-                }
-                else
-                {
-                    batch_offset_v = key_start;
-                }
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-                {
-                    batch_offset_bias = query_start * kargs.stride_bias;
-                }
-                if constexpr(kStoreLSE)
-                {
-                    batch_offset_lse = query_start;
-                }
-                if constexpr(kHasDropout)
-                {
-                    batch_offset_randval = query_start * kargs.stride_randval;
-                }
-                batch_offset_o = query_start * kargs.stride_o;
-
-                // get real # queries & # keys under group mode
-                const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-                kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
-
-                if constexpr(kSkipMinSeqlenQ)
-                {
-                    if(kargs.seqlen_q <= kargs.min_seqlen_q)
-                    {
-                        return;
-                    }
-                }
-
-                // # of required blocks is different in each groups, terminate unnecessary blocks
-                // earlier
-                if(kargs.seqlen_q <= i_m0)
-                {
-                    return;
-                }
-
-                if(kargs.seqlen_k_ptr != nullptr)
-                {
-                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
-                }
-                else
-                {
-                    const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-                    kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
-                }
+                batch_offset_v = key_start * kargs.stride_v;
             }
             else
             {
-                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                batch_offset_v = key_start;
+            }
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                batch_offset_bias = query_start * kargs.stride_bias;
+            }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = query_start;
+            }
+            if constexpr(kHasDropout)
+            {
+                batch_offset_randval = query_start * kargs.stride_randval;
+            }
+            batch_offset_o = query_start * kargs.stride_o;
+
+            // get real # queries & # keys under group mode
+            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+            if constexpr(kSkipMinSeqlenQ)
+            {
+                if(kargs.seqlen_q <= kargs.min_seqlen_q)
                 {
-                    batch_offset_bias =
-                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                    return;
                 }
-                if constexpr(kStoreLSE)
-                {
-                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-                }
-                if constexpr(kHasDropout)
-                {
-                    batch_offset_randval =
-                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
-                }
-                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
             }
 
-            // for simplicity, batch stride we just modify the pointer
-            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-                                     batch_offset_q;
-            const KDataType* k_ptr =
-                reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
-                batch_offset_k;
-            const VDataType* v_ptr =
-                reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
-                batch_offset_v;
-            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-                               batch_offset_o;
+            // # of required blocks is different in each groups, terminate unnecessary blocks
+            // earlier
+            if(kargs.seqlen_q <= i_m0)
+            {
+                return;
+            }
 
-            // Q/K/V DRAM and DRAM window
-            const auto q_dram = [&]() {
-                const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    q_ptr,
-                    make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                    make_tuple(kargs.stride_q, 1),
-                    number<FmhaPipeline::kAlignmentQ>{},
-                    number<1>{});
-                if constexpr(FmhaPipeline::kQLoadOnce)
-                {
-                    return pad_tensor_view(q_dram_naive,
-                                           make_tuple(number<FmhaPipeline::kM0>{},
-                                                      number<FmhaPipeline::kSubQKHeaddim>{}),
-                                           sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-                }
-                else
-                {
-                    return pad_tensor_view(
-                        q_dram_naive,
-                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-                }
-            }();
-            const auto k_dram = [&]() {
-                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    k_ptr,
-                    make_tuple(kargs.seqlen_k, kargs.hdim_q),
-                    make_tuple(kargs.stride_k, 1),
-                    number<FmhaPipeline::kAlignmentK>{},
-                    number<1>{});
-
-                constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
-                return pad_tensor_view(
-                    k_dram_naive,
-                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenK_, kPadHeadDimQ>{});
-            }();
-            const auto v_dram = [&]() {
-                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-                {
-                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        v_ptr,
-                        make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                        make_tuple(kargs.stride_v, 1),
-                        number<FmhaPipeline::kAlignmentV>{},
-                        number<1>{});
-
-                    const auto v_dram_transposed = transform_tensor_view(
-                        v_dram_naive,
-                        make_tuple(make_pass_through_transform(kargs.hdim_v),
-                                   make_pass_through_transform(kargs.seqlen_k)),
-                        make_tuple(sequence<1>{}, sequence<0>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-
-                    constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
-                    return pad_tensor_view(
-                        v_dram_transposed,
-                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                        sequence<kPadHeadDimV, kPadSeqLenK_>{});
-                }
-                else
-                {
-                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        v_ptr,
-                        make_tuple(kargs.hdim_v, kargs.seqlen_k),
-                        make_tuple(kargs.stride_v, 1),
-                        number<FmhaPipeline::kAlignmentV>{},
-                        number<1>{});
-
-                    constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
-                    return pad_tensor_view(
-                        v_dram_naive,
-                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                        sequence<kPadHeadDimV_, kPadSeqLenK>{});
-                }
-            }();
-
-            auto q_dram_window = make_tile_window(
-                q_dram,
-                [&]() {
-                    if constexpr(FmhaPipeline::kQLoadOnce)
-                        return make_tuple(number<FmhaPipeline::kM0>{},
-                                          number<FmhaPipeline::kSubQKHeaddim>{});
-                    else
-                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
-                }(),
-                {i_m0, 0});
-
-            auto k_dram_window = make_tile_window(
-                k_dram,
-                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                {0, 0});
-
-            auto v_dram_window = make_tile_window(
-                v_dram,
-                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                {i_n1, 0});
-            /// FIXME: Before C++20, capturing structured binding variables are not supported.
-            /// Remove following copy capture of the 'i_nhead' if in C++20
-            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-                constexpr auto bias_dram_window_lengths =
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-                {
-                    const BiasDataType* bias_ptr =
-                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-                        batch_offset_bias;
-
-                    const auto bias_dram = [&]() {
-                        const auto bias_dram_naive =
-                            make_naive_tensor_view<address_space_enum::global>(
-                                bias_ptr,
-                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                                make_tuple(kargs.stride_bias, 1),
-                                number<FmhaPipeline::kAlignmentBias>{},
-                                number<1>{});
-
-                        return pad_tensor_view(bias_dram_naive,
-                                               bias_dram_window_lengths,
-                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                    }();
-
-                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-                }
-                else
-                {
-                    return make_null_tile_window(bias_dram_window_lengths);
-                }
-            }();
-
-            // lse
-            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
-                if constexpr(kStoreLSE)
-                {
-                    LSEDataType* lse_ptr =
-                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
-                        batch_offset_lse;
-
-                    const auto lse_dram = [&]() {
-                        const auto lse_dram_naive =
-                            make_naive_tensor_view<address_space_enum::global>(
-                                lse_ptr,
-                                make_tuple(kargs.seqlen_q),
-                                make_tuple(1),
-                                number<1>{},
-                                number<1>{});
-
-                        return pad_tensor_view(
-                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
-                    }();
-
-                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-                }
-                else
-                {
-                    return make_null_tile_window(lse_dram_window_lengths);
-                }
-            }();
-
-            auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
-                if constexpr(kHasDropout)
-                {
-                    return BlockDropout{i_batch_,
-                                        i_nhead_,
-                                        kargs.num_head_q,
-                                        kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
-                                                                            : *kargs.drop_seed.ptr,
-                                        kargs.is_drop_seed_offset_from_host
-                                            ? kargs.drop_offset.val
-                                            : *kargs.drop_offset.ptr,
-                                        kargs.rp_undrop,
-                                        kargs.p_undrop_in_uint8_t,
-                                        kargs.is_store_randval};
-                }
-                else
-                {
-                    return NullBlockDropout{};
-                };
-            }();
-
-            auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
-                constexpr auto randval_dram_window_lengths =
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-                if constexpr(kHasDropout)
-                {
-                    RandValOutputDataType* rand_val_ptr =
-                        reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
-                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
-                        batch_offset_randval;
-
-                    const auto randval_dram = [&]() {
-                        const auto randval_dram_naive =
-                            make_naive_tensor_view<address_space_enum::global>(
-                                rand_val_ptr,
-                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                                make_tuple(kargs.stride_randval, 1),
-                                number<1>{},
-                                number<1>{});
-
-                        return pad_tensor_view(randval_dram_naive,
-                                               randval_dram_window_lengths,
-                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                    }();
-
-                    return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
-                }
-                else
-                {
-                    return make_null_tile_window(randval_dram_window_lengths);
-                }
-            }();
-
-            FmhaMask mask = [&]() {
-                if constexpr(kHasMask)
-                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
-                        kargs.window_size_left,
-                        kargs.window_size_right,
-                        kargs.seqlen_q,
-                        kargs.seqlen_k,
-                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
-                else
-                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
-            }();
-
-            // WA i_batch capture structure binding before c++20
-            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                {
-                    // data loading, shared by entire wg
-                    // TODO: how to use s_read?
-                    SaccDataType slope =
-                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
-                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
-#if CK_TILE_FMHA_FWD_FAST_EXP2
-                    slope *= ck_tile::log2e_v<>;
-#endif
-                    if constexpr(kHasMask)
-                    {
-                        return make_alibi_from_lr_mask<SaccDataType, true>(slope,
-                                                                           kargs.window_size_left,
-                                                                           kargs.window_size_right,
-                                                                           kargs.seqlen_q,
-                                                                           kargs.seqlen_k,
-                                                                           kargs.mask_type);
-                    }
-                    else
-                    {
-                        return Alibi<SaccDataType, true>{
-                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
-                    }
-                }
-                else
-                {
-                    return EmptyPositionEncoding<SaccDataType>{};
-                }
-            }();
-
-            AttentionVariant variant;
-            const auto variant_params = [&] {
-                if constexpr(kHasLogitsSoftCap)
-                {
-                    return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
-                        mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
-                }
-                else
-                {
-                    return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
-                }
-            }();
-
-            BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
-
-            auto o_acc_tile = [&]() {
-                if constexpr(kDoFp8StaticQuant)
-                {
-                    return FmhaPipeline{}(
-                        q_dram_window,
-                        identity{}, // q_element_func
-                        k_dram_window,
-                        identity{}, // k_element_func
-                        v_dram_window,
-                        identity{}, // v_element_func
-                        bias_dram_window,
-                        identity{}, // bias_element_func
-                        randval_dram_window,
-                        lse_dram_window,
-                        identity{},            // lse_element_func
-                        identity{},            // s_acc_element_func
-                        scales{kargs.scale_p}, // p_compute_element_func
-                        composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
-                        mask,
-                        position_encoding,
-                        kargs.scale_s,
-                        variant,
-                        variant_params,
-                        block_indices,
-                        smem_ptr,
-                        dropout);
-                }
-                else
-                {
-                    return FmhaPipeline{}(q_dram_window,
-                                          k_dram_window,
-                                          v_dram_window,
-                                          bias_dram_window,
-                                          randval_dram_window,
-                                          lse_dram_window,
-                                          mask,
-                                          position_encoding,
-                                          kargs.scale_s,
-                                          variant,
-                                          variant_params,
-                                          block_indices,
-                                          smem_ptr,
-                                          dropout);
-                }
-            }();
-
-            // O DRAM and O DRAM window
-            auto o_dram = [&]() {
-                const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    o_ptr,
-                    make_tuple(kargs.seqlen_q, kargs.hdim_v),
-                    make_tuple(kargs.stride_o, 1),
-                    number<FmhaPipeline::kAlignmentO>{},
-                    number<1>{});
-
-                return pad_tensor_view(
-                    o_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
-            }();
-
-            auto o_dram_window = make_tile_window(
-                o_dram,
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                {i_m0, i_n1});
-
-            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+            if(kargs.seqlen_k_ptr != nullptr)
+            {
+                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+            }
+            else
+            {
+                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+            }
         }
         else
         {
-            // TODO: Refine the logical here.
-            // In Decode case
-            //     1. we don't expect KV data reused by different ThreadGroups, bypass the cache
-            //     2. limit the LDS usage, as we want higher occupancy
-            // In Prefill case
-            //     1. we expect KV data reused by different ThreadGroups, use cache
-            //     2. use more LDS, as we want better memory latency hiding
-            // If SplitKV off, we don't expect Q data reused by different ThreadGroups, bypass the
-            // cache
-            constexpr bool PrefillCase = FmhaPipeline::kM0 >= 128;
-            // divide problem
-            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
-
-            const index_t i_m0 = i_tile_m * FmhaPipeline::kM0;
-            const index_t i_n1 = i_tile_n * FmhaPipeline::kN1;
-
-            long_index_t batch_offset_q    = 0;
-            long_index_t batch_offset_k    = 0; // unused for paged-kvcache
-            long_index_t batch_offset_v    = 0; // unused for paged-kvcache
-            long_index_t batch_offset_bias = 0;
-            long_index_t batch_offset_lse  = 0;
-            long_index_t batch_offset_o    = 0;
-            // index_t kv_l2p_offset =
-            //     0; // logical-to-physical offset of seqlen_k coordinate. only used for
-            //     paged-kvcache
-
-            if constexpr(kIsGroupMode)
+            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+            batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+            batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
             {
-                // get starting offset for each batch
-                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+            }
+            if constexpr(kStoreLSE)
+            {
+                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+            }
+            if constexpr(kHasDropout)
+            {
+                batch_offset_randval =
+                    static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
+            }
+            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+        }
 
-                batch_offset_q = query_start * kargs.stride_q;
-                batch_offset_k = key_start * kargs.stride_k;
-                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        // for simplicity, batch stride we just modify the pointer
+        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                 batch_offset_q;
+        const KDataType* k_ptr =
+            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+            batch_offset_k;
+        const VDataType* v_ptr =
+            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
+            batch_offset_v;
+        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                           batch_offset_o;
+
+        // Q/K/V DRAM and DRAM window
+        const auto q_dram = [&]() {
+            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                q_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                make_tuple(kargs.stride_q, 1),
+                number<FmhaPipeline::kAlignmentQ>{},
+                number<1>{});
+            if constexpr(FmhaPipeline::kQLoadOnce)
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            }
+            else
+            {
+                return pad_tensor_view(
+                    q_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            }
+        }();
+        const auto k_dram = [&]() {
+            const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                k_ptr,
+                make_tuple(kargs.seqlen_k, kargs.hdim_q),
+                make_tuple(kargs.stride_k, 1),
+                number<FmhaPipeline::kAlignmentK>{},
+                number<1>{});
+
+            constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
+            return pad_tensor_view(
+                k_dram_naive,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                sequence<kPadSeqLenK_, kPadHeadDimQ>{});
+        }();
+        const auto v_dram = [&]() {
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    v_ptr,
+                    make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                    make_tuple(kargs.stride_v, 1),
+                    number<FmhaPipeline::kAlignmentV>{},
+                    number<1>{});
+
+                const auto v_dram_transposed =
+                    transform_tensor_view(v_dram_naive,
+                                          make_tuple(make_pass_through_transform(kargs.hdim_v),
+                                                     make_pass_through_transform(kargs.seqlen_k)),
+                                          make_tuple(sequence<1>{}, sequence<0>{}),
+                                          make_tuple(sequence<0>{}, sequence<1>{}));
+
+                constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
+                return pad_tensor_view(
+                    v_dram_transposed,
+                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                    sequence<kPadHeadDimV, kPadSeqLenK_>{});
+            }
+            else
+            {
+                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    v_ptr,
+                    make_tuple(kargs.hdim_v, kargs.seqlen_k),
+                    make_tuple(kargs.stride_v, 1),
+                    number<FmhaPipeline::kAlignmentV>{},
+                    number<1>{});
+
+                constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
+                return pad_tensor_view(
+                    v_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                    sequence<kPadHeadDimV_, kPadSeqLenK>{});
+            }
+        }();
+
+        auto q_dram_window = make_tile_window(
+            q_dram,
+            [&]() {
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                    return make_tuple(number<FmhaPipeline::kM0>{},
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
+                else
+                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+            }(),
+            {i_m0, 0});
+
+        auto k_dram_window = make_tile_window(
+            k_dram, make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), {0, 0});
+
+        auto v_dram_window =
+            make_tile_window(v_dram,
+                             make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                             {i_n1, 0});
+        /// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
+        /// following copy capture of the 'i_nhead' if in C++20
+        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto bias_dram_window_lengths =
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+            {
+                const BiasDataType* bias_ptr =
+                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                    batch_offset_bias;
+
+                const auto bias_dram = [&]() {
+                    const auto bias_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        bias_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                        make_tuple(kargs.stride_bias, 1),
+                        number<FmhaPipeline::kAlignmentBias>{},
+                        number<1>{});
+
+                    return pad_tensor_view(bias_dram_naive,
+                                           bias_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                }();
+
+                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+            }
+            else
+            {
+                return make_null_tile_window(bias_dram_window_lengths);
+            }
+        }();
+
+        // lse
+        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+            if constexpr(kStoreLSE)
+            {
+                LSEDataType* lse_ptr =
+                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
+
+                const auto lse_dram = [&]() {
+                    const auto lse_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        lse_ptr,
+                        make_tuple(kargs.seqlen_q),
+                        make_tuple(1),
+                        number<1>{},
+                        number<1>{});
+
+                    return pad_tensor_view(
+                        lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                }();
+
+                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+            }
+            else
+            {
+                return make_null_tile_window(lse_dram_window_lengths);
+            }
+        }();
+
+        auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
+            if constexpr(kHasDropout)
+            {
+                return BlockDropout{i_batch_,
+                                    i_nhead_,
+                                    kargs.num_head_q,
+                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
+                                                                        : *kargs.drop_seed.ptr,
+                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_offset.val
+                                                                        : *kargs.drop_offset.ptr,
+                                    kargs.rp_undrop,
+                                    kargs.p_undrop_in_uint8_t,
+                                    kargs.is_store_randval};
+            }
+            else
+            {
+                return NullBlockDropout{};
+            };
+        }();
+
+        auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
+            constexpr auto randval_dram_window_lengths =
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+            if constexpr(kHasDropout)
+            {
+                RandValOutputDataType* rand_val_ptr =
+                    reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
+                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
+                    batch_offset_randval;
+
+                const auto randval_dram = [&]() {
+                    const auto randval_dram_naive =
+                        make_naive_tensor_view<address_space_enum::global>(
+                            rand_val_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                            make_tuple(kargs.stride_randval, 1),
+                            number<1>{},
+                            number<1>{});
+
+                    return pad_tensor_view(randval_dram_naive,
+                                           randval_dram_window_lengths,
+                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                }();
+
+                return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
+            }
+            else
+            {
+                return make_null_tile_window(randval_dram_window_lengths);
+            }
+        }();
+
+        FmhaMask mask = [&]() {
+            if constexpr(kHasMask)
+                return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                    kargs.window_size_left,
+                    kargs.window_size_right,
+                    kargs.seqlen_q,
+                    kargs.seqlen_k,
+                    kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+            else
+                return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+        }();
+
+        // WA i_batch capture structure binding before c++20
+        auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+            if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+            {
+                // data loading, shared by entire wg
+                // TODO: how to use s_read?
+                SaccDataType slope =
+                    *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                      i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                slope *= ck_tile::log2e_v<>;
+#endif
+                if constexpr(kHasMask)
                 {
-                    batch_offset_v = key_start * kargs.stride_v;
+                    return make_alibi_from_lr_mask<SaccDataType, true>(slope,
+                                                                       kargs.window_size_left,
+                                                                       kargs.window_size_right,
+                                                                       kargs.seqlen_q,
+                                                                       kargs.seqlen_k,
+                                                                       kargs.mask_type);
                 }
                 else
                 {
-                    batch_offset_v = key_start;
-                }
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-                {
-                    batch_offset_bias = query_start * kargs.stride_bias;
-                }
-
-                batch_offset_lse = query_start;
-                batch_offset_o   = query_start * kargs.stride_o;
-
-                // get real # queries & # keys under group mode
-                kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - kargs.seqstart_q_ptr[i_batch];
-
-                // # of required blocks is different in each groups, terminate unnecessary blocks
-                // earlier
-                if(kargs.seqlen_q <= i_m0)
-                {
-                    return;
-                }
-
-                if(kargs.seqlen_k_ptr != nullptr)
-                {
-                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
-                }
-                else
-                {
-                    kargs.seqlen_k =
-                        kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch];
+                    return Alibi<SaccDataType, true>{
+                        slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
                 }
             }
             else
             {
-                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-                if constexpr(kStoreLSE)
-                {
-                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-                }
-                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-                {
-                    batch_offset_bias =
-                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
-                }
+                return EmptyPositionEncoding<SaccDataType>{};
             }
+        }();
 
-            // for simplicity, batch stride we just modify the pointer
-            const index_t i_nhead_k = i_nhead / kargs.nhead_ratio_qk;
+        AttentionVariant variant;
+        const auto variant_params = [&] {
+            if constexpr(kHasLogitsSoftCap)
+            {
+                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+            }
+            else
+            {
+                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+            }
+        }();
 
-            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-                                     batch_offset_q;
-            const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_k +
-                                     batch_offset_k;
-            const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_v +
-                                     batch_offset_v;
+        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
 
-            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-                               batch_offset_o;
+        auto o_acc_tile = [&]() {
+            if constexpr(kDoFp8StaticQuant)
+            {
+                return FmhaPipeline{}(
+                    q_dram_window,
+                    identity{}, // q_element_func
+                    k_dram_window,
+                    identity{}, // k_element_func
+                    v_dram_window,
+                    identity{}, // v_element_func
+                    bias_dram_window,
+                    identity{}, // bias_element_func
+                    randval_dram_window,
+                    lse_dram_window,
+                    identity{},                                          // lse_element_func
+                    identity{},                                          // s_acc_element_func
+                    scales{kargs.scale_p},                               // p_compute_element_func
+                    composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
+                    mask,
+                    position_encoding,
+                    kargs.scale_s,
+                    variant,
+                    variant_params,
+                    block_indices,
+                    smem_ptr,
+                    dropout);
+            }
+            else
+            {
+                return FmhaPipeline{}(q_dram_window,
+                                      k_dram_window,
+                                      v_dram_window,
+                                      bias_dram_window,
+                                      randval_dram_window,
+                                      lse_dram_window,
+                                      mask,
+                                      position_encoding,
+                                      kargs.scale_s,
+                                      variant,
+                                      variant_params,
+                                      block_indices,
+                                      smem_ptr,
+                                      dropout);
+            }
+        }();
 
-            // Q/K/V DRAM and DRAM window
-            const auto q_dram = [&] {
-                const auto q_dram_naive = [&] {
-                    {
-                        return make_naive_tensor_view<address_space_enum::global,
-                                                      memory_operation_enum::set,
-                                                      amd_buffer_coherence_enum::SYSTEM_NT1>(
-                            q_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                            make_tuple(kargs.stride_q, 1),
-                            number<FmhaPipeline::kAlignmentQ>{},
-                            number<1>{});
-                    }
-                }();
+        // O DRAM and O DRAM window
+        auto o_dram = [&]() {
+            const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                o_ptr,
+                make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                make_tuple(kargs.stride_o, 1),
+                number<FmhaPipeline::kAlignmentO>{},
+                number<1>{});
 
-                if constexpr(FmhaPipeline::kQLoadOnce)
-                {
-                    const auto seqlen_q   = kargs.seqlen_q;
-                    const auto q_dram_pad = pad_tensor_view(
-                        q_dram_naive,
-                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                        sequence<false, kPadHeadDimQ>{});
-#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                    constexpr index_t LDSLayerSize  = 256 / sizeof(QDataType);
-                    constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
-
-                    if constexpr(XorLengthFold > 1)
-                    {
-                        const auto q_dram_unmerged = transform_tensor_view(
-                            q_dram_pad,
-                            make_tuple(
-                                make_unmerge_transform(
-                                    make_tuple(seqlen_q / XorLengthFold, XorLengthFold)),
-                                make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
-                            make_tuple(sequence<0>{}, sequence<1>{}),
-                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                        const auto q_dram_merged = transform_tensor_view(
-                            q_dram_unmerged,
-                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
-                                       make_merge_transform_v3_division_mod(make_tuple(
-                                           XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
-                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                            make_tuple(sequence<0>{}, sequence<1>{}));
-
-                        const auto q_dram_unmerged_xor = transform_tensor_view(
-                            q_dram_merged,
-                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
-                                       make_unmerge_transform(make_tuple(
-                                           number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{},
-                                           number<FmhaPipeline::kAlignmentQ>{}))),
-                            make_tuple(sequence<0>{}, sequence<1>{}),
-                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-                        const auto q_dram_permuted = transform_tensor_view(
-                            q_dram_unmerged_xor,
-                            make_tuple(
-                                make_xor_transform(
-                                    make_tuple(seqlen_q / XorLengthFold,
-                                               number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{})),
-                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
-                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                        const auto q_dram_tmp = transform_tensor_view(
-                            q_dram_permuted,
-                            make_tuple(
-                                make_pass_through_transform(seqlen_q / XorLengthFold),
-                                make_unmerge_transform(
-                                    make_tuple(number<XorLengthFold>{},
-                                               number<FmhaPipeline::kQKHeaddim /
-                                                      FmhaPipeline::kAlignmentQ>{})),
-                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
-                            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
-
-                        return transform_tensor_view(
-                            q_dram_tmp,
-                            make_tuple(
-                                make_merge_transform_v3_division_mod(
-                                    make_tuple(seqlen_q / XorLengthFold, number<XorLengthFold>{})),
-                                make_merge_transform_v3_division_mod(make_tuple(
-                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
-                                    number<FmhaPipeline::kAlignmentQ>{}))),
-                            make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
-                            make_tuple(sequence<0>{}, sequence<1>{}));
-                    }
-                    else
-#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                    {
-                        const auto q_dram_unmerged = transform_tensor_view(
-                            q_dram_pad,
-                            make_tuple(
-                                make_pass_through_transform(seqlen_q),
-                                make_unmerge_transform(make_tuple(
-                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
-                                    number<FmhaPipeline::kAlignmentQ>{}))),
-                            make_tuple(sequence<0>{}, sequence<1>{}),
-                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-                        const auto q_dram_permuted = transform_tensor_view(
-                            q_dram_unmerged,
-                            make_tuple(
-                                make_xor_transform(make_tuple(seqlen_q,
-                                                              number<FmhaPipeline::kQKHeaddim /
-                                                                     FmhaPipeline::kAlignmentQ>{})),
-                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
-                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                        return transform_tensor_view(
-                            q_dram_permuted,
-                            make_tuple(
-                                make_pass_through_transform(seqlen_q),
-                                make_merge_transform_v3_division_mod(make_tuple(
-                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
-                                    number<FmhaPipeline::kAlignmentQ>{}))),
-                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                            make_tuple(sequence<0>{}, sequence<1>{}));
-                    }
-                }
-                else
-                {
-                    return pad_tensor_view(
-                        q_dram_naive,
-                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                        sequence<false, kPadHeadDimQ>{});
-                }
-            }();
-
-            const auto make_k_dram = [&](const KDataType* data, index_t height) {
-                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    data, // will update this pointer if using paged-kvcache
-                    make_tuple(height, kargs.hdim_q),
-                    make_tuple(kargs.stride_k, 1),
-                    number<FmhaPipeline::kAlignmentK>{},
-                    number<1>{});
-
-                const auto k_dram_pad = pad_tensor_view(
-                    k_dram_naive,
-                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<false, kPadHeadDimQ>{});
-
-#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                constexpr index_t LDSLayerSize  = 256 / sizeof(KDataType);
-                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
-
-                if constexpr(XorLengthFold > 1)
-                {
-                    const auto k_dram_unmerged = transform_tensor_view(
-                        k_dram_pad,
-                        make_tuple(make_unmerge_transform(
-                                       make_tuple(height / XorLengthFold, XorLengthFold)),
-                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    const auto k_dram_merged = transform_tensor_view(
-                        k_dram_unmerged,
-                        make_tuple(make_pass_through_transform(height / XorLengthFold),
-                                   make_merge_transform_v3_division_mod(make_tuple(
-                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-
-                    const auto k_dram_unmerged_xor = transform_tensor_view(
-                        k_dram_merged,
-                        make_tuple(make_pass_through_transform(height / XorLengthFold),
-                                   make_unmerge_transform(make_tuple(
-                                       number<LDSLayerSize / FmhaPipeline::kAlignmentK>{},
-                                       number<FmhaPipeline::kAlignmentK>{}))),
-                        make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-                    const auto k_dram_permuted = transform_tensor_view(
-                        k_dram_unmerged_xor,
-                        make_tuple(
-                            make_xor_transform(
-                                make_tuple(height / XorLengthFold,
-                                           number<LDSLayerSize / FmhaPipeline::kAlignmentK>{})),
-                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    const auto k_dram_tmp = transform_tensor_view(
-                        k_dram_permuted,
-                        make_tuple(
-                            make_pass_through_transform(height / XorLengthFold),
-                            make_unmerge_transform(make_tuple(
-                                number<XorLengthFold>{},
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
-                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
-
-                    return transform_tensor_view(
-                        k_dram_tmp,
-                        make_tuple(
-                            make_merge_transform_v3_division_mod(
-                                make_tuple(height / XorLengthFold, number<XorLengthFold>{})),
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
-                                number<FmhaPipeline::kAlignmentK>{}))),
-                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-                else
-#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                {
-                    const auto k_dram_unmerged = transform_tensor_view(
-                        k_dram_pad,
-                        make_tuple(
-                            make_pass_through_transform(height),
-                            make_unmerge_transform(make_tuple(
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
-                                number<FmhaPipeline::kAlignmentK>{}))),
-                        make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-                    const auto k_dram_permuted = transform_tensor_view(
-                        k_dram_unmerged,
-                        make_tuple(
-                            make_xor_transform(make_tuple(
-                                height,
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
-                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    return transform_tensor_view(
-                        k_dram_permuted,
-                        make_tuple(
-                            make_pass_through_transform(height),
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
-                                number<FmhaPipeline::kAlignmentK>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-            };
-            const auto k_dram = [&]() {
-                {
-                    return make_k_dram(k_ptr, kargs.seqlen_k);
-                }
-            }();
-
-            const auto make_v_dram = [&](const VDataType* data, index_t length) {
-                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    data, // will update this pointer if using paged-kvcache
-                    make_tuple(length, kargs.hdim_v),
-                    make_tuple(kargs.hdim_v, 1),
-                    number<FmhaPipeline::kAlignmentV>{},
-                    number<1>{});
-
-                // TODO: Add kVHeadDim
-                constexpr index_t XorGroupSize =
-                    FmhaPipeline::Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
-
-                const auto v_dram_pad = pad_tensor_view(
-                    v_dram_naive,
-                    make_tuple(number<FmhaPipeline::kK1>{}, number<FmhaPipeline::kN1>{}),
-                    sequence<kPadSeqLenK, false>{});
-
-#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                constexpr index_t LDSLayerSize  = 256 / sizeof(VDataType);
-                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
-
-                if constexpr(XorLengthFold > 1)
-                {
-                    const auto v_dram_unmerged = transform_tensor_view(
-                        v_dram_pad,
-                        make_tuple(make_unmerge_transform(
-                                       make_tuple(length / XorLengthFold, XorLengthFold)),
-                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    const auto v_dram_merged = transform_tensor_view(
-                        v_dram_unmerged,
-                        make_tuple(make_pass_through_transform(length / XorLengthFold),
-                                   make_merge_transform_v3_division_mod(make_tuple(
-                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-
-                    const auto v_dram_unmerged_xor = transform_tensor_view(
-                        v_dram_merged,
-                        make_tuple(
-                            make_pass_through_transform(length / XorLengthFold),
-                            make_unmerge_transform(make_tuple(number<LDSLayerSize / XorGroupSize>{},
-                                                              number<XorGroupSize>{}))),
-                        make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-                    const auto v_dram_permuted = transform_tensor_view(
-                        v_dram_unmerged_xor,
-                        make_tuple(
-                            make_xor_transform(make_tuple(length / XorLengthFold,
-                                                          number<LDSLayerSize / XorGroupSize>{})),
-                            make_pass_through_transform(number<XorGroupSize>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    const auto v_dram_tmp = transform_tensor_view(
-                        v_dram_permuted,
-                        make_tuple(make_pass_through_transform(length / XorLengthFold),
-                                   make_unmerge_transform(make_tuple(
-                                       number<XorLengthFold>{},
-                                       number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
-                                   make_pass_through_transform(number<XorGroupSize>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
-
-                    return transform_tensor_view(
-                        v_dram_tmp,
-                        make_tuple(make_merge_transform_v3_division_mod(
-                                       make_tuple(length / XorLengthFold, number<XorLengthFold>{})),
-                                   make_merge_transform_v3_division_mod(
-                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
-                                                  number<XorGroupSize>{}))),
-                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-                else
-#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                {
-                    const auto v_dram_unmerged = transform_tensor_view(
-                        v_dram_pad,
-                        make_tuple(make_pass_through_transform(length),
-                                   make_unmerge_transform(
-                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
-                                                  number<XorGroupSize>{}))),
-                        make_tuple(sequence<0>{}, sequence<1>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
-
-                    const auto v_dram_permuted = transform_tensor_view(
-                        v_dram_unmerged,
-                        make_tuple(make_xor_transform(make_tuple(
-                                       length, number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
-                                   make_pass_through_transform(number<XorGroupSize>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    return transform_tensor_view(
-                        v_dram_permuted,
-                        make_tuple(make_pass_through_transform(length),
-                                   make_merge_transform_v3_division_mod(
-                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
-                                                  number<XorGroupSize>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-            };
-
-            const auto v_dram = [&]() {
-                {
-                    return make_v_dram(v_ptr, kargs.seqlen_k);
-                }
-            }();
-
-            auto q_dram_window = make_tile_window(
-                q_dram,
-                [&]() {
-                    if constexpr(FmhaPipeline::kQLoadOnce)
-                        return make_tuple(number<FmhaPipeline::kM0>{},
-                                          number<FmhaPipeline::kSubQKHeaddim>{});
-                    else
-                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
-                }(),
-                {i_m0, 0});
-
-            auto k_dram_window = make_tile_window(
-                k_dram,
-                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                {0, 0});
-
-            auto v_dram_window = make_tile_window(
-                v_dram,
-                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                {0, 0});
-
-            /// FIXME: Before C++20, capturing structured binding variables are not supported.
-            /// Remove following copy capture of the 'i_nhead' if in C++20
-            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-                constexpr auto bias_dram_window_lengths =
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-                {
-                    const BiasDataType* bias_ptr =
-                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-                        batch_offset_bias;
-
-                    const auto bias_dram = [&]() {
-                        const auto bias_dram_naive =
-                            make_naive_tensor_view<address_space_enum::global>(
-                                bias_ptr,
-                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                                make_tuple(kargs.stride_bias, 1),
-                                number<FmhaPipeline::kAlignmentBias>{},
-                                number<1>{});
-
-                        return pad_tensor_view(bias_dram_naive,
-                                               bias_dram_window_lengths,
-                                               sequence<false, kPadSeqLenK>{});
-                    }();
-
-                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-                }
-                else
-                {
-                    return make_null_tile_window(bias_dram_window_lengths);
-                }
-            }();
-
-            // lse acc
-            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
-                if constexpr(kStoreLSE)
-                {
-                    LSEDataType* lse_ptr =
-                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
-                        batch_offset_lse;
-
-                    const auto lse_dram = [&] {
-                        const auto lse_dram_naive = [&] {
-                            {
-                                return make_naive_tensor_view<address_space_enum::global>(
-                                    lse_ptr,
-                                    make_tuple(kargs.seqlen_q),
-                                    make_tuple(1),
-                                    number<1>{},
-                                    number<1>{});
-                            }
-                        }();
-                        return pad_tensor_view(
-                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
-                    }();
-
-                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-                }
-                else
-                {
-                    return make_null_tile_window(lse_dram_window_lengths);
-                }
-            }();
-
-            FmhaMask mask = [&]() {
-                if constexpr(kHasMask)
-                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
-                        kargs.window_size_left,
-                        kargs.window_size_right,
-                        kargs.seqlen_q,
-                        kargs.seqlen_k,
-                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
-                else
-                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
-            }();
-
-            // WA i_batch capture structure binding before c++20
-            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                {
-                    // data loading, shared by entire wg
-                    // TODO: how to use s_read?
-                    SaccDataType slope =
-                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
-                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
-#if CK_TILE_FMHA_FWD_FAST_EXP2
-                    slope *= ck_tile::log2e_v<>;
-#endif
-                    if constexpr(kHasMask)
-                    {
-                        return make_alibi_from_lr_mask<SaccDataType, true, 32>(
-                            slope,
-                            kargs.window_size_left,
-                            kargs.window_size_right,
-                            kargs.seqlen_q,
-                            kargs.seqlen_k,
-                            kargs.mask_type);
-                    }
-                    else
-                    {
-                        return Alibi<SaccDataType, true, 32>{
-                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
-                    }
-                }
-                else
-                {
-                    return EmptyPositionEncoding<SaccDataType>{};
-                }
-            }();
-
-            auto o_acc_tile = [&]() {
-                if constexpr(PrefillCase)
-                {
-                    // allocate double lds
-                    // add __restrict__ here to avoid aliasing
-                    __shared__ char smem_ptrk0
-                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
-                                                                     true>()];
-                    __shared__ char smem_ptrk1
-                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
-                                                                     true>()];
-                    __shared__ char smem_ptrv0[FmhaPipeline::Policy::template GetSmemSizeV<
-                        typename FmhaPipeline::Problem>()];
-                    __shared__ char smem_ptrv1[FmhaPipeline::Policy::template GetSmemSizeV<
-                        typename FmhaPipeline::Problem>()];
-
-                    return FmhaPipeline{}(q_dram_window,
-                                          k_dram_window,
-                                          v_dram_window,
-                                          bias_dram_window,
-                                          lse_dram_window,
-                                          mask,
-                                          position_encoding,
-                                          kargs.scale_s,
-                                          smem_ptrk0,
-                                          smem_ptrk1,
-                                          smem_ptrv0,
-                                          smem_ptrv1);
-                }
-                else
-                {
-                    __shared__ char smem_ptr[GetSmemSize()];
-                    return FmhaPipeline{}(q_dram_window,
-                                          k_dram_window,
-                                          v_dram_window,
-                                          bias_dram_window,
-                                          lse_dram_window,
-                                          mask,
-                                          position_encoding,
-                                          kargs.scale_s,
-                                          smem_ptr);
-                }
-            }();
-
-            // Oacc DRAM and Oacc DRAM window
-            auto o_dram = [&] {
-                const auto o_dram_naive = [&] {
-                    {
-                        return make_naive_tensor_view<address_space_enum::global>(
-                            o_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.hdim_v),
-                            make_tuple(kargs.stride_o, 1),
-                            number<FmhaPipeline::kAlignmentOacc>{},
-                            number<1>{});
-                    }
-                }();
-
-                return pad_tensor_view(
-                    o_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
-            }();
-
-            auto o_dram_window = make_tile_window(
-                o_dram,
+            return pad_tensor_view(
+                o_dram_naive,
                 make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                {i_m0, i_n1});
+                sequence<kPadSeqLenQ, kPadHeadDimV>{});
+        }();
 
-            EpiloguePipeline{}(o_dram_window, o_acc_tile);
-        }
+        auto o_dram_window =
+            make_tile_window(o_dram,
+                             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                             {i_m0, i_n1});
+
+        EpiloguePipeline{}(o_dram_window, o_acc_tile);
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index f6a20c5cb5..aa2ec99590 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -1038,7 +1038,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto k_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1096,7 +1096,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto v_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1190,7 +1190,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto kt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1249,7 +1249,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1344,7 +1344,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto qt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1379,7 +1379,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto dst_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1490,7 +1490,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto do_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1589,7 +1589,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<0, 1>>,
                                        tuple<sequence<0, 1>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto dot_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1623,7 +1623,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto pt_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1667,7 +1667,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                        tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
                                        tuple<sequence<1, 0>>,
                                        tuple<sequence<1, 0>>,
-                                       sequence<2, 1>,
+                                       sequence<1, 2>,
                                        sequence<0, 0>>{};
 
         constexpr auto ds_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
@@ -1718,7 +1718,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                     pt_out.set_y_sliced_thread_data(
-                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
                         pt_warp_tensor.get_thread_buffer());
                 });
@@ -1768,7 +1768,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                         merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
                     dst_out.set_y_sliced_thread_data(
-                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
                         merge_sequences(sequence<1, 1>{}, a_warp_y_lengths),
                         dst_warp_tensor.get_thread_buffer());
                 });
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
index 45a1c8f4b8..cf70dff63f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
@@ -11,7 +11,6 @@ enum class BlockFmhaPipelineEnum
     QRKSVS = 0,
     QRKSVS_ASYNC,
     QSKSVS,
-    QRKSVS_ASYNC_TRLOAD,
 };
 
 template <BlockFmhaPipelineEnum>
@@ -33,10 +32,4 @@ struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QSKSVS>
     static constexpr const char* name = "qs";
 };
 
-template <>
-struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD>
-{
-    static constexpr const char* name = "qr_async_trload";
-};
-
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 86ac713b6f..20b30b7417 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -22,7 +22,6 @@ template <typename QDataType_,
           bool kIsGroupMode_,
           typename AttentionVariant_,
           typename FmhaMask_,
-          bool kUseTrLoad_,
           typename Traits_>
 struct BlockFmhaPipelineProblem
 {
@@ -47,7 +46,6 @@ struct BlockFmhaPipelineProblem
     static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
 
     static constexpr bool kIsGroupMode = kIsGroupMode_;
-    static constexpr bool kUseTrLoad   = kUseTrLoad_;
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
deleted file mode 100644
index 39d8814692..0000000000
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
+++ /dev/null
@@ -1,1177 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce.hpp"
-
-namespace ck_tile {
-
-// This pipeline is qkv all located in LDS
-template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy>
-struct BlockFmhaPipelineQRKSVSAsyncTrload
-{
-    static constexpr auto I0 = number<0>{};
-    static constexpr auto I1 = number<1>{};
-
-    using Problem               = remove_cvref_t<Problem_>;
-    using Policy                = remove_cvref_t<Policy_>;
-    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
-    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
-    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
-    using SaccDataType          = remove_cvref_t<typename Problem::SaccDataType>;
-    using SMPLComputeDataType   = remove_cvref_t<typename Problem::SMPLComputeDataType>;
-    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
-    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
-    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
-    using PDataType             = remove_cvref_t<typename Problem::PDataType>;
-    using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
-    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
-    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
-    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
-
-    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
-    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
-    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
-    static_assert(kQLoadOnce == Policy::QLoadOnce);
-
-    static constexpr index_t kBlockSize = Problem::kBlockSize;
-
-    static constexpr index_t kM0           = BlockFmhaShape::kM0;
-    static constexpr index_t kN0           = BlockFmhaShape::kN0;
-    static constexpr index_t kK0           = BlockFmhaShape::kK0;
-    static constexpr index_t kN1           = BlockFmhaShape::kN1;
-    static constexpr index_t kK1           = BlockFmhaShape::kK1;
-    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
-    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
-    static constexpr index_t kNWarp        = BlockFmhaShape::Gemm0BlockWarps::at(I1);
-    static constexpr index_t kNXdl         = BlockFmhaShape::Gemm0WarpTile::at(I1);
-
-    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
-
-    // static_assert(Problem::kPadSeqLenQ == true && Problem::kPadHeadDimQ == true &&
-    //               Problem::kPadHeadDimV == true);
-
-    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ =
-        Problem::kPadHeadDimQ; // support multiple of vector(like 8x)
-    static constexpr bool kPadHeadDimV =
-        Problem::kPadHeadDimV; // support multiple of vector(like 8x)
-
-    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
-    static constexpr bool kHasDropout       = Problem::kHasDropout;
-    static constexpr auto BiasEnum          = Problem::BiasEnum;
-    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
-    static constexpr bool kHasUnevenSplits  = true;
-
-    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
-                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
-                    !kHasLogitsSoftCap)) ||
-                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
-
-    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
-    // ... together with tensor distribution. tensor dist should able to overwrite this
-    static constexpr index_t kAlignmentQ = Policy::template GetAlignmentQ<Problem>();
-    static constexpr index_t kAlignmentK = Policy::template GetAlignmentK<Problem>();
-    static constexpr index_t kAlignmentV = []() {
-        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            return Policy::template GetAlignmentV<Problem>();
-        else
-            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
-    }();
-
-    static constexpr index_t kAlignmentOacc = Policy::template GetAlignmentO<Problem>();
-
-    static constexpr index_t kAlignmentBias =
-        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
-
-    static constexpr index_t kBlockPerCu = []() {
-        if constexpr(Problem::kBlockPerCu != -1)
-            return Problem::kBlockPerCu;
-        else
-        {
-            if constexpr(kQKHeaddim <= 32)
-            {
-                return 2;
-            }
-            else if constexpr(kQKHeaddim <= 64)
-            {
-                return 3;
-            }
-            else if constexpr(kQKHeaddim <= 128)
-            {
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || kM0 >= 256)
-                    return 1;
-                else
-                    return 2;
-            }
-            else if constexpr(kQKHeaddim <= 256)
-            {
-                return 1;
-            }
-            else
-            {
-                return 1;
-            }
-        }
-    }();
-
-    static constexpr const char* name = "qr_async_trload";
-
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
-    {
-        return Policy::template GetSmemSize<Problem>();
-    }
-
-    // Decode
-    template <typename QDramBlockWindowTmp,
-              typename KDramBlockWindowTmp,
-              typename VDramBlockWindowTmp,
-              typename BiasDramBlockWindowTmp,
-              typename LSEaccDramBlockWindowTmp,
-              typename PositionEncoding>
-    CK_TILE_HOST_DEVICE auto
-    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
-               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
-               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
-               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
-               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
-               FmhaMask mask,
-               PositionEncoding position_encoding,
-               float scale_s,
-               void* smem_ptr) const
-    {
-        static_assert(
-            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
-                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
-                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
-            "wrong!");
-
-        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
-                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
-                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
-                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
-                      "wrong!");
-        ignore = bias_dram_block_window_tmp;
-        ignore = position_encoding;
-        // Block GEMM
-        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
-        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
-
-        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
-        auto s_acc              = SaccBlockTileType{};
-
-        // reduction function for softmax
-        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
-        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
-
-        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
-
-        auto o_acc = OaccBlockTileType{};
-
-        // infer Sacc, S, P, M, L, Oacc type
-        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
-
-        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
-            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
-
-        // init M, L
-        auto m = MLBlockTileType{};
-        auto l = MLBlockTileType{};
-
-        clear_tile(o_acc);
-        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
-        clear_tile(l);
-
-        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
-        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
-            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
-
-        // check early exit if no work to do
-        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
-        {
-            const index_t logical_num_total_loop =
-                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
-            if(logical_num_total_loop <= 0)
-            {
-                if constexpr(kStoreLSE)
-                {
-                    auto lse_acc =
-                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
-
-                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
-
-                    if(get_thread_local_1d_id() < kM0)
-                    {
-                        store_tile(lse_acc_dram_window_tmp, lse_acc);
-                    }
-                }
-
-                // Note: here occ are all cleard, return it
-                // Note: q loaded but no fence, ignore it.
-                return o_acc;
-            }
-        }
-
-        // Q tile in LDS
-        auto q_dram_window = make_tile_window(
-            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
-
-        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<QDataType*>(smem_ptr), Policy::template MakeQLdsBlockDescriptor<Problem>());
-
-        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<QDataType*>(smem_ptr),
-            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
-
-        auto q_lds_store_window =
-            make_tile_window(q_lds_write_view,
-                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0});
-
-        auto q_lds_read_window =
-            make_tile_window(q_lds_read_view,
-                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0},
-                             Policy::template MakeQRegTileDistribution<Problem>());
-
-        async_load_tile(q_lds_store_window, q_dram_window);
-
-        // K tile in LDS
-        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
-        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
-        // make sure the first tile is completely located in page-block (page-block size should be
-        // divisible by kN0)
-        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
-        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
-        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
-
-        auto k_dram_window = make_tile_window(
-            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem>());
-
-        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<KDataType*>(smem_ptr), Policy::template MakeKLdsBlockDescriptor<Problem>());
-        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<KDataType*>(smem_ptr),
-            Policy::template MakeKLdsBlockDescriptor<Problem, false, true>());
-
-        auto k_lds_write_window =
-            make_tile_window(k_lds_write_view,
-                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0});
-        auto k_lds_read_window =
-            make_tile_window(k_lds_read_view,
-                             make_tuple(number<kN0>{}, number<kK0>{}),
-                             {0, 0},
-                             Policy::template MakeKRegTileDistribution<Problem>());
-
-        // S tile in LDS
-        auto s_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptr) +
-                                            Policy::template GetSmemSizeK<Problem>()),
-            Policy::template MakeSLdsBlockDescriptor<Problem>());
-        auto s_write_lds_window = make_tile_window(
-            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
-        auto s_read_lds_window =
-            make_tile_window(s_lds,
-                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0},
-                             Policy::template MakeSRegTileDistribution<Problem>());
-
-        // V tile in LDS
-        auto v_dram_window = make_tile_window(
-            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
-
-        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
-                                         Policy::template GetSmemSizeK<Problem>() +
-                                         Policy::template GetSmemSizeS<Problem>()),
-            Policy::template MakeVLdsBlockDescriptor<Problem>());
-        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
-                                         Policy::template GetSmemSizeK<Problem>() +
-                                         Policy::template GetSmemSizeS<Problem>()),
-            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
-        auto v_lds_write_window =
-            make_tile_window(v_lds_write_view,
-                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0});
-
-        auto v_lds_read_window =
-            make_tile_window(v_lds_read_view,
-                             make_tuple(number<kK1>{}, number<kN1>{}),
-                             {0, 0},
-                             Policy::template MakeVRegTileDistribution<Problem>());
-
-        block_sync_lds_direct_load<0>();
-        auto q_tile = load_tile(q_lds_read_window);
-
-        const index_t num_total_loop =
-            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
-
-        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kQKHeaddim / kK0;
-        constexpr index_t k1_loops = kN0 / kK1;
-
-        static_assert(1 <= k0_loops);
-        static_assert(1 <= k1_loops);
-
-        block_sync_lds();
-        async_load_tile(k_lds_write_window, k_dram_window);
-
-        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
-        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
-
-        do
-        {
-            block_sync_lds();
-            async_load_tile(v_lds_write_window, v_dram_window); // prefetch load v tile
-
-            // move V tile windows
-            move_tile_window(v_dram_window, {kN0, 0});
-
-            // STAGE 1, QK gemm
-            clear_tile(s_acc); // initialize C
-
-            if constexpr(1 < k0_loops)
-            {
-                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
-                    if constexpr(i_k0 == 0)
-                    {
-                        block_sync_lds_direct_load<v_vmem_insts>();
-                    }
-                    else
-                    {
-                        block_sync_lds_direct_load<0>();
-                    }
-
-                    auto k_tile = load_tile(k_lds_read_window);
-
-                    gemm_0(s_acc,
-                           get_slice_tile(q_tile,
-                                          sequence<0, i_k0 * kK0>{},
-                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
-                           k_tile);
-
-                    // loop over along the [K]ey head dimension
-                    move_tile_window(k_dram_window, {0, kK0});
-                    block_sync_lds();
-                    async_load_tile(k_lds_write_window, k_dram_window);
-                });
-                // move back to the origin
-                move_tile_window(k_dram_window, {0, -kK0 * (k0_loops - 1)});
-            }
-
-            if constexpr(k0_loops == 1)
-            {
-                block_sync_lds_direct_load<v_vmem_insts>();
-            }
-            else
-            {
-                block_sync_lds_direct_load<0>();
-            }
-
-            auto k_tile = load_tile(k_lds_read_window);
-
-            gemm_0(s_acc,
-                   get_slice_tile(q_tile,
-                                  sequence<0, (k0_loops - 1) * kK0>{},
-                                  sequence<kM0, k0_loops * kK0>{}),
-                   k_tile);
-
-            if constexpr(kHasUnevenSplits)
-            {
-                if(i_total_loops == (num_total_loop - 1))
-                {
-                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
-                    set_tile_if(s_acc,
-                                -numeric<SMPLComputeDataType>::infinity(),
-                                [&,
-                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
-                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
-                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
-
-                                    {
-                                        return physical_seqlen_k_end_ <= col;
-                                    }
-                                });
-                }
-            }
-
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
-            {
-                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
-
-                bool need_perpixel_check =
-                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
-                if(need_perpixel_check)
-                {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
-                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
-                            return mask.IsOutOfBound(row, col);
-                        });
-                }
-            }
-
-            // move K tile windows after current status checked
-            // prefetch next-tile along [K]ey sequence length dimension
-            move_tile_window(k_dram_window, {kN0, 0});
-
-            block_sync_lds();
-            async_load_tile(k_lds_write_window, k_dram_window);
-
-            // Gemm1
-            auto s_new = [&]() {
-                if constexpr(kNWarp > 1)
-                {
-                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
-
-                    store_tile(s_write_lds_window, s);
-                    block_sync_lds();
-                    return load_tile(s_read_lds_window);
-                }
-                else
-                {
-                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
-                }
-            }();
-
-            auto m_local = block_tile_reduce<SMPLComputeDataType>(
-                s_new,
-                sequence<1>{},
-                f_max,
-                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
-            // Set CrossWarp to false will trigger better strategy on gfx950, but will cause
-            // performance regression because of un-coexecutable packed math, silent it for now
-            block_tile_reduce_sync(
-                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
-
-            const auto m_old = m; // m{j-1}
-            tile_elementwise_inout(
-                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
-
-            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
-                s_new.get_tile_distribution()); // Pcompute{j}
-
-            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
-                /// NOTICE: bias might be materialized mask including -inf values, need
-                /// consideration
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                             FmhaMask::IsMasking)
-                {
-                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
-                               ? type_convert<SMPLComputeDataType>(0.f)
-                               : raw_m;
-                }
-                else
-                {
-                    return raw_m;
-                }
-            };
-
-            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
-            sweep_tile_span(p_spans[I0], [&](auto idx0) {
-                constexpr auto i_idx = make_tuple(idx0);
-                auto row_max         = scale_s * get_validated_m(m[i_idx]);
-                sweep_tile_span(p_spans[I1], [&](auto idx1) {
-                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
-                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                    {
-                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
-                    }
-                    else
-                    {
-                        if constexpr(kHasLogitsSoftCap)
-                        {
-                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
-                        }
-                        else
-                        {
-                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
-                        }
-                    }
-                });
-            });
-
-            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
-                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
-
-            block_tile_reduce_sync(
-                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
-
-            auto p_tile = make_static_distributed_tensor<PDataType>(
-                Policy::template MakePRegTileDistribution<Problem>());
-            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
-
-            // l{j}, Oacc{j}
-            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
-            sweep_tile_span(o_spans[I0], [&](auto idx0) {
-                constexpr auto i_idx = make_tuple(idx0);
-                const auto tmp       = [&]() {
-                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                    {
-                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
-                    }
-                    else
-                    {
-                        if constexpr(kHasLogitsSoftCap)
-                        {
-                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
-                        }
-                        else
-                        {
-                            auto row_max = scale_s * get_validated_m(m[i_idx]);
-                            return exp2(scale_s * m_old[i_idx] - row_max);
-                        }
-                    }
-                }();
-                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
-                sweep_tile_span(o_spans[I1], [&](auto idx1) {
-                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
-
-                    o_acc(i_j_idx) *= tmp;
-                });
-            });
-
-            block_sync_lds_direct_load<k_vmem_insts>();
-
-            auto v_tile = load_tile_transpose(v_lds_read_window);
-
-            if constexpr(1 < k1_loops)
-            {
-                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
-                    gemm_1(o_acc,
-                           get_slice_tile(p_tile,
-                                          sequence<0, i_k1 * kK1>{},
-                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
-                           v_tile);
-
-                    // loop over along the [V]alue Sequence length
-                    move_tile_window(v_lds_read_window, {kK1, 0});
-                    v_tile = load_tile_transpose(v_lds_read_window);
-                });
-                // move back to the origin
-                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
-            }
-
-            gemm_1(o_acc,
-                   get_slice_tile(p_tile,
-                                  sequence<0, (k1_loops - 1) * kK1>{},
-                                  sequence<kM0, k1_loops * kK1>{}),
-                   v_tile);
-
-        } while(++i_total_loops < num_total_loop);
-
-        if constexpr(kStoreLSE)
-        {
-            // store lse acc
-            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
-
-            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
-            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
-                constexpr auto i_idx = make_tuple(idx0);
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                {
-                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
-                }
-                else
-                {
-                    if constexpr(kHasLogitsSoftCap)
-                    {
-                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
-                    }
-                    else
-                    {
-                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
-                    }
-                }
-            });
-
-            if(get_thread_local_1d_id() < kM0)
-            {
-                store_tile(lse_acc_dram_window_tmp, lse_acc);
-            }
-        }
-
-        // finally, O
-        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
-
-        sweep_tile_span(o_spans[I0], [&](auto idx0) {
-            constexpr auto i_idx = make_tuple(idx0);
-            const auto tmp       = [&]() {
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                             FmhaMask::IsMasking)
-                {
-                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
-                }
-                else
-                    return 1 / l[i_idx];
-            }();
-            sweep_tile_span(o_spans[I1], [&](auto idx1) {
-                constexpr auto i_j_idx = make_tuple(idx0, idx1);
-                o_acc(i_j_idx) *= tmp;
-            });
-        });
-
-        return o_acc;
-    }
-
-    // Prefill, double lds
-    template <typename QDramBlockWindowTmp,
-              typename KDramBlockWindowTmp,
-              typename VDramBlockWindowTmp,
-              typename BiasDramBlockWindowTmp,
-              typename LSEaccDramBlockWindowTmp,
-              typename PositionEncoding>
-    CK_TILE_HOST_DEVICE auto
-    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
-               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
-               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
-               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
-               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
-               FmhaMask mask,
-               PositionEncoding position_encoding,
-               float scale_s,
-               void* __restrict__ smem_ptrk0,
-               void* __restrict__ smem_ptrk1,
-               void* __restrict__ smem_ptrv0,
-               void* __restrict__ smem_ptrv1) const
-    {
-        static_assert(
-            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
-                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
-                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
-            "wrong!");
-
-        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
-                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
-                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
-                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
-                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
-                      "wrong!");
-        ignore = bias_dram_block_window_tmp;
-        ignore = position_encoding;
-
-        // Block GEMM
-        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
-        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
-
-        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
-        auto s_acc              = SaccBlockTileType{};
-
-        // reduction function for softmax
-        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
-        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
-
-        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
-
-        auto o_acc = OaccBlockTileType{};
-
-        // infer Sacc, S, P, M, L, Oacc type
-        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
-
-        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
-            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
-
-        // init M, L
-        auto m = MLBlockTileType{};
-        auto l = MLBlockTileType{};
-
-        clear_tile(o_acc);
-        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
-        clear_tile(l);
-
-        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
-        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
-            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
-
-        // check early exit if no work to do
-        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
-        {
-            const index_t logical_num_total_loop =
-                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
-            if(logical_num_total_loop <= 0)
-            {
-                if constexpr(kStoreLSE)
-                {
-                    auto lse_acc =
-                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
-
-                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
-
-                    if(get_thread_local_1d_id() < kM0)
-                    {
-                        store_tile(lse_acc_dram_window_tmp, lse_acc);
-                    }
-                }
-
-                // Note: here occ are all cleard, return it
-                // Note: q loaded but no fence, ignore it.
-                return o_acc;
-            }
-        }
-
-        // Q tile in LDS
-        auto q_dram_window = make_tile_window(
-            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
-
-        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<QDataType*>(smem_ptrk0),
-            Policy::template MakeQLdsBlockDescriptor<Problem>());
-
-        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<QDataType*>(smem_ptrk0),
-            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
-
-        auto q_lds_store_window =
-            make_tile_window(q_lds_write_view,
-                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0});
-
-        auto q_lds_read_window =
-            make_tile_window(q_lds_read_view,
-                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0},
-                             Policy::template MakeQRegTileDistribution<Problem>());
-
-        async_load_tile(q_lds_store_window, q_dram_window);
-        block_sync_lds_direct_load<0>();
-        auto q_tile = load_tile(q_lds_read_window);
-
-        // K tile in LDS
-        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
-        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
-        // make sure the first tile is completely located in page-block (page-block size should be
-        // divisible by kN0)
-        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
-        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
-        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
-
-        auto k_dram_window = make_tile_window(
-            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem, true>());
-
-        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<KDataType* __restrict__>(smem_ptrk0),
-            Policy::template MakeKLdsBlockDescriptor<Problem, true>());
-
-        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
-            static_cast<KDataType* __restrict__>(smem_ptrk0),
-            Policy::template MakeKLdsBlockDescriptor<Problem, true, true>());
-
-        auto k_lds_write_window =
-            make_tile_window(k_lds_write_view,
-                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0});
-
-        auto k_lds_read_window =
-            make_tile_window(k_lds_read_view,
-                             make_tuple(number<kN0>{}, number<kK0>{}),
-                             {0, 0},
-                             Policy::template MakeKRegTileDistribution<Problem>());
-
-        // S tile in LDS
-        auto s_lds = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptrk0) +
-                                            Policy::template GetSmemSizeK<Problem>()),
-            Policy::template MakeSLdsBlockDescriptor<Problem>());
-        auto s_write_lds_window = make_tile_window(
-            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
-        auto s_read_lds_window =
-            make_tile_window(s_lds,
-                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0},
-                             Policy::template MakeSRegTileDistribution<Problem>());
-
-        // V tile in LDS
-        auto v_dram_window = make_tile_window(
-            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
-
-        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
-            Policy::template MakeVLdsBlockDescriptor<Problem>());
-
-        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
-            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
-            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
-
-        auto v_lds_write_window =
-            make_tile_window(v_lds_write_view,
-                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
-                             {0, 0});
-
-        auto v_lds_read_window =
-            make_tile_window(v_lds_read_view,
-                             make_tuple(number<kK1>{}, number<kN1>{}),
-                             {0, 0},
-                             Policy::template MakeVRegTileDistribution<Problem>());
-
-        // block_sync_lds_direct_load<0>();
-        // auto q_tile = load_tile(q_lds_read_window);
-
-        const index_t num_total_loop =
-            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
-
-        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kQKHeaddim / kK0;
-        constexpr index_t k1_loops = kN0 / kK1;
-
-        static_assert(1 <= k0_loops);
-        static_assert(1 <= k1_loops);
-        block_sync_lds<0>();
-        async_load_tile(k_lds_write_window, k_dram_window);
-        async_load_tile(v_lds_write_window, v_dram_window);
-
-        move_tile_window(k_dram_window, {kN0, 0});
-        k_lds_write_window.set_bottom_tensor_view_data_ptr(
-            static_cast<KDataType* __restrict__>(smem_ptrk1));
-        async_load_tile(k_lds_write_window, k_dram_window);
-
-        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
-        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
-
-        constexpr index_t k_lds_insts = k_lds_read_window.get_num_of_access();
-        constexpr index_t v_lds_insts = v_lds_read_window.get_num_of_access();
-
-        block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
-        auto k_tile = load_tile(k_lds_read_window);
-
-        __builtin_amdgcn_sched_barrier(0);
-
-        auto mainloop = [&](index_t cur_loop) {
-            const bool is_even_loop = (cur_loop % 2 == 0);
-
-            auto k_lds_write_ptr = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk0)
-                                                : static_cast<KDataType* __restrict__>(smem_ptrk1);
-            auto k_lds_read_ptr  = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk1)
-                                                : static_cast<KDataType* __restrict__>(smem_ptrk0);
-            auto v_lds_write_ptr = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv1)
-                                                : static_cast<VDataType* __restrict__>(smem_ptrv0);
-            auto v_lds_read_ptr  = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv0)
-                                                : static_cast<VDataType* __restrict__>(smem_ptrv1);
-
-            // move V tile windows
-            block_sync_lds<k_lds_insts>();
-            move_tile_window(v_dram_window, {kN0, 0});
-            v_lds_write_window.set_bottom_tensor_view_data_ptr(v_lds_write_ptr);
-            async_load_tile(v_lds_write_window, v_dram_window);
-
-            // STAGE 1, QK gemm
-            clear_tile(s_acc); // initialize C
-
-            if constexpr(1 < k0_loops)
-            {
-                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
-                    // loop over along the [K]ey head dimension
-                    move_tile_window(k_lds_read_window, {0, kK0});
-                    auto k_tile_switch = load_tile(k_lds_read_window);
-
-                    gemm_0(s_acc,
-                           get_slice_tile(q_tile,
-                                          sequence<0, i_k0 * kK0>{},
-                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
-                           k_tile);
-
-                    k_tile = k_tile_switch;
-                });
-                // move back to the origin
-                move_tile_window(k_lds_read_window, {0, -kK0 * (k0_loops - 1)});
-            }
-
-            gemm_0(s_acc,
-                   get_slice_tile(q_tile,
-                                  sequence<0, (k0_loops - 1) * kK0>{},
-                                  sequence<kM0, k0_loops * kK0>{}),
-                   k_tile);
-
-            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
-            v_lds_read_window.set_bottom_tensor_view_data_ptr(v_lds_read_ptr);
-            auto v_tile = load_tile_transpose(v_lds_read_window);
-
-            if constexpr(kHasUnevenSplits)
-            {
-                if(i_total_loops == (num_total_loop - 1))
-                {
-                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
-                    set_tile_if(s_acc,
-                                -numeric<SMPLComputeDataType>::infinity(),
-                                [&,
-                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
-                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
-                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
-
-                                    {
-                                        return physical_seqlen_k_end_ <= col;
-                                    }
-                                });
-                }
-            }
-
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
-            {
-                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
-
-                bool need_perpixel_check =
-                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
-                if(need_perpixel_check)
-                {
-                    set_tile_if(
-                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
-                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
-                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
-                            return mask.IsOutOfBound(row, col);
-                        });
-                }
-            }
-
-            // Gemm1
-            auto s_new = [&]() {
-                if constexpr(kNWarp > 1)
-                {
-                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
-
-                    store_tile(s_write_lds_window, s);
-                    block_sync_lds();
-                    return load_tile(s_read_lds_window);
-                }
-                else
-                {
-                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
-                }
-            }();
-
-            auto m_local = block_tile_reduce<SMPLComputeDataType>(
-                s_new,
-                sequence<1>{},
-                f_max,
-                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
-            block_tile_reduce_sync(
-                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
-
-            static_for<0, 12, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
-            });
-
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
-            });
-
-            const auto m_old = m; // m{j-1}
-            tile_elementwise_inout(
-                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
-
-            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
-                s_new.get_tile_distribution()); // Pcompute{j}
-
-            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
-                /// NOTICE: bias might be materialized mask including -inf values, need
-                /// consideration
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                             FmhaMask::IsMasking)
-                {
-                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
-                               ? type_convert<SMPLComputeDataType>(0.f)
-                               : raw_m;
-                }
-                else
-                {
-                    return raw_m;
-                }
-            };
-
-            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
-            sweep_tile_span(p_spans[I0], [&](auto idx0) {
-                constexpr auto i_idx = make_tuple(idx0);
-                auto row_max         = scale_s * get_validated_m(m[i_idx]);
-                sweep_tile_span(p_spans[I1], [&](auto idx1) {
-                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
-                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                    {
-                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
-                    }
-                    else
-                    {
-                        if constexpr(kHasLogitsSoftCap)
-                        {
-                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
-                        }
-                        else
-                        {
-                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
-                        }
-                    }
-                });
-            });
-
-            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
-                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
-
-            block_tile_reduce_sync(
-                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
-
-            auto p_tile = make_static_distributed_tensor<PDataType>(
-                Policy::template MakePRegTileDistribution<Problem>());
-            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
-
-            // l{j}, Oacc{j}
-            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
-            sweep_tile_span(o_spans[I0], [&](auto idx0) {
-                constexpr auto i_idx = make_tuple(idx0);
-                const auto tmp       = [&]() {
-                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                    {
-                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
-                    }
-                    else
-                    {
-                        if constexpr(kHasLogitsSoftCap)
-                        {
-                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
-                        }
-                        else
-                        {
-                            auto row_max = scale_s * get_validated_m(m[i_idx]);
-                            return exp2(scale_s * m_old[i_idx] - row_max);
-                        }
-                    }
-                }();
-                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
-                sweep_tile_span(o_spans[I1], [&](auto idx1) {
-                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
-
-                    o_acc(i_j_idx) *= tmp;
-                });
-            });
-
-            block_sync_lds<v_lds_insts>();
-            move_tile_window(k_dram_window, {kN0, 0});
-            k_lds_write_window.set_bottom_tensor_view_data_ptr(k_lds_write_ptr);
-            async_load_tile(k_lds_write_window, k_dram_window);
-
-            if constexpr(1 < k1_loops)
-            {
-                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
-                    // loop over along the [V]alue Sequence length
-                    move_tile_window(v_lds_read_window, {kK1, 0});
-                    auto v_tile_switch = load_tile_transpose(v_lds_read_window);
-
-                    gemm_1(o_acc,
-                           get_slice_tile(p_tile,
-                                          sequence<0, i_k1 * kK1>{},
-                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
-                           v_tile);
-
-                    v_tile = v_tile_switch;
-                });
-                // move back to the origin
-                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
-            }
-
-            gemm_1(o_acc,
-                   get_slice_tile(p_tile,
-                                  sequence<0, (k1_loops - 1) * kK1>{},
-                                  sequence<kM0, k1_loops * kK1>{}),
-                   v_tile);
-
-            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
-            k_lds_read_window.set_bottom_tensor_view_data_ptr(k_lds_read_ptr);
-            k_tile = load_tile(k_lds_read_window);
-
-            static_for<0, 12, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
-            });
-
-            static_for<0, 4, 1>{}([&](auto i) {
-                ignore = i;
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
-            });
-        };
-
-        do
-        {
-            mainloop(i_total_loops);
-            i_total_loops++;
-        } while(i_total_loops < num_total_loop);
-
-        if constexpr(kStoreLSE)
-        {
-            // store lse acc
-            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
-
-            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
-            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
-                constexpr auto i_idx = make_tuple(idx0);
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
-                {
-                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
-                }
-                else
-                {
-                    if constexpr(kHasLogitsSoftCap)
-                    {
-                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
-                    }
-                    else
-                    {
-                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
-                    }
-                }
-            });
-
-            if(get_thread_local_1d_id() < kM0)
-            {
-                store_tile(lse_acc_dram_window_tmp, lse_acc);
-            }
-        }
-
-        // finally, O
-        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
-
-        sweep_tile_span(o_spans[I0], [&](auto idx0) {
-            constexpr auto i_idx = make_tuple(idx0);
-            const auto tmp       = [&]() {
-                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
-                             FmhaMask::IsMasking)
-                {
-                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
-                }
-                else
-                    return 1 / l[i_idx];
-            }();
-            sweep_tile_span(o_spans[I1], [&](auto idx1) {
-                constexpr auto i_j_idx = make_tuple(idx0, idx1);
-                o_acc(i_j_idx) *= tmp;
-            });
-        });
-
-        return o_acc;
-    }
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
deleted file mode 100644
index ed22758566..0000000000
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
+++ /dev/null
@@ -1,823 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
-#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
-#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
-#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
-
-// can remove all bank conflicts, but drop the performance for some cases
-// Probably it is limited by compiler optimization.
-#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
-namespace ck_tile {
-// This pipeline is qkv all located in LDS
-struct BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
-    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                          /* AsyncCopy = */ false,
-                                          /* NumPrefetchK = */ 1,
-                                          /* NumPrefetchV = */ 1>
-{
-    using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
-                                                           /* AsyncCopy = */ false,
-                                                           /* NumPrefetchK = */ 1,
-                                                           /* NumPrefetchV = */ 1>;
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
-    {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
-
-        // this should align with MakeQDramTileDistribution()
-        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
-        static_assert(0 < ElemPerThread);
-        return min(ElemPerThread, MaxVectorSize);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc()
-    {
-        using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
-
-        return static_cast<index_t>(16 / sizeof(OaccDataType));
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentK()
-    {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::KDataType);
-
-        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
-        static_assert(0 < ElemPerThread);
-        return min(ElemPerThread, MaxVectorSize);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
-    {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
-
-        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
-
-        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
-        static_assert(0 < ElemPerThread);
-        return min(ElemPerThread, MaxVectorSize);
-    }
-
-    template <typename Problem, bool BypassLDS = false>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
-    {
-        if constexpr(!BypassLDS)
-        {
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-            constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
-
-            constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
-            static_assert(0 < ElemPerThread);
-            constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
-
-            constexpr index_t KPerThread     = kMaxVecLoad;
-            constexpr index_t KThreads       = kKPerBlock / KPerThread;
-            constexpr index_t MThreadPerWarp = get_warp_size() / KThreads;
-            constexpr index_t NumWarps       = kBlockSize / get_warp_size();
-            constexpr index_t MPerThread     = kMPerBlock / (MThreadPerWarp * NumWarps);
-
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<MPerThread, NumWarps, MThreadPerWarp>,
-                                                 sequence<KThreads, KPerThread>>,
-                                           tuple<sequence<1>, sequence<1, 2>>,
-                                           tuple<sequence<1>, sequence<2, 0>>,
-                                           sequence<1, 2>,
-                                           sequence<0, 1>>{});
-        }
-        else
-        {
-            using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
-            constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-            using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
-
-            constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
-            constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
-
-            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-            constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
-            constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
-
-            constexpr auto q_block_outer_dstr_encoding = tile_distribution_encoding<
-                sequence<NWarp>,
-                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                tuple<sequence<1, 0>>,
-                tuple<sequence<1, 0>>,
-                sequence<2, 1>,
-                sequence<0, 0>>{};
-
-            constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
-
-            constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
-
-            return q_block_dstr;
-        }
-    }
-
-    template <typename Problem, bool LoadOnce = false>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
-    {
-        using KDataType = remove_cvref_t<typename Problem::KDataType>;
-
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kKPerBlock =
-            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
-
-        constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
-        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
-
-        constexpr index_t K1 = min(MaxVectorSize, ElemPerThread);
-        constexpr index_t K0 = kKPerBlock / K1;
-        constexpr index_t N2 = get_warp_size() / K0;
-        constexpr index_t N1 = kBlockSize / get_warp_size();
-        constexpr index_t N0 = kNPerBlock / (N2 * N1);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{});
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
-    {
-        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
-        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
-
-        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
-        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
-
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
-        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
-
-        // Read M first, then K
-        // This is the same data consume order as BlockGEMM
-        constexpr auto q_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<1, 0>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-
-        constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
-
-        constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
-
-        return q_block_dstr;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackQ()
-    {
-        // TODO: this is for 3d layout
-        using QDataType = remove_cvref_t<typename Problem::QDataType>;
-        return static_cast<index_t>(16 / sizeof(QDataType));
-    }
-
-    template <typename Problem, bool Xor = false>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsBlockDescriptor()
-    {
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
-
-        constexpr index_t kKPack = GetSmemKPackQ<Problem>();
-
-        constexpr auto q_lds_block_desc = [&]() {
-            if constexpr(Xor)
-            {
-#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::QDataType);
-                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
-
-                if constexpr(XorLengthFold > 1)
-                {
-                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
-                        make_tuple(number<kMPerBlock / XorLengthFold>{},
-                                   number<LDSLayerSize / kKPack>{},
-                                   number<kKPack>{}),
-                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
-                        number<kKPack>{},
-                        number<1>{});
-
-                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
-                        q_lds_block_desc_naive,
-                        make_tuple(
-                            make_xor_transform(make_tuple(number<kMPerBlock / XorLengthFold>{},
-                                                          number<LDSLayerSize / kKPack>{})),
-                            make_pass_through_transform(number<kKPack>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    constexpr auto q_lds_block_desc_tmp = transform_tensor_descriptor(
-                        q_lds_block_desc_permuted,
-                        make_tuple(
-                            make_pass_through_transform(number<kMPerBlock / XorLengthFold>{}),
-                            make_unmerge_transform(
-                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
-                            make_pass_through_transform(number<kKPack>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
-
-                    return transform_tensor_descriptor(
-                        q_lds_block_desc_tmp,
-                        make_tuple(
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<kMPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
-                            make_merge_transform_v3_division_mod(
-                                make_tuple(number<kMPerBlock / kKPack>{}, number<kKPack>{}))),
-                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-                else
-#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                {
-                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
-                        make_tuple(
-                            number<kMPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
-                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
-                        number<kKPack>{},
-                        number<1>{});
-
-                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
-                        q_lds_block_desc_naive,
-                        make_tuple(make_xor_transform(make_tuple(number<kMPerBlock>{},
-                                                                 number<kKPerBlock / kKPack>{})),
-                                   make_pass_through_transform(number<kKPack>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    return transform_tensor_descriptor(
-                        q_lds_block_desc_permuted,
-                        make_tuple(make_pass_through_transform(number<kMPerBlock>{}),
-                                   make_merge_transform_v3_division_mod(make_tuple(
-                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-            }
-            else
-            {
-                return make_naive_tensor_descriptor(
-                    make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
-                    make_tuple(number<kKPerBlock>{}, number<1>{}),
-                    number<kKPack>{},
-                    number<1>{});
-            }
-        }();
-
-        return q_lds_block_desc;
-    }
-
-    template <typename Problem, bool LoadOnce = false, bool Xor = false>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor()
-    {
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kKPerBlock =
-            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
-
-        constexpr index_t kKPack = GetSmemKPackK<Problem>();
-
-        constexpr auto k_lds_block_desc = [&]() {
-            if constexpr(Xor)
-            {
-#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::KDataType);
-                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
-
-                if constexpr(XorLengthFold > 1)
-                {
-                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
-                        make_tuple(number<kNPerBlock / XorLengthFold>{},
-                                   number<LDSLayerSize / kKPack>{},
-                                   number<kKPack>{}),
-                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
-                        number<kKPack>{},
-                        number<1>{});
-
-                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
-                        k_lds_block_desc_naive,
-                        make_tuple(
-                            make_xor_transform(make_tuple(number<kNPerBlock / XorLengthFold>{},
-                                                          number<LDSLayerSize / kKPack>{})),
-                            make_pass_through_transform(number<kKPack>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    constexpr auto k_lds_block_desc_tmp = transform_tensor_descriptor(
-                        k_lds_block_desc_permuted,
-                        make_tuple(
-                            make_pass_through_transform(number<kNPerBlock / XorLengthFold>{}),
-                            make_unmerge_transform(
-                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
-                            make_pass_through_transform(number<kKPack>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
-
-                    return transform_tensor_descriptor(
-                        k_lds_block_desc_tmp,
-                        make_tuple(
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<kNPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
-                            make_merge_transform_v3_division_mod(
-                                make_tuple(number<kNPerBlock / kKPack>{}, number<kKPack>{}))),
-                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-                else
-#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                {
-                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
-                        make_tuple(
-                            number<kNPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
-                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
-                        number<kKPack>{},
-                        number<1>{});
-
-                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
-                        k_lds_block_desc_naive,
-                        make_tuple(make_xor_transform(make_tuple(number<kNPerBlock>{},
-                                                                 number<kKPerBlock / kKPack>{})),
-                                   make_pass_through_transform(number<kKPack>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    return transform_tensor_descriptor(
-                        k_lds_block_desc_permuted,
-                        make_tuple(make_pass_through_transform(number<kNPerBlock>{}),
-                                   make_merge_transform_v3_division_mod(make_tuple(
-                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-            }
-            else
-            {
-                return make_naive_tensor_descriptor(
-                    make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
-                    make_tuple(number<kKPerBlock>{}, number<1>{}),
-                    number<kKPack>{},
-                    number<1>{});
-            }
-        }();
-
-        return k_lds_block_desc;
-    }
-
-    template <typename Problem, bool Xor = false>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsBlockDescriptor()
-    {
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
-
-        constexpr index_t kKPack = GetSmemKPackV<Problem>();
-
-        constexpr auto v_lds_block_desc = [&]() {
-            if constexpr(Xor)
-            {
-                constexpr auto XorGroupSize =
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
-
-#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::VDataType);
-                constexpr auto XorLengthFold = LDSLayerSize / kNPerBlock;
-
-                if constexpr(XorLengthFold > 1)
-                {
-                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
-                        make_tuple(number<kKPerBlock / XorLengthFold>{},
-                                   number<LDSLayerSize / XorGroupSize>{},
-                                   number<XorGroupSize>{}),
-                        make_tuple(number<LDSLayerSize>{}, number<XorGroupSize>{}, number<1>{}),
-                        number<kKPack>{},
-                        number<1>{});
-
-                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
-                        v_lds_block_desc_naive,
-                        make_tuple(
-                            make_xor_transform(make_tuple(number<kKPerBlock / XorLengthFold>{},
-                                                          number<LDSLayerSize / XorGroupSize>{})),
-                            make_pass_through_transform(number<XorGroupSize>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    constexpr auto v_lds_block_desc_tmp = transform_tensor_descriptor(
-                        v_lds_block_desc_permuted,
-                        make_tuple(
-                            make_pass_through_transform(number<kKPerBlock / XorLengthFold>{}),
-                            make_unmerge_transform(make_tuple(number<XorLengthFold>{},
-                                                              number<kNPerBlock / XorGroupSize>{})),
-                            make_pass_through_transform(number<XorGroupSize>{})),
-                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
-
-                    return transform_tensor_descriptor(
-                        v_lds_block_desc_tmp,
-                        make_tuple(
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<kKPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
-                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-                else
-#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
-                {
-                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
-                        make_tuple(number<kKPerBlock>{},
-                                   number<kNPerBlock / XorGroupSize>{},
-                                   number<XorGroupSize>{}),
-                        make_tuple(number<kNPerBlock>{}, number<XorGroupSize>{}, number<1>{}),
-                        number<kKPack>{},
-                        number<1>{});
-
-                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
-                        v_lds_block_desc_naive,
-                        make_tuple(make_xor_transform(make_tuple(
-                                       number<kKPerBlock>{}, number<kNPerBlock / XorGroupSize>{})),
-                                   make_pass_through_transform(number<XorGroupSize>{})),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
-                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
-
-                    return transform_tensor_descriptor(
-                        v_lds_block_desc_permuted,
-                        make_tuple(
-                            make_pass_through_transform(number<kKPerBlock>{}),
-                            make_merge_transform_v3_division_mod(make_tuple(
-                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
-                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
-                        make_tuple(sequence<0>{}, sequence<1>{}));
-                }
-            }
-            else
-            {
-                return make_naive_tensor_descriptor(
-                    make_tuple(number<kKPerBlock>{}, number<kNPerBlock>{}),
-                    make_tuple(number<kNPerBlock>{}, number<1>{}),
-                    number<kKPack>{},
-                    number<1>{});
-            }
-        }();
-
-        return v_lds_block_desc;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
-    {
-        using GemmProblem =
-            BlockGemmProblem<typename Problem::QDataType,
-                             typename Problem::KDataType,
-                             typename Problem::SaccDataType,
-                             Problem::kBlockSize,
-                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
-                                                    Problem::BlockFmhaShape::kN0,
-                                                    Problem::BlockFmhaShape::kK0>,
-                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
-                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>,
-                             GemmLoopOrder::MNK>;
-
-        using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::QDataType,
-                                   typename Problem::KDataType,
-                                   typename Problem::SaccDataType,
-                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
-                                   true>;
-
-        using BlockGemmPolicy =
-            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::QDataType,
-                                                typename Problem::KDataType,
-                                                typename Problem::SaccDataType,
-                                                typename Problem::BlockFmhaShape::Gemm0BlockWarps,
-                                                WarpGemm>;
-
-        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy>{};
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetPVBlockGemm()
-    {
-        using GemmProblem =
-            BlockGemmProblem<typename Problem::PDataType,
-                             typename Problem::VDataType,
-                             typename Problem::OaccDataType,
-                             Problem::kBlockSize,
-                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
-                                                    Problem::BlockFmhaShape::kN1,
-                                                    Problem::BlockFmhaShape::kK1>,
-                                           typename Problem::BlockFmhaShape::Gemm1BlockWarps,
-                                           typename Problem::BlockFmhaShape::Gemm1WarpTile>,
-                             GemmLoopOrder::KMN>;
-
-        using WarpGemm = WarpGemmMfmaDispatcher<
-            typename Problem::PDataType,
-            typename Problem::VDataType,
-            typename Problem::OaccDataType,
-            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
-            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
-            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-            true,
-            false,
-            false,
-            ((Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 16 &&
-              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32) ||
-             (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 32 &&
-              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 16))
-                ? WGAttrNumAccessEnum::Double
-                : WGAttrNumAccessEnum::Single>;
-
-        using BlockGemmPolicy =
-            BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::PDataType,
-                                                typename Problem::VDataType,
-                                                typename Problem::OaccDataType,
-                                                typename Problem::BlockFmhaShape::Gemm1BlockWarps,
-                                                WarpGemm>;
-
-        return BlockGemmARegBRegCRegV1<GemmProblem, BlockGemmPolicy>{};
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeKRegTileDistribution()
-    {
-        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
-        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
-
-        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
-        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
-
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
-
-        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
-        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
-
-        // Read N first, then K
-        // This is the same data consume order as BlockGEMM
-        constexpr auto k_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<MWarp>,
-                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<0, 1>>,
-                                       tuple<sequence<0, 1>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
-
-        constexpr auto k_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            k_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
-
-        constexpr auto k_block_dstr = make_static_tile_distribution(k_block_dstr_encode);
-
-        return k_block_dstr;
-    }
-
-    template <typename Problem>
-    CK_TILE_DEVICE static constexpr auto MakeVDramTileDistribution()
-    {
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
-
-        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
-
-        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
-        static_assert(0 < ElemPerThread);
-        constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
-
-        constexpr index_t NPerThread     = kMaxVecLoad;
-        constexpr index_t NThreads       = kNPerBlock / NPerThread;
-        constexpr index_t KThreadPerWarp = get_warp_size() / NThreads;
-        constexpr index_t NumWarps       = kBlockSize / get_warp_size();
-        constexpr index_t KPerThread     = kKPerBlock / (KThreadPerWarp * NumWarps);
-
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<1>,
-                                       tuple<sequence<KPerThread, NumWarps, KThreadPerWarp>,
-                                             sequence<NThreads, NPerThread>>,
-                                       tuple<sequence<1>, sequence<1, 2>>,
-                                       tuple<sequence<1>, sequence<2, 0>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 1>>{});
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakePRegTileDistribution()
-    {
-        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
-        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
-
-        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
-        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
-
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
-
-        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
-        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
-
-        // Read M first, then K
-        // This is the same data consume order as BlockGEMM
-        constexpr auto p_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<1, 0>>,
-                                       tuple<sequence<1, 0>>,
-                                       sequence<2, 1>,
-                                       sequence<0, 0>>{};
-
-        constexpr auto p_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            p_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
-
-        constexpr auto p_block_dstr = make_static_tile_distribution(p_block_dstr_encode);
-
-        return p_block_dstr;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeVRegTileDistribution()
-    {
-        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
-        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
-
-        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
-        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
-
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
-
-        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
-        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
-
-        // Read N first, then K
-        // This is the same data consume order as BlockGEMM
-        constexpr auto v_block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<MWarp>,
-                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-                                       tuple<sequence<0, 1>>,
-                                       tuple<sequence<0, 1>>,
-                                       sequence<2, 1>,
-                                       sequence<0, 0>>{};
-
-        constexpr auto v_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            v_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
-
-        constexpr auto v_block_dstr =
-            make_static_tile_distribution(typename InputTileDistributionTraits<
-                                          decltype(v_block_dstr_encode),
-                                          typename Problem::VDataType>::TransposedDstrEncode{});
-
-        return v_block_dstr;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetSmemNPackS()
-    {
-        using SDataType = remove_cvref_t<typename Problem::SaccDataType>;
-        return static_cast<index_t>(16 / sizeof(SDataType));
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeSLdsBlockDescriptor()
-    {
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
-        constexpr index_t kNPack     = GetSmemNPackS<Problem>();
-
-        constexpr auto s_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kNPerBlock / kNPack>{}, number<kMPerBlock>{}, number<kNPack>{}),
-            make_tuple(number<(kMPerBlock + 1) * kNPack>{}, number<kNPack>{}, number<1>{}),
-            number<kNPack>{},
-            number<1>{});
-
-        constexpr auto s_lds_block_desc = transform_tensor_descriptor(
-            s_lds_block_desc_0,
-            make_tuple(
-                make_pass_through_transform(number<kMPerBlock>{}),
-                make_merge_transform(make_tuple(number<kNPerBlock / kNPack>{}, number<kNPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return s_lds_block_desc;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeSRegTileDistribution()
-    {
-        using BlockGemm = remove_cvref_t<decltype(GetKVBlockGemm<Problem>())>;
-
-        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
-        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
-        constexpr index_t MWarp = config.template at<1>();
-        constexpr index_t NWarp = config.template at<2>();
-
-        // static_assert(MWarp == 1, "Check failed!");
-
-        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
-        constexpr index_t kTileK     = Problem::BlockFmhaShape::kN0;
-
-        // K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm
-        constexpr index_t K3 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
-        constexpr index_t K2 = WG::WarpGemmAttribute::Impl::kABKLane;
-        constexpr index_t K1 = kKPerBlock / (K2 * K3);
-        constexpr index_t K0 = kTileK / kKPerBlock;
-        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane;
-        constexpr index_t M1 = MWarp;
-        constexpr index_t M0 = kMPerBlock / (M2 * M1);
-
-        constexpr auto s2_block_dstr_encoding =
-            tile_distribution_encoding<sequence<NWarp>,
-                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2, K3>>,
-                                       tuple<sequence<1, 0>, sequence<2, 1>>,
-                                       tuple<sequence<1, 0>, sequence<2, 2>>,
-                                       sequence<1, 2, 2, 2>,
-                                       sequence<0, 0, 1, 3>>{};
-
-        constexpr auto s2_block_dstr = make_static_tile_distribution(s2_block_dstr_encoding);
-
-        return s2_block_dstr;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeQ()
-    {
-        return MakeQLdsBlockDescriptor<Problem>().get_element_space_size() *
-               sizeof(typename Problem::QDataType);
-    }
-
-    template <typename Problem, bool LoadOnce = false>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK()
-    {
-        return MakeKLdsBlockDescriptor<Problem, LoadOnce>().get_element_space_size() *
-               sizeof(typename Problem::KDataType);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV()
-    {
-        return MakeVLdsBlockDescriptor<Problem>().get_element_space_size() *
-               sizeof(typename Problem::VDataType);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeS()
-    {
-        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
-
-        return NWarp > 1 ? MakeSLdsBlockDescriptor<Problem>().get_element_space_size() *
-                               sizeof(typename Problem::SaccDataType)
-                         : 0;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
-    {
-        // Alignment on gfx950 is 1280 Bytes
-        // Alignment before gfx950 is 512 Bytes.
-        return max(GetSmemSizeQ<Problem>(),
-                   GetSmemSizeK<Problem>() + GetSmemSizeS<Problem>() + GetSmemSizeV<Problem>());
-    }
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index e2cea97f9a..3489d6f9a1 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -383,31 +383,23 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackV()
     {
         // TODO: this is for 3d layout
-        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
-        constexpr index_t kBlockSize   = Problem::kBlockSize;
-        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
-        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
-        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-        constexpr index_t kMaxVecLoad =
-            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
-
-        return kMaxVecLoad;
+        using VDataType = remove_cvref_t<typename Problem::VDataType>;
+        return 16 / sizeof(VDataType);
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
     {
-        using VLayout                  = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
-        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
-        constexpr index_t kBlockSize   = Problem::kBlockSize;
-        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
-        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
-        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-        constexpr index_t kMaxVecLoad =
-            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
-
+        using VLayout   = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
+        using VDataType = remove_cvref_t<typename Problem::VDataType>;
         if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
+            constexpr index_t kBlockSize   = Problem::kBlockSize;
+            constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+            constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+            constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+            constexpr index_t kMaxVecLoad =
+                min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
             constexpr index_t kMinVecLoad = 4 / sizeof(VDataType);
 
             constexpr index_t kVecLoad = ((total_pixels / kMaxVecLoad) >= kMinVecLoad)
@@ -418,7 +410,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         }
         else
         {
-            return kMaxVecLoad;
+            return 16 / sizeof(VDataType);
         }
     }
 
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
index 0c90bb9e85..4652e5f20f 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -42,8 +42,6 @@ struct BlockGemmARegBRegCRegV1
         static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
         static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
 
-        static constexpr auto BlockGemmLoopOrder = Problem::BlockGemmLoopOrder;
-
         static constexpr index_t KPack = WarpGemm::kKPerThread;
     };
 
@@ -54,9 +52,8 @@ struct BlockGemmARegBRegCRegV1
 
     using Traits = GemmTraits_<Problem, Policy>;
 
-    using WarpGemm                           = typename Traits::WarpGemm;
-    using BlockGemmShape                     = typename Traits::BlockGemmShape;
-    static constexpr auto BlockGemmLoopOrder = Traits::BlockGemmLoopOrder;
+    using WarpGemm       = typename Traits::WarpGemm;
+    using BlockGemmShape = typename Traits::BlockGemmShape;
 
     using ADataType = remove_cvref_t<typename Traits::ADataType>;
     using BDataType = remove_cvref_t<typename Traits::BDataType>;
@@ -89,36 +86,17 @@ struct BlockGemmARegBRegCRegV1
         }
         else
         {
-            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
-            {
-                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
-                    sequence<NWarp>,
-                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                    tuple<sequence<1, 0>>,
-                    tuple<sequence<1, 0>>,
-                    sequence<2, 1>,
-                    sequence<0, 0>>{};
+            constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<NWarp>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
 
-                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
-
-                return a_block_dstr_encode;
-            }
-            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
-            {
-                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
-                    sequence<NWarp>,
-                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
-                    tuple<sequence<1, 0>>,
-                    tuple<sequence<1, 0>>,
-                    sequence<1, 2>,
-                    sequence<0, 0>>{};
-
-                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
-
-                return a_block_dstr_encode;
-            }
+            return a_block_dstr_encode;
         }
     }
 
@@ -140,33 +118,17 @@ struct BlockGemmARegBRegCRegV1
         }
         else
         {
-            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
-            {
-                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
-                    sequence<MWarp>,
-                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-                    tuple<sequence<0, 1>>,
-                    tuple<sequence<0, 1>>,
-                    sequence<2, 1>,
-                    sequence<0, 0>>{};
-                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+            constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<0, 1>>,
+                tuple<sequence<0, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
 
-                return b_block_dstr_encode;
-            }
-            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
-            {
-                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
-                    sequence<MWarp>,
-                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
-                    tuple<sequence<0, 1>>,
-                    tuple<sequence<0, 1>>,
-                    sequence<1, 2>,
-                    sequence<0, 0>>{};
-                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
-                return b_block_dstr_encode;
-            }
+            return b_block_dstr_encode;
         }
     }
 
@@ -251,82 +213,40 @@ struct BlockGemmARegBRegCRegV1
         constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
 
         // hot loop:
-        if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
-        {
-            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A Block window
-                    AWarpTensor a_warp_tensor;
-                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
-                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
-
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        // read B warp tensor from B block tensor
-                        BWarpTensor b_warp_tensor;
-                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
-
-                        // read C warp tensor from C block tensor
-                        using c_iter_idx = std::conditional_t<TransposeC,
-                                                              sequence<nIter, mIter>,
-                                                              sequence<mIter, nIter>>;
-                        CWarpTensor c_warp_tensor;
-                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
-
-                        // warp GEMM
-                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-
-                        // write C warp tensor into C block tensor
-                        c_block_tensor.set_y_sliced_thread_data(
-                            merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
-                });
-            });
-        }
-        else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
-        {
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
             static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A Block window
+                AWarpTensor a_warp_tensor;
+                a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                    merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                    merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                    static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
-                        // read A warp tensor from A Block window
-                        AWarpTensor a_warp_tensor;
+                    // read B warp tensor from B block tensor
+                    BWarpTensor b_warp_tensor;
+                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
 
-                        a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+                    // read C warp tensor from C block tensor
+                    using c_iter_idx = std::
+                        conditional_t<TransposeC, sequence<nIter, mIter>, sequence<mIter, nIter>>;
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
 
-                        // read B warp tensor from B block tensor
-                        BWarpTensor b_warp_tensor;
+                    // warp GEMM
+                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
 
-                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
-
-                        // read C warp tensor from C block tensor
-                        CWarpTensor c_warp_tensor;
-
-                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
-
-                        // warp GEMM
-                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
-
-                        // write C warp tensor into C block tensor
-                        c_block_tensor.set_y_sliced_thread_data(
-                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
-                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
-                            c_warp_tensor.get_thread_buffer());
-                    });
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
                 });
             });
-        }
+        });
     }
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp b/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
index d0be065fc9..fd5211a59a 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_problem.hpp
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 
 namespace ck_tile {
 
@@ -14,8 +13,7 @@ template <typename ADataType_,
           typename CDataType_,
           index_t kBlockSize_,
           typename BlockGemmShape_,
-          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN,
-          index_t NumWaveGroups_            = 1>
+          index_t NumWaveGroups_ = 1>
 struct BlockGemmProblem
 {
     using ADataType      = remove_cvref_t<ADataType_>;
@@ -23,9 +21,8 @@ struct BlockGemmProblem
     using CDataType      = remove_cvref_t<CDataType_>;
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
-    static constexpr index_t kBlockSize               = kBlockSize_;
-    static constexpr index_t NumWaveGroups            = NumWaveGroups_;
-    static constexpr GemmLoopOrder BlockGemmLoopOrder = BlockGemmLoopOrder_;
+    static constexpr index_t kBlockSize    = kBlockSize_;
+    static constexpr index_t NumWaveGroups = NumWaveGroups_;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
index b3c86b9456..b18bf603a9 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp
@@ -39,12 +39,6 @@ enum struct TailNumber
     Full,
 };
 
-enum struct GemmLoopOrder
-{
-    KMN,
-    MNK,
-};
-
 } // namespace ck_tile
 
 inline std::ostream& operator<<(std::ostream& os, const ck_tile::GemmPipelineScheduler& s)
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index c628614b54..52bd07c9e2 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -14,11 +14,10 @@ template <typename ADataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_         = ADataType_,
-          bool FixedVectorSize_             = false,
-          index_t VectorSizeA_              = 1,
-          index_t VectorSizeB_              = 1,
-          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
+          typename ComputeDataType_ = ADataType_,
+          bool FixedVectorSize_     = false,
+          index_t VectorSizeA_      = 1,
+          index_t VectorSizeB_      = 1>
 struct GemmPipelineProblemBase
 {
     using Traits = remove_cvref_t<Traits_>;
@@ -46,10 +45,9 @@ struct GemmPipelineProblemBase
     static constexpr bool kPadN = Traits::kPadN;
     static constexpr bool kPadK = Traits::kPadK;
 
-    static constexpr bool DoubleSmemBuffer            = Traits::DoubleSmemBuffer;
-    static constexpr auto Scheduler                   = GemmPipelineScheduler::Default;
-    static constexpr index_t VectorLoadSize           = Traits::_VectorSize;
-    static constexpr GemmLoopOrder BlockGemmLoopOrder = BlockGemmLoopOrder_;
+    static constexpr bool DoubleSmemBuffer  = Traits::DoubleSmemBuffer;
+    static constexpr auto Scheduler         = GemmPipelineScheduler::Default;
+    static constexpr index_t VectorLoadSize = Traits::_VectorSize;
 
     // In the base situation, the Preshuffle setting should be false.
     static constexpr bool Preshuffle = false;
@@ -169,11 +167,10 @@ template <typename ADataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          typename ComputeDataType_         = ADataType_,
-          bool FixedVectorSize_             = false,
-          index_t VectorSizeA_              = 1,
-          index_t VectorSizeB_              = 1,
-          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
+          typename ComputeDataType_ = ADataType_,
+          bool FixedVectorSize_     = false,
+          index_t VectorSizeA_      = 1,
+          index_t VectorSizeB_      = 1>
 using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
                                                     BDataType_,
                                                     CDataType_,
@@ -182,22 +179,20 @@ using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
                                                     ComputeDataType_,
                                                     FixedVectorSize_,
                                                     VectorSizeA_,
-                                                    VectorSizeB_,
-                                                    BlockGemmLoopOrder_>;
+                                                    VectorSizeB_>;
 
 template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
           typename BlockGemmShape_,
           typename Traits_,
-          GemmPipelineScheduler Scheduler_  = GemmPipelineScheduler::Intrawave,
-          bool HasHotLoop_                  = true,
-          TailNumber TailNum_               = TailNumber::Full,
-          typename ComputeDataType_         = ADataType_,
-          bool FixedVectorSize_             = false,
-          index_t VectorSizeA_              = 1,
-          index_t VectorSizeB_              = 1,
-          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
+          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
+          bool HasHotLoop_                 = true,
+          TailNumber TailNum_              = TailNumber::Full,
+          typename ComputeDataType_        = ADataType_,
+          bool FixedVectorSize_            = false,
+          index_t VectorSizeA_             = 1,
+          index_t VectorSizeB_             = 1>
 struct UniversalGemmPipelineProblem
 {
     using Traits = remove_cvref_t<Traits_>;
@@ -229,9 +224,8 @@ struct UniversalGemmPipelineProblem
     static constexpr auto Scheduler        = Scheduler_;
     static constexpr bool Preshuffle       = Traits::Preshuffle;
 
-    static constexpr index_t VectorSizeA              = VectorSizeA_;
-    static constexpr index_t VectorSizeB              = VectorSizeB_;
-    static constexpr GemmLoopOrder BlockGemmLoopOrder = BlockGemmLoopOrder_;
+    static constexpr index_t VectorSizeA = VectorSizeA_;
+    static constexpr index_t VectorSizeB = VectorSizeB_;
 
     static constexpr auto HasHotLoop        = HasHotLoop_;
     static constexpr auto TailNum           = TailNum_;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index d1deaf9e0e..fb191d565d 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -104,10 +104,6 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
         1>>;
 #endif
 
-using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
-
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
@@ -214,10 +210,6 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
         AttrNumAccess>>;
 #endif
 
-using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
-
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 8c6f39e511..e91d505c8e 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -45,8 +45,6 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp16 2:4 structural sparsity
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
@@ -76,8 +74,6 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 7a10d1fa56..434be9f84a 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -14,14 +14,10 @@ namespace ck_tile {
  * Y dim must have at least one dim not been reduced
  */
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
-template <typename AccDistributedTensor_,
-          typename ReduceFunc,
-          bool WithBroadcast = true,
-          bool CrossWarp     = true>
+template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
                                            const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {},
-                                           bool_constant<CrossWarp>     = {})
+                                           bool_constant<WithBroadcast> = {})
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -60,24 +56,14 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 
                 // reduction sweep forward
                 static_for<0, nstage, 1>{}([&](auto istage) {
-                    if constexpr(CrossWarp)
-                    {
-                        constexpr index_t lid_delta =
-                            lid_over_rid_derivative * (1 << (nstage - istage - 1));
+                    constexpr index_t lid_delta =
+                        lid_over_rid_derivative * (1 << (nstage - istage - 1));
 
-                        // pull data from remote lane
-                        const auto v_remote = warp_shuffle_down(v_local, lid_delta);
+                    // pull data from remote lane
+                    const auto v_remote = warp_shuffle_down(v_local, lid_delta);
 
-                        // reduce
-                        v_local = reduce_func(v_local, v_remote);
-                    }
-                    else
-                    {
-                        // pull data from remote lane
-                        const auto v_swapped_regs = warp_shuffle_down_pair(v_local);
-                        // reduce
-                        v_local = reduce_func(v_swapped_regs.at(0), v_swapped_regs.at(1));
-                    }
+                    // reduce
+                    v_local = reduce_func(v_local, v_remote);
                 });
             }
         });

From 20288caa2f20082187a5e0d39d28907e1baf766e Mon Sep 17 00:00:00 2001
From: slippedJim <jim.guo@amd.com>
Date: Wed, 13 Aug 2025 00:23:40 +0800
Subject: [PATCH 418/443] remove bad pipeline codegen (#2673)

---
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py         | 2 +-
 example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 269af4e6a7..471486419a 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -507,7 +507,7 @@ class KernelComponentFactory:
                 (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 0e4ac44d45..b2d962cd74 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -638,7 +638,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
             '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-            '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            # '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
             '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
         }
     elif dtype == 'fp8' or dtype == 'bf8':
@@ -657,7 +657,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
             '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
-            '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            # '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
             '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
     }
     elif dtype == 'fp8' or dtype == 'bf8':

From bbf41b27f2e533c431edda39850af1a8630f483f Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Tue, 12 Aug 2025 10:23:08 -0700
Subject: [PATCH 419/443] fix builds with mainline/staging compilers (#2674)

---
 Jenkinsfile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 590ee92e90..619f15d624 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -460,7 +460,9 @@ def buildHipClangJob(Map conf=[:]){
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
         if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+            // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
+            // newer clang22 compilers and running with older hip runtima libraries
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
         }
         def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
         def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
@@ -518,7 +520,9 @@ def Build_CK(Map conf=[:]){
         }
         def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
         if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+            // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
+            // newer clang22 compilers and running with older hip runtima libraries
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
         }
         if(params.BUILD_LEGACY_OS){
             dockerOpts = dockerOpts + " --env LD_LIBRARY_PATH='/opt/Python-3.8.13/lib' "

From 0856b3f4a29bd454fb8a9cef3d8776fb84e38119 Mon Sep 17 00:00:00 2001
From: joyeamd <john.ye@amd.com>
Date: Wed, 13 Aug 2025 03:33:56 +0800
Subject: [PATCH 420/443] [CK_TILE]fix ck_tile's moe_sorting example in gfx11
 (#2667)

* fix ck_tile's moe_sorting example in gfx11

* fix clang format

---------

Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
---
 .../flatmm_32x512x128_1x4x1_16x16x32.hpp      | 100 ++++++++++--------
 1 file changed, 58 insertions(+), 42 deletions(-)

diff --git a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
index 23c4ad583e..21ca470222 100644
--- a/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
+++ b/include/ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp
@@ -63,48 +63,15 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
     static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
     static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4
 
-    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
+    private:
+    template <index_t LanesPerK, index_t WarpSize, typename = void>
+    struct LdsStoreDescSelector;
+
+    template <index_t LanesPerK, index_t WarpSize>
+    struct LdsStoreDescSelector<LanesPerK, WarpSize, std::enable_if_t<(LanesPerK >= WarpSize)>>
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<2, 1>, // !! note here is different
-            sequence<0, 0>>{};
-
-        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>;
-
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-        return c_block_dstr;
-    }
-
-    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
-    {
-        using CDataType             = float;
-        constexpr auto c_block_dstr = MakeCBlockDist();
-        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
-        return c_block_tensor;
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
-    {
-        // A async->LDS
-        // constexpr index_t Block_M = Problem::BlockShape::Block_M0;
-        // constexpr index_t Block_K = Problem::BlockShape::Block_K0;
-        // constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
-        constexpr index_t WarpSize = ck_tile::get_warp_size();
-        // constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
-
-        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
-        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
-        constexpr index_t KPad    = KPack_; // pad between warps
-
-        static_assert(Block_K % KVector == 0);
-        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
-        if constexpr(LanesPerK >= WarpSize)
+        template <index_t NumWarps, index_t Block_M, index_t Block_K, index_t KVector, index_t KPad>
+        static CK_TILE_HOST_DEVICE constexpr auto MakeDesc()
         {
             // need multiple waves to load K
             static_assert(LanesPerK % WarpSize == 0);
@@ -143,7 +110,13 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
                 return lds_block_desc_issues_warps_lanes;
             }
         }
-        else
+    };
+
+    template <index_t LanesPerK, index_t WarpSize>
+    struct LdsStoreDescSelector<LanesPerK, WarpSize, std::enable_if_t<(LanesPerK < WarpSize)>>
+    {
+        template <index_t NumWarps, index_t Block_M, index_t Block_K, index_t KVector, index_t KPad>
+        static CK_TILE_HOST_DEVICE constexpr auto MakeDesc()
         {
             // lanes within a wave load different M but same K
             static_assert(WarpSize % LanesPerK == 0);
@@ -175,6 +148,49 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
 
             return lds_block_desc_issues_warps_lanes;
         }
+    };
+
+    public:
+    static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<2, 1>, // !! note here is different
+            sequence<0, 0>>{};
+
+        using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>;
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        return c_block_dstr;
+    }
+
+    static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
+    {
+        using CDataType             = float;
+        constexpr auto c_block_dstr = MakeCBlockDist();
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
+    {
+        // A async->LDS
+        constexpr index_t WarpSize = ck_tile::get_warp_size();
+
+        constexpr index_t KPack_  = 8;      // GetSmemKPack_A<Problem>(); // LDS
+        constexpr index_t KVector = 2;      // GetAlignment_A<Problem>(); // async copy 1 dword
+        constexpr index_t KPad    = KPack_; // pad between warps
+
+        static_assert(Block_K % KVector == 0);
+        constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
+
+        return LdsStoreDescSelector<LanesPerK, WarpSize>::
+            template MakeDesc<NumWarps, Block_M, Block_K, KVector, KPad>();
     }
 
     // template <typename Problem>

From 30dafe82810bd49a186149007f33ebbf120084de Mon Sep 17 00:00:00 2001
From: Geo Min <geomin12@amd.com>
Date: Tue, 12 Aug 2025 14:13:01 -0700
Subject: [PATCH 421/443] [TheRock CI] Adding TheRock CI gate check (#2648)

* Adding initial TheRock CI

* Adding composable kernel link

* Adding correct repo for rocm-libraries

* Adding entire rocm-libraries checkout

* Adding correct flag

* Adding correct flag for fetch sources

* Fixing git health

* Removing patch

* Removing patching

* Removing manual check

* PR comments

* testing without dist

* Removing test branch

* PR comments

* PR comments

* PR comment

* Adding test_runs_on
---
 .github/workflows/therock-ci-linux.yml      | 128 ++++++++++++++++++++
 .github/workflows/therock-ci.yml            |  50 ++++++++
 .github/workflows/therock-test-packages.yml |  76 ++++++++++++
 3 files changed, 254 insertions(+)
 create mode 100644 .github/workflows/therock-ci-linux.yml
 create mode 100644 .github/workflows/therock-ci.yml
 create mode 100644 .github/workflows/therock-test-packages.yml

diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml
new file mode 100644
index 0000000000..645a91c030
--- /dev/null
+++ b/.github/workflows/therock-ci-linux.yml
@@ -0,0 +1,128 @@
+name: TheRock CI Linux
+
+on:
+  workflow_call:
+    inputs:
+      cmake_options:
+        type: string
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  therock-build-linux:
+    name: Build Linux Packages
+    runs-on: azure-linux-scale-rocm
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:044b113562629f4bd2ec5d2e64b32eee11562d48fb1a75d7493daec9dd8d8292
+    env:
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+      TEATIME_FORCE_INTERACTIVE: 0
+    steps:
+      - name: Checkout composable_kernel repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Checkout TheRock repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+          ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
+          path: "TheRock"
+
+      - name: Runner Health Settings
+        run: |
+          df -h
+          cmake --version
+          echo "Installed Python versions:"
+          ls -d /opt/python
+          echo "python: $(which python), python3: $(which python3)"
+          echo "Git version: $(git --version)"
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+      
+      - name: Fetch sources
+        run: |
+          ./TheRock/build_tools/fetch_sources.py --jobs 12
+
+      - name: Install python deps
+        run: |
+          pip install -r TheRock/requirements.txt
+          pip freeze
+
+      - name: Configure Projects
+        env:
+          amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
+          package_version: ADHOCBUILD
+          extra_cmake_options: ${{ inputs.cmake_options }}
+          BUILD_DIR: build
+        run: |
+          python3 TheRock/build_tools/github_actions/build_configure.py
+
+      - name: Build TheRock
+        run: cmake --build TheRock/build
+
+      - name: Build therock-archives
+        run: cmake --build TheRock/build --target therock-archives
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "Full SDK du:"
+          echo "------------"
+          du -h -d 1 TheRock/build/dist/rocm
+          echo "Artifact Archives:"
+          echo "------------------"
+          ls -lh TheRock/build/artifacts/*.tar.xz
+          echo "Artifacts:"
+          echo "----------"
+          du -h -d 1 TheRock/build/artifacts
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
+
+      - name: Create Logs index Files and upload logs
+        if: always()
+        run: |
+          python3 TheRock/build_tools/github_actions/create_log_index.py \
+            --build-dir=TheRock/build \
+            --amdgpu-family=${{ env.AMDGPU_FAMILIES }}
+
+          python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
+            --build-dir=TheRock/build \
+            --run-id ${{ github.run_id }} \
+            --amdgpu-family ${{ env.AMDGPU_FAMILIES }}
+
+      - name: Upload artifacts
+        run: |
+          python TheRock/build_tools/github_actions/upload_build_artifacts.py \
+            --run-id ${{ github.run_id }} \
+            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
+            --build-dir TheRock/build
+
+      - name: Add Links to Job Summary
+        if: always()
+        run: |
+          python TheRock/build_tools/github_actions/upload_build_summary.py \
+            --run-id ${{ github.run_id }} \
+            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
+            --build-dir TheRock/build
+
+  therock-test-linux:
+    name: "Test"
+    needs: [therock-build-linux]
+    uses: ./.github/workflows/therock-test-packages.yml
+    with:
+      project_to_test: "miopen"
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      platform: "linux"
diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml
new file mode 100644
index 0000000000..18411baa09
--- /dev/null
+++ b/.github/workflows/therock-ci.yml
@@ -0,0 +1,50 @@
+name: TheRock CI for composable_kernel
+
+on:
+  push:
+    branches:
+      - develop
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  therock-ci-linux:
+    name: TheRock CI Linux
+    permissions:
+      contents: read
+      id-token: write
+    uses: ./.github/workflows/therock-ci-linux.yml
+    secrets: inherit
+    with:
+      cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
+      amdgpu_families: "gfx94X-dcgpu"
+      test_runs_on: "linux-mi325-1gpu-ossci-rocm"
+
+  therock_ci_summary:
+    name: TheRock CI Summary
+    if: always()
+    needs:
+      - therock-ci-linux
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Output failed jobs
+        run: |
+          echo '${{ toJson(needs) }}'
+          FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
+            | jq --raw-output \
+            'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
+          )"
+          if [[ "${FAILED_JOBS}" != "" ]]; then
+            echo "The following jobs failed: ${FAILED_JOBS}"
+            exit 1
+          fi
diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml
new file mode 100644
index 0000000000..439135743c
--- /dev/null
+++ b/.github/workflows/therock-test-packages.yml
@@ -0,0 +1,76 @@
+name: TheRock Test Packages
+
+on:
+  workflow_call:
+    inputs:
+      project_to_test:
+        type: string
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  configure_test_matrix:
+    name: "Configure test matrix"
+    runs-on: ubuntu-24.04
+    if: ${{ inputs.test_runs_on != '' }}
+    outputs:
+      components: ${{ steps.configure.outputs.components }}
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: "Configuring CI options"
+        env:
+          PLATFORM: ${{ inputs.platform }}
+          project_to_test: ${{ inputs.project_to_test }}
+        id: configure
+        run: python ./build_tools/github_actions/fetch_test_configurations.py
+
+  test_components:
+    name: 'Test ${{ matrix.components.job_name }}'
+    runs-on: ${{ inputs.test_runs_on }}
+    needs: configure_test_matrix
+    # skip tests if no test matrix to run
+    if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ github.run_id }}"
+      OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
+      THEROCK_BIN_DIR: "./build/bin"
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
+          PLATFORM: ${{ inputs.platform }}
+
+      - name: Test
+        timeout-minutes: ${{ matrix.components.timeout_minutes }}
+        run: |
+          if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
+          ${{ matrix.components.test_script }}

From 3f57ec3d2dc856a30ca1c652eda19e5dd4ee6041 Mon Sep 17 00:00:00 2001
From: Thrupti Raj Lakshmana Gowda <thruptiraj.lakshmanagowda@amd.com>
Date: Tue, 12 Aug 2025 18:05:05 -0500
Subject: [PATCH 422/443] GEMM Multi D for CK Tile Engine (#2660)

* Readme for GEMM Multi D

* GEMM Multi D partial Progress

* GEMM Multi D partial Progress!

* CK Tile Engine GEMM Multi D : All Python files generated

* Partial Progress

* Partial Progress

* Partial Progress

* Partial Progress : Incorrect Result

* Partial Progress : Debugging

* Partial Progress : Correct Results

* Partial Progress - Incorrect Results

* Partial Progress - Commenting Passthrough bypass logic

* Changing Passthrough to MultiplyMultiply

* Correct Results!

* Fix and debug the pass through feature

* Sample commit

* Correct Results : MultiplyMultiply

* Code Cleanup

* Removing Failed Instances

* Working code before Unary element support

* Custom Elementwise Function support and working implementation for Mul and Add

* Updating README

* Working for Passthrough

* Review Comments : Minor Fixes

* Review Comments : Minor Fixes

* Readme Updated

* Partial Changes after Rebase

* Working Code : Changes after Rebase

* Updating Jenkins file

* Removing default value changed while testing

* Configuration changes in config files

* Tile Handler changes in GEMM Multi D Tile Engine

* Tile Handler changes in GEMM Multi D Example

* Change log for Gemm Multi D in CK Tile Engine

* Configuration changes in config files

---------

Co-authored-by: ThomasNing <thomasning@amd.com>
---
 CHANGELOG.md                                  |   1 +
 Jenkinsfile                                   |  24 +-
 .../19_gemm_multi_d/gemm_multi_d_fp16.cpp     |  90 +--
 .../unary_element_wise_operation.hpp          | 242 ++----
 include/ck_tile/ops/reduce.hpp                |   6 +-
 tile_engine/ops/CMakeLists.txt                |   1 +
 tile_engine/ops/gemm_multi_d/CMakeLists.txt   | 152 ++++
 tile_engine/ops/gemm_multi_d/README.md        | 110 +++
 .../gemm_multi_d/benchmark_gemm_multi_d.cpp   |  73 ++
 .../gemm_multi_d/benchmark_gemm_multi_d.hpp   | 218 +++++
 .../configs/custom_ci_config.json             |  80 ++
 .../gemm_multi_d/configs/default_config.json  |  84 ++
 .../configs/user_provided_config.json         |  81 ++
 .../gemm_multi_d_codegen_utils.py             | 229 ++++++
 .../ops/gemm_multi_d/gemm_multi_d_config.py   | 250 ++++++
 .../gemm_multi_d/gemm_multi_d_host_api.hpp    | 164 ++++
 .../gemm_multi_d_instance_builder.py          | 755 ++++++++++++++++++
 .../gemm_multi_d/gemm_multi_d_profiler.hpp    | 278 +++++++
 18 files changed, 2547 insertions(+), 291 deletions(-)
 create mode 100644 tile_engine/ops/gemm_multi_d/CMakeLists.txt
 create mode 100644 tile_engine/ops/gemm_multi_d/README.md
 create mode 100644 tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
 create mode 100644 tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp
 create mode 100644 tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
 create mode 100644 tile_engine/ops/gemm_multi_d/configs/default_config.json
 create mode 100644 tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
 create mode 100644 tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
 create mode 100644 tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
 create mode 100644 tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
 create mode 100755 tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
 create mode 100644 tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c942a776d..7c09271edc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added rotating buffer feature for CK_Tile GEMM.
 * Added int8 support for CK_TILE GEMM.
 * Added support for elementwise kernel.
+* Added benchmarking support for tile engine GEMM Multi D.
 
 ### Optimized
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 619f15d624..7955b8733a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1176,6 +1176,8 @@ pipeline {
                                             -D GPU_TARGETS="gfx90a" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D DGEMM_MULTI_D_DATATYPE="fp16" \
+                                            -D DGEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                            ninja -j64 benchmark_gemm_fp8_rcr && \
                                            ./bin/benchmark_gemm_fp8_rcr && \
@@ -1192,7 +1194,15 @@ pipeline {
                                            ninja -j64 benchmark_gemm_fp8_rrr && \
                                            ./bin/benchmark_gemm_fp8_rrr && \
                                            ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr """
+                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_crrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_crrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rcrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rcrr """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1214,6 +1224,8 @@ pipeline {
                                             -D GPU_TARGETS="gfx942" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D DGEMM_MULTI_D_DATATYPE="fp16" \
+                                            -D DGEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                            ninja -j64 benchmark_gemm_fp8_rcr && \
                                            ./bin/benchmark_gemm_fp8_rcr && \
@@ -1230,7 +1242,15 @@ pipeline {
                                            ninja -j64 benchmark_gemm_fp8_rrr && \
                                            ./bin/benchmark_gemm_fp8_rrr && \
                                            ninja -j64 benchmark_gemm_fp16_rrr && \
-                                           ./bin/benchmark_gemm_fp16_rrr """
+                                           ./bin/benchmark_gemm_fp16_rrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rrrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_ccrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_crrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_crrr && \
+                                           ninja -j64 benchmark_gemm_multi_d_fp16_rcrr && \
+                                           ./bin/benchmark_gemm_multi_d_fp16_rcrr """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
index 8971871c14..d7bf2b5c42 100644
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -197,95 +197,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config&
         }
     };
 
-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "For compute pipeline tail number should always be Full, but have \"" << tail_num
-                << "\" which is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        auto check_tail = [&](auto... TNs) {
-            (try_run<BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
-        };
-
-        check_tail(ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                   ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            RunSplitk(
-                ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            RunSplitk(ck_tile::bool_constant<true>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            RunSplitk(ck_tile::bool_constant<false>{},
-                      ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Num K loop must be larger than number of prefetech stages."
-                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-    }
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
     return ave_time;
 }
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 0e385901ed..2f8cef7afd 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -262,219 +262,67 @@ struct PassThroughPack2
 
 struct PassThrough
 {
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+    template <class T>
+    using raw_t = std::remove_cv_t<std::remove_reference_t<T>>;
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<double, double>(double& y, const double& x) const
+    template <class Y, class X>
+    CK_TILE_HOST_DEVICE void operator()(Y&& y, const X& x) const
     {
-        y = x;
+        /*  Only do the assignment when
+            - y is an *l-value*   and
+            - y is *not* const     */
+        if constexpr(std::is_lvalue_reference_v<Y&&> && !std::is_const_v<raw_t<Y>>)
+        {
+            y = ck_tile::type_convert<raw_t<Y>>(x);
+        }
+        /*  otherwise (r-value or const)     → do nothing  */
     }
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, double>(float& y, const double& x) const
+    template <typename E, typename C, typename... Ds>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
     {
-        y = type_convert<float>(x);
-    }
+        // Suppress unused parameter warning for ds
+        ((void)ds, ...);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<double, float>(double& y, const float& x) const
-    {
-        y = type_convert<double>(x);
+        // Just assign e with c
+        if constexpr(std::is_same_v<E, C>)
+        {
+            e = c;
+        }
+        else
+        {
+            e = ck_tile::type_convert<E>(c);
+        }
     }
+};
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, float>(float& y, const float& x) const
+struct MultiDMultiply
+{
+    template <typename E, typename C, typename... Ds>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
     {
-        y = x;
-    }
+        // Start with the base value c
+        float result = ck_tile::type_convert<float>(c);
 
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp16_t, ck_tile::fp16_t>(ck_tile::fp16_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = x;
-    }
+        // Multiply by each D parameter using fold expression
+        ((result *= ck_tile::type_convert<float>(ds)), ...);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp16_t, float>(ck_tile::fp16_t& y,
-                                                                const float& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
+        e = ck_tile::type_convert<E>(result);
     }
+};
 
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf16_t, ck_tile::bf16_t>(ck_tile::bf16_t& y, const ck_tile::bf16_t& x) const
+struct MultiDAdd
+{
+    template <typename E, typename C, typename... Ds>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
     {
-        y = x;
-    }
+        // Start with the base value c
+        float result = ck_tile::type_convert<float>(c);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
-    {
-        y = x;
-    }
+        // Add by each D parameter using fold expression
+        ((result += ck_tile::type_convert<float>(ds)), ...);
 
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf16_t, float>(ck_tile::bf16_t& y,
-                                                                const float& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::bf16_t>(float& y,
-                                                                const ck_tile::bf16_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf16_t, ck_tile::fp16_t>(ck_tile::bf16_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp16_t>(float& y,
-                                                                const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp16_t, int8_t>(ck_tile::fp16_t& y,
-                                                                 const int8_t& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf16_t, int8_t>(ck_tile::bf16_t& y,
-                                                                 const int8_t& x) const
-    {
-        y = type_convert<ck_tile::bf16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<uint8_t, uint8_t>(uint8_t& y, const uint8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
-    {
-        y = type_convert<int8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int32_t, int8_t>(int32_t& y, const int8_t& x) const
-    {
-        y = type_convert<int32_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int8_t, float>(int8_t& y, const float& x) const
-    {
-        y = type_convert<int8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, int8_t>(float& y, const int8_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int4_t, int4_t>(int4_t& y, const int4_t& x) const
-    {
-        y = x;
-    }
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<int4_t, int>(int4_t& y, const int& x) const
-    {
-        y = type_convert<int4_t>(x);
-    }
-#endif
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp8_t, ck_tile::fp8_t>(ck_tile::fp8_t& y, const ck_tile::fp8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::fp8_t>(float& y,
-                                                               const ck_tile::fp8_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::fp8_t, float>(ck_tile::fp8_t& y,
-                                                               const float& x) const
-    {
-        y = type_convert<ck_tile::fp8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp16_t, ck_tile::fp8_t>(ck_tile::fp16_t& y, const ck_tile::fp8_t& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp8_t, ck_tile::fp16_t>(ck_tile::fp8_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = type_convert<ck_tile::fp8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf8_t, ck_tile::bf8_t>(ck_tile::bf8_t& y, const ck_tile::bf8_t& x) const
-    {
-        y = x;
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<float, ck_tile::bf8_t>(float& y,
-                                                               const ck_tile::bf8_t& x) const
-    {
-        y = type_convert<float>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void operator()<ck_tile::bf8_t, float>(ck_tile::bf8_t& y,
-                                                               const float& x) const
-    {
-        y = type_convert<ck_tile::bf8_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::fp16_t, ck_tile::bf8_t>(ck_tile::fp16_t& y, const ck_tile::bf8_t& x) const
-    {
-        y = type_convert<ck_tile::fp16_t>(x);
-    }
-
-    template <>
-    CK_TILE_HOST_DEVICE void
-    operator()<ck_tile::bf8_t, ck_tile::fp16_t>(ck_tile::bf8_t& y, const ck_tile::fp16_t& x) const
-    {
-        y = ck_tile::type_convert<ck_tile::bf8_t>(x);
+        e = ck_tile::type_convert<E>(result);
     }
 };
 
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index 042e0b98c2..a6721c9305 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -6,10 +6,10 @@
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
-#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
-#include "ck_tile/ops/common/tensor_layout.hpp"
-#include "ck_tile/ops/common/utils.hpp"
 #include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
 #include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
diff --git a/tile_engine/ops/CMakeLists.txt b/tile_engine/ops/CMakeLists.txt
index 0cf2c16da2..7d7002af1b 100644
--- a/tile_engine/ops/CMakeLists.txt
+++ b/tile_engine/ops/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(gemm)
+add_subdirectory(gemm_multi_d)
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/CMakeLists.txt b/tile_engine/ops/gemm_multi_d/CMakeLists.txt
new file mode 100644
index 0000000000..3708dd3fee
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,152 @@
+
+set(GEMM_MULTI_D_DATATYPE "fp16" CACHE STRING "List of datatypes for GEMM Multi D (semicolon-separated)")
+set(GEMM_MULTI_D_LAYOUT "rcrr" CACHE STRING "List of layout for GEMM Multi D(semicolon-separated)")
+set(GEMM_MULTI_D_ELEMENTWISE_FUNCTION "mul"  CACHE STRING "Elementwise function")
+
+function(build_gemm_multi_d_for_datatype_layout datatype layout)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
+
+    # Comment this if-else block when using user_provided_config
+    if(layout STREQUAL "rcrr")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
+    endif()
+
+    # uncomment this if you want to use user_provided_config.json
+    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    
+    # Generate kernel list
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
+                --config_json ${json_blob}
+                --list_blobs
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
+    endif()
+
+    file(STRINGS "${working_path}/gemm_multi_d_instance_blobs.txt" codegen_blobs)
+    file(STRINGS "${working_path}/gemm_multi_d_instance_blobs_range.txt" codegen_blobs_range)
+    
+    # Generate the blobs
+    add_custom_command(
+        OUTPUT ${codegen_blobs}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_multi_d_instance_builder.py
+                --working_path "${working_path}"
+                --datatype ${datatype}
+                --layout ${layout}
+                --elementwise_function ${GEMM_MULTI_D_ELEMENTWISE_FUNCTION}
+                --config_json "${json_blob}"
+                --gen_blobs
+        COMMENT "Generating GEMM Multi D instance sources for ${datatype} ${layout}"
+    )
+    add_custom_target(gemm_multi_d_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})
+
+    set(intermediate_libs)
+    list(LENGTH codegen_blobs codegen_blobs_len)
+
+    foreach(blob IN LISTS codegen_blobs_range)
+        string(STRIP "${blob}" stripped_blob)
+        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
+        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
+        list(GET spilit_blob 0 name)
+        list(GET spilit_blob 1 first)
+        list(GET spilit_blob 2 last)
+        math(EXPR total_files "${last} - ${first}")
+        if(total_files EQUAL 0)
+            continue()        # nothing for this trait
+        endif()
+
+        # Object libraries (chunked) per trait
+        set(sub_intermediate_libs)
+        set(chunk_size 3)
+        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
+        
+        foreach(i RANGE 0 ${num_chunks_minus_1})
+            math(EXPR start "${first} + ${i} * ${chunk_size} ")
+            math(EXPR end "${start} + ${chunk_size} - 1")
+
+            set(chunk_files)
+            foreach(j RANGE ${start} ${end})
+                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
+                    list(GET codegen_blobs ${j} f)
+                    list(APPEND chunk_files "${f}")
+                endif()
+            endforeach()
+
+            #list(LENGTH chunk_files chunk_files_len)
+            #if(chunk_files_len AND chunk_files_len GREATER 1)
+            if(chunk_files)
+                set(sub_intermediate_lib_name "gemm_multi_d_objlib_${name}_${i}_${datatype}_${layout}")
+                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
+                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
+            endif()
+
+        endforeach()
+
+        # ------------------ Bundle the object libs into one static lib ---------
+        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
+        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
+        if(sub_intermediate_libs)
+            set(intermediate_lib_name "gemm_multi_d_staticlib_${name}_${datatype}_${layout}")
+            # Collect the $<TARGET_OBJECTS:...> expressions
+            
+            set(obj_exprs)
+            foreach(objlib IN LISTS sub_intermediate_libs)
+                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
+            endforeach()
+            
+            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
+            add_dependencies(${intermediate_lib_name} gemm_multi_d_gen_${datatype}_${layout})
+            #foreach(objlib IN LISTS sub_intermediate_libs)
+            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
+            #endforeach()
+            list(APPEND intermediate_libs ${intermediate_lib_name})
+        endif()
+
+    endforeach()
+    
+    # Interface library for instances
+    add_library(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE)
+    add_dependencies(gemm_multi_d_template_instances_${datatype}_${layout} gemm_multi_d_gen_${datatype}_${layout})
+    target_link_libraries(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
+    target_include_directories(gemm_multi_d_template_instances_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    set_target_properties(gemm_multi_d_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
+    
+    # Host API interface library
+    add_library(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE)
+    target_link_libraries(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE gemm_multi_d_template_instances_${datatype}_${layout})
+    target_include_directories(gemm_multi_d_host_api_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+
+    
+
+    # Executable per datatype
+    set(exec_name "benchmark_gemm_multi_d_${datatype}_${layout}")
+    add_executable(${exec_name} benchmark_gemm_multi_d.cpp)
+    target_link_libraries(${exec_name} PRIVATE gemm_multi_d_host_api_${datatype}_${layout})
+    target_compile_options(${exec_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+    )
+endfunction()
+
+# Process each datatype in isolation
+foreach(dt IN LISTS GEMM_MULTI_D_DATATYPE)
+    foreach(l IN LISTS GEMM_MULTI_D_LAYOUT)
+        build_gemm_multi_d_for_datatype_layout(${dt} ${l})
+    endforeach()
+endforeach()
diff --git a/tile_engine/ops/gemm_multi_d/README.md b/tile_engine/ops/gemm_multi_d/README.md
new file mode 100644
index 0000000000..369553b121
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/README.md
@@ -0,0 +1,110 @@
+
+CK Tile Engine for GEMM Multi D is used to generate and run GEMM kernels with different combinations of BlockTile sizes, WarpTile sizes, WarpTile mapping for all valid pipelines, schedulers and epilogues while able to give custom datatype and Layout selections
+
+# Kernel Configurations
+
+# User Specific
+Users can specify custom kernel configurations such as tile size, warp size, padding, pipeline, scheduler, and epilogue in the config file. This allows building only for selected configurations, significantly reducing build time.
+For reference please see `./configs/user_provided_config.json`.
+
+# Default
+The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark. For reference please see in `./configs/default_config.json`
+
+If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. 
+
+## Build Instructions
+``` bash
+# in the root of composable kernel create build directory
+mkdir build && cd build
+# build composable kernel
+# replace [Arch] with the appropriate architecture or leave blank and 
+# replace [Datatype] in comma separated datatypes string (possible datatypes are [fp16])
+# replace [Layout1;Layout2;...] in comma separated datatypes string (possible layouts are [rcr, rrr, crr, ccr])
+# replace "mul" with either of mul,add,passthrough for Elementwise function as Multiply, Add or Passthrough respectively. If this is not specified it is considered as mul by default.
+sh ../script/cmake-ck-dev.sh  ../ [Arch] -DGEMM_MULTI_D_DATATYPE="[Datatype]" -DGEMM_MULTI_D_LAYOUT="[Layout1;Layout2]" -DGEMM_MULTI_D_ELEMENTWISE_FUNCTION="mul"
+# generate different executable for each passed datatype
+make benchmark_gemm_multi_d_[Datatype]_[Layout1] -j
+make benchmark_gemm_multi_d_[Datatype]_[Layout2] -j
+```
+`benchmark_gemm_multi_d_[Datatype]_[Layout]` will be located in the `./bin/` directory.
+
+`benchmark_gemm_multi_d_[Datatype]_[Layout]` must be rebuilt everytime if configuration file is modified.
+
+``` bash
+rm -rf tile_engine/ && make benchmark_gemm_multi_d_[Datatype]_[Layout] -j  # rebuild
+```
+
+## For eaxmple build for gfx942 for datatype with rcr layout
+``` bash
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh  ../ gfx942 -DGEMM_MULTI_D_DATATYPE="fp16" -DGEMM_MULTI_D_LAYOUT="rcrr" 
+make benchmark_gemm_multi_d_fp16_rcrr -j
+
+## benchmark_gemm inputs
+```
+                      -m    The value for m dimension. Default is 3840.
+                      -n    The value for n dimension. Default is 4096.
+                      -k    The value for k dimension. Default is 2048.
+               -stride_a    The stride value for tensor A. Default is 0.
+               -stride_b    The stride value for tensor B. Default is 0.
+              -stride_ds    The stride value for tensor Ds. Default is 0.
+               -stride_e    The stride value for tensor E. Default is 0.
+                -split_k    The split value for k dimension. Default is 1.
+                  -verify    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 1, validation on CPU, as validation on GPU is not supported.
+                    -log    Wether output kernel instance information or not. Possible values are true or false. Default is false.
+                 -warmup    The number of iterations before benchmark the kernel. Default is 50.
+                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
+                  -timer    Whether if the timer is gpu timer or not. Possible values are false or true. Default is true.
+                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+            -flush_cache    To flush cache, possible values are true or false. Default is false.
+         -rotating_count    Number of iterations to rotate the cache. Default is 5.
+                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
+           -csv_filename    The filename of benchmark result. Default is gemm_multi_d_kernel.
+               -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
+              -scheduler    The type of scheduler. Possible values are intrawave. Default is intrawave.
+               -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
+                  -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false.
+                  -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false.
+                  -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false.
+
+Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json 
+```
+Note: In `./configs/user_provided_config.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above.
+
+## Example
+
+The following JSON file specifies parameters used to generate and build GEMM kernels across all possible combinations of pipelines, schedulers, epilogues with different tile and warp sizes.
+
+```json
+{     
+    /// other parameters ///
+    
+    "tile_m": {
+      "values": [256]
+    },
+    "tile_n": {
+      "values": [256]
+    },
+    "tile_k": {
+      "values": [64, 32]
+    },
+
+    /// other parameters ///
+
+    "pipeline": {
+      "values": ["compv3", "compv4", "mem"]
+    },
+    "scheduler": {
+      "values": ["intrawave", "interwave"]
+    },
+    "epilogue": {
+      "values": ["cshuffle"]
+    }
+}
+```
+
+At runtime, a specific subset of the generated kernels can be selected using command-line arguments.
+``` bash
+./bin/benchmark_gemm_multi_d_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=cshuffle 
+```
+The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and cshuffle epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings.
diff --git a/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
new file mode 100644
index 0000000000..764a295809
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <tuple>
+#include <exception>
+
+#include "benchmark_gemm_multi_d.hpp"
+#include "gemm_multi_d_profiler.hpp"
+
+void benchmark_gemm_multi_d(const ck_tile::ArgParser& arg_parser)
+{
+    GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"),
+                                           arg_parser.get_int("m"),
+                                           arg_parser.get_int("n"),
+                                           arg_parser.get_int("k"),
+                                           arg_parser.get_int("stride_a"),
+                                           arg_parser.get_int("stride_b"),
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_e"),
+                                           DataTypeTraits<ADataType>::name,
+                                           DataTypeTraits<BDataType>::name,
+                                           DataTypeTraits<D0DataType>::name,
+                                           DataTypeTraits<D1DataType>::name,
+                                           DataTypeTraits<AccDataType>::name,
+                                           DataTypeTraits<EDataType>::name,
+                                           ALayout::name,
+                                           BLayout::name,
+                                           D0Layout::name,
+                                           D1Layout::name,
+                                           ELayout::name};
+
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count")};
+
+    auto& profiler = GemmMultiDProfiler::instance(setting);
+
+    try
+    {
+        auto kernel_func = get_kernel_func_by_trait(arg_parser);
+        profiler.benchmark(gemm_multi_d_problem, kernel_func);
+        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Benchmark failed: " << e.what() << std::endl;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, parser] = create_args(argc, argv);
+        if(!result)
+            return EXIT_FAILURE;
+        benchmark_gemm_multi_d(parser);
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << "\n";
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp
new file mode 100644
index 0000000000..f52d69e374
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/benchmark_gemm_multi_d.hpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+
+#include "gemm_multi_d_host_api.hpp"
+
+struct GemmMultiDProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_e_;
+
+    std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_e_;
+    std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_e_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_d0\":" << problem.stride_d0_ << ",\n"
+           << "   \"stride_d1\":" << problem.stride_d1_ << ",\n"
+           << "   \"stride_e\":" << problem.stride_e_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_d0\":\"" << problem.dtype_d0_ << "\",\n"
+           << "   \"dtype_d1\":\"" << problem.dtype_d1_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_e\":\"" << problem.dtype_e_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_d0\":\"" << problem.layout_d0_ << "\",\n"
+           << "   \"layout_d1\":\"" << problem.layout_d1_ << "\",\n"
+           << "   \"layout_e\":\"" << problem.layout_e_ << "\"\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+};
+
+// @brief Function to get the kernel output with reference implementation on CPU
+void gemm_multi_d_host_reference(int verify,
+                                 ck_tile::HostTensor<ADataType>& a_m_k,
+                                 ck_tile::HostTensor<BDataType>& b_k_n,
+                                 ck_tile::HostTensor<D0DataType>& d0_m_n,
+                                 ck_tile::HostTensor<D1DataType>& d1_m_n,
+                                 ck_tile::HostTensor<EDataType>& e_m_n_host_result)
+{
+    if(verify > 0)
+    {
+        // Currently supporting on CPU verification for Gemm Multi D
+        // e_m_n_host_result.SetZero();
+        ck_tile::reference_gemm_multiple_d<ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           AccDataType,
+                                           EDataType,
+                                           ElementWiseFn>(
+            a_m_k, b_k_n, {d0_m_n, d1_m_n}, e_m_n_host_result);
+    }
+}
+
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct KernelInstance
+{
+    std::string name_;
+    GemmMultiDProblem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \"" << "{\n"
+           << obj.name_ << "\n}" << "\",\n"
+           << " \"problem\": \"" << obj.problem_ << "\",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+/// @brief Function to compare the results of the device and host computations
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
+             ck_tile::HostTensor<EDataType>& e_m_n_dev_result,
+             ck_tile::HostTensor<EDataType>& e_m_n_host_result)
+{
+    const float max_accumulated_value =
+        *std::max_element(e_m_n_host_result.mData.begin(), e_m_n_host_result.mData.end());
+
+    const auto rtol_atol = calculate_rtol_atol(K, 1, max_accumulated_value);
+
+    bool pass = ck_tile::check_err(e_m_n_dev_result,
+                                   e_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
diff --git a/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json b/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
new file mode 100644
index 0000000000..cd638d9af0
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/configs/custom_ci_config.json
@@ -0,0 +1,80 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        32
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/configs/default_config.json b/tile_engine/ops/gemm_multi_d/configs/default_config.json
new file mode 100644
index 0000000000..6d1afa4425
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/configs/default_config.json
@@ -0,0 +1,84 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256
+      ]
+    },
+    "tile_n": {
+      "values": [
+        128
+      ]
+    },
+    "tile_k": {
+      "values": [
+        32
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        16
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3",
+        "compv4",
+        "mem"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave",
+        "interwave"
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle"
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json b/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
new file mode 100644
index 0000000000..243d858fe5
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/configs/user_provided_config.json
@@ -0,0 +1,81 @@
+{
+  "tile_config": {
+    "tile_m": {
+      "values": [
+        256
+      ]
+    },
+    "tile_n": {
+      "values": [
+        256
+      ]
+    },
+    "tile_k": {
+      "values": [
+        64
+      ]
+    },
+    "warp_m": {
+      "values": [
+        2
+      ]
+    },
+    "warp_n": {
+      "values": [
+        2
+      ]
+    },
+    "warp_k": {
+      "values": [
+        1
+      ]
+    },
+    "warp_tile_m": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_n": {
+      "values": [
+        32
+      ]
+    },
+    "warp_tile_k": {
+      "values": [
+        16
+      ]
+    }
+  },
+  "trait_config": {
+    "pipeline": {
+      "values": [
+        "compv3"
+      ]
+    },
+    "scheduler": {
+      "values": [
+        "intrawave"      
+      ]
+    },
+    "epilogue": {
+      "values": [
+        "cshuffle"      
+      ]
+    },
+    "pad_m": {
+      "values": [
+        false
+      ]
+    },
+    "pad_n": {
+      "values": [
+        false
+      ]
+    },
+    "pad_k": {
+      "values": [
+        false
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
new file mode 100644
index 0000000000..7d3629819d
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_codegen_utils.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+Mappings and utility functions for kernel code generation.
+"""
+
+import subprocess
+import re
+from functools import lru_cache
+
+DATA_TYPE_MAP = {
+    "fp32": "float",
+    "fp16": "ck_tile::half_t",
+    "bf16": "ck_tile::bf16_t",
+    "int8": "ck_tile::int8_t",
+    "fp8": "ck_tile::fp8_t",
+    "bf8": "ck_tile::bf8_t",
+    "int4": "ck_tile::pk_int4_t",
+    "int32": "ck_tile::int32_t",
+}
+
+LAYOUT_MAP = {
+    "r": "ck_tile::tensor_layout::gemm::RowMajor",
+    "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
+}
+
+
+# TODO THIS IS NOT SUPPORTED FOR MULTI D AS OF NOW
+# DEFAULT_EPILOGUE = """
+#             using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<
+#                                 ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
+#                                                                       BDataType,
+#                                                                       AccDataType,
+#                                                                       CDataType,
+#                                                                       CLayout,
+#                                                                       kPadM,
+#                                                                       kPadN,
+#                                                                       WarpTileM,
+#                                                                       WarpTileN,
+#                                                                       WarpTileK,
+#                                                                       UniversalGemmProblem::TransposeC,
+#                                                                       true,
+#                                                                       memory_operation>>;
+# """
+
+CSHUFFLE_EPILOGUE = """
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                             BDataType,
+                                                             DsDataType,
+                                                             AccDataType,
+                                                             EDataType,
+                                                             DsLayout,
+                                                             ELayout,
+                                                             CDEElementWise,
+                                                             GemmPipelineProblem::kBlockSize,
+                                                             TilePartitioner::MPerBlock,
+                                                             TilePartitioner::NPerBlock,
+                                                             WarpM,
+                                                             WarpN,
+                                                             WarpTileM,
+                                                             WarpTileN,
+                                                             WarpTileK,
+                                                             UniversalGemmProblem::TransposeC,
+                                                             memory_operation>>;
+"""
+
+PIPELINE_MAP = {
+    "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"],
+    "compv3": [
+        "ck_tile::BaseGemmPipelineAgBgCrCompV3",
+        "ck_tile::GemmPipelineAgBgCrCompV3",
+    ],
+    "compv4": [
+        "ck_tile::BaseGemmPipelineAgBgCrCompV4",
+        "ck_tile::GemmPipelineAgBgCrCompV4",
+    ],
+}
+
+SCHEDULER_MAP = {
+    "interwave": "ck_tile::GemmPipelineScheduler::Interwave",
+    "intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
+}
+
+# EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE}
+
+EPILOGUE_MAP = {"cshuffle": CSHUFFLE_EPILOGUE}
+
+
+def BOOL_MAP(b_):
+    return {True: "true", False: "false"}[bool(b_)]
+
+
+# Can add some more supported combinations
+warp_tile_supported_combinations = {
+    "gfx90a": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32]],
+    },
+    "gfx942": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
+        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
+        "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
+    },
+    "gfx950": {
+        "fp16_fp16_fp16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "bf16_bf16_bf16": [
+            [32, 32, 8],
+            [16, 16, 16],
+            [32, 32, 16],
+            [16, 16, 32],
+            [4, 64, 16],
+            [64, 4, 16],
+        ],
+        "fp8_fp8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 32],
+            [16, 16, 64],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+        "bf8_bf8_fp16": [
+            [32, 32, 16],
+            [32, 32, 32],
+            [16, 16, 64],
+            [16, 16, 32],
+            [16, 16, 128],
+            [32, 32, 64],
+        ],
+    },
+}
+
+# Remove some unsupported combinations
+trait_unsupported_combinations = {
+    ("compv3", "cshuffle", "interwave"),
+    ("compv3", "default", "interwave"),
+    ("compv4", "cshuffle", "interwave"),
+    ("compv4", "default", "interwave"),
+}
+
+
+ELEMENT_SIZE_MAP = {
+    "fp16": 2,
+    "bf16": 2,
+    "int8": 1,
+    "fp8": 1,
+    "bf8": 1,
+    "int4": 0.5,
+    "int32": 4,
+}
+
+
+def element_size(data_type: str) -> float:
+    """Calculate the size (in bytes) of a single element for given data type."""
+    data_type = data_type.lower()
+    if data_type not in ELEMENT_SIZE_MAP:
+        raise ValueError(f"Unsupported data type: {data_type}")
+    return ELEMENT_SIZE_MAP[data_type]
+
+
+GPU_NAME_PATTERN = re.compile(r"Name:\s*(gfx\d+\w*)")
+
+
+@lru_cache(maxsize=1)
+def get_gpu_name_by_id(gpu_id: int = 0) -> str:
+    """Retrieve GPU name (e.g. gfx90a) by device ID"""
+    try:
+        output = subprocess.check_output(
+            ["rocminfo"], text=True, stderr=subprocess.PIPE, timeout=5
+        )
+        if matches := GPU_NAME_PATTERN.finditer(output):
+            gpu_list = [m.group(1) for m in matches]
+            return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
+
+        return ""
+
+    except subprocess.CalledProcessError as e:
+        print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
+    except FileNotFoundError:
+        print("ROCm tools not installed (requires rocminfo)")
+    except subprocess.TimeoutExpired:
+        print("GPU query timeout (5s)")
+    except Exception as e:
+        print(f"GPU detection error: {str(e)}")
+
+    return ""
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
new file mode 100644
index 0000000000..e5a879158f
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_config.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+Handles loading, parsing, and validation of JSON and Argument configuration parameters.
+"""
+
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Optional, Union, Type
+import json
+
+
+@dataclass
+class EnumConfigParam:
+    """Represents an enumeration-type configuration parameter"""
+
+    values: List[Union[int, str, bool]]
+
+
+@dataclass
+class RangeConfigParam:
+    """Represents a numeric range-type configuration parameter"""
+
+    min: int
+    max: int
+    step: int
+    exclude: Optional[List[int]]
+
+    def generate_candidates(self) -> List[int]:
+        """Generates valid candidates after applying range constraints"""
+
+        if self.min > self.max:
+            raise ValueError(f"Invalid range: min({self.min}) > max({self.max})")
+        if self.step <= 0:
+            raise ValueError(f"Step must be positive, got {self.step}")
+
+        candidates = list(range(self.min, self.max + 1, self.step))
+
+        if hasattr(self, "exclude") and self.exclude:
+            if not isinstance(self.exclude, list):
+                raise TypeError("exclude must be list type")
+            exclude_set = set(self.exclude)
+            candidates = [x for x in candidates if x not in exclude_set]
+
+        if not candidates:
+            raise ValueError(
+                f"No valid candidates for range [{self.min}-{self.max}] "
+                f"with step {self.step} and excludes {self.exclude}"
+            )
+
+        return candidates
+
+
+@dataclass
+class DataType:
+    """Configuration class for data type parameter."""
+
+    a_datatype: str
+    b_datatype: str
+    e_datatype: str
+    d0_datatype: str
+    d1_datatype: str
+    ds_datatype: List[str]
+
+
+@dataclass
+class Layout:
+    """Configuration class for Layout parameter."""
+
+    a_layout: str
+    b_layout: str
+    e_layout: str
+    d0_layout: str
+    d1_layout: str
+    ds_layout: List[str]
+
+
+@dataclass
+class ArgumentConfig:
+    """Configuration class for Argument parameter."""
+
+    datatypes: DataType
+    layouts: Layout
+    function_name: str
+
+    @classmethod
+    def from_args(
+        cls: Type["ArgumentConfig"],
+        datatype: str,
+        layout: str,
+        elementwise_function: str,
+    ) -> "ArgumentConfig":
+        """configuration loader with validation controls"""
+
+        datatypes = DataType(
+            a_datatype=datatype,
+            b_datatype=datatype,
+            e_datatype=datatype,
+            d0_datatype=datatype,
+            d1_datatype=datatype,
+            ds_datatype=[datatype, datatype],
+        )
+
+        layout_parts = layout.lower()
+        assert len(layout_parts) == 4, (
+            f"Invalid layout string: {layout} (must be 4 characters like 'rcrr' where r stands for row major and c stands for column major)"
+        )
+        assert layout_parts[0] in ("r", "c"), (
+            f"Invalid matrix_a layout: {layout_parts[0]} (must be 'r' for row major or or 'c' for column major)"
+        )
+        assert layout_parts[1] in ("r", "c"), (
+            f"Invalid matrix_b layout: {layout_parts[1]} (must be 'r' for row major or or 'c' for column major)"
+        )
+        assert layout_parts[2] == "r", (
+            f"Invalid matrix_e layout: {layout_parts[2]} (must be 'r' only as currently we are supporting only row major)"
+        )
+        assert layout_parts[3] == "r", (
+            f"Invalid D dimension layout: {layout_parts[3]} (must be 'r' only as currently we are supporting only row major)"
+        )
+
+        layouts = Layout(
+            a_layout=layout[0],
+            b_layout=layout[1],
+            e_layout=layout[2],
+            d0_layout=layout[3],
+            d1_layout=layout[3],
+            ds_layout=[layout[3], layout[3]],
+        )
+        # Elementwise function name validation
+        valid_functions = ["mul", "add", "passthrough"]
+        if elementwise_function not in valid_functions:
+            raise ValueError(
+                f"Invalid elementwise function: {elementwise_function}. "
+                f"Valid options are: {', '.join(valid_functions)}"
+            )
+
+        # Set the function name based on the elementwise function
+        if elementwise_function == "mul":
+            function_name = "MultiDMultiply"
+        elif elementwise_function == "add":
+            function_name = "MultiDAdd"
+        elif elementwise_function == "passthrough":
+            function_name = "PassThrough"  # TODO Change this
+
+        return cls(datatypes=datatypes, layouts=layouts, function_name=function_name)
+
+
+@dataclass
+class TileConfig:
+    """Configuration class for tile parameter."""
+
+    tile_m: Union[EnumConfigParam, RangeConfigParam]
+    tile_n: Union[EnumConfigParam, RangeConfigParam]
+    tile_k: Union[EnumConfigParam, RangeConfigParam]
+
+    warp_m: Union[EnumConfigParam, RangeConfigParam]
+    warp_n: Union[EnumConfigParam, RangeConfigParam]
+    warp_k: Union[EnumConfigParam, RangeConfigParam]
+
+    warp_tile_m: Union[EnumConfigParam, RangeConfigParam]
+    warp_tile_n: Union[EnumConfigParam, RangeConfigParam]
+    warp_tile_k: Union[EnumConfigParam, RangeConfigParam]
+
+
+@dataclass
+class TraitConfig:
+    """Configuration class for kernel traits."""
+
+    pipeline: EnumConfigParam
+    scheduler: EnumConfigParam
+    epilogue: EnumConfigParam
+    pad_m: EnumConfigParam
+    pad_n: EnumConfigParam
+    pad_k: EnumConfigParam
+
+
+@dataclass
+class JsonConfig:
+    """Configuration class for JSON parameter."""
+
+    tile_config: TileConfig
+    trait_config: TraitConfig
+
+    @classmethod
+    def from_json(cls: Type["JsonConfig"], filepath: str) -> "JsonConfig":
+        """JSON configuration loader with validation controls"""
+        config_path = Path(filepath)
+
+        try:
+            if not config_path.exists():
+                raise FileNotFoundError(f"Config file {filepath} not found")
+
+            with config_path.open("r") as f:
+                config_dict = json.load(f)
+
+            # Parse tile config
+            def create_param(param_dict):
+                if "values" in param_dict:
+                    return EnumConfigParam(values=param_dict["values"])
+                else:
+                    return RangeConfigParam(
+                        min=param_dict["min"],
+                        max=param_dict["max"],
+                        step=param_dict["step"],
+                        exclude=param_dict.get("exclude", []),
+                    )
+
+            tile_config = TileConfig(
+                tile_m=create_param(config_dict["tile_config"]["tile_m"]),
+                tile_n=create_param(config_dict["tile_config"]["tile_n"]),
+                tile_k=create_param(config_dict["tile_config"]["tile_k"]),
+                warp_m=create_param(config_dict["tile_config"]["warp_m"]),
+                warp_n=create_param(config_dict["tile_config"]["warp_n"]),
+                warp_k=create_param(config_dict["tile_config"]["warp_k"]),
+                warp_tile_m=create_param(config_dict["tile_config"]["warp_tile_m"]),
+                warp_tile_n=create_param(config_dict["tile_config"]["warp_tile_n"]),
+                warp_tile_k=create_param(config_dict["tile_config"]["warp_tile_k"]),
+            )
+
+            # Parse trait config
+            trait_config = TraitConfig(
+                pipeline=EnumConfigParam(
+                    values=config_dict["trait_config"]["pipeline"]["values"]
+                ),
+                scheduler=EnumConfigParam(
+                    values=config_dict["trait_config"]["scheduler"]["values"]
+                ),
+                epilogue=EnumConfigParam(
+                    values=config_dict["trait_config"]["epilogue"]["values"]
+                ),
+                pad_m=EnumConfigParam(
+                    values=config_dict["trait_config"]["pad_m"]["values"]
+                ),
+                pad_n=EnumConfigParam(
+                    values=config_dict["trait_config"]["pad_n"]["values"]
+                ),
+                pad_k=EnumConfigParam(
+                    values=config_dict["trait_config"]["pad_k"]["values"]
+                ),
+            )
+
+            return cls(tile_config=tile_config, trait_config=trait_config)
+
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON format: {str(e)}")
+        except KeyError as e:
+            raise KeyError(f"Missing required configuration field: {str(e)}")
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
new file mode 100644
index 0000000000..41fddf30aa
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_host_api.hpp
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstring>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "gemm_multi_d_dispatcher.hpp"
+#include "gemm_multi_d_common.hpp"
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_ds", "0", "The stride value for tensor Ds  Default is 0.")
+        .insert("stride_e", "0", "The stride value for tensor E  Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "1",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is "
+                "not supported.")
+        .insert("log",
+                "false",
+                "Wether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert("warmup",
+                "50",
+                "The number of iterations before benchmarking the kernel. Default is 50.")
+        .insert("repeat",
+                "100",
+                "The number of iterations for benchmarking the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Indicates whether the timer is a GPU timer. Possible values are true or false. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "false",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "gemm_multi_d_kernel",
+                "The filename of benchmark result. Default is set to gemm_multi_d_kernel.")
+        .insert(
+            "pipeline",
+            "compv3",
+            "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.")
+        .insert("scheduler",
+                "intrawave",
+                "The type of pipeline. Possible values are compv3, compv4 or mem. Default is "
+                "compv3.")
+        .insert(
+            "epilogue",
+            "cshuffle",
+            "The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.")
+        .insert("pad_m",
+                "false",
+                "Whether pad or not in m direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_n",
+                "false",
+                "Whether pad or not in n direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_k",
+                "false",
+                "Whether pad or not in k direction. Possible values are true or false. Default is "
+                "false.");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser)
+{
+    KernelTraits trait;
+    trait.pipeline  = arg_parser.get_str("pipeline");
+    trait.scheduler = arg_parser.get_str("scheduler");
+    trait.epilogue  = arg_parser.get_str("epilogue");
+    trait.pad_m     = arg_parser.get_bool("pad_m");
+    trait.pad_n     = arg_parser.get_bool("pad_n");
+    trait.pad_k     = arg_parser.get_bool("pad_k");
+
+    return GemmMultiDDispatcher::dispatch(trait);
+}
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
new file mode 100755
index 0000000000..6e65f6bf75
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
@@ -0,0 +1,755 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+# -*- coding: utf-8 -*-
+
+"""
+generate kernel instances to speed up compilation
+"""
+
+import argparse
+import itertools
+from pathlib import Path
+from typing import List, Optional
+from gemm_multi_d_config import JsonConfig, ArgumentConfig, RangeConfigParam
+from gemm_multi_d_codegen_utils import (
+    DATA_TYPE_MAP,
+    LAYOUT_MAP,
+    PIPELINE_MAP,
+    SCHEDULER_MAP,
+    EPILOGUE_MAP,
+    BOOL_MAP,
+    warp_tile_supported_combinations,
+    trait_unsupported_combinations,
+    element_size,
+    get_gpu_name_by_id,
+)
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+class GemmMultiDCodeGenerator:
+    """GEMM (General Matrix Multiplication) Multi D code generator."""
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        user_provided_config: Optional[JsonConfig] = None,
+    ):
+        self.output_dir = Path(args.working_path)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        if user_provided_config is not None:
+            self.config = user_provided_config
+        else:
+            config_path = (
+                Path(__file__).resolve().parent / "configs" / "default_config.json"
+            )
+            self.config = JsonConfig.from_json(config_path)
+
+        self.args = ArgumentConfig.from_args(
+            args.datatype, args.layout, args.elementwise_function
+        )
+
+        self.valid_trait_names: List[str] = []
+        self.valid_trait_tile_combinations: map[str, list[tuple[int]]] = {}
+
+    def list_all_trait_names(self):
+        """List all possible kernel trait names into file."""
+        w_p = Path(self.output_dir)
+        file_path = w_p / "gemm_multi_d_instance_blobs.txt"
+        self._generate_all_traits()
+        self._get_valid_trait_tile_combinations()
+        file_range_map = {}
+        # Write all file paths to the header file
+        files_listed = 0
+        with file_path.open("w") as f:
+            # Core files
+            core_files = [
+                "gemm_multi_d_common.hpp",
+                "gemm_multi_d_instances.hpp",
+                "gemm_multi_d_dispatcher.hpp",
+            ]
+            for core_file in core_files:
+                f.write(str(w_p / core_file) + "\n")
+                files_listed += 1
+
+            # Trait header files
+            for trait in self.valid_trait_names:
+                trait_file = f"gemm_multi_d_{trait}.hpp"
+                f.write(str(w_p / trait_file) + "\n")
+                files_listed += 1
+            file_name = set()
+            # Instance source files
+            for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+                start_idx = files_listed
+                for tile in tile_valid_params:
+                    for (
+                        tile_m,
+                        tile_n,
+                        tile_k,
+                        warp_m,
+                        warp_n,
+                        warp_k,
+                        _,
+                        _,
+                        _,
+                    ) in tile:
+                        instance_name = f"gemm_multi_d_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
+
+                        if instance_name not in file_name:
+                            file_name.add(instance_name)
+                            f.write(str(w_p / instance_name) + "\n")
+                            files_listed += 1
+
+                file_range_map[trait] = (start_idx, files_listed)
+
+        file_path = w_p / "gemm_multi_d_instance_blobs_range.txt"
+        with file_path.open("w") as f:
+            for name, ranges in file_range_map.items():
+                start, last = ranges
+                f.write(name + " " + f"{start}" + " " + f"{last}" + "\n")
+
+    def _generate_all_traits(self):
+        """Generate all possible kernel traits names."""
+        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"]
+
+        # Generate all unique_combinations
+        _unique = set(
+            itertools.product(
+                *[getattr(self.config.trait_config, param).values for param in params]
+            )
+        )
+
+        for combo in _unique:
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo
+            current_combination = (pipeline, epilogue, scheduler)
+
+            if current_combination not in trait_unsupported_combinations:
+                trait_name = (
+                    f"{pipeline}_{epilogue}_{scheduler}_"
+                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}"
+                )
+                self.valid_trait_names.append(trait_name)
+            else:
+                logging.debug(f"Invalid combination: {pipeline}-{epilogue}-{scheduler}")
+
+    def _get_valid_trait_tile_combinations(self):
+        def get_tile_value(tile_param):
+            return (
+                tile_param.generate_candidates()
+                if isinstance(tile_param, RangeConfigParam)
+                else tile_param.values
+            )
+
+        tile_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.tile_m),
+                get_tile_value(self.config.tile_config.tile_n),
+                get_tile_value(self.config.tile_config.tile_k),
+            )
+        )
+
+        warp_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.warp_m),
+                get_tile_value(self.config.tile_config.warp_n),
+                get_tile_value(self.config.tile_config.warp_k),
+            )
+        )
+
+        warp_tile_group = list(
+            itertools.product(
+                get_tile_value(self.config.tile_config.warp_tile_m),
+                get_tile_value(self.config.tile_config.warp_tile_n),
+                get_tile_value(self.config.tile_config.warp_tile_k),
+            )
+        )
+
+        tile_params = {
+            t + w + wt for t in tile_group for w in warp_group for wt in warp_tile_group
+        }
+
+        for trait in self.valid_trait_names:
+            tile_valid_params = [
+                tile for tile in tile_params if self.is_tile_valid(tile, trait)
+            ]
+
+            if trait not in self.valid_trait_tile_combinations:
+                self.valid_trait_tile_combinations[trait] = []
+            self.valid_trait_tile_combinations[trait].append(tile_valid_params)
+
+    def is_tile_valid(self, tile: tuple, trait: str) -> bool:
+        """Check if the tile configuration is valid for the given trait."""
+        (
+            tile_m,
+            tile_n,
+            tile_k,
+            warp_m,
+            warp_n,
+            warp_k,
+            warp_tile_m,
+            warp_tile_n,
+            warp_tile_k,
+        ) = tile
+        pipeline, *_ = trait.split("_")
+
+        # Parameter validity check
+        invalid_params = []
+        if (warp_m, warp_n, warp_k) not in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]:
+            invalid_params.append(
+                f"warp_m({warp_m}) * warp_n({warp_n}) * warp_k({warp_k})"
+            )
+        if (warp_m * warp_tile_m) == 0:
+            invalid_params.append(f"warp_m({warp_m}) * warp_tile_m({warp_tile_m})")
+        if (warp_n * warp_tile_n) == 0:
+            invalid_params.append(f"warp_n({warp_n}) * warp_tile_n({warp_tile_n})")
+        if (warp_k * warp_tile_k) == 0:
+            invalid_params.append(f"warp_k({warp_k}) * warp_tile_k({warp_tile_k})")
+
+        if invalid_params:
+            logging.debug(
+                f"Trait: [{trait}], Invalid warp configuration: {', '.join(invalid_params)}. "
+                f"Parameter combination: warp=({warp_m},{warp_n},{warp_k}), "
+                f"warp_tile=({warp_tile_m},{warp_tile_n},{warp_tile_k})"
+            )
+            return False
+        # Dimension alignment check
+        alignment_issues = []
+        if tile_m % (warp_m * warp_tile_m) != 0:
+            alignment_issues.append(
+                f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}"
+            )
+        if tile_n % (warp_n * warp_tile_n) != 0:
+            alignment_issues.append(
+                f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}"
+            )
+        if tile_k % (warp_k * warp_tile_k) != 0:
+            alignment_issues.append(
+                f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}"
+            )
+
+        if alignment_issues:
+            logging.debug(
+                f"Trait: [{trait}], Dimension alignment failed: {', '.join(alignment_issues)}. "
+                f"Tile dimensions {tile_m}x{tile_n}x{tile_k} must be divisible by "
+                f"[warp]: {warp_m}x{warp_n}x{warp_k} x [warp_tile]: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+            )
+            return False
+
+        # LDS capacity verification
+        matrix_a_size = (tile_m * tile_k) * element_size(self.args.datatypes.a_datatype)
+
+        matrix_b_size = (tile_n * tile_k) * element_size(self.args.datatypes.b_datatype)
+
+        total_tile_in_lds = matrix_a_size + matrix_b_size
+
+        max_tile_size = 2**15 if pipeline == "compv4" else 2**16
+
+        if total_tile_in_lds > max_tile_size:
+            logging.debug(
+                f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds / 1024:.1f}KB) > "
+                f"maximum allowed {max_tile_size:,}B ({max_tile_size / 1024}KB). Breakdown:\n"
+                f"- Matrix A ({self.config.problem.datatype_map['matrix_a']}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n"
+                f"- Matrix B ({self.config.problem.datatype_map['matrix_b']}): {tile_n}x{tile_k} = {matrix_b_size:,}B"
+            )
+            return False
+
+        # Warp combination validation
+        warp_tile_key = f"{self.args.datatypes.a_datatype}_{self.args.datatypes.b_datatype}_{self.args.datatypes.e_datatype}"
+
+        current_combination = [warp_tile_m, warp_tile_n, warp_tile_k]
+
+        gpu_name = get_gpu_name_by_id(0)
+
+        gpu_warp_tile_key = warp_tile_supported_combinations.get(gpu_name, {})
+        if not gpu_warp_tile_key:
+            logging.debug(
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check."
+            )
+            return False
+
+        allowed_combinations = gpu_warp_tile_key.get(warp_tile_key, [])
+        if not allowed_combinations:
+            logging.debug(
+                f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check."
+            )
+            return False
+
+        if current_combination not in allowed_combinations:
+            logging.debug(
+                f"Trait: [{trait}], Invalid warp combination: {current_combination} not in allowed list. "
+                f"Valid combinations for data type '{warp_tile_key}': {allowed_combinations}"
+            )
+            return False
+
+        return True
+
+    def generate_all_instance_files(self):
+        """Generate all kernel instances files."""
+        self._generate_common_header_file()
+        self._generate_all_trait_files()
+        self._generate_dispatcher_file()
+
+    def _generate_common_header_file(self):
+        """Generate common header file with datatypes and layout."""
+
+        acc_type = "float"  # As we are currently supporting only fp16
+
+        content = f"""
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+// Data types
+using ADataType = {DATA_TYPE_MAP[self.args.datatypes.a_datatype]};
+using BDataType = {DATA_TYPE_MAP[self.args.datatypes.b_datatype]};
+using AccDataType = {acc_type};
+using D0DataType = {DATA_TYPE_MAP[self.args.datatypes.d0_datatype]};
+using D1DataType = {DATA_TYPE_MAP[self.args.datatypes.d1_datatype]};
+using DsDataType = ck_tile::tuple<D0DataType, D1DataType>;
+using EDataType = {DATA_TYPE_MAP[self.args.datatypes.e_datatype]};
+
+
+// Layout configurations
+using ALayout = {LAYOUT_MAP[self.args.layouts.a_layout]};
+using BLayout = {LAYOUT_MAP[self.args.layouts.b_layout]};
+using D0Layout = {LAYOUT_MAP[self.args.layouts.d0_layout]};
+using D1Layout = {LAYOUT_MAP[self.args.layouts.d1_layout]};
+using DsLayout = ck_tile::tuple<D0Layout, D1Layout>;
+using ELayout = {LAYOUT_MAP[self.args.layouts.e_layout]};
+
+// Element-wise function for D
+using ElementWiseFn = ck_tile::element_wise::{self.args.function_name};
+
+"""
+
+        (self.output_dir / "gemm_multi_d_common.hpp").write_text(content)
+
+    def _generate_all_trait_files(self):
+        """Generate all kernel traits into files."""
+        if not self.valid_trait_names:
+            self._generate_all_traits()
+            self._get_valid_trait_tile_combinations()
+        for trait in self.valid_trait_names:
+            self._generate_trait_file(trait)
+        self._generate_instantiation_source_files()
+        self._generate_common_instance_header_file()
+
+    def _generate_trait_file(self, trait: str):
+        """Generate a trait with all tile/warp combinations."""
+        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_")
+        filename = f"gemm_multi_d_{trait}.hpp"
+
+        content = f"""
+#pragma once
+
+#include "gemm_multi_d_common.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/host.hpp"
+
+namespace {trait} {{
+"""
+        # Add template struct with configuration
+        content += self._generate_kernel_struct(
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k
+        )
+
+        content += f"\n}} // namespace {trait}\n"
+        (self.output_dir / filename).write_text(content)
+
+    def _generate_kernel_struct(
+        self,
+        pipeline: str,
+        epilogue: str,
+        scheduler: str,
+        pad_m: str,
+        pad_n: str,
+        pad_k: str,
+    ) -> str:
+        """Generate the code block of kernel struct"""
+        return f"""
+
+template <int TileM, int TileN, int TileK,
+          int WarpM, int WarpN, int WarpK,
+          int WarpTileM, int WarpTileN, int WarpTileK,
+          typename CDEElementWise = ElementWiseFn>
+struct GemmKernelMultiD {{
+    static constexpr bool kPadM = {pad_m};
+    static constexpr bool kPadN = {pad_n};
+    static constexpr bool kPadK = {pad_k};
+
+    static float launch(ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args, const ck_tile::stream_config& stream) {{
+        static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
+        
+        static constexpr bool TransposeC = false;
+
+        static constexpr int kBlockPerCu                         = 1;
+        static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+        static constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<TileM, TileN, TileK>,
+                                   ck_tile::sequence<WarpM, WarpN, WarpK>,
+                                   ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                      TileParitionerGroupNum,
+                                                      TileParitionerM01>;
+
+        using Traits  =
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, ELayout>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
+                                             ALayout, BLayout, ELayout, TransposeC>;
+
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * TileK;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * TileK;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        float ave_time{{0}};
+
+        const auto Run = [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {{
+            constexpr bool has_hot_loop_v = has_hot_loop_.value;
+            constexpr auto tail_number_v  = tail_number_.value;
+            constexpr auto scheduler      = {SCHEDULER_MAP[scheduler]};
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem =
+                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                      BDataType,
+                                                      AccDataType,
+                                                      GemmShape,
+                                                      GemmUniversalTraits,
+                                                      scheduler,
+                                                      has_hot_loop_v,
+                                                      tail_number_v>;
+
+            using GemmPipeline = {PIPELINE_MAP[pipeline][1]}<UniversalGemmProblem>;
+            {EPILOGUE_MAP[epilogue]}
+            using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {{
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
+            }}
+
+            if(stream.log_level_ > 0)
+            {{
+                std::cout << "Launching kernel with args:"
+                      << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
+                      << ", blocks: {{" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}}"
+                      << std::endl;
+            }}
+
+            ave_time = ck_tile::launch_kernel(stream,
+                                          ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                              Kernel{{}}, grids, blocks, 0, kargs));
+                
+            return ave_time;
+
+        }};
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {{
+            if(args.k_batch == 1) {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::set>{{}});
+            }} else {{
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                            ck_tile::memory_operation_enum::atomic_add>{{}});
+            }}
+        }};
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+        return ave_time;
+    }}
+
+    static std::string get_name() {{
+        return std::string("gemm_multi_d_") + std::to_string(TileM) + "x" + std::to_string(TileN) + "x" + std::to_string(TileK) +
+                "_" + std::to_string(WarpM) + "x" + std::to_string(WarpN) + "x" + std::to_string(WarpK) + "_" +
+                std::to_string(WarpTileM) + "x" + std::to_string(WarpTileN) + "x" + std::to_string(WarpTileK) + "_" +
+                "{pad_m}" + "_" +
+                "{pad_n}" + "_" +
+                "{pad_k}" + "_" +
+                "{pipeline}" + "_" +
+                "{epilogue}" + "_" +
+                "{scheduler}";
+    }}
+}};
+"""
+
+    def _generate_instantiation_source_files(self):
+        """Generate kernel instance instantiation source files"""
+        tile_map = {}
+        for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+            for tile in tile_valid_params:
+                for (
+                    tile_m,
+                    tile_n,
+                    tile_k,
+                    warp_m,
+                    warp_n,
+                    warp_k,
+                    warp_tile_m,
+                    warp_tile_n,
+                    warp_tile_k,
+                ) in tile:
+                    key = f"{tile_m}x{tile_n}x{tile_k}x{warp_m}x{warp_n}x{warp_k}"
+                    value = f"{warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
+                    if key not in tile_map:
+                        tile_map[key] = set()
+                    tile_map[key].add(value)
+
+        files_listed = 0
+        for trait, _ in self.valid_trait_tile_combinations.items():
+            for block_tile, warp_tiles in tile_map.items():
+                tile_m, tile_n, tile_k, warp_m, warp_n, warp_k = map(
+                    int, block_tile.split("x")
+                )
+
+                content = f"""
+#include "gemm_multi_d_{trait}.hpp" 
+
+"""
+                for warp_tile in warp_tiles:
+                    warp_tile_m, warp_tile_n, warp_tile_k = map(
+                        int, warp_tile.split("x")
+                    )
+
+                    files_listed = files_listed + 1
+                    content = (
+                        content
+                        + f"""
+template struct {trait}::GemmKernelMultiD<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}>;"""
+                    )
+                content += """
+"""
+                (
+                    self.output_dir
+                    / f"gemm_multi_d_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}.cpp"
+                ).write_text(content)
+        print(f"Generated {files_listed} kernel instances in total.")
+
+    def _generate_common_instance_header_file(self):
+        """Generate common instance header into file."""
+        content = """
+#pragma once
+"""
+        for trait in self.valid_trait_names:
+            content += f'#include "gemm_multi_d_{trait}.hpp"\n'
+        (self.output_dir / "gemm_multi_d_instances.hpp").write_text(content)
+
+    def _generate_dispatcher_file(self):
+        """Generate the code block of dispatch mechanism."""
+        content = """
+#pragma once
+
+#include <unordered_map>
+#include <functional>
+#include <vector>
+
+#include "gemm_multi_d_common.hpp"
+#include "gemm_multi_d_instances.hpp"
+
+/// @brief Defines the configuration parameters for a GEMM Multi D operation, enabling the selection of a
+/// specific kernel instance based on the provided settings.
+struct KernelTraits
+{
+    /// @brief The name of the pipeline.
+    std::string pipeline;
+    /// @brief The name of the scheduler (e.g., "intrawave", "interwave").
+    std::string scheduler;
+    /// @brief The name of the epilogue (e.g., "cshuffle", "default").
+    std::string epilogue;
+    /// @brief Indicates whether padding is applied to the M dimension.
+    bool pad_m;
+    /// @brief Indicates whether padding is applied to the N dimension.
+    bool pad_n;
+    /// @brief Indicates whether padding is applied to the K dimension.
+    bool pad_k;
+};
+
+struct GemmMultiDDispatcher {
+    static auto& get_kernel_map() {
+        // Use a static local variable
+        static std::unordered_map<
+            std::string,
+            std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>>
+            kernel_map;
+        return kernel_map;
+    }
+
+    static void init() {
+        auto& kernel_map = get_kernel_map();
+        if(!kernel_map.empty()) return;
+        \n"""
+
+        for trait, tile_valid_params in self.valid_trait_tile_combinations.items():
+            content += f"""         kernel_map["{trait}"] = {{"""
+            for _, tile in enumerate(tile_valid_params):
+                for j in range(len(tile)):
+                    (
+                        tile_m,
+                        tile_n,
+                        tile_k,
+                        warp_m,
+                        warp_n,
+                        warp_k,
+                        warp_tile_m,
+                        warp_tile_n,
+                        warp_tile_k,
+                    ) = tile[j]
+                    content += """[=](ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args, const ck_tile::stream_config& stream) { """
+
+                    content += f"""
+                        return run_kernel<{trait}::GemmKernelMultiD<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}>>(args, stream);"""
+
+                    if j == len(tile) - 1:
+                        content += """
+                                } """
+                    else:
+                        content += """
+                                }, """
+            content += """
+            };\n """
+
+        content += """    }
+
+    template <typename Kernel>
+    static std::tuple<std::string, float> run_kernel(ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args, const ck_tile::stream_config& stream)
+    {
+        std::string name = Kernel::get_name();
+        float avg_time = Kernel::launch(args, stream);
+        
+        return std::make_tuple(name, avg_time);
+    }
+    
+    
+    static auto dispatch(const KernelTraits& trait) {
+        init();
+        const std::string key = assemble_key(trait);
+        auto& kernel_map = get_kernel_map();
+        if(auto it = kernel_map.find(key); it != kernel_map.end())
+        {
+            return it->second;
+        }
+        throw std::runtime_error("No suitable kernel found: " + key);
+    }
+
+private:
+    static std::string assemble_key(const KernelTraits &trait) {
+        return std::string(trait.pipeline) + "_" +
+               trait.epilogue + "_" +
+               trait.scheduler + "_" +
+               (trait.pad_m ? "true" : "false") + "_" +
+               (trait.pad_n ? "true" : "false") + "_" +
+               (trait.pad_k ? "true" : "false");
+    }
+};
+
+"""
+        (self.output_dir / "gemm_multi_d_dispatcher.hpp").write_text(content)
+
+
+def do_list_blobs(
+    args: argparse.Namespace, user_provide_config: Optional[JsonConfig] = None
+):
+    generator = GemmMultiDCodeGenerator(args, user_provide_config)
+    generator.list_all_trait_names()
+
+
+def do_gen_blobs(
+    args: argparse.Namespace, user_provide_config: Optional[JsonConfig] = None
+):
+    generator = GemmMultiDCodeGenerator(args, user_provide_config)
+    generator.generate_all_instance_files()
+
+
+def main(args):
+    gemm_multi_d_config = JsonConfig.from_json(args.config_json)
+
+    if args.list_blobs:
+        do_list_blobs(args, gemm_multi_d_config)
+    elif args.gen_blobs:
+        do_gen_blobs(args, gemm_multi_d_config)
+    else:
+        logging.warning(
+            "No mode specified (use --list_blobs or --gen_blobs). Generating by default..."
+        )
+        do_gen_blobs(args, gemm_multi_d_config)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK gemm multi D kernel",
+    )
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="The path where all the blobs are going to be generated",
+    )
+    parser.add_argument(
+        "-j",
+        "--config_json",
+        required=False,
+        help="Path to the json which contains the configurations that user provide",
+    )
+    parser.add_argument(
+        "-d",
+        "--datatype",
+        required=True,
+        help="Specify what datatype to use for the kernel generation, e.g. fp16",
+    )
+    parser.add_argument(
+        "-ly",
+        "--layout",
+        required=True,
+        help="Specify what layout to use for the kernel generation, e.g. rcrr, rrrr",
+    )
+    parser.add_argument(
+        "-ef",
+        "--elementwise_function",
+        required=True,
+        help="Specify what element wise function for D, e.g. mul, add, passthrough",
+    )
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action="store_true",
+        help="List all kernel instances to file",
+    )
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action="store_true",
+        help="Generate all kernel instances into different files",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
new file mode 100644
index 0000000000..0106d76c05
--- /dev/null
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "benchmark_gemm_multi_d.hpp"
+
+class GemmMultiDProfiler
+{
+    public:
+    static GemmMultiDProfiler& instance(Setting setting)
+    {
+        static GemmMultiDProfiler instance{setting};
+        return instance;
+    }
+
+    void benchmark(
+        GemmMultiDProblem& gemm_multi_d_problem,
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>&
+            callables)
+    {
+        const ALayout layout_a   = ALayout{};
+        const BLayout layout_b   = BLayout{};
+        const D0Layout layout_d0 = D0Layout{};
+        const D1Layout layout_d1 = D1Layout{};
+        const ELayout layout_e   = ELayout{};
+
+        gemm_multi_d_problem.stride_a_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                                                     gemm_multi_d_problem.k_,
+                                                                     gemm_multi_d_problem.stride_a_,
+                                                                     is_row_major(layout_a));
+        gemm_multi_d_problem.stride_b_ = ck_tile::get_default_stride(gemm_multi_d_problem.k_,
+                                                                     gemm_multi_d_problem.n_,
+                                                                     gemm_multi_d_problem.stride_b_,
+                                                                     is_row_major(layout_b));
+        gemm_multi_d_problem.stride_d0_ =
+            ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                        gemm_multi_d_problem.n_,
+                                        gemm_multi_d_problem.stride_d0_,
+                                        is_row_major(layout_d0));
+        gemm_multi_d_problem.stride_d1_ =
+            ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                        gemm_multi_d_problem.n_,
+                                        gemm_multi_d_problem.stride_d1_,
+                                        is_row_major(layout_d1));
+        gemm_multi_d_problem.stride_e_ = ck_tile::get_default_stride(gemm_multi_d_problem.m_,
+                                                                     gemm_multi_d_problem.n_,
+                                                                     gemm_multi_d_problem.stride_e_,
+                                                                     is_row_major(layout_e));
+
+        ck_tile::HostTensor<ADataType> a_m_k(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.k_,
+                                            gemm_multi_d_problem.stride_a_,
+                                            is_row_major(layout_a)));
+        ck_tile::HostTensor<BDataType> b_k_n(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.k_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_b_,
+                                            is_row_major(layout_b)));
+        ck_tile::HostTensor<D0DataType> d0_m_n(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_d0_,
+                                            is_row_major(layout_d0)));
+        ck_tile::HostTensor<D1DataType> d1_m_n(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_d1_,
+                                            is_row_major(layout_d1)));
+        ck_tile::HostTensor<EDataType> e_m_n_device_result(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_e_,
+                                            is_row_major(layout_e)));
+
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(d1_m_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.mData.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.mData.data());
+        d0_m_n_dev_buf.ToDevice(d0_m_n.mData.data());
+        d1_m_n_dev_buf.ToDevice(d1_m_n.mData.data());
+
+        e_m_n_dev_buf.SetZero();
+        e_m_n_device_result.SetZero();
+
+        std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                                  d1_m_n_dev_buf.GetDeviceBuffer()};
+
+        std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {
+            gemm_multi_d_problem.stride_d0_, gemm_multi_d_problem.stride_d1_};
+
+        ck_tile::GemmMultiDHostArgs<DsDataType::size()> gemm_multi_d_args = {
+            a_m_k_dev_buf.GetDeviceBuffer(),
+            b_k_n_dev_buf.GetDeviceBuffer(),
+            ds_ptr_buf,
+            e_m_n_dev_buf.GetDeviceBuffer(),
+            gemm_multi_d_problem.split_k_,
+            gemm_multi_d_problem.m_,
+            gemm_multi_d_problem.n_,
+            gemm_multi_d_problem.k_,
+            gemm_multi_d_problem.stride_a_,
+            gemm_multi_d_problem.stride_b_,
+            stridesDs,
+            gemm_multi_d_problem.stride_e_,
+        };
+
+        ck_tile::HostTensor<EDataType> e_m_n_host_result(
+            ck_tile::host_tensor_descriptor(gemm_multi_d_problem.m_,
+                                            gemm_multi_d_problem.n_,
+                                            gemm_multi_d_problem.stride_e_,
+                                            is_row_major(layout_e)));
+
+        if(setting_.verify_)
+        {
+            gemm_multi_d_host_reference(
+                setting_.verify_, a_m_k, b_k_n, d0_m_n, d1_m_n, e_m_n_host_result);
+        }
+
+        for(auto& callable : callables)
+        {
+            auto kernel_run_result =
+                callable(gemm_multi_d_args,
+                         ck_tile::stream_config{
+                             nullptr, true, setting_.log_, setting_.n_warmup_, setting_.n_repeat_});
+
+            auto [kernel_name, execution_time] = kernel_run_result;
+
+            process_result(gemm_multi_d_problem,
+                           e_m_n_dev_buf,
+                           e_m_n_host_result,
+                           e_m_n_device_result,
+                           kernel_run_result);
+        }
+    }
+
+    void process_result(const GemmMultiDProblem& gemm_multi_d_problem,
+                        ck_tile::DeviceMem& e_m_n_dev_buf,
+                        ck_tile::HostTensor<EDataType>& e_m_n_host_result,
+                        ck_tile::HostTensor<EDataType>& e_m_n_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+
+        KernelInstance kernel_instance{name, gemm_multi_d_problem, {-1.0f, -1.0f, -1.0f}};
+
+        static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
+        std::size_t flop = 0, num_byte = 0;
+        flop += std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ *
+                gemm_multi_d_problem.k_;
+        ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
+            num_byte += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
+                        gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+            flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
+                    gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+        });
+        num_byte += sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ +
+                    sizeof(BDataType) * gemm_multi_d_problem.k_ * gemm_multi_d_problem.n_ +
+                    sizeof(EDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
+
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        e_m_n_dev_buf.FromDevice(e_m_n_dev_result.data());
+        bool verified_correct =
+            !setting_.verify_ ||
+            compare(name, gemm_multi_d_problem.k_, e_m_n_dev_result, e_m_n_host_result);
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        e_m_n_dev_buf.SetZero();
+        e_m_n_dev_result.SetZero();
+    }
+
+    KernelInstance select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        std::cout << "**********************************" << std::endl;
+        std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                  << "The best kernel instance is: " << kernel_instance << std::endl;
+        std::cout << "**********************************" << std::endl;
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity," << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
+                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
+                     << problem.stride_d0_ << "," << problem.stride_d1_ << "," << problem.stride_e_
+                     << "," << problem.dtype_a_ << "," << problem.dtype_b_ << ","
+                     << problem.dtype_d0_ << "," << problem.dtype_d1_ << "," << problem.dtype_acc_
+                     << "," << problem.dtype_e_ << "," << problem.layout_a_ << ","
+                     << problem.layout_b_ << "," << problem.layout_d0_ << "," << problem.layout_d1_
+                     << "," << problem.layout_e_ << "," << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    GemmMultiDProfiler(const GemmMultiDProfiler&)            = delete;
+    GemmMultiDProfiler& operator=(const GemmMultiDProfiler&) = delete;
+
+    private:
+    ~GemmMultiDProfiler() { kernel_instances_.clear(); }
+    GemmMultiDProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance> kernel_instances_;
+};

From 0f42a92fc127f727e004d867eb2cc5177f626143 Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Tue, 12 Aug 2025 18:23:34 -0700
Subject: [PATCH 423/443] Finish the grouped gemm restructure with fp8 data
 type (#2655)

* Finish the grouped gemm restructure with data type

* restore gemm_utils.hpp

* Update example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Comment Addressed

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  | 105 ++++---------
 .../ck_tile/17_grouped_gemm/grouped_gemm.hpp  | 148 +++++++++++++++---
 .../run_grouped_gemm_example.inc              | 115 +++++++++++---
 3 files changed, 251 insertions(+), 117 deletions(-)
 mode change 100644 => 100755 example/ck_tile/17_grouped_gemm/grouped_gemm.cpp

diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
old mode 100644
new mode 100755
index 897952f03c..a821af0649
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -16,91 +16,50 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
 float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                             const ck_tile::index_t num_groups,
                             void* kargs_ptr,
                             bool splitk)
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
     constexpr bool kPadM = false;
     constexpr bool kPadN = false;
     constexpr bool kPadK = false;
 
-    constexpr int kBlockPerCu                         = 1;
     constexpr ck_tile::index_t TileParitionerGroupNum = 8;
     constexpr ck_tile::index_t TileParitionerM01      = 4;
 
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
     using TilePartitioner = ck_tile::
         GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
     using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
-                                                                           kPadN,
-                                                                           kPadK,
-                                                                           DoubleSmemBuffer,
-                                                                           ALayout,
-                                                                           BLayout,
-                                                                           CLayout>;
+    using GemmUniversalTraits =
+        ck_tile::PersistentTileGemmUniversalTraits<GemmConfig::kPadM,
+                                                   GemmConfig::kPadN,
+                                                   GemmConfig::kPadK,
+                                                   GemmConfig::DoubleSmemBuffer,
+                                                   ALayout,
+                                                   BLayout,
+                                                   CLayout>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
     float ave_time{0};
 
     const auto Run = [&](const auto memory_operation_) {
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
         constexpr auto memory_operation = memory_operation_.value;
 
         // We create the GEMM pipeline without specifying hotloop or tailnumber.
@@ -112,7 +71,8 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                                                                            GemmUniversalTraits,
                                                                            scheduler>;
 
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
@@ -125,11 +85,11 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                                              GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
                                              UniversalGemmProblem::TransposeC,
                                              memory_operation>>;
         using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
@@ -145,7 +105,7 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
 
         ave_time =
             ck_tile::launch_kernel(s,
-                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
+                                   ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
                                        Kernel{},
                                        grids,
                                        blocks,
@@ -173,4 +133,7 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
 #include "run_grouped_gemm_example.inc"
 
 constexpr bool Persistent = true;
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
+int main(int argc, char* argv[])
+{
+    return !run_grouped_gemm_example<Persistent, GemmConfigComputeV4>(argc, argv);
+}
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 89d91fbef6..e992cb3118 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -15,24 +15,26 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 
 #ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
 #endif
 
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
 #else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
 #endif
+}
 
 template <typename DataType>
 struct GemmTypeConfig;
@@ -46,13 +48,109 @@ struct GemmTypeConfig<ck_tile::half_t>
     using AccDataType = float;
 };
 
-using Types = GemmTypeConfig<ck_tile::half_t>;
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
 
-// Specific type aliases for easy access
-using ADataType   = Types::ADataType;
-using BDataType   = Types::BDataType;
-using AccDataType = Types::AccDataType;
-using CDataType   = Types::CDataType;
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 1;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
 
 using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
 
@@ -69,6 +167,7 @@ auto create_args(int argc, char* argv[])
         .insert("b_layout", "C", "B tensor data layout - Row by default.")
         .insert("c_layout", "R", "C tensor data layout - Row by default.")
         .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "10", "number of iterations before benchmark the kernel.")
         .insert("repeat", "100", "number of iterations to benchmark the kernel.")
         .insert("group_count", "8", "group count.")
@@ -98,7 +197,14 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                    const ck_tile::stream_config& s,
                    void* kargs_ptr);
 
-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
 float grouped_gemm_tileloop(const ck_tile::stream_config& s,
                             const ck_tile::index_t num_groups,
                             void* kargs_ptr,
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index fa7f1a31c1..425299203f 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -10,6 +10,7 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                  ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                          const ck_tile::index_t kbatch,
                          const float max_accumulated_value)
@@ -30,7 +31,8 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
 
-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
           typename BDataType,
           typename DsDataType,
           typename AccDataType,
@@ -102,8 +104,14 @@ float invoke_gemm(int n_warmup,
                                             kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
                                             hipMemcpyHostToDevice,
                                             stream.stream_id_));
-        ave_time = grouped_gemm_tileloop<ALayout, BLayout, CLayout>(
-            stream, group_count, kargs_ptr, splitk);
+        ave_time = grouped_gemm_tileloop<GemmConfig,
+                                         ALayout,
+                                         BLayout,
+                                         CLayout,
+                                         ADataType,
+                                         BDataType,
+                                         AccDataType,
+                                         CDataType>(stream, group_count, kargs_ptr, splitk);
     }
 
     std::string op_name{"Grouped Gemm"};
@@ -127,7 +135,15 @@ float invoke_gemm(int n_warmup,
     return ave_time;
 }
 
-template <bool Persistent, typename ALayout, typename BLayout, typename CLayout>
+template <bool Persistent,
+          typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_grouped_gemm_example_with_layouts(int argc,
                                           char* argv[],
                                           const ALayout a_layout                  = ALayout{},
@@ -243,7 +259,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
             {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
     }
 
-    invoke_gemm<ADataType,
+    invoke_gemm<GemmConfig,
+                ADataType,
                 BDataType,
                 ck_tile::tuple<>,
                 AccDataType,
@@ -271,7 +288,9 @@ int run_grouped_gemm_example_with_layouts(int argc,
                 a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
             const float max_accumulated_value =
                 *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-            const auto rtol_atol = calculate_rtol_atol(Ks[i], kbatch, max_accumulated_value);
+            const auto rtol_atol =
+                calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                    Ks[i], kbatch, max_accumulated_value);
             pass &= ck_tile::check_err(c_m_n_tensors[i],
                                        c_m_n_host_ref,
                                        "Error: Incorrect results!",
@@ -288,7 +307,61 @@ int run_grouped_gemm_example_with_layouts(int argc,
     return pass;
 }
 
-template <bool Persistent>
+template <bool Persistent, typename GemmConfig, typename PrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Types = GemmTypeConfig<PrecType>;
+    // Specific type aliases for easy access
+    using ADataType   = typename Types::ADataType;
+    using BDataType   = typename Types::BDataType;
+    using AccDataType = typename Types::AccDataType;
+    using CDataType   = typename Types::CDataType;
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
+
+template <bool Persistent, template <typename PrecType> typename GemmConfig>
 int run_grouped_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -297,30 +370,22 @@ int run_grouped_gemm_example(int argc, char* argv[])
         return -1;
     }
 
-    const std::string a_layout = arg_parser.get_str("a_layout");
-    const std::string b_layout = arg_parser.get_str("b_layout");
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string data_type = arg_parser.get_str("prec");
 
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if(a_layout == "R" && b_layout == "C")
+    if(data_type == "fp16")
     {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Row{}, Col{}, Row{});
+        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
     }
-    else if(a_layout == "R" && b_layout == "R")
+    else if(data_type == "fp8")
     {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Row{}, Row{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "R")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Col{}, Row{}, Row{});
-    }
-    else if(a_layout == "C" && b_layout == "C")
-    {
-        return run_grouped_gemm_example_with_layouts<Persistent>(argc, argv, Col{}, Col{}, Row{});
+        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
+            a_layout, b_layout, argc, argv);
     }
     else
     {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+        throw std::runtime_error("Unsupported data type configuration.");
     }
 }

From 452791a3bacbadb95774c071cc1f9c3495b04187 Mon Sep 17 00:00:00 2001
From: Cong Ma <142121551+CongMa13@users.noreply.github.com>
Date: Tue, 12 Aug 2025 22:32:51 -0600
Subject: [PATCH 424/443] Preshuffle AQ matrix in block scale gemm (#2624)

* Preshuffle AQ matrix in block scale gemm

* turns the output to fp16. Increase the repetition time.

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 .../38_block_scale_gemm/CMakeLists.txt        |   3 +
 .../38_block_scale_gemm/gemm_aquant_basic.cpp |  30 ++-
 .../gemm_aquant_preshuffle.cpp                | 238 ++++++++++++++++++
 .../38_block_scale_gemm/gemm_utils.hpp        | 103 +++-----
 .../run_gemm_aquant_example.inc               |  71 ++++--
 .../block_universal_gemm_as_aquant_bs_cr.hpp  | 198 ++++++++++-----
 .../kernel/gemm_aquant_kernel.hpp             | 128 ++++++++--
 .../gemm_aquant_pipeline_ag_bg_cr_base.hpp    |   5 +-
 .../gemm_aquant_pipeline_ag_bg_cr_policy.hpp  |  35 ++-
 .../gemm_aquant_pipeline_ag_bg_cr_v3.hpp      |  10 +-
 .../pipeline/gemm_group_quant_utils.hpp       |  61 +++--
 .../pipeline/tile_gemm_aquant_traits.hpp      |   2 +
 .../test_run_gemm_aquant_example.inc          |  11 +-
 13 files changed, 667 insertions(+), 228 deletions(-)
 create mode 100644 example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp

diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
index bdcb6f50bd..914fdac0e4 100644
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -8,6 +8,9 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     add_executable(tile_example_gemm_aquant_basic EXCLUDE_FROM_ALL gemm_aquant_basic.cpp)
     target_compile_options(tile_example_gemm_aquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+    add_executable(tile_example_gemm_aquant_preshuffle EXCLUDE_FROM_ALL gemm_aquant_preshuffle.cpp)
+    target_compile_options(tile_example_gemm_aquant_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile quant gemm tests for current target")
 endif()
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
index 2667cae788..2ac08c7343 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
@@ -21,7 +21,8 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
 {
     constexpr bool kPadM = false;
@@ -52,7 +53,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
     using CodegenGemmTraits =
-        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
 
     using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
                                                                  BDataType,
@@ -144,7 +145,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
 
 #include "run_gemm_aquant_example.inc"
 
-template <typename TypeConfig, uint32_t QuantGroupSize>
+template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
     using Row = ck_tile::tensor_layout::gemm::RowMajor;
@@ -156,7 +157,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     {
         if(a_layout == "R" && b_layout == "C")
         {
-            return run_gemm_example_with_layouts<TypeConfig, QuantGroupSize>(
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
                 argc, argv, Row{}, Row{}, Col{}, Row{});
         }
         else
@@ -172,6 +173,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     return 0;
 }
 
+template <template <typename PreType> typename GemmConfig>
 int run_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -186,12 +188,14 @@ int run_gemm_example(int argc, char* argv[])
     {
         using TypeConfig =
             decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "bf8")
     {
         using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4fp8")
     {
@@ -199,7 +203,8 @@ int run_gemm_example(int argc, char* argv[])
                                                         ck_tile::fp8_t,
                                                         float,
                                                         ck_tile::fp8_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4bf8")
     {
@@ -207,19 +212,22 @@ int run_gemm_example(int argc, char* argv[])
                                                         ck_tile::bf8_t,
                                                         float,
                                                         ck_tile::bf8_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4f32fp8")
     {
         using TypeConfig =
             decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else if(data_type == "i4f32bf8")
     {
         using TypeConfig =
             decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
     }
     else
     {
@@ -227,4 +235,4 @@ int run_gemm_example(int argc, char* argv[])
     }
 }
 
-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigComputeV3>(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
new file mode 100644
index 0000000000..f4f1aa98d3
--- /dev/null
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 1;
+
+    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+
+    constexpr ck_tile::index_t M_Tile = 16;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 256;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 16;
+    constexpr ck_tile::index_t N_Warp_Tile = 16;
+    constexpr ck_tile::index_t K_Warp_Tile = 32;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
+
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 CodegenGemmShape,
+                                                                 CodegenGemmTraits,
+                                                                 ComputeDataType>;
+
+    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
+    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    constexpr bool transposed_warp_gemm = false;
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+
+        using CodegenPipelineProblem =
+            ck_tile::GemmAQuantPipelineProblem<ADataType,
+                                               AQDataType,
+                                               BDataType,
+                                               AccDataType,
+                                               CodegenGemmShape,
+                                               CodegenGemmTraits,
+                                               QuantGroupSize,
+                                               ComputeDataType,
+                                               ck_tile::GemmPipelineScheduler::Intrawave,
+                                               has_hot_loop_v,
+                                               tail_number_v>;
+        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
+        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+                   ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                    BDataType,
+                                                    ck_tile::tuple<>,
+                                                    AccDataType,
+                                                    CDataType,
+                                                    ck_tile::tuple<>,
+                                                    CLayout,
+                                                    ck_tile::element_wise::PassThrough,
+                                                    CodegenPipelineProblem::kBlockSize,
+                                                    TilePartitioner::MPerBlock,
+                                                    TilePartitioner::NPerBlock,
+                                                    M_Warp,
+                                                    N_Warp,
+                                                    M_Warp_Tile,
+                                                    N_Warp_Tile,
+                                                    K_Warp_Tile,
+                                                    transposed_warp_gemm,
+                                                    ck_tile::memory_operation_enum::set>>;
+        using Kernel =
+            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(args.k_batch != 1)
+        {
+            throw std::runtime_error("split-k is not supported yet!");
+        }
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+}
+
+#include "run_gemm_aquant_example.inc"
+
+template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
+                argc, argv, Row{}, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for A.");
+    }
+
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4fp8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::fp8_t,
+                                                        float,
+                                                        ck_tile::fp8_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::bf8_t,
+                                                        float,
+                                                        ck_tile::bf8_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32bf8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigPreshufle_AQ>(argc, argv); }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
index 35e80ddb89..0d0da93133 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -35,7 +35,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
 #endif
 }
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+constexpr ck_tile::index_t get_k_from_preshuffled_warp_tile()
 {
 #if defined(__gfx950__)
     if constexpr(M_Warp_Tile == 32)
@@ -138,7 +138,7 @@ struct GemmConfigComputeV3 : public GemmConfigBase
     // Compute V3 only support Intrawave scheduler
     static constexpr ck_tile::index_t M_Tile = 32;
     static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
     static constexpr ck_tile::index_t M_Warp = 1;
     static constexpr ck_tile::index_t N_Warp = 4;
@@ -265,7 +265,8 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 32;
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -287,7 +288,8 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -296,62 +298,25 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
     static constexpr bool DoubleSmemBuffer     = false;
 };
 
-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
-struct GemmTypeConfig;
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t>
+template <typename PrecType>
+struct GemmConfigPreshufle_AQ : public GemmConfigBase
 {
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::half_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-    // ToDo: Add more bias config to support different categories of GEMM.
-};
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::bf16_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
 
-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::fp8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();
 
-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
-{
-    using ADataType   = ck_tile::int8_t;
-    using BDataType   = ck_tile::int8_t;
-    using AccDataType = int32_t;
-    using CDataType   = int32_t;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
 };
 
 template <typename ADataType_,
@@ -424,7 +389,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float>
     using QDataType   = float;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -434,7 +399,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>
     using QDataType   = float;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -444,7 +409,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::f
     using QDataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -454,7 +419,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float, ck_tile::fp8_t
     using QDataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -464,7 +429,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float, ck_tile::bf8_t
     using QDataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -474,7 +439,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::b
     using QDataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -484,7 +449,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::fp8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -494,7 +459,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::bf8_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -504,7 +469,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, ck_tile::f
     using QDataType   = ck_tile::fp8_t;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -514,7 +479,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, ck_tile::b
     using QDataType   = ck_tile::bf8_t;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -524,7 +489,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <>
@@ -534,7 +499,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, float>
     using QDataType   = float;
     using BDataType   = ck_tile::pk_int4_t;
     using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };
 
 template <typename T>
@@ -660,7 +625,7 @@ auto create_args(int argc, char* argv[])
         .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
         .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("repeat", "1000", "number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
index 9bdef9755b..6b5e01ca4c 100644
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
@@ -4,6 +4,7 @@
 #pragma once
 #include <bit>
 #include <random>
+#include <stdexcept>
 
 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
@@ -12,6 +13,24 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                  ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
+template <typename T>
+auto shuffle_aq(const ck_tile::HostTensor<T>& t, int block_aq_k)
+{
+    if(t.get_lengths().size() != 2)
+    {
+        throw std::runtime_error("Host tensor is not rank 2 tensor.");
+    }
+    int m_   = t.get_lengths()[0];
+    int aqk_ = t.get_lengths()[1];
+    if(aqk_ % block_aq_k != 0)
+    {
+        throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
+    }
+    ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {1, 0, 2});
+}
+
 template <typename ADataType,
           typename AQDataType,
           typename BDataType,
@@ -21,7 +40,8 @@ template <typename ADataType,
           typename AQLayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::DeviceMem& aq_m_aqk_dev_buf,
                   ck_tile::DeviceMem& b_k_n_dev_buf,
@@ -62,7 +82,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                       ALayout,
                                       BLayout,
                                       CLayout,
-                                      QuantGroupSize>(
+                                      QuantGroupSize,
+                                      Preshuffle>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = std::size_t(2) * M * N * K;
@@ -85,7 +106,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     return ave_time;
 }
 
-template <typename TypeConfig,
+template <typename GemmConfig,
+          typename TypeConfig,
           uint32_t QuantGroupSize,
           typename ALayout,
           typename AQLayout,
@@ -184,8 +206,18 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
+    if constexpr(GemmConfig::Preshuffle)
+    {
+        ck_tile::HostTensor<AQDataType> aq_shuffle_host =
+            shuffle_aq(aq_m_aqk, GemmConfig::K_Tile / QuantGroupSize);
+        aq_m_aqk_dev_buf.ToDevice(aq_shuffle_host.data());
+    }
+    else
+    {
+        aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
+    }
+
     a_m_k_dev_buf.ToDevice(a_m_k.data());
-    aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
     b_k_n_dev_buf.ToDevice(b_k_n.data());
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
@@ -199,21 +231,22 @@ int run_gemm_example_with_layouts(int argc,
                 AQLayout,
                 BLayout,
                 CLayout,
-                QuantGroupSize>(a_m_k_dev_buf,
-                                aq_m_aqk_dev_buf,
-                                b_k_n_dev_buf,
-                                c_m_n_dev_buf,
-                                M,
-                                N,
-                                K,
-                                AQK,
-                                stride_A,
-                                stride_AQ,
-                                stride_B,
-                                stride_C,
-                                kbatch,
-                                n_warmup,
-                                n_repeat);
+                QuantGroupSize,
+                GemmConfig::Preshuffle>(a_m_k_dev_buf,
+                                        aq_m_aqk_dev_buf,
+                                        b_k_n_dev_buf,
+                                        c_m_n_dev_buf,
+                                        M,
+                                        N,
+                                        K,
+                                        AQK,
+                                        stride_A,
+                                        stride_AQ,
+                                        stride_B,
+                                        stride_C,
+                                        kbatch,
+                                        n_warmup,
+                                        n_repeat);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index 4c136e78f7..c6b8882946 100644
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -156,6 +156,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
 
         static constexpr index_t KPack      = WarpGemm::kKPerThread;
         static constexpr index_t KPerThread = KIterPerWarp * WarpGemm::kKPerThread;
+
+        static constexpr bool Preshuffle = Problem::Traits::Preshuffle;
     };
 
     public:
@@ -322,6 +324,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
             static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                           "The CDataType as defined in traits should be the same as correspoinding "
                           "C block tensor data type!");
+            constexpr auto warp_size = get_warp_size();
 
             // hot loop:
             static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
@@ -354,82 +357,153 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
                             }
                         });
 
-                        // Need to multiply aquant with accumulated C
-                        //
-                        // The accumulated C tile has the standard distribution. For example
-                        // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
-                        // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
-                        // [26,0], [27,0].
-                        //
-                        // These elements are in different rows, need to get the scale value
-                        // for the corresponding row.
-                        // Based on aquant's tile distribution, it can be inferred which
-                        // lane holds the relevant scale. For example, the scales corresponding
-                        // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
-                        // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
-                        //
-                        // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
+                        if constexpr(Traits::Preshuffle)
+                        {
+                            // A view is created on top of the preshuffled AQ, where each row of the
+                            // view is composed of a row from a warp tile within an AQ block tile.
+                            // Multiple warp tile rows that belong to the same block tile are laid
+                            // out as consecutive rows.
+                            //
+                            // When we need to multiply a C warp tile with an AQ warp tile, thread 0
+                            // in the warp will load AQ_warp_tile[0], thread 1 will load
+                            // AQ_warp_tile[1], and so on, up to thread 63, which will load
+                            // AQ_warp_tile[63]. The VGPR file in the warp acts similarly to LDS in
+                            // this context, but we use cross-lane operations to access the data.
+                            // (Cross-lane operations are faster than using LDS.)
+                            //
+                            // Note that when the size of the AQ warp tile is smaller than the warp
+                            // size, you need to pad the rows in the view to ensure that each thread
+                            // can read one element.
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};
+                            constexpr uint32_t kTileRowsOfCPerThread = 4;
 
-                        // MIters per warp
-                        constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
+                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                                [&](auto c_row) {
+                                    // For a warp tile of [16x16x32], take thread 0 as an example.
+                                    // Its VGPR[0] stores the value from C_tile[0,0], VGPR[1] stores
+                                    // C_tile[1,0], VGPR[2] stores C_tile[2,0], and VGPR[3] stores
+                                    // C_tile[3,0]. This means VGPR[0] should be multiplied by
+                                    // AQ_tile[0, 0], VGPR[1] by AQ_tile[1, 0], VGPR[2] by
+                                    // AQ_tile[2, 0], and VGPR[3] by AQ_tile[3, 0].
 
-                        // Reg block offset based on mIter
-                        constexpr index_t reg_block_offset =
-                            ((mIter / mIters_per_warp) * Traits::AQPerBlock);
+                                    // Thread 0 can read AQ_tile[0, 0] from itself, AQ_tile[1, 0]
+                                    // from thread 1, ..., and AQ_tile[3, 0] from thread 3.
+                                    auto pull_from_lane =
+                                        ((threadIdx.x & (warp_size - 1)) / Traits::WarpGemm::kN *
+                                             kTileRowsOfCPerThread +
+                                         c_row) *
+                                            Traits::QScalesPerBlockRow +
+                                        kQScale;
+                                    auto& scale_reg = aq_block_tensor.get_thread_buffer()[mIter];
 
-                        constexpr index_t lane_base_offset =
-                            (mIter % mIters_per_warp) * WarpGemm::kM;
+                                    // cross lane ops
+                                    uint32_t scale_reg_dword;
 
-                        // Scale tensor offset along K
-                        constexpr index_t src_reg_offset = reg_block_offset + kQScale;
+                                    if constexpr(std::is_same_v<AQDataType, float>)
+                                    {
+                                        scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                    }
+                                    else
+                                    {
+                                        scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                    }
 
-                        constexpr uint32_t kTileRows        = 4;
-                        constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
+                                    int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                        pull_from_lane << 2,
+                                        __builtin_bit_cast(int, scale_reg_dword));
 
-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
+                                    float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
 
-                        static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
-                            // Multiply by 4 because output is stored in tiles of 4
-                            // x CNLane
-                            constexpr uint32_t row_base =
-                                ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
-                                ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f *
+                                         kA_cvt_scale * kB_cvt_scale);
+                                });
+                        }
+                        else
+                        {
+                            // Need to multiply aquant with accumulated C
+                            //
+                            // The accumulated C tile has the standard distribution. For example
+                            // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
+                            // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
+                            // [26,0], [27,0].
+                            //
+                            // These elements are in different rows, need to get the scale value
+                            // for the corresponding row.
+                            // Based on aquant's tile distribution, it can be inferred which
+                            // lane holds the relevant scale. For example, the scales corresponding
+                            // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
+                            // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
+                            //
+                            // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
 
-                            constexpr uint32_t reg_offset_for_row_data = c_row / WarpGemm::kCMLane;
+                            // MIters per warp
+                            constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
 
-                            // Lane index to source scale from
-                            uint32_t src_lane_idx = lane_base_offset + row_base +
-                                                    (__lane_id() / WarpGemm::kN * kTileRows);
+                            // Reg block offset based on mIter
+                            constexpr index_t reg_block_offset =
+                                ((mIter / mIters_per_warp) * Traits::AQPerBlock);
 
-                            // Directly index into thread buffer corresponding to
-                            // desired row coefficient
-                            auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];
-                            uint32_t scale_reg_dword;
+                            constexpr index_t lane_base_offset =
+                                (mIter % mIters_per_warp) * WarpGemm::kM;
 
-                            if constexpr(std::is_same_v<AQDataType, float>)
-                            {
-                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                            }
-                            else
-                            {
-                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                            }
+                            // Scale tensor offset along K
+                            constexpr index_t src_reg_offset = reg_block_offset + kQScale;
 
-                            // Pull scale data across lanes
-                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                                src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+                            constexpr uint32_t kTileRows        = 4;
+                            constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
 
-                            float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};
 
-                            c_block_tensor
-                                .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
-                                (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
-                                 scale_reg_f * kA_cvt_scale * kB_cvt_scale);
-                        });
+                            static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
+                                // Multiply by 4 because output is stored in tiles of 4
+                                // x CNLane
+                                constexpr uint32_t row_base =
+                                    ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
+                                    ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+
+                                constexpr uint32_t reg_offset_for_row_data =
+                                    c_row / WarpGemm::kCMLane;
+
+                                // Lane index to source scale from
+                                uint32_t src_lane_idx = lane_base_offset + row_base +
+                                                        (__lane_id() / WarpGemm::kN * kTileRows);
+
+                                // Directly index into thread buffer corresponding to
+                                // desired row coefficient
+                                auto& scale_reg =
+                                    aq_block_tensor.get_thread_buffer()[src_reg_offset];
+                                uint32_t scale_reg_dword;
+
+                                if constexpr(std::is_same_v<AQDataType, float>)
+                                {
+                                    scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                }
+                                else
+                                {
+                                    scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                }
+
+                                // Pull scale data across lanes
+                                int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                    src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+
+                                float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+
+                                c_block_tensor
+                                    .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
+                                    (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
+                                     scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                            });
+                        }
                     });
                 });
             });
diff --git a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
index b1f89fe2e2..78a514d6cd 100644
--- a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
@@ -3,11 +3,14 @@
 
 #pragma once
 
-#include <iostream>
 #include <string>
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/host/concat.hpp"
 
 namespace ck_tile {
@@ -104,6 +107,7 @@ struct AQuantGemmKernel
     using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
     using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
     static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    static constexpr bool Preshuffle         = GemmPipeline::Preshuffle;
 
     using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
     using AQDataType = remove_cvref_t<typename GemmPipeline::AQDataType>;
@@ -157,7 +161,7 @@ struct AQuantGemmKernel
         __device__ SplitKBatchOffset(const AQuantGemmKernelArgs& kargs,
                                      const std::size_t k_id = blockIdx.z)
         {
-            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
             const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
             const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
 
@@ -372,14 +376,75 @@ struct AQuantGemmKernel
             }
         }();
 
+        const auto get_padding_size = [](index_t length, index_t alignment) {
+            return ck_tile::integer_least_multiple(length, alignment) - length;
+        };
+
+        const auto& make_preshuffled_aq_tensor_view = [&]() {
+            const auto aq_x = kargs.M * GemmPipeline::KPerBlockAQ;
+            const auto aq_y = kargs.QK / GemmPipeline::KPerBlockAQ;
+
+            const auto aq_desc =
+                make_naive_tensor_descriptor(make_tuple(aq_y, aq_x),
+                                             make_tuple(aq_x, 1),
+                                             number<GemmPipeline::GetVectorSizeAQ()>{},
+                                             number<1>{});
+
+            const auto block_tile_size = GemmPipeline::MPerBlock * GemmPipeline::KPerBlockAQ;
+            const auto aq_pad0_desc    = transform_tensor_descriptor(
+                aq_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_right_pad_transform(aq_x, get_padding_size(aq_x, block_tile_size))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            const auto pad_aq_x = aq_pad0_desc.get_lengths()[I1];
+            const auto wave_tile_size =
+                TilePartitioner::BlockGemmShape::WarpTile::at(I0) * GemmPipeline::KPerBlockAQ;
+            const auto wave_tile_count_x = ck_tile::integer_divide_ceil(pad_aq_x, wave_tile_size);
+            const auto aq_unmerge_pad0_desc = transform_tensor_descriptor(
+                aq_pad0_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_unmerge_transform(make_tuple(wave_tile_count_x, wave_tile_size))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+            const auto aq_pad1_desc = transform_tensor_descriptor(
+                aq_unmerge_pad0_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_pass_through_transform(wave_tile_count_x),
+                           make_right_pad_transform(
+                               wave_tile_size, get_padding_size(wave_tile_size, get_warp_size()))),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            const auto pad_wave_size =
+                ck_tile::integer_least_multiple(wave_tile_size, get_warp_size());
+            const auto aq_merge_pad1_desc = transform_tensor_descriptor(
+                aq_pad1_desc,
+                make_tuple(make_merge_transform(make_tuple(wave_tile_count_x, aq_y)),
+                           make_pass_through_transform(pad_wave_size)),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return make_tensor_view<address_space_enum::global>(aq_ptr, aq_merge_pad1_desc);
+        };
+
         const auto& aq_tensor_view = [&]() {
             static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            return make_naive_tensor_view<address_space_enum::global>(
-                aq_ptr,
-                make_tuple(kargs.M, kargs.QK),
-                make_tuple(kargs.stride_AQ, 1),
-                number<GemmPipeline::GetVectorSizeAQ()>{},
-                number<1>{});
+            if constexpr(Preshuffle)
+            {
+                return make_preshuffled_aq_tensor_view();
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    aq_ptr,
+                    make_tuple(kargs.M, kargs.QK),
+                    make_tuple(kargs.stride_AQ, 1),
+                    number<GemmPipeline::GetVectorSizeAQ()>{},
+                    number<1>{});
+            }
         }();
 
         const auto& b_tensor_view = [&]() {
@@ -491,16 +556,7 @@ struct AQuantGemmKernel
             }
         }();
 
-        const auto& aq_pad_view = [&]() {
-            const auto& aq_tensor_view = views.at(I1);
-            static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
-            return pad_tensor_view(
-                aq_tensor_view,
-                make_tuple(number<TilePartitioner::MPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                // TODO: Add support for padding.
-                sequence<false, false>{});
-        }();
+        const auto& aq_pad_view = [&]() { return views.at(I1); }();
 
         const auto& b_pad_view = [&]() {
             const auto& b_tensor_view = views.at(I2);
@@ -543,8 +599,10 @@ struct AQuantGemmKernel
     }
 
     template <typename PadView>
-    CK_TILE_DEVICE static auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views,
+                                                   const AQuantGemmKernelArgs& kargs,
+                                                   const index_t i_m,
+                                                   const index_t i_n)
     {
         const auto& a_pad_view  = views.at(I0);
         const auto& aq_pad_view = views.at(I1);
@@ -570,11 +628,26 @@ struct AQuantGemmKernel
 
         const auto& aq_block_window = [&]() {
             static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            return make_tile_window(
-                aq_pad_view,
-                make_tuple(number<TilePartitioner::MPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                {i_m, 0});
+            if constexpr(Preshuffle)
+            {
+                constexpr auto tile_window_width = get_warp_size();
+                constexpr auto tile_window_height =
+                    TilePartitioner::MPerBlock / TilePartitioner::BlockGemmShape::WarpTile::at(I0);
+                auto block_m_idx = i_m / TilePartitioner::MPerBlock;
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<tile_window_height>{}, number<tile_window_width>{}),
+                    {block_m_idx * kargs.K / TilePartitioner::BlockGemmShape::BlockTile::at(I2),
+                     0});
+            }
+            else
+            {
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                               number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
+                    {i_m, 0});
+            }
         }();
 
         const auto& b_block_window = [&]() {
@@ -633,7 +706,8 @@ struct AQuantGemmKernel
             a_ptr, b_ptr, aq_ptr, c_ptr, kargs, splitk_batch_offset);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        auto gemm_tile_windows =
+            MakeGemmTileWindows(gemm_pad_views, kargs, block_idx_m, block_idx_n);
 
         const index_t num_loop = __builtin_amdgcn_readfirstlane(
             TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
index 1356d7e222..ed13adf10e 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
@@ -38,12 +38,9 @@ struct GemmAQuantPipelineAgBgCrImplBase : public GemmPipelineAgBgCrImplBase<Prob
     {
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
 
-        using YPerTile = number<MPerBlock>;
-        using XPerTile = number<KPerBlockAQ>;
-
         auto aq_copy_dram_window =
             make_tile_window(aq_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(YPerTile(), XPerTile()),
+                             aq_dram_block_window_tmp.get_window_lengths(),
                              aq_dram_block_window_tmp.get_window_origin(),
                              Policy::template MakeAQDramTileDistribution<Problem>());
         return aq_copy_dram_window;
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
index 2004f7d90e..f2d78d7ab5 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -42,6 +42,7 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
         constexpr index_t KPerBlockAQ = KPerBlock / Problem::kQuantGroupSize;
         constexpr index_t VecLoadSize = GetVectorSizeAQ<Problem>();
+        constexpr bool Preshuffle     = Problem::Traits::Preshuffle;
         using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
         using WarpGemm                = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
                                                                typename Problem::ComputeDataType,
@@ -52,14 +53,34 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
                                                                false>;
 
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-        using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
-                                                                      WarpGemm,
-                                                                      BlockSize,
-                                                                      MPerBlock,
-                                                                      KPerBlockAQ,
-                                                                      VecLoadSize>;
+        if constexpr(Preshuffle)
+        {
+            using TileEncodingPattern =
+                TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                  WarpGemm,
+                                                  BlockSize,
+                                                  MPerBlock / WarpGemm::kM,
+                                                  ck_tile::integer_least_multiple(
+                                                      WarpGemm::kM * KPerBlockAQ, get_warp_size()),
+                                                  KPerBlockAQ,
+                                                  VecLoadSize,
+                                                  Preshuffle>;
 
-        return TileEncodingPattern::Make2DStaticTileDistribution();
+            return TileEncodingPattern::Make2DStaticTileDistribution();
+        }
+        else
+        {
+            using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                                          WarpGemm,
+                                                                          BlockSize,
+                                                                          MPerBlock,
+                                                                          KPerBlockAQ,
+                                                                          KPerBlockAQ,
+                                                                          VecLoadSize,
+                                                                          Preshuffle>;
+
+            return TileEncodingPattern::Make2DStaticTileDistribution();
+        }
     }
 
     template <typename Problem>
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
index 746396b13a..64b2402aa5 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -7,7 +7,6 @@
 #include <sstream>
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/host/concat.hpp"
@@ -134,6 +133,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
     static constexpr bool kPadK = Problem::kPadK;
 
     static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr bool Preshuffle       = Problem::Traits::Preshuffle;
 
     static constexpr bool HasHotLoop = Problem::HasHotLoop;
     static constexpr auto TailNum    = Problem::TailNum;
@@ -254,9 +254,6 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
             constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
 
             static_assert(!is_aq_col_major, "Aq must be row major (col major not supported yet)");
-            static_assert(MPerBlock == AQDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
-                              KPerBlockAQ == AQDramBlockWindowTmp{}.get_window_lengths()[I1{}],
-                          "Aq block window has incorrect lengths for defined AqLayout!");
 
             static_assert(is_a_col_major
                               ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
@@ -312,8 +309,11 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
                 is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
             constexpr BDramTileWindowStep b_dram_tile_window_step =
                 is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
+
+            // only row_major for AQ
             constexpr AQDramTileWindowStep aq_dram_tile_window_step =
-                is_aq_col_major ? make_array(KPerBlockAQ, 0) : make_array(0, KPerBlockAQ);
+                Preshuffle ? make_array(MPerBlock / BlockGemm::WarpGemm::kM, 0)
+                           : make_array(0, KPerBlockAQ);
 
             // DRAM prefetch (global read 0)
             Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
index c018314ab7..051543b8b6 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
@@ -50,10 +50,11 @@ template <typename BlockGemmShape,
           index_t BlockSize,
           index_t YPerTile,
           index_t XPerTile,
-          index_t VecSize>
+          index_t KPerBlockAQ,
+          index_t VecSize,
+          bool Preshuffle>
 struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPattern
 {
-    // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
     static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
     static constexpr index_t warp_size = get_warp_size();
     static constexpr index_t num_warps = BlockSize / get_warp_size();
@@ -69,26 +70,46 @@ struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPatter
     // KWarps > 1 isn't supported
     static_assert(KWarps == 1);
 
-    // # of elements per thread
-    static constexpr index_t X = XPerTile;
-
-    static constexpr index_t Y0 = 1;
-    static constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
-    static constexpr index_t Y2 = MWarps;
-    static constexpr index_t Y3 = WarpGemm::kM;
-    static_assert(Y3 >= WarpGemm::kM, "Scales for all rows must be available within the warp.");
-    static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
-                  "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
-
     CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
     {
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<NWarps>,
-                                       tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
-                                       tuple<sequence<1, 0>, sequence<1, 1>>,
-                                       tuple<sequence<2, 0>, sequence<0, 3>>,
-                                       sequence<1, 2>,
-                                       sequence<1, 0>>{});
+        if constexpr(Preshuffle)
+        {
+            // # of elements per thread
+            constexpr index_t X2 = KPerBlockAQ;
+            constexpr index_t X1 = warp_size / X2;
+            constexpr index_t X0 = XPerTile / warp_size;
+
+            constexpr index_t Y1 = MWarps;
+            constexpr index_t Y0 = YPerTile / Y1;
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<NWarps>,
+                                           tuple<sequence<Y0, Y1>, sequence<X0, X1, X2>>,
+                                           tuple<sequence<1, 0>, sequence<2, 2>>,
+                                           tuple<sequence<1, 0>, sequence<1, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{});
+        }
+        else
+        {
+            // # of elements per thread
+            constexpr index_t X = XPerTile;
+
+            constexpr index_t Y0 = 1;
+            constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
+            constexpr index_t Y2 = MWarps;
+            constexpr index_t Y3 = WarpGemm::kM;
+            static_assert(Y3 >= WarpGemm::kM,
+                          "Scales for all rows must be available within the warp.");
+            static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
+                          "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<NWarps>,
+                                           tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
+                                           tuple<sequence<1, 0>, sequence<1, 1>>,
+                                           tuple<sequence<2, 0>, sequence<0, 3>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 0>>{});
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
index 4972badb3f..41f8f1deef 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
@@ -10,6 +10,7 @@ namespace ck_tile {
 template <bool kPadM_,
           bool kPadN_,
           bool kPadK_,
+          bool Preshuffle_,
           typename ALayout_,
           typename BLayout_,
           typename CLayout_,
@@ -29,6 +30,7 @@ struct TileGemmAQuantTraits
 
     static constexpr bool UseStructuredSparsity = false;
     static constexpr index_t NumWaveGroups      = 1;
+    static constexpr bool Preshuffle            = Preshuffle_;
 };
 
 } // namespace ck_tile
diff --git a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
index a63a58b473..0b886938b8 100644
--- a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
+++ b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
@@ -24,7 +24,8 @@ template <typename ADataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
 {
     constexpr bool kPadM = false;
@@ -55,7 +56,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
     using CodegenGemmTraits =
-        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
 
     using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
                                                                  BDataType,
@@ -161,7 +162,8 @@ template <typename ADataType,
           typename AQLayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   ck_tile::DeviceMem& aq_m_aqk_dev_buf,
                   ck_tile::DeviceMem& b_k_n_dev_buf,
@@ -202,7 +204,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                       ALayout,
                                       BLayout,
                                       CLayout,
-                                      QuantGroupSize>(
+                                      QuantGroupSize,
+                                      Preshuffle>(
         args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
 
     std::size_t flop     = std::size_t(2) * M * N * K;

From 05a6e92705a82061ee6d29f5035cd0cac75f4355 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Wed, 13 Aug 2025 14:57:43 +0800
Subject: [PATCH 425/443] Re-enable optimization for gfx950 fmha fwd (#2671)

* Fix for fwd/bwd kernel build filter

* fix bwd code

* save an example for __bf16 type

* temp save, waiting for debug

* tempsave, fmha_decode

* temp save, change all instance to 1wave

* fix async copytest bug

* Add block_sync_lds_direct_load utility

* fix the s_waitcnt_imm calculation

* Improve s_waitcnt_imm calculation

* fix vmcnt shift

* add input validation and bug fix

* remove unnecessary output

* move test_copy into test

* temp save

* tempsave

* compile pass

* tempsave, trload+asyncload done

* tempsave. asynccopy+trload sanity checked

* remove unnecessary features

* fix the lds alignment caused performance regression

* enable prefill overload operator().

* remove all lds bankconflict with xor layouts

* enable larger tile size; upgrade xor pattern

* upgrade prefill pipeline; simple iglp; consistent data produce and consume order

* small refactor

* Load Q through lds, implement xor;

* add vmcnt guard before load ktile

* Add v_permlaneb32 for block_reduce. Disable it as it will cause un-coexecutable packed math in FA

* Add XOR fold strategy for hdim<128, but perf dropped; disable it by default; wait further perf debug

* add __restrict__ to tr load

* merge fa_decode pipeline into fmha_fwd api

* remove unnecessary files; rename some files

* Remove unnecessary changes

* bug fix, clang format;

* remove non-necessary change

* fix clangformat with 18.1.3

* fix bugs

* fix bug

* fix bug on non-gfx950

* fix bugs in gemm

* fix bug in pki4

* tempsave, update the blocksync functions

* change the warp setting for hdim32 fmha fwd

* clang format

* fix conflict. disable all v-col instance for fmha fwd

* Fix the bug

* clang format

* refactor blockgemm change, isolate to v2;

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: asleepzzz <hanwen.chang@amd.com>
---
 .../ck_tile/01_fmha/codegen/cpp_symbol_map.py |    2 +
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |  147 +-
 example/ck_tile/01_fmha/fmha_fwd.cpp          |    2 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |    3 +
 .../ck_tile/01_fmha/script/benchmark_fwd.sh   |   11 -
 .../ck_tile/01_fmha/script/smoke_test_fwd.sh  |   21 +-
 .../core/arch/amd_buffer_addressing.hpp       |   17 +-
 .../arch/amd_buffer_addressing_builtins.hpp   |   17 +-
 include/ck_tile/core/arch/arch.hpp            |   27 +-
 include/ck_tile/core/arch/utility.hpp         |   15 +
 include/ck_tile/core/config.hpp               |   10 +
 include/ck_tile/core/numeric/bfloat16.hpp     |   11 +
 include/ck_tile/core/numeric/pk_fp4.hpp       |    2 +-
 include/ck_tile/core/numeric/pk_int4.hpp      |    2 +-
 include/ck_tile/core/numeric/vector_type.hpp  |   12 +-
 include/ck_tile/ops/fmha.hpp                  |    2 +
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       | 1504 ++++++++++++-----
 .../pipeline/block_fmha_pipeline_enum.hpp     |    7 +
 .../pipeline/block_fmha_pipeline_problem.hpp  |    2 +
 ...ck_fmha_pipeline_qr_ks_vs_async_trload.hpp | 1177 +++++++++++++
 ..._pipeline_qr_ks_vs_async_trload_policy.hpp |  821 +++++++++
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   30 +-
 .../block/block_gemm_areg_breg_creg_v2.hpp    |  372 ++++
 ...k_gemm_areg_breg_creg_v2_custom_policy.hpp |   45 +
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |    8 +
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |    4 +
 .../ck_tile/ops/reduce/block/block_reduce.hpp |   30 +-
 27 files changed, 3767 insertions(+), 534 deletions(-)
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
 create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
 create mode 100644 include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
 create mode 100644 include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp

diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
index 6fca800c90..42a9d5148a 100644
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -115,6 +115,7 @@ PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
     "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
     "qs" : "ck_tile::BlockFmhaPipelineQSKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
 }
 
 PIPELINE_ENUM_MAP = {
@@ -123,6 +124,7 @@ PIPELINE_ENUM_MAP = {
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
     "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
     "qr_pagedkv" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD",
 }
 
 BOOL_MAP = {
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 471486419a..f614f42e6b 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -12,6 +12,7 @@ from typing import List, Optional, Tuple
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file
 
 
 DTYPE_BITS = {
@@ -83,6 +84,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     {F_mode},
     fmha_variant_{F_idx},
     fmha_mask_{F_idx},
+    {F_trload},
     fmha_trait_{F_idx}>;
 
 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -97,7 +99,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
 
 #include <iostream>
 
@@ -161,12 +163,19 @@ float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config&
     [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
         return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
     }};
+    
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
 
 {F_dispatch}
     return r;
 }}
 """
 
+FMHA_FWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
+{F_dtype_case}
+    }}
+"""
+
 FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
 {F_hdim_case}
     }}
@@ -177,8 +186,8 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
                 return fmha_fwd_<trait_>(s, a);
             }}
 """
@@ -221,6 +230,7 @@ class FmhaFwdApiTrait:
     dpad       : str
     dvpad      : str
     skip       : str
+    tr_load    : str
     constraint : CppConstraint
 
     @property
@@ -231,13 +241,19 @@ class FmhaFwdApiTrait:
     @property
     def scheck(self) -> str:
         if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
-        if self.pipeline_tag == 'qr_async':
+        if self.pipeline_tag in ['qr_async', 'qr_async_trload']:
             if self.spad == 't' : return 'true' # always support
             else :                return 'true'
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_q % {self.bm0} == 0'
         else: assert False
+    
+    @property
+    def seqtune(self) -> str:
+        if self.bm0 == 128: return 'true/*fall back to largest tile*/'                  # group mode only generate spad/skpad == true
+        else: 
+            return f'a.seqlen_q <= {self.bm0}'
 
     @property
     def skcheck(self) -> str:
@@ -248,6 +264,9 @@ class FmhaFwdApiTrait:
         elif self.pipeline_tag in ['qr', 'qs']:
             if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag == 'qr_async_trload':
+            if self.skpad == 't' : return 'true'
+            else:                  return 'true'
         else: assert False
 
     @property
@@ -256,7 +275,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
             else :               assert False
-        elif self.pipeline_tag in ['qr', 'qs']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -268,7 +287,7 @@ class FmhaFwdApiTrait:
             vec = int((32 * 4) / DTYPE_BITS[self.dtype])
             if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
             else :                assert False
-        elif self.pipeline_tag in ['qr', 'qs']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
             bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
             if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
             else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -290,6 +309,7 @@ class FmhaFwdPipeline:
     F_squant     : str  #
     F_mask       : str  # value from MASK_MAP
     F_skip       : str  # true/false
+    F_trload     : str  # true/false
     F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
 
     @property
@@ -331,6 +351,9 @@ class FmhaFwdPipeline:
 
         if self.F_squant == 't' : n += '_squant'
         else: n += '_nsquant'
+        
+        if self.F_trload == 't' : n += '_trload'
+        else: n += '_ntrload'
 
         return n
 
@@ -351,31 +374,39 @@ class FmhaFwdApiPool:
 
     @property
     def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.pool.keys()):
-            per_hdim_case=str()
-            for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
-                traits=self.pool[dtype][(hdim, hdim_v)]
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
-                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip],
-                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
-                                   F_constraint=trait.constraint,
-                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
+        tr_load_cond_map = {
+            "t": "has_load_tr",
+            "f": "true"
+        }
+        
+        per_tr_load =str()
+        for tr_load in ["t", "f"]:
+            per_dtypes=str()
+            for i, dtype in enumerate(self.pool.keys()):
+                per_hdim_case=str()
+                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                    traits=self.pool[dtype][(hdim, hdim_v)]
+                    inners=str()
+                    for k, trait in enumerate(traits):
+                        if_k = 'if' if k == 0 else 'else if'
+                        inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                       F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                       F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                       F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip], F_trload=BOOL_MAP[trait.tr_load],
+                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                       F_constraint=trait.constraint,
+                                       F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                       F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                       F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                    if_j = 'if' if j == 0 else 'else if'
+                    per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
+                if_i = 'if' if i == 0 else 'else if'
+                per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+            per_tr_load += FMHA_FWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_dtype_case=per_dtypes)
+        if not per_tr_load:
             # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_tr_load)
 
 @dataclass
 class FmhaFwdTileSize:
@@ -458,7 +489,8 @@ class FmhaFwdKernel:
                 F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                 F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
+                F_trload        = BOOL_MAP[self.F_pipeline.F_trload])
 
     @property
     def name(self) -> str:
@@ -494,6 +526,7 @@ class FmhaFwdKernel:
                 dpad=self.F_pipeline.F_dpad,
                 dvpad=self.F_pipeline.F_dvpad,
                 skip=self.F_pipeline.F_skip,
+                tr_load=self.F_pipeline.F_trload,
                 constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
 
 class KernelComponentFactory:
@@ -503,10 +536,15 @@ class KernelComponentFactory:
     def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
         if dtype == 'fp16' or dtype == 'bf16':
             return {
-                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (64, 64)  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
-                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (128,128) : [FmhaFwdTileSize(16, 32, 64, 128, 32,  128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32, 128, 128, 32,  128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64, 32, 128, 16,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
                 (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
                 (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
@@ -534,34 +572,27 @@ class KernelComponentFactory:
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
                 if hdim == 256 and hdim_v == 256:
-                # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                     # the below two is used for hdim vectorize load
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                 else:
                     if bias == "bias":
                         # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                     else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        if (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f":
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 't'))
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 't'))
                     if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f')) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f'))
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
         elif dtype in ['fp8fp16', 'fp8bf16']:
             # TODO
             None
@@ -599,6 +630,12 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
+                if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
+                    # non qr_async_trload only support km0=128 tile size when hdim is not 128
+                    # non qr_async only support kn0=128 tile size when hdim is 128
+                    continue
+                if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
+                    continue
                 # logits_soft_cap is only allowed if no bias
                 if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
                     continue
@@ -665,10 +702,10 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
     return (api_pool, gen)
 
 def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
+    update_file(autogen_dir / kernel.filename, kernel.template)
 
 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
 
 def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
     api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index c0e4dc3d30..d0f8e3798c 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1135,7 +1135,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
               << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush;
+              << " GB/s" << std::flush << std::endl;
 
     if(do_validation == 0)
     {
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 81dda692ea..df1e9e5699 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/fmha.hpp"
@@ -1028,6 +1029,7 @@ template <ck_tile::index_t HDim_,
           bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
+          bool kUseTrLoad_,
           bool kSkipMinSeqlenQ_ = false>
 struct fmha_fwd_traits_
 {
@@ -1052,6 +1054,7 @@ struct fmha_fwd_traits_
     static constexpr bool kPadSK                     = kPadSK_;
     static constexpr bool kPadD                      = kPadD_;
     static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kUseTrLoad                 = kUseTrLoad_;
     static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
 };
 
diff --git a/example/ck_tile/01_fmha/script/benchmark_fwd.sh b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
index 599c595a75..88c16cceb6 100755
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -18,14 +18,3 @@ $EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kn
 done
 done
 done
-
-for perm in 0 1 ; do
-
-$EXE -prec=fp8 -squant=1 -b=32 -h=16 -d=128 -s=512   -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=16 -h=16 -d=128 -s=1024  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=8  -h=16 -d=128 -s=2048  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=4  -h=16 -d=128 -s=4096  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=2  -h=16 -d=128 -s=8192  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=1  -h=16 -d=128 -s=16384 -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-
-done
\ No newline at end of file
diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
index b867cd6c07..dc2be933bd 100755
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -42,7 +42,6 @@ run_fp16_bf16_tests() {
     for prec in "fp16" "bf16" ; do
     for mode in 1 0 ; do
     for perm in 0 1 ; do
-    for vlayout in "r" "c" ; do
     for hdim in 32 64 128 256 ; do
     for lse in 0 1 ; do
     for bias in "n" "e" "a" ; do
@@ -51,16 +50,16 @@ run_fp16_bf16_tests() {
     for page_block_size in $PAGE_BLOCK_SIZE ; do
     for cache_batch_idx in $CACHE_BATCH_IDX ; do
 
-    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
 
     done ; done ; done ; done ; done
     done ; done ; done ; done ; done
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 35da19cd3e..07be65a150 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -41,10 +41,6 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1318,6 +1314,17 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -2756,7 +2763,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 8c3bc0bc36..c64b296408 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -32,10 +32,6 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
 {
     buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
     int32x4_t r = __builtin_bit_cast(int32x4_t, res);
-    r.x         = __builtin_amdgcn_readfirstlane(r.x);
-    r.y         = __builtin_amdgcn_readfirstlane(r.y);
-    r.z         = __builtin_amdgcn_readfirstlane(r.z);
-    r.w         = __builtin_amdgcn_readfirstlane(r.w);
     return r;
 }
 
@@ -1186,6 +1182,17 @@ enum struct amd_buffer_coherence_enum
     glc               = 1,
     slc               = 2,
     glc_slc           = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 
 template <index_t N,
@@ -2574,7 +2581,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
-__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
+__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 
     static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index ab42ec8617..f0e9518120 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -89,21 +89,6 @@ CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
 
 CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
 
-CK_TILE_DEVICE void block_sync_lds()
-{
-#if CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
-    // asm volatile("\
-    // s_waitcnt lgkmcnt(0) \n \
-    // s_barrier \
-    // " ::);
-
-    __builtin_amdgcn_s_waitcnt(0xc07f);
-    __builtin_amdgcn_s_barrier();
-#else
-    __syncthreads();
-#endif
-}
-
 CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 {
 #ifdef __gfx12__
@@ -174,6 +159,18 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
     __builtin_amdgcn_s_barrier();
 }
 
+template <index_t lgkmcnt = 0>
+CK_TILE_DEVICE void block_sync_lds()
+{
+    s_waitcnt_barrier<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, lgkmcnt>();
+}
+
+template <index_t vmcnt = 0>
+CK_TILE_DEVICE void block_sync_lds_direct_load()
+{
+    s_waitcnt_barrier<vmcnt, waitcnt_arg::kMaxExpCnt, waitcnt_arg::kMaxLgkmCnt>();
+}
+
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
index 7184f99521..93008f8525 100644
--- a/include/ck_tile/core/arch/utility.hpp
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -59,6 +59,21 @@ CK_TILE_DEVICE T warp_shuffle_down(const T& v_local, uint32_t lane_delta)
 #endif
 }
 
+template <typename T>
+CK_TILE_DEVICE auto warp_shuffle_down_pair(const T& v_local)
+{
+    static_assert(sizeof(T) == sizeof(int32_t), "wrong!");
+
+    const int32x2_t x = __builtin_amdgcn_permlane32_swap(
+        bit_cast<int32_t>(v_local), bit_cast<int32_t>(v_local), false, false);
+
+    thread_buffer<T, 2> v;
+    v(0) = bit_cast<T>(x[0]);
+    v(1) = bit_cast<T>(x[1]);
+
+    return v;
+}
+
 template <typename T>
 CK_TILE_DEVICE T warp_shuffle(const T& v_local, uint32_t src_lane)
 {
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index c471f416c3..e472bd01e5 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -191,6 +191,16 @@
 #endif
 #endif
 
+// use llvm builtin bf16 data type after ROCm 6.5
+#ifndef CK_TILE_USE_LLVM_BUILTIN_BF16
+#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 5 && HIP_VERSION_PATCH >= 50421) || \
+    (HIP_VERSION_MAJOR >= 7)
+#define CK_TILE_USE_LLVM_BUILTIN_BF16 1
+#else
+#define CK_TILE_USE_LLVM_BUILTIN_BF16 0
+#endif
+#endif
+
 #ifndef CK_TILE_DEBUG_LOG
 #define CK_TILE_DEBUG_LOG 0
 #endif
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 6f31468809..245fb7244f 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -6,6 +6,9 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
+#include <hip/hip_bfloat16.h>
+#endif
 #include <stdint.h>
 
 #pragma once
@@ -102,7 +105,11 @@ struct native_t<bfloat16_t>
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = typename bf16_t::raw_type;
 #else
+#if CK_TILE_USE_LLVM_BUILTIN_BF16
+using bfloat16_t = __bf16;
+#else
 using bfloat16_t = ushort;
+#endif
 using bf16_t     = bfloat16_t;
 using bf16_raw_t = uint16_t;
 #endif
@@ -280,7 +287,11 @@ template <bf16_rounding_mode rounding =
               static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE constexpr bfloat16_t float_to_bf16(float f, constant<rounding> = {})
 {
+#if defined(__gfx950__)
+    return static_cast<bfloat16_t>(f);
+#else
     return bit_cast<bfloat16_t>(float_to_bf16_raw(f, constant<rounding>{}));
+#endif
 }
 
 template <bf16_rounding_mode rounding =
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index a345cd1b75..7464bc7c48 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -21,7 +21,7 @@ namespace ck_tile {
 using fp32_t   = float;
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
 
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index ba8b87a9b8..0b0eb70beb 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -99,7 +99,7 @@ struct numeric_traits<pk_int4_t>
 
 using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
-using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x2_t = bfloat16_t __attribute__((ext_vector_type(2)));
 
 CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
 {
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index 58bdb43b08..bbd3d53827 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -131,12 +131,12 @@ using fp16x64_t = _Float16 __attribute__((ext_vector_type(64)));
 
 // bf16
 // using bf16_t = ...
-using bf16x2_t  = bf16_raw_t __attribute__((ext_vector_type(2)));
-using bf16x4_t  = bf16_raw_t __attribute__((ext_vector_type(4)));
-using bf16x8_t  = bf16_raw_t __attribute__((ext_vector_type(8)));
-using bf16x16_t = bf16_raw_t __attribute__((ext_vector_type(16)));
-using bf16x32_t = bf16_raw_t __attribute__((ext_vector_type(32)));
-using bf16x64_t = bf16_raw_t __attribute__((ext_vector_type(64)));
+using bf16x2_t  = bfloat16_t __attribute__((ext_vector_type(2)));
+using bf16x4_t  = bfloat16_t __attribute__((ext_vector_type(4)));
+using bf16x8_t  = bfloat16_t __attribute__((ext_vector_type(8)));
+using bf16x16_t = bfloat16_t __attribute__((ext_vector_type(16)));
+using bf16x32_t = bfloat16_t __attribute__((ext_vector_type(32)));
+using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
 
 // i32
 // using int32_t = ...
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index d8dd5db12e..69f645b850 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -52,6 +52,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 8d257a3329..5b3d38d3e7 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -13,6 +13,7 @@
 #include <utility>
 #include <variant>
 
+#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
 // S[seqlen_q, seqlen_k] = Q[seqlen_q, hdim_q] @ K[seqlen_k, hdim_q]
 // S'[seqlen_q, seqlen_k] = S[seqlen_q, seqlen_k] * Scale[1]
 // S''[seqlen_q, seqlen_k] = S'[seqlen_q, seqlen_k] + Bias[seqlen_q, seqlen_k]
@@ -61,6 +62,14 @@ struct FmhaFwdKernel
 
     static constexpr bool kUseAsyncCopy = FmhaPipeline::Policy::AsyncCopy;
 
+    static constexpr bool kUseTrLoad = FmhaPipeline::Problem::kUseTrLoad;
+#if defined(__gfx950__)
+    static constexpr bool kIsAvialable = true;
+#else
+    static constexpr bool kIsAvialable = !kUseTrLoad;
+#endif
+    static constexpr std::string_view kPipelineName = FmhaPipeline::name;
+
     // clang-format off
     template <typename T> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
@@ -100,7 +109,7 @@ struct FmhaFwdKernel
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
             "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
             (kHasLogitsSoftCap ? "_logits" : "_nlogits" ) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" );
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + (kHasDropout ? "_dropout" : "_ndropout" ) + (kSkipMinSeqlenQ ? "_skip" : "_nskip" ) + (kDoFp8StaticQuant ? "_squant" : "_nsquant" ) + (kUseTrLoad ? "_trload" : "_ntrload");
         #undef _SS_
         #undef _TS_
         // clang-format on
@@ -1036,455 +1045,1142 @@ struct FmhaFwdKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        if constexpr(kIsAvialable)
+            run_(std::move(kargs));
+    }
 
-        // divide problem
-        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
-
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
-
-        long_index_t batch_offset_q       = 0;
-        long_index_t batch_offset_k       = 0;
-        long_index_t batch_offset_v       = 0;
-        long_index_t batch_offset_bias    = 0;
-        long_index_t batch_offset_randval = 0;
-        long_index_t batch_offset_lse     = 0;
-        long_index_t batch_offset_o       = 0;
-
-        if constexpr(kIsGroupMode)
+    CK_TILE_DEVICE void run_(Kargs kargs) const
+    {
+        if constexpr(kPipelineName != "qr_async_trload")
         {
-            // get starting offset for each batch
-            const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
-            const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+            // allocate LDS
+            __shared__ char smem_ptr[GetSmemSize()];
 
-            batch_offset_q = query_start * kargs.stride_q;
-            batch_offset_k = key_start * kargs.stride_k;
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                batch_offset_v = key_start * kargs.stride_v;
-            }
-            else
-            {
-                batch_offset_v = key_start;
-            }
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                batch_offset_bias = query_start * kargs.stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = query_start;
-            }
-            if constexpr(kHasDropout)
-            {
-                batch_offset_randval = query_start * kargs.stride_randval;
-            }
-            batch_offset_o = query_start * kargs.stride_o;
+            // divide problem
+            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-            // get real # queries & # keys under group mode
-            const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
-            kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
+            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
 
-            if constexpr(kSkipMinSeqlenQ)
+            long_index_t batch_offset_q       = 0;
+            long_index_t batch_offset_k       = 0;
+            long_index_t batch_offset_v       = 0;
+            long_index_t batch_offset_bias    = 0;
+            long_index_t batch_offset_randval = 0;
+            long_index_t batch_offset_lse     = 0;
+            long_index_t batch_offset_o       = 0;
+
+            if constexpr(kIsGroupMode)
             {
-                if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                // get starting offset for each batch
+                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+                batch_offset_q = query_start * kargs.stride_q;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias = query_start * kargs.stride_bias;
+                }
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = query_start;
+                }
+                if constexpr(kHasDropout)
+                {
+                    batch_offset_randval = query_start * kargs.stride_randval;
+                }
+                batch_offset_o = query_start * kargs.stride_o;
+
+                // get real # queries & # keys under group mode
+                const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
+                kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
+
+                if constexpr(kSkipMinSeqlenQ)
+                {
+                    if(kargs.seqlen_q <= kargs.min_seqlen_q)
+                    {
+                        return;
+                    }
+                }
+
+                // # of required blocks is different in each groups, terminate unnecessary blocks
+                // earlier
+                if(kargs.seqlen_q <= i_m0)
                 {
                     return;
                 }
-            }
 
-            // # of required blocks is different in each groups, terminate unnecessary blocks
-            // earlier
-            if(kargs.seqlen_q <= i_m0)
-            {
-                return;
-            }
-
-            if(kargs.seqlen_k_ptr != nullptr)
-            {
-                kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                }
+                else
+                {
+                    const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
+                    kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+                }
             }
             else
             {
-                const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
-                kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
+                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                }
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                }
+                if constexpr(kHasDropout)
+                {
+                    batch_offset_randval =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
+                }
+                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
             }
-        }
-        else
-        {
-            batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
-            batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
-            batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
-            }
-            if constexpr(kStoreLSE)
-            {
-                batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
-            }
-            if constexpr(kHasDropout)
-            {
-                batch_offset_randval =
-                    static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
-            }
-            batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
-        }
 
-        // for simplicity, batch stride we just modify the pointer
-        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
-                                 batch_offset_q;
-        const KDataType* k_ptr =
-            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
-            batch_offset_k;
-        const VDataType* v_ptr =
-            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
-            batch_offset_v;
-        ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
-                           static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
-                           batch_offset_o;
+            // for simplicity, batch stride we just modify the pointer
+            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                     batch_offset_q;
+            const KDataType* k_ptr =
+                reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+                batch_offset_k;
+            const VDataType* v_ptr =
+                reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
+                batch_offset_v;
+            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                               batch_offset_o;
 
-        // Q/K/V DRAM and DRAM window
-        const auto q_dram = [&]() {
-            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                q_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_q),
-                make_tuple(kargs.stride_q, 1),
-                number<FmhaPipeline::kAlignmentQ>{},
-                number<1>{});
-            if constexpr(FmhaPipeline::kQLoadOnce)
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-            else
-            {
-                return pad_tensor_view(
-                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
-            }
-        }();
-        const auto k_dram = [&]() {
-            const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                k_ptr,
-                make_tuple(kargs.seqlen_k, kargs.hdim_q),
-                make_tuple(kargs.stride_k, 1),
-                number<FmhaPipeline::kAlignmentK>{},
-                number<1>{});
-
-            constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
-            return pad_tensor_view(
-                k_dram_naive,
-                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK_, kPadHeadDimQ>{});
-        }();
-        const auto v_dram = [&]() {
-            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-            {
-                const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    v_ptr,
-                    make_tuple(kargs.seqlen_k, kargs.hdim_v),
-                    make_tuple(kargs.stride_v, 1),
-                    number<FmhaPipeline::kAlignmentV>{},
+            // Q/K/V DRAM and DRAM window
+            const auto q_dram = [&]() {
+                const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    q_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                    make_tuple(kargs.stride_q, 1),
+                    number<FmhaPipeline::kAlignmentQ>{},
+                    number<1>{});
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                {
+                    return pad_tensor_view(q_dram_naive,
+                                           make_tuple(number<FmhaPipeline::kM0>{},
+                                                      number<FmhaPipeline::kSubQKHeaddim>{}),
+                                           sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                }
+                else
+                {
+                    return pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                }
+            }();
+            const auto k_dram = [&]() {
+                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    k_ptr,
+                    make_tuple(kargs.seqlen_k, kargs.hdim_q),
+                    make_tuple(kargs.stride_k, 1),
+                    number<FmhaPipeline::kAlignmentK>{},
                     number<1>{});
-
-                const auto v_dram_transposed =
-                    transform_tensor_view(v_dram_naive,
-                                          make_tuple(make_pass_through_transform(kargs.hdim_v),
-                                                     make_pass_through_transform(kargs.seqlen_k)),
-                                          make_tuple(sequence<1>{}, sequence<0>{}),
-                                          make_tuple(sequence<0>{}, sequence<1>{}));
 
                 constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
                 return pad_tensor_view(
-                    v_dram_transposed,
-                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK_>{});
+                    k_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<kPadSeqLenK_, kPadHeadDimQ>{});
+            }();
+            const auto v_dram = [&]() {
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        v_ptr,
+                        make_tuple(kargs.seqlen_k, kargs.hdim_v),
+                        make_tuple(kargs.stride_v, 1),
+                        number<FmhaPipeline::kAlignmentV>{},
+                        number<1>{});
+
+                    const auto v_dram_transposed = transform_tensor_view(
+                        v_dram_naive,
+                        make_tuple(make_pass_through_transform(kargs.hdim_v),
+                                   make_pass_through_transform(kargs.seqlen_k)),
+                        make_tuple(sequence<1>{}, sequence<0>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    constexpr bool kPadSeqLenK_ = kUseAsyncCopy ? kPadSeqLenK : false;
+                    return pad_tensor_view(
+                        v_dram_transposed,
+                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                        sequence<kPadHeadDimV, kPadSeqLenK_>{});
+                }
+                else
+                {
+                    const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                        v_ptr,
+                        make_tuple(kargs.hdim_v, kargs.seqlen_k),
+                        make_tuple(kargs.stride_v, 1),
+                        number<FmhaPipeline::kAlignmentV>{},
+                        number<1>{});
+
+                    constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
+                    return pad_tensor_view(
+                        v_dram_naive,
+                        make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                        sequence<kPadHeadDimV_, kPadSeqLenK>{});
+                }
+            }();
+
+            auto q_dram_window = make_tile_window(
+                q_dram,
+                [&]() {
+                    if constexpr(FmhaPipeline::kQLoadOnce)
+                        return make_tuple(number<FmhaPipeline::kM0>{},
+                                          number<FmhaPipeline::kSubQKHeaddim>{});
+                    else
+                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+                }(),
+                {i_m0, 0});
+
+            auto k_dram_window = make_tile_window(
+                k_dram,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                {0, 0});
+
+            auto v_dram_window = make_tile_window(
+                v_dram,
+                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                {i_n1, 0});
+            /// FIXME: Before C++20, capturing structured binding variables are not supported.
+            /// Remove following copy capture of the 'i_nhead' if in C++20
+            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto bias_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    const BiasDataType* bias_ptr =
+                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                        batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                number<FmhaPipeline::kAlignmentBias>{},
+                                number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
+                }
+                else
+                {
+                    return make_null_tile_window(bias_dram_window_lengths);
+                }
+            }();
+
+            // lse
+            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+                if constexpr(kStoreLSE)
+                {
+                    LSEDataType* lse_ptr =
+                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+                        batch_offset_lse;
+
+                    const auto lse_dram = [&]() {
+                        const auto lse_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                lse_ptr,
+                                make_tuple(kargs.seqlen_q),
+                                make_tuple(1),
+                                number<1>{},
+                                number<1>{});
+
+                        return pad_tensor_view(
+                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                    }();
+
+                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+                }
+                else
+                {
+                    return make_null_tile_window(lse_dram_window_lengths);
+                }
+            }();
+
+            auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
+                if constexpr(kHasDropout)
+                {
+                    return BlockDropout{i_batch_,
+                                        i_nhead_,
+                                        kargs.num_head_q,
+                                        kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
+                                                                            : *kargs.drop_seed.ptr,
+                                        kargs.is_drop_seed_offset_from_host
+                                            ? kargs.drop_offset.val
+                                            : *kargs.drop_offset.ptr,
+                                        kargs.rp_undrop,
+                                        kargs.p_undrop_in_uint8_t,
+                                        kargs.is_store_randval};
+                }
+                else
+                {
+                    return NullBlockDropout{};
+                };
+            }();
+
+            auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto randval_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(kHasDropout)
+                {
+                    RandValOutputDataType* rand_val_ptr =
+                        reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
+                        batch_offset_randval;
+
+                    const auto randval_dram = [&]() {
+                        const auto randval_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                rand_val_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_randval, 1),
+                                number<1>{},
+                                number<1>{});
+
+                        return pad_tensor_view(randval_dram_naive,
+                                               randval_dram_window_lengths,
+                                               sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
+                }
+                else
+                {
+                    return make_null_tile_window(randval_dram_window_lengths);
+                }
+            }();
+
+            FmhaMask mask = [&]() {
+                if constexpr(kHasMask)
+                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                        kargs.window_size_left,
+                        kargs.window_size_right,
+                        kargs.seqlen_q,
+                        kargs.seqlen_k,
+                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+                else
+                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+            }();
+
+            // WA i_batch capture structure binding before c++20
+            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    // data loading, shared by entire wg
+                    // TODO: how to use s_read?
+                    SaccDataType slope =
+                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    slope *= ck_tile::log2e_v<>;
+#endif
+                    if constexpr(kHasMask)
+                    {
+                        return make_alibi_from_lr_mask<SaccDataType, true>(slope,
+                                                                           kargs.window_size_left,
+                                                                           kargs.window_size_right,
+                                                                           kargs.seqlen_q,
+                                                                           kargs.seqlen_k,
+                                                                           kargs.mask_type);
+                    }
+                    else
+                    {
+                        return Alibi<SaccDataType, true>{
+                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }
+                else
+                {
+                    return EmptyPositionEncoding<SaccDataType>{};
+                }
+            }();
+
+            AttentionVariant variant;
+            const auto variant_params = [&] {
+                if constexpr(kHasLogitsSoftCap)
+                {
+                    return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
+                        mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
+                }
+                else
+                {
+                    return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
+                }
+            }();
+
+            BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+
+            auto o_acc_tile = [&]() {
+                if constexpr(kDoFp8StaticQuant)
+                {
+                    return FmhaPipeline{}(
+                        q_dram_window,
+                        identity{}, // q_element_func
+                        k_dram_window,
+                        identity{}, // k_element_func
+                        v_dram_window,
+                        identity{}, // v_element_func
+                        bias_dram_window,
+                        identity{}, // bias_element_func
+                        randval_dram_window,
+                        lse_dram_window,
+                        identity{},            // lse_element_func
+                        identity{},            // s_acc_element_func
+                        scales{kargs.scale_p}, // p_compute_element_func
+                        composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
+                        mask,
+                        position_encoding,
+                        kargs.scale_s,
+                        variant,
+                        variant_params,
+                        block_indices,
+                        smem_ptr,
+                        dropout);
+                }
+                else
+                {
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          randval_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          variant,
+                                          variant_params,
+                                          block_indices,
+                                          smem_ptr,
+                                          dropout);
+                }
+            }();
+
+            // O DRAM and O DRAM window
+            auto o_dram = [&]() {
+                const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    o_ptr,
+                    make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                    make_tuple(kargs.stride_o, 1),
+                    number<FmhaPipeline::kAlignmentO>{},
+                    number<1>{});
+
+                return pad_tensor_view(
+                    o_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            }();
+
+            auto o_dram_window = make_tile_window(
+                o_dram,
+                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                {i_m0, i_n1});
+
+            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        }
+        else
+        {
+            // TODO: Refine the logical here.
+            // In Decode case
+            //     1. we don't expect KV data reused by different ThreadGroups, bypass the cache
+            //     2. limit the LDS usage, as we want higher occupancy
+            // In Prefill case
+            //     1. we expect KV data reused by different ThreadGroups, use cache
+            //     2. use more LDS, as we want better memory latency hiding
+            // If SplitKV off, we don't expect Q data reused by different ThreadGroups, bypass the
+            // cache
+            constexpr bool PrefillCase = FmhaPipeline::kM0 >= 128;
+            // divide problem
+            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
+
+            const index_t i_m0 = i_tile_m * FmhaPipeline::kM0;
+            const index_t i_n1 = i_tile_n * FmhaPipeline::kN1;
+
+            long_index_t batch_offset_q    = 0;
+            long_index_t batch_offset_k    = 0; // unused for paged-kvcache
+            long_index_t batch_offset_v    = 0; // unused for paged-kvcache
+            long_index_t batch_offset_bias = 0;
+            long_index_t batch_offset_lse  = 0;
+            long_index_t batch_offset_o    = 0;
+            // index_t kv_l2p_offset =
+            //     0; // logical-to-physical offset of seqlen_k coordinate. only used for
+            //     paged-kvcache
+
+            if constexpr(kIsGroupMode)
+            {
+                // get starting offset for each batch
+                const long_index_t query_start = kargs.seqstart_q_ptr[i_batch];
+                const long_index_t key_start   = kargs.seqstart_k_ptr[i_batch];
+
+                batch_offset_q = query_start * kargs.stride_q;
+                batch_offset_k = key_start * kargs.stride_k;
+                if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+                {
+                    batch_offset_v = key_start * kargs.stride_v;
+                }
+                else
+                {
+                    batch_offset_v = key_start;
+                }
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias = query_start * kargs.stride_bias;
+                }
+
+                batch_offset_lse = query_start;
+                batch_offset_o   = query_start * kargs.stride_o;
+
+                // get real # queries & # keys under group mode
+                kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - kargs.seqstart_q_ptr[i_batch];
+
+                // # of required blocks is different in each groups, terminate unnecessary blocks
+                // earlier
+                if(kargs.seqlen_q <= i_m0)
+                {
+                    return;
+                }
+
+                if(kargs.seqlen_k_ptr != nullptr)
+                {
+                    kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
+                }
+                else
+                {
+                    kargs.seqlen_k =
+                        kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch];
+                }
             }
             else
             {
+                batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
+                batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
+                batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
+                if constexpr(kStoreLSE)
+                {
+                    batch_offset_lse = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse;
+                }
+                batch_offset_o = static_cast<long_index_t>(i_batch) * kargs.batch_stride_o;
+
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    batch_offset_bias =
+                        static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
+                }
+            }
+
+            // for simplicity, batch stride we just modify the pointer
+            const index_t i_nhead_k = i_nhead / kargs.nhead_ratio_qk;
+
+            const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
+                                     static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                     batch_offset_q;
+            const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_k +
+                                     batch_offset_k;
+            const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                                     static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_v +
+                                     batch_offset_v;
+
+            ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr) +
+                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o +
+                               batch_offset_o;
+
+            // Q/K/V DRAM and DRAM window
+            const auto q_dram = [&] {
+                const auto q_dram_naive = [&] {
+                    {
+                        return make_naive_tensor_view<address_space_enum::global,
+                                                      memory_operation_enum::set,
+                                                      amd_buffer_coherence_enum::SYSTEM_NT1>(
+                            q_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                            make_tuple(kargs.stride_q, 1),
+                            number<FmhaPipeline::kAlignmentQ>{},
+                            number<1>{});
+                    }
+                }();
+
+                if constexpr(FmhaPipeline::kQLoadOnce)
+                {
+                    const auto seqlen_q   = kargs.seqlen_q;
+                    const auto q_dram_pad = pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<false, kPadHeadDimQ>{});
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                    constexpr index_t LDSLayerSize  = 256 / sizeof(QDataType);
+                    constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                    if constexpr(XorLengthFold > 1)
+                    {
+                        const auto q_dram_unmerged = transform_tensor_view(
+                            q_dram_pad,
+                            make_tuple(
+                                make_unmerge_transform(
+                                    make_tuple(seqlen_q / XorLengthFold, XorLengthFold)),
+                                make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        const auto q_dram_merged = transform_tensor_view(
+                            q_dram_unmerged,
+                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
+                                       make_merge_transform_v3_division_mod(make_tuple(
+                                           XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+
+                        const auto q_dram_unmerged_xor = transform_tensor_view(
+                            q_dram_merged,
+                            make_tuple(make_pass_through_transform(seqlen_q / XorLengthFold),
+                                       make_unmerge_transform(make_tuple(
+                                           number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{},
+                                           number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                        const auto q_dram_permuted = transform_tensor_view(
+                            q_dram_unmerged_xor,
+                            make_tuple(
+                                make_xor_transform(
+                                    make_tuple(seqlen_q / XorLengthFold,
+                                               number<LDSLayerSize / FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        const auto q_dram_tmp = transform_tensor_view(
+                            q_dram_permuted,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q / XorLengthFold),
+                                make_unmerge_transform(
+                                    make_tuple(number<XorLengthFold>{},
+                                               number<FmhaPipeline::kQKHeaddim /
+                                                      FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                        return transform_tensor_view(
+                            q_dram_tmp,
+                            make_tuple(
+                                make_merge_transform_v3_division_mod(
+                                    make_tuple(seqlen_q / XorLengthFold, number<XorLengthFold>{})),
+                                make_merge_transform_v3_division_mod(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                    }
+                    else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                    {
+                        const auto q_dram_unmerged = transform_tensor_view(
+                            q_dram_pad,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q),
+                                make_unmerge_transform(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1>{}),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                        const auto q_dram_permuted = transform_tensor_view(
+                            q_dram_unmerged,
+                            make_tuple(
+                                make_xor_transform(make_tuple(seqlen_q,
+                                                              number<FmhaPipeline::kQKHeaddim /
+                                                                     FmhaPipeline::kAlignmentQ>{})),
+                                make_pass_through_transform(number<FmhaPipeline::kAlignmentQ>{})),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                            make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                        return transform_tensor_view(
+                            q_dram_permuted,
+                            make_tuple(
+                                make_pass_through_transform(seqlen_q),
+                                make_merge_transform_v3_division_mod(make_tuple(
+                                    number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentQ>{},
+                                    number<FmhaPipeline::kAlignmentQ>{}))),
+                            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                            make_tuple(sequence<0>{}, sequence<1>{}));
+                    }
+                }
+                else
+                {
+                    return pad_tensor_view(
+                        q_dram_naive,
+                        make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
+                        sequence<false, kPadHeadDimQ>{});
+                }
+            }();
+
+            const auto make_k_dram = [&](const KDataType* data, index_t height) {
+                const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(height, kargs.hdim_q),
+                    make_tuple(kargs.stride_k, 1),
+                    number<FmhaPipeline::kAlignmentK>{},
+                    number<1>{});
+
+                const auto k_dram_pad = pad_tensor_view(
+                    k_dram_naive,
+                    make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                    sequence<false, kPadHeadDimQ>{});
+
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr index_t LDSLayerSize  = 256 / sizeof(KDataType);
+                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    const auto k_dram_unmerged = transform_tensor_view(
+                        k_dram_pad,
+                        make_tuple(make_unmerge_transform(
+                                       make_tuple(height / XorLengthFold, XorLengthFold)),
+                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto k_dram_merged = transform_tensor_view(
+                        k_dram_unmerged,
+                        make_tuple(make_pass_through_transform(height / XorLengthFold),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    const auto k_dram_unmerged_xor = transform_tensor_view(
+                        k_dram_merged,
+                        make_tuple(make_pass_through_transform(height / XorLengthFold),
+                                   make_unmerge_transform(make_tuple(
+                                       number<LDSLayerSize / FmhaPipeline::kAlignmentK>{},
+                                       number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto k_dram_permuted = transform_tensor_view(
+                        k_dram_unmerged_xor,
+                        make_tuple(
+                            make_xor_transform(
+                                make_tuple(height / XorLengthFold,
+                                           number<LDSLayerSize / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto k_dram_tmp = transform_tensor_view(
+                        k_dram_permuted,
+                        make_tuple(
+                            make_pass_through_transform(height / XorLengthFold),
+                            make_unmerge_transform(make_tuple(
+                                number<XorLengthFold>{},
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_view(
+                        k_dram_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(height / XorLengthFold, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    const auto k_dram_unmerged = transform_tensor_view(
+                        k_dram_pad,
+                        make_tuple(
+                            make_pass_through_transform(height),
+                            make_unmerge_transform(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto k_dram_permuted = transform_tensor_view(
+                        k_dram_unmerged,
+                        make_tuple(
+                            make_xor_transform(make_tuple(
+                                height,
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{})),
+                            make_pass_through_transform(number<FmhaPipeline::kAlignmentK>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_view(
+                        k_dram_permuted,
+                        make_tuple(
+                            make_pass_through_transform(height),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<FmhaPipeline::kQKHeaddim / FmhaPipeline::kAlignmentK>{},
+                                number<FmhaPipeline::kAlignmentK>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            };
+            const auto k_dram = [&]() {
+                {
+                    return make_k_dram(k_ptr, kargs.seqlen_k);
+                }
+            }();
+
+            const auto make_v_dram = [&](const VDataType* data, index_t length) {
                 const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                    v_ptr,
-                    make_tuple(kargs.hdim_v, kargs.seqlen_k),
-                    make_tuple(kargs.stride_v, 1),
+                    data, // will update this pointer if using paged-kvcache
+                    make_tuple(length, kargs.hdim_v),
+                    make_tuple(kargs.hdim_v, 1),
                     number<FmhaPipeline::kAlignmentV>{},
                     number<1>{});
 
-                constexpr bool kPadHeadDimV_ = kUseAsyncCopy ? kPadHeadDimV : false;
-                return pad_tensor_view(
+                // TODO: Add kVHeadDim
+                constexpr index_t XorGroupSize =
+                    FmhaPipeline::Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
+
+                const auto v_dram_pad = pad_tensor_view(
                     v_dram_naive,
-                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV_, kPadSeqLenK>{});
-            }
-        }();
+                    make_tuple(number<FmhaPipeline::kK1>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenK, false>{});
 
-        auto q_dram_window = make_tile_window(
-            q_dram,
-            [&]() {
-                if constexpr(FmhaPipeline::kQLoadOnce)
-                    return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kSubQKHeaddim>{});
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr index_t LDSLayerSize  = 256 / sizeof(VDataType);
+                constexpr index_t XorLengthFold = LDSLayerSize / (FmhaPipeline::kQKHeaddim);
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    const auto v_dram_unmerged = transform_tensor_view(
+                        v_dram_pad,
+                        make_tuple(make_unmerge_transform(
+                                       make_tuple(length / XorLengthFold, XorLengthFold)),
+                                   make_pass_through_transform(number<FmhaPipeline::kQKHeaddim>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto v_dram_merged = transform_tensor_view(
+                        v_dram_unmerged,
+                        make_tuple(make_pass_through_transform(length / XorLengthFold),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       XorLengthFold, number<FmhaPipeline::kQKHeaddim>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+
+                    const auto v_dram_unmerged_xor = transform_tensor_view(
+                        v_dram_merged,
+                        make_tuple(
+                            make_pass_through_transform(length / XorLengthFold),
+                            make_unmerge_transform(make_tuple(number<LDSLayerSize / XorGroupSize>{},
+                                                              number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+                    const auto v_dram_permuted = transform_tensor_view(
+                        v_dram_unmerged_xor,
+                        make_tuple(
+                            make_xor_transform(make_tuple(length / XorLengthFold,
+                                                          number<LDSLayerSize / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    const auto v_dram_tmp = transform_tensor_view(
+                        v_dram_permuted,
+                        make_tuple(make_pass_through_transform(length / XorLengthFold),
+                                   make_unmerge_transform(make_tuple(
+                                       number<XorLengthFold>{},
+                                       number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_view(
+                        v_dram_tmp,
+                        make_tuple(make_merge_transform_v3_division_mod(
+                                       make_tuple(length / XorLengthFold, number<XorLengthFold>{})),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
                 else
-                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
-            }(),
-            {i_m0, 0});
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    const auto v_dram_unmerged = transform_tensor_view(
+                        v_dram_pad,
+                        make_tuple(make_pass_through_transform(length),
+                                   make_unmerge_transform(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}));
 
-        auto k_dram_window = make_tile_window(
-            k_dram, make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), {0, 0});
+                    const auto v_dram_permuted = transform_tensor_view(
+                        v_dram_unmerged,
+                        make_tuple(make_xor_transform(make_tuple(
+                                       length, number<FmhaPipeline::kQKHeaddim / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
 
-        auto v_dram_window =
-            make_tile_window(v_dram,
-                             make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                             {i_n1, 0});
-        /// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
-        /// following copy capture of the 'i_nhead' if in C++20
-        const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto bias_dram_window_lengths =
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
-            {
-                const BiasDataType* bias_ptr =
-                    reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
-                    batch_offset_bias;
-
-                const auto bias_dram = [&]() {
-                    const auto bias_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        bias_ptr,
-                        make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                        make_tuple(kargs.stride_bias, 1),
-                        number<FmhaPipeline::kAlignmentBias>{},
-                        number<1>{});
-
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(bias_dram_window_lengths);
-            }
-        }();
-
-        // lse
-        auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
-            if constexpr(kStoreLSE)
-            {
-                LSEDataType* lse_ptr =
-                    reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse + batch_offset_lse;
-
-                const auto lse_dram = [&]() {
-                    const auto lse_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                        lse_ptr,
-                        make_tuple(kargs.seqlen_q),
-                        make_tuple(1),
-                        number<1>{},
-                        number<1>{});
-
-                    return pad_tensor_view(
-                        lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
-                }();
-
-                return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
-            }
-            else
-            {
-                return make_null_tile_window(lse_dram_window_lengths);
-            }
-        }();
-
-        auto dropout = [&, i_nhead_ = i_nhead, i_batch_ = i_batch]() {
-            if constexpr(kHasDropout)
-            {
-                return BlockDropout{i_batch_,
-                                    i_nhead_,
-                                    kargs.num_head_q,
-                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_seed.val
-                                                                        : *kargs.drop_seed.ptr,
-                                    kargs.is_drop_seed_offset_from_host ? kargs.drop_offset.val
-                                                                        : *kargs.drop_offset.ptr,
-                                    kargs.rp_undrop,
-                                    kargs.p_undrop_in_uint8_t,
-                                    kargs.is_store_randval};
-            }
-            else
-            {
-                return NullBlockDropout{};
+                    return transform_tensor_view(
+                        v_dram_permuted,
+                        make_tuple(make_pass_through_transform(length),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(number<FmhaPipeline::kQKHeaddim / XorGroupSize>{},
+                                                  number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
             };
-        }();
 
-        auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
-            constexpr auto randval_dram_window_lengths =
-                make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
-            if constexpr(kHasDropout)
-            {
-                RandValOutputDataType* rand_val_ptr =
-                    reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
-                    static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
-                    batch_offset_randval;
-
-                const auto randval_dram = [&]() {
-                    const auto randval_dram_naive =
-                        make_naive_tensor_view<address_space_enum::global>(
-                            rand_val_ptr,
-                            make_tuple(kargs.seqlen_q, kargs.seqlen_k),
-                            make_tuple(kargs.stride_randval, 1),
-                            number<1>{},
-                            number<1>{});
-
-                    return pad_tensor_view(randval_dram_naive,
-                                           randval_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
-                }();
-
-                return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
-            }
-            else
-            {
-                return make_null_tile_window(randval_dram_window_lengths);
-            }
-        }();
-
-        FmhaMask mask = [&]() {
-            if constexpr(kHasMask)
-                return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
-                    kargs.window_size_left,
-                    kargs.window_size_right,
-                    kargs.seqlen_q,
-                    kargs.seqlen_k,
-                    kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
-            else
-                return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
-        }();
-
-        // WA i_batch capture structure binding before c++20
-        auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
-            if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
-            {
-                // data loading, shared by entire wg
-                // TODO: how to use s_read?
-                SaccDataType slope =
-                    *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
-                      i_batch_ * kargs.alibi_slope_stride + i_nhead_);
-#if CK_TILE_FMHA_FWD_FAST_EXP2
-                slope *= ck_tile::log2e_v<>;
-#endif
-                if constexpr(kHasMask)
+            const auto v_dram = [&]() {
                 {
-                    return make_alibi_from_lr_mask<SaccDataType, true>(slope,
-                                                                       kargs.window_size_left,
-                                                                       kargs.window_size_right,
-                                                                       kargs.seqlen_q,
-                                                                       kargs.seqlen_k,
-                                                                       kargs.mask_type);
+                    return make_v_dram(v_ptr, kargs.seqlen_k);
+                }
+            }();
+
+            auto q_dram_window = make_tile_window(
+                q_dram,
+                [&]() {
+                    if constexpr(FmhaPipeline::kQLoadOnce)
+                        return make_tuple(number<FmhaPipeline::kM0>{},
+                                          number<FmhaPipeline::kSubQKHeaddim>{});
+                    else
+                        return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
+                }(),
+                {i_m0, 0});
+
+            auto k_dram_window = make_tile_window(
+                k_dram,
+                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
+                {0, 0});
+
+            auto v_dram_window = make_tile_window(
+                v_dram,
+                make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
+                {0, 0});
+
+            /// FIXME: Before C++20, capturing structured binding variables are not supported.
+            /// Remove following copy capture of the 'i_nhead' if in C++20
+            const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto bias_dram_window_lengths =
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
+                {
+                    const BiasDataType* bias_ptr =
+                        reinterpret_cast<const BiasDataType*>(kargs.bias_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_bias +
+                        batch_offset_bias;
+
+                    const auto bias_dram = [&]() {
+                        const auto bias_dram_naive =
+                            make_naive_tensor_view<address_space_enum::global>(
+                                bias_ptr,
+                                make_tuple(kargs.seqlen_q, kargs.seqlen_k),
+                                make_tuple(kargs.stride_bias, 1),
+                                number<FmhaPipeline::kAlignmentBias>{},
+                                number<1>{});
+
+                        return pad_tensor_view(bias_dram_naive,
+                                               bias_dram_window_lengths,
+                                               sequence<false, kPadSeqLenK>{});
+                    }();
+
+                    return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});
                 }
                 else
                 {
-                    return Alibi<SaccDataType, true>{
-                        slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    return make_null_tile_window(bias_dram_window_lengths);
                 }
-            }
-            else
-            {
-                return EmptyPositionEncoding<SaccDataType>{};
-            }
-        }();
+            }();
 
-        AttentionVariant variant;
-        const auto variant_params = [&] {
-            if constexpr(kHasLogitsSoftCap)
-            {
-                return ck_tile::LogitsSoftCapParams<FmhaMask, CK_TILE_FMHA_FWD_FAST_EXP2>{
-                    mask, kargs.scale_s, kargs.logits_soft_cap, kargs.logits_soft_cap_rcp};
-            }
-            else
-            {
-                return ck_tile::StandardAttentionParams<FmhaMask>{mask, kargs.scale_s};
-            }
-        }();
+            // lse acc
+            auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
+                constexpr auto lse_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
+                if constexpr(kStoreLSE)
+                {
+                    LSEDataType* lse_ptr =
+                        reinterpret_cast<LSEDataType*>(kargs.lse_ptr) +
+                        static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse +
+                        batch_offset_lse;
 
-        BlockIndices block_indices{i_batch, i_nhead, i_nhead / kargs.nhead_ratio_qk};
+                    const auto lse_dram = [&] {
+                        const auto lse_dram_naive = [&] {
+                            {
+                                return make_naive_tensor_view<address_space_enum::global>(
+                                    lse_ptr,
+                                    make_tuple(kargs.seqlen_q),
+                                    make_tuple(1),
+                                    number<1>{},
+                                    number<1>{});
+                            }
+                        }();
+                        return pad_tensor_view(
+                            lse_dram_naive, lse_dram_window_lengths, sequence<kPadSeqLenQ>{});
+                    }();
 
-        auto o_acc_tile = [&]() {
-            if constexpr(kDoFp8StaticQuant)
-            {
-                return FmhaPipeline{}(
-                    q_dram_window,
-                    identity{}, // q_element_func
-                    k_dram_window,
-                    identity{}, // k_element_func
-                    v_dram_window,
-                    identity{}, // v_element_func
-                    bias_dram_window,
-                    identity{}, // bias_element_func
-                    randval_dram_window,
-                    lse_dram_window,
-                    identity{},                                          // lse_element_func
-                    identity{},                                          // s_acc_element_func
-                    scales{kargs.scale_p},                               // p_compute_element_func
-                    composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
-                    mask,
-                    position_encoding,
-                    kargs.scale_s,
-                    variant,
-                    variant_params,
-                    block_indices,
-                    smem_ptr,
-                    dropout);
-            }
-            else
-            {
-                return FmhaPipeline{}(q_dram_window,
-                                      k_dram_window,
-                                      v_dram_window,
-                                      bias_dram_window,
-                                      randval_dram_window,
-                                      lse_dram_window,
-                                      mask,
-                                      position_encoding,
-                                      kargs.scale_s,
-                                      variant,
-                                      variant_params,
-                                      block_indices,
-                                      smem_ptr,
-                                      dropout);
-            }
-        }();
+                    return make_tile_window(lse_dram, lse_dram_window_lengths, {i_m0});
+                }
+                else
+                {
+                    return make_null_tile_window(lse_dram_window_lengths);
+                }
+            }();
 
-        // O DRAM and O DRAM window
-        auto o_dram = [&]() {
-            const auto o_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                o_ptr,
-                make_tuple(kargs.seqlen_q, kargs.hdim_v),
-                make_tuple(kargs.stride_o, 1),
-                number<FmhaPipeline::kAlignmentO>{},
-                number<1>{});
+            FmhaMask mask = [&]() {
+                if constexpr(kHasMask)
+                    return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
+                        kargs.window_size_left,
+                        kargs.window_size_right,
+                        kargs.seqlen_q,
+                        kargs.seqlen_k,
+                        kargs.mask_type == GenericAttentionMaskEnum::MASK_FROM_TOP_LEFT);
+                else
+                    return FmhaMask{kargs.seqlen_q, kargs.seqlen_k};
+            }();
 
-            return pad_tensor_view(
-                o_dram_naive,
+            // WA i_batch capture structure binding before c++20
+            auto position_encoding = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    // data loading, shared by entire wg
+                    // TODO: how to use s_read?
+                    SaccDataType slope =
+                        *(reinterpret_cast<const SaccDataType*>(kargs.alibi_slope_ptr) +
+                          i_batch_ * kargs.alibi_slope_stride + i_nhead_);
+#if CK_TILE_FMHA_FWD_FAST_EXP2
+                    slope *= ck_tile::log2e_v<>;
+#endif
+                    if constexpr(kHasMask)
+                    {
+                        return make_alibi_from_lr_mask<SaccDataType, true, 32>(
+                            slope,
+                            kargs.window_size_left,
+                            kargs.window_size_right,
+                            kargs.seqlen_q,
+                            kargs.seqlen_k,
+                            kargs.mask_type);
+                    }
+                    else
+                    {
+                        return Alibi<SaccDataType, true, 32>{
+                            slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
+                    }
+                }
+                else
+                {
+                    return EmptyPositionEncoding<SaccDataType>{};
+                }
+            }();
+
+            auto o_acc_tile = [&]() {
+                if constexpr(PrefillCase)
+                {
+                    // allocate double lds
+                    // add __restrict__ here to avoid aliasing
+                    __shared__ char smem_ptrk0
+                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
+                                                                     true>()];
+                    __shared__ char smem_ptrk1
+                        [FmhaPipeline::Policy::template GetSmemSizeK<typename FmhaPipeline::Problem,
+                                                                     true>()];
+                    __shared__ char smem_ptrv0[FmhaPipeline::Policy::template GetSmemSizeV<
+                        typename FmhaPipeline::Problem>()];
+                    __shared__ char smem_ptrv1[FmhaPipeline::Policy::template GetSmemSizeV<
+                        typename FmhaPipeline::Problem>()];
+
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          smem_ptrk0,
+                                          smem_ptrk1,
+                                          smem_ptrv0,
+                                          smem_ptrv1);
+                }
+                else
+                {
+                    __shared__ char smem_ptr[GetSmemSize()];
+                    return FmhaPipeline{}(q_dram_window,
+                                          k_dram_window,
+                                          v_dram_window,
+                                          bias_dram_window,
+                                          lse_dram_window,
+                                          mask,
+                                          position_encoding,
+                                          kargs.scale_s,
+                                          smem_ptr);
+                }
+            }();
+
+            // Oacc DRAM and Oacc DRAM window
+            auto o_dram = [&] {
+                const auto o_dram_naive = [&] {
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            o_ptr,
+                            make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                            make_tuple(kargs.stride_o, 1),
+                            number<FmhaPipeline::kAlignmentOacc>{},
+                            number<1>{});
+                    }
+                }();
+
+                return pad_tensor_view(
+                    o_dram_naive,
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
+                    sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            }();
+
+            auto o_dram_window = make_tile_window(
+                o_dram,
                 make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                sequence<kPadSeqLenQ, kPadHeadDimV>{});
-        }();
+                {i_m0, i_n1});
 
-        auto o_dram_window =
-            make_tile_window(o_dram,
-                             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
-                             {i_m0, i_n1});
-
-        EpiloguePipeline{}(o_dram_window, o_acc_tile);
+            EpiloguePipeline{}(o_dram_window, o_acc_tile);
+        }
     }
 };
 
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
index cf70dff63f..45a1c8f4b8 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp
@@ -11,6 +11,7 @@ enum class BlockFmhaPipelineEnum
     QRKSVS = 0,
     QRKSVS_ASYNC,
     QSKSVS,
+    QRKSVS_ASYNC_TRLOAD,
 };
 
 template <BlockFmhaPipelineEnum>
@@ -32,4 +33,10 @@ struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QSKSVS>
     static constexpr const char* name = "qs";
 };
 
+template <>
+struct BlockFmhaPipelineEnumToStr<BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD>
+{
+    static constexpr const char* name = "qr_async_trload";
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 20b30b7417..86ac713b6f 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -22,6 +22,7 @@ template <typename QDataType_,
           bool kIsGroupMode_,
           typename AttentionVariant_,
           typename FmhaMask_,
+          bool kUseTrLoad_,
           typename Traits_>
 struct BlockFmhaPipelineProblem
 {
@@ -46,6 +47,7 @@ struct BlockFmhaPipelineProblem
     static constexpr index_t kBlockSize     = BlockFmhaShape::NumWarps * get_warp_size();
 
     static constexpr bool kIsGroupMode = kIsGroupMode_;
+    static constexpr bool kUseTrLoad   = kUseTrLoad_;
 
     // attributes from traits
     static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
new file mode 100644
index 0000000000..39d8814692
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp
@@ -0,0 +1,1177 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+
+namespace ck_tile {
+
+// This pipeline is qkv all located in LDS
+template <typename Problem_, typename Policy_ = BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy>
+struct BlockFmhaPipelineQRKSVSAsyncTrload
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    using Problem               = remove_cvref_t<Problem_>;
+    using Policy                = remove_cvref_t<Policy_>;
+    using QDataType             = remove_cvref_t<typename Problem::QDataType>;
+    using KDataType             = remove_cvref_t<typename Problem::KDataType>;
+    using VDataType             = remove_cvref_t<typename Problem::VDataType>;
+    using SaccDataType          = remove_cvref_t<typename Problem::SaccDataType>;
+    using SMPLComputeDataType   = remove_cvref_t<typename Problem::SMPLComputeDataType>;
+    using BiasDataType          = remove_cvref_t<typename Problem::BiasDataType>;
+    using RandValOutputDataType = remove_cvref_t<typename Problem::RandValOutputDataType>;
+    using LSEDataType           = remove_cvref_t<typename Problem::LSEDataType>;
+    using PDataType             = remove_cvref_t<typename Problem::PDataType>;
+    using OaccDataType          = remove_cvref_t<typename Problem::OaccDataType>;
+    using ODataType             = remove_cvref_t<typename Problem::ODataType>;
+    using AttentionVariant      = remove_cvref_t<typename Problem::AttentionVariant>;
+    using FmhaMask              = remove_cvref_t<typename Problem::FmhaMask>;
+
+    using BlockFmhaShape             = remove_cvref_t<typename Problem::BlockFmhaShape>;
+    using VLayout                    = remove_cvref_t<typename BlockFmhaShape::VLayout>;
+    static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
+    static_assert(kQLoadOnce == Policy::QLoadOnce);
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
+    static constexpr index_t kNWarp        = BlockFmhaShape::Gemm0BlockWarps::at(I1);
+    static constexpr index_t kNXdl         = BlockFmhaShape::Gemm0WarpTile::at(I1);
+
+    static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
+
+    // static_assert(Problem::kPadSeqLenQ == true && Problem::kPadHeadDimQ == true &&
+    //               Problem::kPadHeadDimV == true);
+
+    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
+    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenK  = Problem::kPadSeqLenK;
+    static constexpr bool kPadHeadDimQ =
+        Problem::kPadHeadDimQ; // support multiple of vector(like 8x)
+    static constexpr bool kPadHeadDimV =
+        Problem::kPadHeadDimV; // support multiple of vector(like 8x)
+
+    static constexpr bool kHasLogitsSoftCap = Problem::kHasLogitsSoftCap;
+    static constexpr bool kHasDropout       = Problem::kHasDropout;
+    static constexpr auto BiasEnum          = Problem::BiasEnum;
+    static constexpr bool kStoreLSE         = Problem::kStoreLSE;
+    static constexpr bool kHasUnevenSplits  = true;
+
+    static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
+                   (kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
+                    !kHasLogitsSoftCap)) ||
+                  (!CK_TILE_FMHA_FWD_FAST_EXP2 && !kHasLogitsSoftCap));
+
+    // last dimension vector length used to create tensor view(and decide buffer_load vector length)
+    // ... together with tensor distribution. tensor dist should able to overwrite this
+    static constexpr index_t kAlignmentQ = Policy::template GetAlignmentQ<Problem>();
+    static constexpr index_t kAlignmentK = Policy::template GetAlignmentK<Problem>();
+    static constexpr index_t kAlignmentV = []() {
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            return Policy::template GetAlignmentV<Problem>();
+        else
+            return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
+    }();
+
+    static constexpr index_t kAlignmentOacc = Policy::template GetAlignmentO<Problem>();
+
+    static constexpr index_t kAlignmentBias =
+        kPadSeqLenK ? 1 : Policy::template GetAlignmentBias<Problem>();
+
+    static constexpr index_t kBlockPerCu = []() {
+        if constexpr(Problem::kBlockPerCu != -1)
+            return Problem::kBlockPerCu;
+        else
+        {
+            if constexpr(kQKHeaddim <= 32)
+            {
+                return 2;
+            }
+            else if constexpr(kQKHeaddim <= 64)
+            {
+                return 3;
+            }
+            else if constexpr(kQKHeaddim <= 128)
+            {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS || kM0 >= 256)
+                    return 1;
+                else
+                    return 2;
+            }
+            else if constexpr(kQKHeaddim <= 256)
+            {
+                return 1;
+            }
+            else
+            {
+                return 1;
+            }
+        }
+    }();
+
+    static constexpr const char* name = "qr_async_trload";
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    // Decode
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* smem_ptr) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
+                      "wrong!");
+        ignore = bias_dram_block_window_tmp;
+        ignore = position_encoding;
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp, lse_acc);
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // Q tile in LDS
+        auto q_dram_window = make_tile_window(
+            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptr), Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptr),
+            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
+
+        auto q_lds_store_window =
+            make_tile_window(q_lds_write_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
+
+        async_load_tile(q_lds_store_window, q_dram_window);
+
+        // K tile in LDS
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem>());
+
+        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType*>(smem_ptr), Policy::template MakeKLdsBlockDescriptor<Problem>());
+        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType*>(smem_ptr),
+            Policy::template MakeKLdsBlockDescriptor<Problem, false, true>());
+
+        auto k_lds_write_window =
+            make_tile_window(k_lds_write_view,
+                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read_view,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKRegTileDistribution<Problem>());
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptr) +
+                                            Policy::template GetSmemSizeK<Problem>()),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // V tile in LDS
+        auto v_dram_window = make_tile_window(
+            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetSmemSizeK<Problem>() +
+                                         Policy::template GetSmemSizeS<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType*>(static_cast<char*>(smem_ptr) +
+                                         Policy::template GetSmemSizeK<Problem>() +
+                                         Policy::template GetSmemSizeS<Problem>()),
+            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
+        auto v_lds_write_window =
+            make_tile_window(v_lds_write_view,
+                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read_view,
+                             make_tuple(number<kK1>{}, number<kN1>{}),
+                             {0, 0},
+                             Policy::template MakeVRegTileDistribution<Problem>());
+
+        block_sync_lds_direct_load<0>();
+        auto q_tile = load_tile(q_lds_read_window);
+
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+
+        block_sync_lds();
+        async_load_tile(k_lds_write_window, k_dram_window);
+
+        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
+        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
+
+        do
+        {
+            block_sync_lds();
+            async_load_tile(v_lds_write_window, v_dram_window); // prefetch load v tile
+
+            // move V tile windows
+            move_tile_window(v_dram_window, {kN0, 0});
+
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            if constexpr(1 < k0_loops)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    if constexpr(i_k0 == 0)
+                    {
+                        block_sync_lds_direct_load<v_vmem_insts>();
+                    }
+                    else
+                    {
+                        block_sync_lds_direct_load<0>();
+                    }
+
+                    auto k_tile = load_tile(k_lds_read_window);
+
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_tile);
+
+                    // loop over along the [K]ey head dimension
+                    move_tile_window(k_dram_window, {0, kK0});
+                    block_sync_lds();
+                    async_load_tile(k_lds_write_window, k_dram_window);
+                });
+                // move back to the origin
+                move_tile_window(k_dram_window, {0, -kK0 * (k0_loops - 1)});
+            }
+
+            if constexpr(k0_loops == 1)
+            {
+                block_sync_lds_direct_load<v_vmem_insts>();
+            }
+            else
+            {
+                block_sync_lds_direct_load<0>();
+            }
+
+            auto k_tile = load_tile(k_lds_read_window);
+
+            gemm_0(s_acc,
+                   get_slice_tile(q_tile,
+                                  sequence<0, (k0_loops - 1) * kK0>{},
+                                  sequence<kM0, k0_loops * kK0>{}),
+                   k_tile);
+
+            if constexpr(kHasUnevenSplits)
+            {
+                if(i_total_loops == (num_total_loop - 1))
+                {
+                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+                    set_tile_if(s_acc,
+                                -numeric<SMPLComputeDataType>::infinity(),
+                                [&,
+                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
+                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
+
+                                    {
+                                        return physical_seqlen_k_end_ <= col;
+                                    }
+                                });
+                }
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+
+                bool need_perpixel_check =
+                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
+                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            // move K tile windows after current status checked
+            // prefetch next-tile along [K]ey sequence length dimension
+            move_tile_window(k_dram_window, {kN0, 0});
+
+            block_sync_lds();
+            async_load_tile(k_lds_write_window, k_dram_window);
+
+            // Gemm1
+            auto s_new = [&]() {
+                if constexpr(kNWarp > 1)
+                {
+                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+                    store_tile(s_write_lds_window, s);
+                    block_sync_lds();
+                    return load_tile(s_read_lds_window);
+                }
+                else
+                {
+                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+                }
+            }();
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            // Set CrossWarp to false will trigger better strategy on gfx950, but will cause
+            // performance regression because of un-coexecutable packed math, silent it for now
+            block_tile_reduce_sync(
+                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                auto row_max         = scale_s * get_validated_m(m[i_idx]);
+                sweep_tile_span(p_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
+                    }
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(
+                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            auto p_tile = make_static_distributed_tensor<PDataType>(
+                Policy::template MakePRegTileDistribution<Problem>());
+            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                const auto tmp       = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds_direct_load<k_vmem_insts>();
+
+            auto v_tile = load_tile_transpose(v_lds_read_window);
+
+            if constexpr(1 < k1_loops)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    gemm_1(o_acc,
+                           get_slice_tile(p_tile,
+                                          sequence<0, i_k1 * kK1>{},
+                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_tile);
+
+                    // loop over along the [V]alue Sequence length
+                    move_tile_window(v_lds_read_window, {kK1, 0});
+                    v_tile = load_tile_transpose(v_lds_read_window);
+                });
+                // move back to the origin
+                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
+            }
+
+            gemm_1(o_acc,
+                   get_slice_tile(p_tile,
+                                  sequence<0, (k1_loops - 1) * kK1>{},
+                                  sequence<kM0, k1_loops * kK1>{}),
+                   v_tile);
+
+        } while(++i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp, lse_acc);
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[I0], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        return o_acc;
+    }
+
+    // Prefill, double lds
+    template <typename QDramBlockWindowTmp,
+              typename KDramBlockWindowTmp,
+              typename VDramBlockWindowTmp,
+              typename BiasDramBlockWindowTmp,
+              typename LSEaccDramBlockWindowTmp,
+              typename PositionEncoding>
+    CK_TILE_HOST_DEVICE auto
+    operator()(const QDramBlockWindowTmp& q_dram_block_window_tmp,       // M0*K0 tile
+               const KDramBlockWindowTmp& k_dram_block_window_tmp,       // N0*K0 tile
+               const VDramBlockWindowTmp& v_dram_block_window_tmp,       // N1*K1 tile
+               const BiasDramBlockWindowTmp& bias_dram_block_window_tmp, // M0*N0 tile
+               LSEaccDramBlockWindowTmp& lse_acc_dram_window_tmp,        // M0*1 tile
+               FmhaMask mask,
+               PositionEncoding position_encoding,
+               float scale_s,
+               void* __restrict__ smem_ptrk0,
+               void* __restrict__ smem_ptrk1,
+               void* __restrict__ smem_ptrv0,
+               void* __restrict__ smem_ptrv1) const
+    {
+        static_assert(
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kSubQKHeaddim == QDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN0 == KDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK0 == KDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kN1 == VDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kK1 == VDramBlockWindowTmp{}.get_window_lengths()[I1] &&
+                          kM0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I0] &&
+                          kN0 == BiasDramBlockWindowTmp{}.get_window_lengths()[I1],
+                      "wrong!");
+        ignore = bias_dram_block_window_tmp;
+        ignore = position_encoding;
+
+        // Block GEMM
+        constexpr auto gemm_0 = Policy::template GetQKBlockGemm<Problem>();
+        constexpr auto gemm_1 = Policy::template GetPVBlockGemm<Problem>();
+
+        using SaccBlockTileType = decltype(gemm_0.MakeCBlockTile());
+        auto s_acc              = SaccBlockTileType{};
+
+        // reduction function for softmax
+        const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+        const auto f_sum = [](auto e0, auto e1) { return e0 + e1; };
+
+        using OaccBlockTileType = decltype(gemm_1.MakeCBlockTile());
+
+        auto o_acc = OaccBlockTileType{};
+
+        // infer Sacc, S, P, M, L, Oacc type
+        using SBlockTileType = decltype(cast_tile<SMPLComputeDataType>(o_acc));
+
+        using MLBlockTileType = decltype(block_tile_reduce<SMPLComputeDataType>(
+            SBlockTileType{}, sequence<1>{}, f_max, SMPLComputeDataType{0}));
+
+        // init M, L
+        auto m = MLBlockTileType{};
+        auto l = MLBlockTileType{};
+
+        clear_tile(o_acc);
+        set_tile(m, -numeric<SMPLComputeDataType>::infinity());
+        clear_tile(l);
+
+        const auto q_origin = q_dram_block_window_tmp.get_window_origin();
+        const auto [logical_seqlen_k_start, logical_seqlen_k_end] =
+            mask.GetTileRangeAlongX(q_origin.at(I0), number<kM0>{}, number<kN0>{});
+
+        // check early exit if no work to do
+        if constexpr(FmhaMask::IsMasking || kPadSeqLenK || kHasUnevenSplits)
+        {
+            const index_t logical_num_total_loop =
+                integer_divide_ceil(logical_seqlen_k_end - logical_seqlen_k_start, kN0);
+            if(logical_num_total_loop <= 0)
+            {
+                if constexpr(kStoreLSE)
+                {
+                    auto lse_acc =
+                        make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+                    set_tile(lse_acc, -numeric<SMPLComputeDataType>::infinity());
+
+                    if(get_thread_local_1d_id() < kM0)
+                    {
+                        store_tile(lse_acc_dram_window_tmp, lse_acc);
+                    }
+                }
+
+                // Note: here occ are all cleard, return it
+                // Note: q loaded but no fence, ignore it.
+                return o_acc;
+            }
+        }
+
+        // Q tile in LDS
+        auto q_dram_window = make_tile_window(
+            q_dram_block_window_tmp, Policy::template MakeQDramTileDistribution<Problem>());
+
+        auto q_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptrk0),
+            Policy::template MakeQLdsBlockDescriptor<Problem>());
+
+        auto q_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<QDataType*>(smem_ptrk0),
+            Policy::template MakeQLdsBlockDescriptor<Problem, true>());
+
+        auto q_lds_store_window =
+            make_tile_window(q_lds_write_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto q_lds_read_window =
+            make_tile_window(q_lds_read_view,
+                             Policy::template MakeQLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeQRegTileDistribution<Problem>());
+
+        async_load_tile(q_lds_store_window, q_dram_window);
+        block_sync_lds_direct_load<0>();
+        auto q_tile = load_tile(q_lds_read_window);
+
+        // K tile in LDS
+        const index_t physical_seqlen_k_start = logical_seqlen_k_start;
+        const index_t physical_seqlen_k_end   = logical_seqlen_k_end;
+        // make sure the first tile is completely located in page-block (page-block size should be
+        // divisible by kN0)
+        // relationship between each *_start variables: aligned_physical_seqlen_k_start <=
+        // physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
+        const index_t aligned_physical_seqlen_k_start = physical_seqlen_k_start;
+
+        auto k_dram_window = make_tile_window(
+            k_dram_block_window_tmp, Policy::template MakeKDramTileDistribution<Problem, true>());
+
+        auto k_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType* __restrict__>(smem_ptrk0),
+            Policy::template MakeKLdsBlockDescriptor<Problem, true>());
+
+        auto k_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            static_cast<KDataType* __restrict__>(smem_ptrk0),
+            Policy::template MakeKLdsBlockDescriptor<Problem, true, true>());
+
+        auto k_lds_write_window =
+            make_tile_window(k_lds_write_view,
+                             Policy::template MakeKLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto k_lds_read_window =
+            make_tile_window(k_lds_read_view,
+                             make_tuple(number<kN0>{}, number<kK0>{}),
+                             {0, 0},
+                             Policy::template MakeKRegTileDistribution<Problem>());
+
+        // S tile in LDS
+        auto s_lds = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<SaccDataType*>(reinterpret_cast<char*>(smem_ptrk0) +
+                                            Policy::template GetSmemSizeK<Problem>()),
+            Policy::template MakeSLdsBlockDescriptor<Problem>());
+        auto s_write_lds_window = make_tile_window(
+            s_lds, Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(), {0, 0});
+        auto s_read_lds_window =
+            make_tile_window(s_lds,
+                             Policy::template MakeSLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0},
+                             Policy::template MakeSRegTileDistribution<Problem>());
+
+        // V tile in LDS
+        auto v_dram_window = make_tile_window(
+            v_dram_block_window_tmp, Policy::template MakeVDramTileDistribution<Problem>());
+
+        auto v_lds_write_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
+            Policy::template MakeVLdsBlockDescriptor<Problem>());
+
+        auto v_lds_read_view = make_tensor_view<address_space_enum::lds>(
+            reinterpret_cast<VDataType* __restrict__>(static_cast<char*>(smem_ptrv0)),
+            Policy::template MakeVLdsBlockDescriptor<Problem, true>());
+
+        auto v_lds_write_window =
+            make_tile_window(v_lds_write_view,
+                             Policy::template MakeVLdsBlockDescriptor<Problem>().get_lengths(),
+                             {0, 0});
+
+        auto v_lds_read_window =
+            make_tile_window(v_lds_read_view,
+                             make_tuple(number<kK1>{}, number<kN1>{}),
+                             {0, 0},
+                             Policy::template MakeVRegTileDistribution<Problem>());
+
+        // block_sync_lds_direct_load<0>();
+        // auto q_tile = load_tile(q_lds_read_window);
+
+        const index_t num_total_loop =
+            integer_divide_ceil(physical_seqlen_k_end - aligned_physical_seqlen_k_start, kN0);
+
+        index_t i_total_loops      = 0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
+        constexpr index_t k1_loops = kN0 / kK1;
+
+        static_assert(1 <= k0_loops);
+        static_assert(1 <= k1_loops);
+        block_sync_lds<0>();
+        async_load_tile(k_lds_write_window, k_dram_window);
+        async_load_tile(v_lds_write_window, v_dram_window);
+
+        move_tile_window(k_dram_window, {kN0, 0});
+        k_lds_write_window.set_bottom_tensor_view_data_ptr(
+            static_cast<KDataType* __restrict__>(smem_ptrk1));
+        async_load_tile(k_lds_write_window, k_dram_window);
+
+        constexpr index_t k_vmem_insts = k_dram_window.get_num_of_access();
+        constexpr index_t v_vmem_insts = v_dram_window.get_num_of_access();
+
+        constexpr index_t k_lds_insts = k_lds_read_window.get_num_of_access();
+        constexpr index_t v_lds_insts = v_lds_read_window.get_num_of_access();
+
+        block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+        auto k_tile = load_tile(k_lds_read_window);
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        auto mainloop = [&](index_t cur_loop) {
+            const bool is_even_loop = (cur_loop % 2 == 0);
+
+            auto k_lds_write_ptr = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk0)
+                                                : static_cast<KDataType* __restrict__>(smem_ptrk1);
+            auto k_lds_read_ptr  = is_even_loop ? static_cast<KDataType* __restrict__>(smem_ptrk1)
+                                                : static_cast<KDataType* __restrict__>(smem_ptrk0);
+            auto v_lds_write_ptr = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv1)
+                                                : static_cast<VDataType* __restrict__>(smem_ptrv0);
+            auto v_lds_read_ptr  = is_even_loop ? static_cast<VDataType* __restrict__>(smem_ptrv0)
+                                                : static_cast<VDataType* __restrict__>(smem_ptrv1);
+
+            // move V tile windows
+            block_sync_lds<k_lds_insts>();
+            move_tile_window(v_dram_window, {kN0, 0});
+            v_lds_write_window.set_bottom_tensor_view_data_ptr(v_lds_write_ptr);
+            async_load_tile(v_lds_write_window, v_dram_window);
+
+            // STAGE 1, QK gemm
+            clear_tile(s_acc); // initialize C
+
+            if constexpr(1 < k0_loops)
+            {
+                static_for<0, k0_loops - 1, 1>{}([&](auto i_k0) {
+                    // loop over along the [K]ey head dimension
+                    move_tile_window(k_lds_read_window, {0, kK0});
+                    auto k_tile_switch = load_tile(k_lds_read_window);
+
+                    gemm_0(s_acc,
+                           get_slice_tile(q_tile,
+                                          sequence<0, i_k0 * kK0>{},
+                                          sequence<kM0, (i_k0 + 1) * kK0>{}),
+                           k_tile);
+
+                    k_tile = k_tile_switch;
+                });
+                // move back to the origin
+                move_tile_window(k_lds_read_window, {0, -kK0 * (k0_loops - 1)});
+            }
+
+            gemm_0(s_acc,
+                   get_slice_tile(q_tile,
+                                  sequence<0, (k0_loops - 1) * kK0>{},
+                                  sequence<kM0, k0_loops * kK0>{}),
+                   k_tile);
+
+            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+            v_lds_read_window.set_bottom_tensor_view_data_ptr(v_lds_read_ptr);
+            auto v_tile = load_tile_transpose(v_lds_read_window);
+
+            if constexpr(kHasUnevenSplits)
+            {
+                if(i_total_loops == (num_total_loop - 1))
+                {
+                    const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+                    set_tile_if(s_acc,
+                                -numeric<SMPLComputeDataType>::infinity(),
+                                [&,
+                                 physical_seqlen_k_start_ = physical_seqlen_k_start,
+                                 physical_seqlen_k_end_   = physical_seqlen_k_end](auto tile_idx) {
+                                    const auto col = k_origin.at(I0) + tile_idx.at(I1);
+
+                                    {
+                                        return physical_seqlen_k_end_ <= col;
+                                    }
+                                });
+                }
+            }
+
+            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
+            {
+                const auto k_origin = make_tuple(kN0 * i_total_loops, 0);
+
+                bool need_perpixel_check =
+                    mask.IsEdgeTile(q_origin.at(I0), k_origin.at(I0), number<kM0>{}, number<kN0>{});
+                if(need_perpixel_check)
+                {
+                    set_tile_if(
+                        s_acc, -numeric<SMPLComputeDataType>::infinity(), [&](auto tile_idx) {
+                            const auto row = q_origin.at(I0) + tile_idx.at(I0);
+                            const auto col = k_origin.at(I0) + tile_idx.at(I1);
+                            return mask.IsOutOfBound(row, col);
+                        });
+                }
+            }
+
+            // Gemm1
+            auto s_new = [&]() {
+                if constexpr(kNWarp > 1)
+                {
+                    auto s = cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+
+                    store_tile(s_write_lds_window, s);
+                    block_sync_lds();
+                    return load_tile(s_read_lds_window);
+                }
+                else
+                {
+                    return cast_tile<SMPLComputeDataType>(s_acc); // S{j}
+                }
+            }();
+
+            auto m_local = block_tile_reduce<SMPLComputeDataType>(
+                s_new,
+                sequence<1>{},
+                f_max,
+                -numeric<SMPLComputeDataType>::infinity()); // m_local = rowmax(S{j})
+            block_tile_reduce_sync(
+                m_local, f_max, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            static_for<0, 12, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
+            });
+
+            const auto m_old = m; // m{j-1}
+            tile_elementwise_inout(
+                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local); // m{j}
+
+            auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
+                s_new.get_tile_distribution()); // Pcompute{j}
+
+            static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
+                /// NOTICE: bias might be materialized mask including -inf values, need
+                /// consideration
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return raw_m == -numeric<SMPLComputeDataType>::infinity()
+                               ? type_convert<SMPLComputeDataType>(0.f)
+                               : raw_m;
+                }
+                else
+                {
+                    return raw_m;
+                }
+            };
+
+            constexpr auto p_spans = decltype(p_compute)::get_distributed_spans();
+            sweep_tile_span(p_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                auto row_max         = scale_s * get_validated_m(m[i_idx]);
+                sweep_tile_span(p_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            p_compute(i_j_idx) = exp2(s_new[i_j_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            p_compute(i_j_idx) = exp2(scale_s * s_new[i_j_idx] - row_max);
+                        }
+                    }
+                });
+            });
+
+            auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
+                p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
+
+            block_tile_reduce_sync(
+                rowsum_p, f_sum, bool_constant<false>{} /*, bool_constant<false>{}*/);
+
+            auto p_tile = make_static_distributed_tensor<PDataType>(
+                Policy::template MakePRegTileDistribution<Problem>());
+            p_tile.get_thread_buffer() = cast_tile<PDataType>(p_compute).get_thread_buffer();
+
+            // l{j}, Oacc{j}
+            constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+            sweep_tile_span(o_spans[I0], [&](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                const auto tmp       = [&]() {
+                    if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                                 BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                    {
+                        return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                    }
+                    else
+                    {
+                        if constexpr(kHasLogitsSoftCap)
+                        {
+                            return exp2(m_old[i_idx] - get_validated_m(m[i_idx]));
+                        }
+                        else
+                        {
+                            auto row_max = scale_s * get_validated_m(m[i_idx]);
+                            return exp2(scale_s * m_old[i_idx] - row_max);
+                        }
+                    }
+                }();
+                l(i_idx) = tmp * l[i_idx] + rowsum_p[i_idx];
+                sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                    constexpr auto i_j_idx = make_tuple(idx0, idx1);
+
+                    o_acc(i_j_idx) *= tmp;
+                });
+            });
+
+            block_sync_lds<v_lds_insts>();
+            move_tile_window(k_dram_window, {kN0, 0});
+            k_lds_write_window.set_bottom_tensor_view_data_ptr(k_lds_write_ptr);
+            async_load_tile(k_lds_write_window, k_dram_window);
+
+            if constexpr(1 < k1_loops)
+            {
+                static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
+                    // loop over along the [V]alue Sequence length
+                    move_tile_window(v_lds_read_window, {kK1, 0});
+                    auto v_tile_switch = load_tile_transpose(v_lds_read_window);
+
+                    gemm_1(o_acc,
+                           get_slice_tile(p_tile,
+                                          sequence<0, i_k1 * kK1>{},
+                                          sequence<kM0, (i_k1 + 1) * kK1>{}),
+                           v_tile);
+
+                    v_tile = v_tile_switch;
+                });
+                // move back to the origin
+                move_tile_window(v_lds_read_window, {-kK1 * (k1_loops - 1), 0});
+            }
+
+            gemm_1(o_acc,
+                   get_slice_tile(p_tile,
+                                  sequence<0, (k1_loops - 1) * kK1>{},
+                                  sequence<kM0, k1_loops * kK1>{}),
+                   v_tile);
+
+            block_sync_lds_direct_load<k_vmem_insts + v_vmem_insts>();
+            k_lds_read_window.set_bottom_tensor_view_data_ptr(k_lds_read_ptr);
+            k_tile = load_tile(k_lds_read_window);
+
+            static_for<0, 12, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS_READ
+            });
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS_READ
+            });
+        };
+
+        do
+        {
+            mainloop(i_total_loops);
+            i_total_loops++;
+        } while(i_total_loops < num_total_loop);
+
+        if constexpr(kStoreLSE)
+        {
+            // store lse acc
+            auto lse_acc = make_static_distributed_tensor<LSEDataType>(m.get_tile_distribution());
+
+            constexpr auto lse_acc_spans = decltype(lse_acc)::get_distributed_spans();
+            sweep_tile_span(lse_acc_spans[I0], [&, m_ = m, l_ = l](auto idx0) {
+                constexpr auto i_idx = make_tuple(idx0);
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             BiasEnum == BlockAttentionBiasEnum::ALIBI)
+                {
+                    lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                }
+                else
+                {
+                    if constexpr(kHasLogitsSoftCap)
+                    {
+                        lse_acc(i_idx) = m_[i_idx] / C_LOG2E + log(l_[i_idx]);
+                    }
+                    else
+                    {
+                        lse_acc(i_idx) = m_[i_idx] * scale_s / C_LOG2E + log(l_[i_idx]);
+                    }
+                }
+            });
+
+            if(get_thread_local_1d_id() < kM0)
+            {
+                store_tile(lse_acc_dram_window_tmp, lse_acc);
+            }
+        }
+
+        // finally, O
+        constexpr auto o_spans = decltype(o_acc)::get_distributed_spans();
+
+        sweep_tile_span(o_spans[I0], [&](auto idx0) {
+            constexpr auto i_idx = make_tuple(idx0);
+            const auto tmp       = [&]() {
+                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
+                             FmhaMask::IsMasking)
+                {
+                    return l[i_idx] == 0.f ? 0.f : 1 / l[i_idx];
+                }
+                else
+                    return 1 / l[i_idx];
+            }();
+            sweep_tile_span(o_spans[I1], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx0, idx1);
+                o_acc(i_j_idx) *= tmp;
+            });
+        });
+
+        return o_acc;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
new file mode 100644
index 0000000000..6582991207
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
@@ -0,0 +1,821 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp"
+
+// can remove all bank conflicts, but drop the performance for some cases
+// Probably it is limited by compiler optimization.
+#define CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD 0
+namespace ck_tile {
+// This pipeline is qkv all located in LDS
+struct BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
+    : BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                          /* AsyncCopy = */ false,
+                                          /* NumPrefetchK = */ 1,
+                                          /* NumPrefetchV = */ 1>
+{
+    using BasePolicy = BlockFmhaPipelineQXKSVSCustomPolicy</* QLoadOnce = */ true,
+                                                           /* AsyncCopy = */ false,
+                                                           /* NumPrefetchK = */ 1,
+                                                           /* NumPrefetchV = */ 1>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentQ()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+        // this should align with MakeQDramTileDistribution()
+        constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc()
+    {
+        using OaccDataType = remove_cvref_t<typename Problem::OaccDataType>;
+
+        return static_cast<index_t>(16 / sizeof(OaccDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentK()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::KDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        return min(ElemPerThread, MaxVectorSize);
+    }
+
+    template <typename Problem, bool BypassLDS = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQDramTileDistribution()
+    {
+        if constexpr(!BypassLDS)
+        {
+            constexpr index_t kBlockSize = Problem::kBlockSize;
+            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+            constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::QDataType);
+
+            constexpr index_t ElemPerThread = (kMPerBlock * kKPerBlock) / kBlockSize;
+            static_assert(0 < ElemPerThread);
+            constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+            constexpr index_t KPerThread     = kMaxVecLoad;
+            constexpr index_t KThreads       = kKPerBlock / KPerThread;
+            constexpr index_t MThreadPerWarp = get_warp_size() / KThreads;
+            constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+            constexpr index_t MPerThread     = kMPerBlock / (MThreadPerWarp * NumWarps);
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<MPerThread, NumWarps, MThreadPerWarp>,
+                                                 sequence<KThreads, KPerThread>>,
+                                           tuple<sequence<1>, sequence<1, 2>>,
+                                           tuple<sequence<1>, sequence<2, 0>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 1>>{});
+        }
+        else
+        {
+            using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+            constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+            using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+            constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+            constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+            constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+            constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+            constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+            constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+            constexpr auto q_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<NWarp>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                tuple<sequence<1, 0>>,
+                tuple<sequence<1, 0>>,
+                sequence<2, 1>,
+                sequence<0, 0>>{};
+
+            constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+            return q_block_dstr;
+        }
+    }
+
+    template <typename Problem, bool LoadOnce = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKDramTileDistribution()
+    {
+        using KDataType = remove_cvref_t<typename Problem::KDataType>;
+
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock =
+            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(KDataType);
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+
+        constexpr index_t K1 = min(MaxVectorSize, ElemPerThread);
+        constexpr index_t K0 = kKPerBlock / K1;
+        constexpr index_t N2 = get_warp_size() / K0;
+        constexpr index_t N1 = kBlockSize / get_warp_size();
+        constexpr index_t N0 = kNPerBlock / (N2 * N1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read M first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto q_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto q_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            q_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto q_block_dstr = make_static_tile_distribution(q_block_dstr_encode);
+
+        return q_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackQ()
+    {
+        // TODO: this is for 3d layout
+        using QDataType = remove_cvref_t<typename Problem::QDataType>;
+        return static_cast<index_t>(16 / sizeof(QDataType));
+    }
+
+    template <typename Problem, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeQLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;
+
+        constexpr index_t kKPack = GetSmemKPackQ<Problem>();
+
+        constexpr auto q_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::QDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kMPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / kKPack>{},
+                                   number<kKPack>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
+                        q_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kMPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto q_lds_block_desc_tmp = transform_tensor_descriptor(
+                        q_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kMPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(
+                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        q_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kMPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(number<kMPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto q_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(
+                            number<kMPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto q_lds_block_desc_permuted = transform_tensor_descriptor(
+                        q_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(number<kMPerBlock>{},
+                                                                 number<kKPerBlock / kKPack>{})),
+                                   make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        q_lds_block_desc_permuted,
+                        make_tuple(make_pass_through_transform(number<kMPerBlock>{}),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                    make_tuple(number<kKPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return q_lds_block_desc;
+    }
+
+    template <typename Problem, bool LoadOnce = false, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock =
+            LoadOnce ? Problem::BlockFmhaShape::kSubQKHeaddim : Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t kKPack = GetSmemKPackK<Problem>();
+
+        constexpr auto k_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::KDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kKPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kNPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / kKPack>{},
+                                   number<kKPack>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
+                        k_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kNPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto k_lds_block_desc_tmp = transform_tensor_descriptor(
+                        k_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kNPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(
+                                make_tuple(number<XorLengthFold>{}, number<kKPerBlock / kKPack>{})),
+                            make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        k_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(
+                                make_tuple(number<kNPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto k_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(
+                            number<kNPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+                        make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto k_lds_block_desc_permuted = transform_tensor_descriptor(
+                        k_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(number<kNPerBlock>{},
+                                                                 number<kKPerBlock / kKPack>{})),
+                                   make_pass_through_transform(number<kKPack>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        k_lds_block_desc_permuted,
+                        make_tuple(make_pass_through_transform(number<kNPerBlock>{}),
+                                   make_merge_transform_v3_division_mod(make_tuple(
+                                       number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                    make_tuple(number<kKPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return k_lds_block_desc;
+    }
+
+    template <typename Problem, bool Xor = false>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t kKPack = GetSmemKPackV<Problem>();
+
+        constexpr auto v_lds_block_desc = [&]() {
+            if constexpr(Xor)
+            {
+                constexpr auto XorGroupSize =
+                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{});
+
+#if CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                constexpr auto LDSLayerSize  = 256 / sizeof(typename Problem::VDataType);
+                constexpr auto XorLengthFold = LDSLayerSize / kNPerBlock;
+
+                if constexpr(XorLengthFold > 1)
+                {
+                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kKPerBlock / XorLengthFold>{},
+                                   number<LDSLayerSize / XorGroupSize>{},
+                                   number<XorGroupSize>{}),
+                        make_tuple(number<LDSLayerSize>{}, number<XorGroupSize>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
+                        v_lds_block_desc_naive,
+                        make_tuple(
+                            make_xor_transform(make_tuple(number<kKPerBlock / XorLengthFold>{},
+                                                          number<LDSLayerSize / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    constexpr auto v_lds_block_desc_tmp = transform_tensor_descriptor(
+                        v_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kKPerBlock / XorLengthFold>{}),
+                            make_unmerge_transform(make_tuple(number<XorLengthFold>{},
+                                                              number<kNPerBlock / XorGroupSize>{})),
+                            make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3>{}));
+
+                    return transform_tensor_descriptor(
+                        v_lds_block_desc_tmp,
+                        make_tuple(
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kKPerBlock / XorLengthFold>{}, number<XorLengthFold>{})),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
+                        make_tuple(sequence<0, 1>{}, sequence<2, 3>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+#endif // CK_TILE_FMHA_HANDLE_XOR_LENGTH_FOLD
+                {
+                    constexpr auto v_lds_block_desc_naive = make_naive_tensor_descriptor(
+                        make_tuple(number<kKPerBlock>{},
+                                   number<kNPerBlock / XorGroupSize>{},
+                                   number<XorGroupSize>{}),
+                        make_tuple(number<kNPerBlock>{}, number<XorGroupSize>{}, number<1>{}),
+                        number<kKPack>{},
+                        number<1>{});
+
+                    constexpr auto v_lds_block_desc_permuted = transform_tensor_descriptor(
+                        v_lds_block_desc_naive,
+                        make_tuple(make_xor_transform(make_tuple(
+                                       number<kKPerBlock>{}, number<kNPerBlock / XorGroupSize>{})),
+                                   make_pass_through_transform(number<XorGroupSize>{})),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}));
+
+                    return transform_tensor_descriptor(
+                        v_lds_block_desc_permuted,
+                        make_tuple(
+                            make_pass_through_transform(number<kKPerBlock>{}),
+                            make_merge_transform_v3_division_mod(make_tuple(
+                                number<kNPerBlock / XorGroupSize>{}, number<XorGroupSize>{}))),
+                        make_tuple(sequence<0>{}, sequence<1, 2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(number<kKPerBlock>{}, number<kNPerBlock>{}),
+                    make_tuple(number<kNPerBlock>{}, number<1>{}),
+                    number<kKPack>{},
+                    number<1>{});
+            }
+        }();
+
+        return v_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetQKBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::QDataType,
+                             typename Problem::KDataType,
+                             typename Problem::SaccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN0,
+                                                    Problem::BlockFmhaShape::kK0>,
+                                           typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
+
+        using WarpGemm =
+            WarpGemmMfmaDispatcher<typename Problem::QDataType,
+                                   typename Problem::KDataType,
+                                   typename Problem::SaccDataType,
+                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                   true>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV2CustomPolicy<typename Problem::QDataType,
+                                                typename Problem::KDataType,
+                                                typename Problem::SaccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm0BlockWarps,
+                                                WarpGemm,
+                                                GemmLoopOrder::MNK>;
+
+        return BlockGemmARegBRegCRegV2<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetPVBlockGemm()
+    {
+        using GemmProblem =
+            BlockGemmProblem<typename Problem::PDataType,
+                             typename Problem::VDataType,
+                             typename Problem::OaccDataType,
+                             Problem::kBlockSize,
+                             TileGemmShape<sequence<Problem::BlockFmhaShape::kM0,
+                                                    Problem::BlockFmhaShape::kN1,
+                                                    Problem::BlockFmhaShape::kK1>,
+                                           typename Problem::BlockFmhaShape::Gemm1BlockWarps,
+                                           typename Problem::BlockFmhaShape::Gemm1WarpTile>>;
+
+        using WarpGemm = WarpGemmMfmaDispatcher<
+            typename Problem::PDataType,
+            typename Problem::VDataType,
+            typename Problem::OaccDataType,
+            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+            true,
+            false,
+            false,
+            ((Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 16 &&
+              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32) ||
+             (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 32 &&
+              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 16))
+                ? WGAttrNumAccessEnum::Double
+                : WGAttrNumAccessEnum::Single>;
+
+        using BlockGemmPolicy =
+            BlockGemmARegBRegCRegV2CustomPolicy<typename Problem::PDataType,
+                                                typename Problem::VDataType,
+                                                typename Problem::OaccDataType,
+                                                typename Problem::BlockFmhaShape::Gemm1BlockWarps,
+                                                WarpGemm,
+                                                GemmLoopOrder::KMN>;
+
+        return BlockGemmARegBRegCRegV2<GemmProblem, BlockGemmPolicy>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeKRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetQKBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read N first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto k_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto k_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            k_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        constexpr auto k_block_dstr = make_static_tile_distribution(k_block_dstr_encode);
+
+        return k_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeVDramTileDistribution()
+    {
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MaxVectorSize = 16 / sizeof(typename Problem::VDataType);
+
+        constexpr index_t ElemPerThread = (kNPerBlock * kKPerBlock) / kBlockSize;
+        static_assert(0 < ElemPerThread);
+        constexpr index_t kMaxVecLoad = min(ElemPerThread, MaxVectorSize);
+
+        constexpr index_t NPerThread     = kMaxVecLoad;
+        constexpr index_t NThreads       = kNPerBlock / NPerThread;
+        constexpr index_t KThreadPerWarp = get_warp_size() / NThreads;
+        constexpr index_t NumWarps       = kBlockSize / get_warp_size();
+        constexpr index_t KPerThread     = kKPerBlock / (KThreadPerWarp * NumWarps);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<KPerThread, NumWarps, KThreadPerWarp>,
+                                             sequence<NThreads, NPerThread>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakePRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kN0;
+
+        constexpr index_t MIterPerWarp = kMPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read M first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto p_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<1, 0>>,
+                                       tuple<sequence<1, 0>>,
+                                       sequence<2, 1>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto p_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            p_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+        constexpr auto p_block_dstr = make_static_tile_distribution(p_block_dstr_encode);
+
+        return p_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeVRegTileDistribution()
+    {
+        using BlockGemm       = remove_cvref_t<decltype(GetPVBlockGemm<Problem>())>;
+        constexpr auto config = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm        = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<0>{});
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm1BlockWarps::at(number<1>{});
+
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+
+        constexpr index_t NIterPerWarp = kNPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = kKPerBlock / WarpGemm::kK;
+
+        // Read N first, then K
+        // This is the same data consume order as BlockGEMM
+        constexpr auto v_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<2, 1>,
+                                       sequence<0, 0>>{};
+
+        constexpr auto v_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            v_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+        constexpr auto v_block_dstr =
+            make_static_tile_distribution(typename InputTileDistributionTraits<
+                                          decltype(v_block_dstr_encode),
+                                          typename Problem::VDataType>::TransposedDstrEncode{});
+
+        return v_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemNPackS()
+    {
+        using SDataType = remove_cvref_t<typename Problem::SaccDataType>;
+        return static_cast<index_t>(16 / sizeof(SDataType));
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSLdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN0;
+        constexpr index_t kNPack     = GetSmemNPackS<Problem>();
+
+        constexpr auto s_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kNPerBlock / kNPack>{}, number<kMPerBlock>{}, number<kNPack>{}),
+            make_tuple(number<(kMPerBlock + 1) * kNPack>{}, number<kNPack>{}, number<1>{}),
+            number<kNPack>{},
+            number<1>{});
+
+        constexpr auto s_lds_block_desc = transform_tensor_descriptor(
+            s_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kMPerBlock>{}),
+                make_merge_transform(make_tuple(number<kNPerBlock / kNPack>{}, number<kNPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return s_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeSRegTileDistribution()
+    {
+        using BlockGemm = remove_cvref_t<decltype(GetKVBlockGemm<Problem>())>;
+
+        constexpr auto config   = BlockGemm::Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG                = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        // static_assert(MWarp == 1, "Check failed!");
+
+        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
+        constexpr index_t kTileK     = Problem::BlockFmhaShape::kN0;
+
+        // K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm
+        constexpr index_t K3 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K2 = WG::WarpGemmAttribute::Impl::kABKLane;
+        constexpr index_t K1 = kKPerBlock / (K2 * K3);
+        constexpr index_t K0 = kTileK / kKPerBlock;
+        constexpr index_t M2 = WG::WarpGemmAttribute::Impl::kAMLane;
+        constexpr index_t M1 = MWarp;
+        constexpr index_t M0 = kMPerBlock / (M2 * M1);
+
+        constexpr auto s2_block_dstr_encoding =
+            tile_distribution_encoding<sequence<NWarp>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2, K3>>,
+                                       tuple<sequence<1, 0>, sequence<2, 1>>,
+                                       tuple<sequence<1, 0>, sequence<2, 2>>,
+                                       sequence<1, 2, 2, 2>,
+                                       sequence<0, 0, 1, 3>>{};
+
+        constexpr auto s2_block_dstr = make_static_tile_distribution(s2_block_dstr_encoding);
+
+        return s2_block_dstr;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeQ()
+    {
+        return MakeQLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::QDataType);
+    }
+
+    template <typename Problem, bool LoadOnce = false>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeK()
+    {
+        return MakeKLdsBlockDescriptor<Problem, LoadOnce>().get_element_space_size() *
+               sizeof(typename Problem::KDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeV()
+    {
+        return MakeVLdsBlockDescriptor<Problem>().get_element_space_size() *
+               sizeof(typename Problem::VDataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSizeS()
+    {
+        constexpr index_t NWarp = Problem::BlockFmhaShape::Gemm0BlockWarps::at(number<1>{});
+
+        return NWarp > 1 ? MakeSLdsBlockDescriptor<Problem>().get_element_space_size() *
+                               sizeof(typename Problem::SaccDataType)
+                         : 0;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
+    {
+        // Alignment on gfx950 is 1280 Bytes
+        // Alignment before gfx950 is 512 Bytes.
+        return max(GetSmemSizeQ<Problem>(),
+                   GetSmemSizeK<Problem>() + GetSmemSizeS<Problem>() + GetSmemSizeV<Problem>());
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 3489d6f9a1..e2cea97f9a 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -383,23 +383,31 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     CK_TILE_HOST_DEVICE static constexpr auto GetSmemKPackV()
     {
         // TODO: this is for 3d layout
-        using VDataType = remove_cvref_t<typename Problem::VDataType>;
-        return 16 / sizeof(VDataType);
+        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
+        constexpr index_t kBlockSize   = Problem::kBlockSize;
+        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        constexpr index_t kMaxVecLoad =
+            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+
+        return kMaxVecLoad;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentV()
     {
-        using VLayout   = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
-        using VDataType = remove_cvref_t<typename Problem::VDataType>;
+        using VLayout                  = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
+        using VDataType                = remove_cvref_t<typename Problem::VDataType>;
+        constexpr index_t kBlockSize   = Problem::kBlockSize;
+        constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
+        constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        constexpr index_t kMaxVecLoad =
+            min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
+
         if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
-            constexpr index_t kBlockSize   = Problem::kBlockSize;
-            constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
-            constexpr index_t kKPerBlock   = Problem::BlockFmhaShape::kK1;
-            constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-            constexpr index_t kMaxVecLoad =
-                min(total_pixels, static_cast<index_t>(16 / sizeof(VDataType)));
             constexpr index_t kMinVecLoad = 4 / sizeof(VDataType);
 
             constexpr index_t kVecLoad = ((total_pixels / kMaxVecLoad) >= kMinVecLoad)
@@ -410,7 +418,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         }
         else
         {
-            return 16 / sizeof(VDataType);
+            return kMaxVecLoad;
         }
     }
 
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
new file mode 100644
index 0000000000..8313693d3a
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp"
+
+namespace ck_tile {
+
+// This BlockGemm enhanced the control over inst issue order
+// A is block distributed tensor
+// B is block distributed tensor
+// C is block distributed tensor
+template <typename Problem_, typename Policy_>
+struct BlockGemmARegBRegCRegV2
+{
+    private:
+    template <typename PipelineProblem_, typename GemmPolicy_>
+    struct GemmTraits_
+    {
+        using Problem        = remove_cvref_t<PipelineProblem_>;
+        using Policy         = remove_cvref_t<GemmPolicy_>;
+        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+        static constexpr index_t MPerBlock = BlockGemmShape::kM;
+        static constexpr index_t NPerBlock = BlockGemmShape::kN;
+        static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+        static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm               = remove_cvref_t<decltype(config.template at<0>())>;
+
+        static constexpr index_t MWarp        = config.template at<1>();
+        static constexpr index_t NWarp        = config.template at<2>();
+        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+
+        static constexpr auto BlockGemmLoopOrder = Policy::BlockGemmLoopOrder;
+
+        static constexpr index_t KPack = WarpGemm::kKPerThread;
+    };
+
+    public:
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using Traits = GemmTraits_<Problem, Policy>;
+
+    using WarpGemm                           = typename Traits::WarpGemm;
+    using BlockGemmShape                     = typename Traits::BlockGemmShape;
+    static constexpr auto BlockGemmLoopOrder = Traits::BlockGemmLoopOrder;
+
+    using ADataType = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType = remove_cvref_t<typename Traits::BDataType>;
+    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+
+    static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
+    static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
+    static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
+
+    static constexpr index_t MWarp            = Traits::MWarp;
+    static constexpr index_t NWarp            = Traits::NWarp;
+    static constexpr bool UseDefaultScheduler = (Problem::NumWaveGroups != 1);
+
+    CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto a_block_outer_dstr_encoding =
+                tile_distribution_encoding<sequence<NWarp>,
+                                           tuple<sequence<MIterPerWarp>, sequence<KIterPerWarp>>,
+                                           tuple<>,
+                                           tuple<>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{};
+
+            constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+            return a_block_dstr_encode;
+        }
+        else
+        {
+            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+            {
+                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<NWarp>,
+                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<1, 0>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<2, 1>,
+                    sequence<0, 0>>{};
+
+                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+                return a_block_dstr_encode;
+            }
+            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+            {
+                constexpr auto a_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<NWarp>,
+                    tuple<sequence<MIterPerWarp, MWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<1, 0>>,
+                    tuple<sequence<1, 0>>,
+                    sequence<1, 2>,
+                    sequence<0, 0>>{};
+
+                constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
+
+                return a_block_dstr_encode;
+            }
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeBBlockDistributionEncode()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto b_block_outer_dstr_encoding =
+                tile_distribution_encoding<sequence<MWarp>,
+                                           tuple<sequence<NIterPerWarp>, sequence<KIterPerWarp>>,
+                                           tuple<>,
+                                           tuple<>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{};
+            constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+            return b_block_dstr_encode;
+        }
+        else
+        {
+            if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+            {
+                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<MWarp>,
+                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<0, 1>>,
+                    tuple<sequence<0, 1>>,
+                    sequence<2, 1>,
+                    sequence<0, 0>>{};
+                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+
+                return b_block_dstr_encode;
+            }
+            else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+            {
+                constexpr auto b_block_outer_dstr_encoding = tile_distribution_encoding<
+                    sequence<MWarp>,
+                    tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                    tuple<sequence<0, 1>>,
+                    tuple<sequence<0, 1>>,
+                    sequence<1, 2>,
+                    sequence<0, 0>>{};
+                constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                    b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
+                return b_block_dstr_encode;
+            }
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockDistributionEncode()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<>,
+                tuple<>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+
+            return c_block_dstr_encode;
+        }
+        else
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+
+            return c_block_dstr_encode;
+        }
+    }
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockTensor, typename BBlockTensor>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockTensor& a_block_tensor,
+                                   const BBlockTensor& b_block_tensor) const
+    {
+        static_assert(std::is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
+                          std::is_same_v<BDataType, remove_cv_t<typename BBlockTensor::DataType>> &&
+                          std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+                      "wrong!");
+
+        // check ABC-block-distribution
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(MakeABlockDistributionEncode())>,
+                           remove_cvref_t<decltype(ABlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "A distribution is wrong!");
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(MakeBBlockDistributionEncode())>,
+                           remove_cvref_t<decltype(BBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "B distribution is wrong!");
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(MakeCBlockDistributionEncode())>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "C distribution is wrong!");
+
+        using AWarpDstr = typename WarpGemm::AWarpDstr;
+        using BWarpDstr = typename WarpGemm::BWarpDstr;
+        using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+        using AWarpTensor = typename WarpGemm::AWarpTensor;
+        using BWarpTensor = typename WarpGemm::BWarpTensor;
+        using CWarpTensor = typename WarpGemm::CWarpTensor;
+
+        constexpr auto a_warp_y_lengths =
+            to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto b_warp_y_lengths =
+            to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+        constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+        constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        if constexpr(BlockGemmLoopOrder == GemmLoopOrder::KMN)
+        {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    // read A warp tensor from A Block window
+                    AWarpTensor a_warp_tensor;
+                    a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<kIter, mIter>{}, a_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<kIter, nIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                        CWarpTensor c_warp_tensor;
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
+        else if constexpr(BlockGemmLoopOrder == GemmLoopOrder::MNK)
+        {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                        // read A warp tensor from A Block window
+                        AWarpTensor a_warp_tensor;
+
+                        a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
+
+                        // read B warp tensor from B block tensor
+                        BWarpTensor b_warp_tensor;
+
+                        b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+
+                        // read C warp tensor from C block tensor
+                        CWarpTensor c_warp_tensor;
+
+                        c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                        // warp GEMM
+                        WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                        // write C warp tensor into C block tensor
+                        c_block_tensor.set_y_sliced_thread_data(
+                            merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                            c_warp_tensor.get_thread_buffer());
+                    });
+                });
+            });
+        }
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        if constexpr(UseDefaultScheduler)
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<MWarp>,
+                tuple<sequence<MIterPerWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<>,
+                tuple<>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+            constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+            auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+            return c_block_tensor;
+        }
+        else
+        {
+            constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+                tuple<sequence<1, 2>>,
+                tuple<sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<0, 0>>{};
+
+            constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+                c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+            constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+            auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+            return c_block_tensor;
+        }
+    }
+
+    // C = A * B
+    template <typename ABlockTensor, typename BBlockTensor>
+    CK_TILE_DEVICE auto operator()(const ABlockTensor& a_block_tensor,
+                                   const BBlockTensor& b_block_tensor) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor, b_block_tensor);
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp
new file mode 100644
index 0000000000..c2cfbc083b
--- /dev/null
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+enum struct GemmLoopOrder
+{
+    KMN,
+    MNK,
+};
+
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_,
+          GemmLoopOrder BlockGemmLoopOrder_ = GemmLoopOrder::KMN>
+struct BlockGemmARegBRegCRegV2CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+
+    static constexpr auto BlockGemmLoopOrder = BlockGemmLoopOrder_;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index fb191d565d..d1deaf9e0e 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -104,6 +104,10 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
         1>>;
 #endif
 
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
@@ -210,6 +214,10 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
         AttrNumAccess>>;
 #endif
 
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
     WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index e91d505c8e..8c6f39e511 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -45,6 +45,8 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp16 2:4 structural sparsity
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
@@ -74,6 +76,8 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 434be9f84a..7a10d1fa56 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -14,10 +14,14 @@ namespace ck_tile {
  * Y dim must have at least one dim not been reduced
  */
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
-template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
+template <typename AccDistributedTensor_,
+          typename ReduceFunc,
+          bool WithBroadcast = true,
+          bool CrossWarp     = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
                                            const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {})
+                                           bool_constant<WithBroadcast> = {},
+                                           bool_constant<CrossWarp>     = {})
 {
     using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
     using DstrEncode       = typename Dstr::DstrEncode;
@@ -56,14 +60,24 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 
                 // reduction sweep forward
                 static_for<0, nstage, 1>{}([&](auto istage) {
-                    constexpr index_t lid_delta =
-                        lid_over_rid_derivative * (1 << (nstage - istage - 1));
+                    if constexpr(CrossWarp)
+                    {
+                        constexpr index_t lid_delta =
+                            lid_over_rid_derivative * (1 << (nstage - istage - 1));
 
-                    // pull data from remote lane
-                    const auto v_remote = warp_shuffle_down(v_local, lid_delta);
+                        // pull data from remote lane
+                        const auto v_remote = warp_shuffle_down(v_local, lid_delta);
 
-                    // reduce
-                    v_local = reduce_func(v_local, v_remote);
+                        // reduce
+                        v_local = reduce_func(v_local, v_remote);
+                    }
+                    else
+                    {
+                        // pull data from remote lane
+                        const auto v_swapped_regs = warp_shuffle_down_pair(v_local);
+                        // reduce
+                        v_local = reduce_func(v_swapped_regs.at(0), v_swapped_regs.at(1));
+                    }
                 });
             }
         });

From 28a97865f540716ce825e01375164ac3722dcbd0 Mon Sep 17 00:00:00 2001
From: SamiAario-AMD <samaario@amd.com>
Date: Wed, 13 Aug 2025 11:12:08 +0300
Subject: [PATCH 426/443] Cleanups (#2631)

* Remove some duplicate code in fmha_fwd_appendkv_kernel.hpp

* Simplify two templated operator calls by having the templated types deduced automatically

* Simplify two GemmPipeline calls

* Fix GemmPipelineAgBgCrCompV4::GetName

* Refactor use of ArgParser in CK tile GEMM examples

* Update args in README.md to match the implementation in create_args

* Remove some unnecessary include statements

* Rename two variables

* Factor out common code

* Factor out do_verify

* Add and use type aliases for memory operation integral constants

* In gemm_basic.cpp, use kPadM, kPadN, kPadK, and kBlockPerCu from GemmConfig

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
---
 example/ck_tile/03_gemm/README.md             |  5 +-
 example/ck_tile/03_gemm/gemm_basic.cpp        | 75 ++++++++-----------
 example/ck_tile/03_gemm/gemm_utils.hpp        |  6 ++
 .../03_gemm/gemm_weight_preshuffle.cpp        | 33 ++++----
 example/ck_tile/03_gemm/run_gemm_example.inc  | 71 ++++++++----------
 example/ck_tile/03_gemm/universal_gemm.cpp    | 57 +++++++-------
 .../fmha/kernel/fmha_fwd_appendkv_kernel.hpp  | 57 +++++---------
 .../ops/gemm/kernel/universal_gemm_kernel.hpp | 14 ++--
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |  2 +-
 9 files changed, 142 insertions(+), 178 deletions(-)

diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index 20cc202176..59ef2640b7 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -18,7 +18,6 @@ This will result in an executable `build/bin/tile_example_gemm_basic` & `build/b
 ## example
 ```
 args:
-          -b    batch size (default:1)
           -m    m dimension (default:1024)
           -n    n dimension (default:2048)
           -k    k dimension (default:64)
@@ -29,9 +28,11 @@ args:
    -stride_b    Tensor B stride (default:0)
    -stride_c    Tensor C stride (default:0)
           -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
-          -e    Absolute error tolerance (default:1e-5)
        -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
      -warmup    number of iterations before benchmark the kernel (default:10)
      -repeat    number of iterations to benchmark the kernel (default:100)
       -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+    -split_k    splitK value (default:1)
+       -init    0:random, 1:linear, 2:constant (default:1)
+ -persistent    0:non-persistent, 1:persistent (default:0)
 ```
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
index 0d9c2d9957..25781a4ae8 100644
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
 
 template <typename GemmConfig,
@@ -29,12 +20,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
     if constexpr(Persistent)
         std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu = 1;
 
     // This part comes from the Codegen
     constexpr ck_tile::index_t M_Tile = 256;
@@ -56,8 +41,12 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 
     using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenGemmTraits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                      GemmConfig::kPadN,
+                                                      GemmConfig::kPadK,
+                                                      ALayout,
+                                                      BLayout,
+                                                      CLayout>;
 
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
@@ -111,28 +100,30 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                       << std::endl;
         }
 
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        float ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                       Kernel{}, grids, blocks, 0, kargs));
 
         return ave_time;
     };
 
     if(args.k_batch == 1)
     {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::set>{});
+        return Run(MemoryOpSet{});
     }
     else
     {
-        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                              ck_tile::memory_operation_enum::atomic_add>{});
+        return Run(MemoryOpAtomicAdd{});
     }
 }
 
 #include "run_gemm_example.inc"
 
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
     using Row = ck_tile::tensor_layout::gemm::RowMajor;
     using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
@@ -142,12 +133,12 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -160,22 +151,22 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "R" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+                arg_parser, Row{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+                arg_parser, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -184,38 +175,34 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     }
 }
 
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
 
     if(data_type == "fp16")
     {
-        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
-        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "i8")
     {
         return run_gemm_example_prec_type<ck_tile::int8_t, ck_tile::int8_t, int32_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "pk_int4_t")
     {
@@ -223,7 +210,7 @@ int run_gemm_example(int argc, char* argv[])
         if constexpr(GemmConfigBase::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
         {
             return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-                a_layout, b_layout, argc, argv);
+                a_layout, b_layout, arg_parser);
         }
         else
         {
@@ -238,9 +225,13 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example(argc, argv);
+        return !run_gemm_example(arg_parser);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index cab110597b..5f477b3821 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -476,6 +476,12 @@ auto create_args(int argc, char* argv[])
     return std::make_tuple(result, arg_parser);
 }
 
+// Type aliases for memory operation integral constants
+using MemoryOpSet =
+    std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
+using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
+                                                 ck_tile::memory_operation_enum::atomic_add>;
+
 // host API
 template <typename ADataType,
           typename BDataType,
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index 0a06787e2b..8a7560bf86 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -210,12 +210,13 @@ template <typename GemmConfig,
           typename APrecType,
           typename BPrecType = APrecType,
           typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
-    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
-    auto [result, arg_parser] = create_args(argc, argv);
-    bool preshuffle           = GemmConfig::Preshuffle;
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
 
     if(preshuffle && (a_layout != "R" || b_layout != "C"))
     {
@@ -226,7 +227,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     if(a_layout == "R" && b_layout == "C")
     {
         return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-            argc, argv, Row{}, Col{}, Row{});
+            arg_parser, Row{}, Col{}, Row{});
     }
     else
     {
@@ -235,12 +236,8 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
 }
 
 template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
@@ -248,26 +245,26 @@ int run_gemm_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                           ck_tile::fp8_t,
                                           ck_tile::fp8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                           ck_tile::bf8_t,
                                           ck_tile::bf8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else
     {
@@ -277,9 +274,13 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example<GemmConfigPreshuffle_2>(argc, argv);
+        return !run_gemm_example<GemmConfigPreshuffle_2>(arg_parser);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index cc10394065..229771e536 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -272,6 +272,25 @@ auto shuffle_b(const ck_tile::HostTensor<T>& t)
     return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
 }
 
+template <typename CDataType>
+bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+               const ck_tile::HostTensor<CDataType>& c_m_n_ref,
+               const ck_tile::tuple<double, double>& rtol_atol,
+               const char* variant)
+{
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail")
+              << std::endl;
+    return pass;
+}
+
 template <typename GemmConfig,
           typename ADataType,
           typename BDataType = ADataType,
@@ -279,16 +298,11 @@ template <typename GemmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout>
-int run_gemm_example_with_layouts(int argc,
-                                  char* argv[],
+int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
                                   const ALayout a_layout                  = ALayout{},
                                   const BLayout b_layout                  = BLayout{},
                                   [[maybe_unused]] const CLayout c_layout = CLayout{})
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
 
     ck_tile::index_t M = arg_parser.get_int("m");
@@ -430,28 +444,20 @@ int run_gemm_example_with_layouts(int argc,
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
 
+    // memory on host to store gpu reference result
+    ck_tile::HostTensor<CDataType> c_m_n_ref(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+    c_m_n_ref.SetZero();
+
     if(arg_parser.get_int("v") == 1)
     {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
         ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_ref);
+            a_m_k, b_k_n, c_m_n_ref);
         const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
         const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
             K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
     }
     else if(arg_parser.get_int("v") == 2)
     {
@@ -465,13 +471,8 @@ int run_gemm_example_with_layouts(int argc,
             b_k_n_dev_buf.ToDevice(b_k_n.data());
         }
 
-        // memory on host to store gpu reference result
-        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
         // memory on device to store gpu reference result
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
-
-        c_m_n_gpu_ref.SetZero();
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
         c_m_n_gpu_buf_ref.SetZero();
 
         ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
@@ -486,21 +487,13 @@ int run_gemm_example_with_layouts(int argc,
                                     BLayout,
                                     CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
 
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());
 
         const float max_accumulated_value =
-            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
         const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
             K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_gpu_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
     }
 
     return pass;
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index d82520241d..14c4905720 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -189,17 +189,11 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
         if(args.k_batch == 1)
         {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
+            Run(has_hot_loop_, tail_number_, MemoryOpSet{});
         }
         else
         {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
+            Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
         }
     };
 
@@ -211,12 +205,13 @@ template <typename GemmConfig,
           typename APrecType,
           typename BPrecType = APrecType,
           typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
-    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
-    auto [result, arg_parser] = create_args(argc, argv);
-    bool preshuffle           = GemmConfig::Preshuffle;
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
 
     if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
     {
@@ -234,12 +229,12 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -252,22 +247,22 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
         if(a_layout == "R" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+                arg_parser, Row{}, Row{}, Row{});
         }
         else if(a_layout == "R" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+                arg_parser, Row{}, Col{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "R")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+                arg_parser, Col{}, Row{}, Row{});
         }
         else if(a_layout == "C" && b_layout == "C")
         {
             return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+                arg_parser, Col{}, Col{}, Row{});
         }
         else
         {
@@ -277,12 +272,8 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
 }
 
 template <template <typename PreType> typename GemmConfig>
-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
@@ -290,33 +281,33 @@ int run_gemm_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf16")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
     }
     else if(data_type == "fp8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                           ck_tile::fp8_t,
                                           ck_tile::fp8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "bf8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                           ck_tile::bf8_t,
                                           ck_tile::bf8_t,
-                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "int8")
     {
         return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
                                           ck_tile::int8_t,
                                           ck_tile::int8_t,
-                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+                                          ck_tile::int32_t>(a_layout, b_layout, arg_parser);
     }
     else if(data_type == "pk_int4_t")
     {
@@ -326,7 +317,7 @@ int run_gemm_example(int argc, char* argv[])
             return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
                                               ck_tile::half_t,
                                               ck_tile::pk_int4_t,
-                                              ck_tile::half_t>(a_layout, b_layout, argc, argv);
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
         }
         else
         {
@@ -341,9 +332,13 @@ int run_gemm_example(int argc, char* argv[])
 
 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
     try
     {
-        return !run_gemm_example<GemmConfigComputeV3>(argc, argv);
+        return !run_gemm_example<GemmConfigComputeV3>(arg_parser);
     }
     catch(const std::runtime_error& e)
     {
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
index 9fec9a320c..5129f83532 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
@@ -647,44 +647,25 @@ struct FmhaFwdAppendKVKernel
                              make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
                              {0, i_n0});
 
-        if constexpr(kApplyRoPE)
-        {
-            FmhaPipeline{}(q_dram_window,
-                           k_dram_window,
-                           i_page_block_k,
-                           k_page_block_navigator,
-                           knew_dram_window,
-                           v_dram_window,
-                           i_page_block_v,
-                           v_page_block_navigator,
-                           vnew_dram_window,
-                           q_rotary_cos_dram_window,
-                           q_rotary_sin_dram_window,
-                           knew_rotary_cos_dram_window,
-                           knew_rotary_sin_dram_window,
-                           kargs.rotary_dim,
-                           kargs.seqlen_q <= i_m0,
-                           skip_append_kv);
-        }
-        else
-        {
-            FmhaPipeline{}(q_dram_window,
-                           k_dram_window,
-                           i_page_block_k,
-                           k_page_block_navigator,
-                           knew_dram_window,
-                           v_dram_window,
-                           i_page_block_v,
-                           v_page_block_navigator,
-                           vnew_dram_window,
-                           q_rotary_cos_dram_window,
-                           q_rotary_sin_dram_window,
-                           knew_rotary_cos_dram_window,
-                           knew_rotary_sin_dram_window,
-                           0, // rotary_dim not used
-                           kargs.seqlen_q <= i_m0,
-                           skip_append_kv);
-        }
+        // If kApplyRoPe is false, we set the rotary_dim to 0
+        auto rotary_dim = kApplyRoPE ? kargs.rotary_dim : 0;
+
+        FmhaPipeline{}(q_dram_window,
+                       k_dram_window,
+                       i_page_block_k,
+                       k_page_block_navigator,
+                       knew_dram_window,
+                       v_dram_window,
+                       i_page_block_v,
+                       v_page_block_navigator,
+                       vnew_dram_window,
+                       q_rotary_cos_dram_window,
+                       q_rotary_sin_dram_window,
+                       knew_rotary_cos_dram_window,
+                       knew_rotary_sin_dram_window,
+                       rotary_dim,
+                       kargs.seqlen_q <= i_m0,
+                       skip_append_kv);
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 1d513faea3..0ac0ca37e6 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -943,17 +943,15 @@ struct UniversalGemmKernel
         const auto& bs_block_window = gemm_tile_windows.at(I1);
         const auto& ds_block_window = gemm_tile_windows.at(I2);
 
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
-            as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0);
+        const auto& c_block_tile =
+            GemmPipeline{}(as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0);
 
         if(UseDefaultScheduler || (get_warp_id() == 0))
         {
             // Run Epilogue Pipeline
             auto& c_block_window = gemm_tile_windows.at(I3);
 
-            EpiloguePipeline{}.template
-            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(ds_block_window)>(
-                c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+            EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
         }
     }
 
@@ -1001,15 +999,13 @@ struct UniversalGemmKernel
         const auto& bs_block_window = gemm_tile_windows.at(I1);
         const auto& ds_block_window = gemm_tile_windows.at(I2);
 
-        const auto& c_block_tile = GemmPipeline{}.template operator()(
+        const auto& c_block_tile = GemmPipeline{}(
             as_block_window[I0], bs_block_window[I0], num_loop, smem_ptr_0, smem_ptr_1);
 
         // Run Epilogue Pipeline
         auto& c_block_window = gemm_tile_windows.at(I3);
 
-        EpiloguePipeline{}.template
-        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(ds_block_window)>(
-            c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+        EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
     }
 
     // Non-persistent kernel entry point
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 22c8cf383b..e6da00da95 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -149,7 +149,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
-        return concat('_', "pipeline_AgBgCrCompV3", 
+        return concat('_', "pipeline_AgBgCrCompV4",
                       concat('x', MPerBlock, NPerBlock, KPerBlock,  BlockSize),
                       concat('x', GetVectorSizeA(), GetVectorSizeB(),  GetVectorSizeC()),
                       concat('x', kPadM, kPadN, kPadK));

From 3142562c22d87efba3951954bdcc4dd475a88df0 Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Wed, 13 Aug 2025 19:06:22 +0800
Subject: [PATCH 427/443] fix for aiter consume (#2677)

---
 .../ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
index ffb6d579ed..5d55e8bc36 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -84,6 +84,7 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
     {F_mode},
     fmha_variant_{F_idx},
     fmha_mask_{F_idx},
+    false,
     fmha_trait_{F_idx}>;
 
 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -98,7 +99,7 @@ using fmha_kernel_{F_idx} =
     ck_tile::FmhaBatchPrefillWithPagedKVCacheKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
 
 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false>;
 
 #include <iostream>
 
@@ -177,7 +178,7 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 
 FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false>;
                 return fmha_batch_prefill_<trait_>(s, a);
             }}
 """
@@ -507,8 +508,8 @@ class KernelComponentFactory:
             for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
                     pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
                     pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
         else:
             assert False
         return pipelines

From b963478759b08c18e4337c841639553a6a6d538f Mon Sep 17 00:00:00 2001
From: JH-Leon-KIM-AMD <jeonghyun.kim@amd.com>
Date: Wed, 13 Aug 2025 17:24:34 +0300
Subject: [PATCH 428/443] CSV-driven convolution test pipeline (#2581)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add CSV-driven convolution test pipeline

- Add test_grouped_convnd_fwd_dataset_xdl.cpp with CSV reader functionality
- Add complete dataset generation toolchain in test_data/
- Add Jenkins integration with RUN_CONV_COMPREHENSIVE_DATASET parameter
- Ready for comprehensive convolution testing with scalable datasets

* Update convolution test dataset generation pipeline

* add 2d, 3d dataset csv files

* Remove CSV test dataset files from repository

* Update generate_test_dataset.sh

* Fix channel division for MIOpen to CK conversion

* Remove unnecessary test files

* Fix clang-format-18 formatting issues

---------

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 Jenkinsfile                                   |  31 ++
 test/grouped_convnd_fwd/CMakeLists.txt        |   4 +
 .../test_grouped_convnd_fwd_dataset_xdl.cpp   | 335 ++++++++++++++++
 test_data/generate_model_configs.py           | 167 ++++++++
 test_data/generate_test_dataset.sh            | 262 +++++++++++++
 test_data/miopen_to_csv.py                    | 363 ++++++++++++++++++
 test_data/run_model_with_miopen.py            | 136 +++++++
 7 files changed, 1298 insertions(+)
 create mode 100644 test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
 create mode 100644 test_data/generate_model_configs.py
 create mode 100755 test_data/generate_test_dataset.sh
 create mode 100644 test_data/miopen_to_csv.py
 create mode 100644 test_data/run_model_with_miopen.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 7955b8733a..282c3a6049 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -892,6 +892,10 @@ pipeline {
             name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
             defaultValue: false,
             description: "Run the grouped conv large cases tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CONV_COMPREHENSIVE_DATASET",
+            defaultValue: false,
+            description: "Run comprehensive convolution dataset tests before important changes (default: OFF)")
         booleanParam(
             name: "RUN_CODEGEN_TESTS",
             defaultValue: true,
@@ -1090,6 +1094,33 @@ pipeline {
                 }
             }
         }
+        stage("Run Comprehensive Convolution Dataset Tests")
+        {
+            parallel
+            {
+                stage("Run Comprehensive Dataset Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CONV_COMPREHENSIVE_DATASET.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ cd test_data && \
+                                           ./generate_test_dataset.sh && \
+                                           cd ../script && \
+                                           ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 test_grouped_convnd_fwd_dataset_xdl && \
+                                           ./bin/test_grouped_convnd_fwd_dataset_xdl"""
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
         stage("Run Codegen Tests")
         {
             parallel
diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/test/grouped_convnd_fwd/CMakeLists.txt
index 4ceb4a2d99..24622fa0b5 100644
--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -11,6 +11,10 @@ if(GPU_TARGETS MATCHES "gfx9")
     add_executable(test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_fwd_large_cases_xdl.cpp)
     target_compile_options(test_grouped_convnd_fwd_large_cases_xdl PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(test_grouped_convnd_fwd_large_cases_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
+
+    add_executable(test_grouped_convnd_fwd_dataset_xdl test_grouped_convnd_fwd_dataset_xdl.cpp)
+    target_compile_options(test_grouped_convnd_fwd_dataset_xdl PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_fwd_dataset_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
 endif()
 
 add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp)
diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
new file mode 100644
index 0000000000..ded68d9a44
--- /dev/null
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_dataset_xdl.cpp
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>          // Standard C library (exit codes, malloc)
+#include <iostream>         // C++ I/O streams (cout, cerr)
+#include <initializer_list> // C++ initializer list support (unused here)
+#include <vector>           // C++ vector container - stores test cases
+#include <fstream>          // File I/O for CSV reading
+#include <sstream>          // String stream for CSV parsing
+#include <string>           // String operations
+#include <gtest/gtest.h>    // Google Test framework - provides TYPED_TEST, EXPECT_TRUE
+
+#include "profiler/profile_grouped_conv_fwd_impl.hpp" // The actual GPU profiler that does convolution work
+
+// CSV Reader Function for Loading Test Cases
+// Reads convolution parameters from CSV file and returns vector of ConvParam structures
+std::vector<ck::utils::conv::ConvParam> load_csv_test_cases(const std::string& filename)
+{
+    std::vector<ck::utils::conv::ConvParam> conv_params; // Return vector
+    std::ifstream file(filename);                        // Open CSV file
+
+    if(!file.is_open())
+    {
+        std::cerr << "ERROR: Cannot open CSV file: " << filename << std::endl;
+        return conv_params; // Return empty vector on error
+    }
+
+    std::string line;
+    int line_number = 0;
+
+    // Read file line by line
+    while(std::getline(file, line))
+    {
+        line_number++;
+        std::cout << "Line " << line_number << ": " << line << std::endl;
+        // Skip comment lines (starting with #) and empty lines
+        if(line.empty() || line[0] == '#')
+        {
+            continue;
+        }
+
+        // Skip header line (contains column names)
+        if(line.find("NDim,Groups,BatchSize") != std::string::npos)
+        {
+            continue;
+        }
+
+        // Parse CSV line using stringstream
+        std::stringstream ss(line);
+        std::string cell;
+        std::vector<std::string> row;
+
+        // Split line by commas
+        while(std::getline(ss, cell, ','))
+        {
+            row.push_back(cell);
+        }
+
+        // Validate row has correct number of columns
+        if(row.size() < 19)
+        { // Need at least 19 columns for 2D (excluding TestName)
+            std::cerr << "WARNING: Line " << line_number << " has insufficient columns ("
+                      << row.size() << "), skipping" << std::endl;
+            continue;
+        }
+
+        try
+        {
+            // Parse CSV data into ConvParam structure
+            // CSV Format:
+            // NDim,Groups,BatchSize,OutChannels,InChannels,KernelH,KernelW,InputH,InputW,OutputH,OutputW,StrideH,StrideW,DilationH,DilationW,LeftPadH,LeftPadW,RightPadH,RightPadW,TestName
+            int NDim        = std::stoi(row[0]);
+            int Groups      = std::stoi(row[1]);
+            int BatchSize   = std::stoi(row[2]);
+            int OutChannels = std::stoi(row[3]);
+            int InChannels  = std::stoi(row[4]);
+
+            if(NDim == 2)
+            {
+                // 2D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelH,KernelW}, {InputH,InputW}, {StrideH,StrideW}, {DilationH,DilationW},
+                // {LeftPadH,LeftPadW}, {RightPadH,RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                                     // NDim = 2
+                    Groups,                                   // Groups
+                    BatchSize,                                // Batch size
+                    OutChannels,                              // Output channels
+                    InChannels,                               // Input channels
+                    {std::stoi(row[5]), std::stoi(row[6])},   // Kernel: {H, W}
+                    {std::stoi(row[7]), std::stoi(row[8])},   // Input: {H, W}
+                    {std::stoi(row[11]), std::stoi(row[12])}, // Stride: {H, W}
+                    {std::stoi(row[13]), std::stoi(row[14])}, // Dilation: {H, W}
+                    {std::stoi(row[15]), std::stoi(row[16])}, // Left pad: {H, W}
+                    {std::stoi(row[17]), std::stoi(row[18])}  // Right pad: {H, W}
+                };
+                conv_params.push_back(param);
+            }
+            else if(NDim == 3)
+            {
+                // 3D Convolution: Need more columns for 3D parameters
+                if(row.size() < 26)
+                {
+                    std::cerr << "WARNING: 3D convolution on line " << line_number
+                              << " needs 26+ columns, has " << row.size() << ", skipping"
+                              << std::endl;
+                    continue;
+                }
+                // 3D Convolution: {NDim, Groups, BatchSize, OutChannels, InChannels,
+                // {KernelD,KernelH,KernelW}, {InputD,InputH,InputW}, {OutputD,OutputH,OutputW},
+                // {StrideD,StrideH,StrideW}, {DilationD,DilationH,DilationW},
+                // {LeftPadD,LeftPadH,LeftPadW}, {RightPadD,RightPadH,RightPadW}}
+                ck::utils::conv::ConvParam param = {
+                    NDim,                                                       // NDim = 3
+                    Groups,                                                     // Groups
+                    BatchSize,                                                  // Batch size
+                    OutChannels,                                                // Output channels
+                    InChannels,                                                 // Input channels
+                    {std::stoi(row[5]), std::stoi(row[6]), std::stoi(row[7])},  // Kernel: {D, H, W}
+                    {std::stoi(row[8]), std::stoi(row[9]), std::stoi(row[10])}, // Input: {D, H, W}
+                    {std::stoi(row[14]),
+                     std::stoi(row[15]),
+                     std::stoi(row[16])}, // Stride: {D, H, W}
+                    {std::stoi(row[17]),
+                     std::stoi(row[18]),
+                     std::stoi(row[19])}, // Dilation: {D, H, W}
+                    {std::stoi(row[20]),
+                     std::stoi(row[21]),
+                     std::stoi(row[22])}, // Left pad: {D, H, W}
+                    {std::stoi(row[23]),
+                     std::stoi(row[24]),
+                     std::stoi(row[25])} // Right pad: {D, H, W}
+                };
+                conv_params.push_back(param);
+            }
+            else
+            {
+                std::cerr << "WARNING: Unsupported NDim=" << NDim << " on line " << line_number
+                          << ", skipping" << std::endl;
+            }
+        }
+        catch(const std::exception& e)
+        {
+            std::cerr << "ERROR: Failed to parse line " << line_number << ": " << e.what()
+                      << std::endl;
+            continue;
+        }
+    }
+
+    file.close();
+    std::cout << "Loaded " << conv_params.size() << " test cases from " << filename << std::endl;
+    return conv_params;
+}
+
+// Template class that works with different data types and tensor layouts
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test // Inherit from Google Test base class
+{
+    protected:
+    using DataType =
+        std::tuple_element_t<0, Tuple>; // Extract data type from tuple (fp32, fp16, bf16, int8)
+    using InLayout =
+        std::tuple_element_t<1, Tuple>; // Extract input tensor layout (NHWGC, NDHWGC, etc.)
+    using WeiLayout =
+        std::tuple_element_t<2, Tuple>; // Extract weight tensor layout (GKYXC, GKZYXC, etc.)
+    using OutLayout =
+        std::tuple_element_t<3, Tuple>; // Extract output tensor layout (NHWGK, NDHWGK, etc.)
+    using IndexType = ck::long_index_t; // 64-bit integer type for tensor dimensions
+
+    // THE KEY CONTAINER: This stores all test case parameters
+    // Each test will push_back() ConvParam structures here
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    // Template function to run tests for N-dimensional spatial convolution (2D or 3D)
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty()); // Google Test assertion: ensure we have test cases
+        bool pass = true;                  // Track overall pass/fail across all test cases
+
+        // MAIN LOOP: Execute every test case that was added to conv_params
+        for(auto& param : conv_params)
+        {
+            // CALL THE ACTUAL GPU PROFILER - This is where convolution happens!
+            pass = pass &&
+                   ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                               InLayout,   // Input tensor layout
+                                                               WeiLayout,  // Weight tensor layout
+                                                               OutLayout,  // Output tensor layout
+                                                               DataType,   // Input data type
+                                                               DataType,   // Weight data type
+                                                               DataType,   // Output data type
+                                                               DataType,   // Accumulation type
+                                                               DataType,   // Bias type
+                                                               IndexType>( // Index type (int64)
+                       true, // do_verification: Compare GPU result with CPU reference
+                       1, // init_method: How to initialize random test data (1 = uniform -5 to 5)
+                       false,  // do_log: Don't print detailed tensor values
+                       false,  // time_kernel: Don't do performance timing (just correctness)
+                       param); // ConvParam: {NDim, Groups, Batch, OutChannels, InChannels,
+                               // KernelSize, InputSize, ...}
+        }
+        EXPECT_TRUE(pass); // Google Test assertion: ALL test cases must pass
+    }
+};
+
+using namespace ck::tensor_layout::convolution; // Import tensor layout names (NHWGC, GKYXC, etc.)
+
+// GOOGLE TEST TYPE COMBINATIONS: Define what data types and layouts to test
+// This creates 4 separate test instances for 2D convolution:
+using KernelTypes2d =
+    ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>,       // fp32 test
+                     std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,  // fp16 test
+                     std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>, // bfloat16 test
+                     std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;     // int8 test
+
+// This creates 3 separate test instances for 3D convolution (no int8 support for 3D):
+using KernelTypes3d =
+    ::testing::Types<std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,        // fp32 3D test
+                     std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,   // fp16 3D test
+                     std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>; // bfloat16 3D test
+
+// Create specialized test classes that inherit from the base template class
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple> // 2D convolution test class
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple> // 3D convolution test class
+{
+};
+
+// GOOGLE TEST MAGIC: Create test suites
+// This tells Google Test to create 4 test instances for 2D (fp32, fp16, bf16, int8)
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+// This tells Google Test to create 3 test instances for 3D (fp32, fp16, bf16)
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+// THE ACTUAL 2D TEST - This runs 4 times (once for each data type: fp32, fp16, bf16, int8)
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    // LOAD TEST CASES FROM CSV FILE instead of hardcoded cases
+    // Try different locations for the CSV file (build directory vs source directory)
+    std::vector<std::string> csv_paths = {
+        "../test_data/conv_test_set_2d_dataset.csv", // From build directory to source
+    };
+
+    bool loaded = false;
+    for(const auto& csv_path : csv_paths)
+    {
+        auto csv_cases = load_csv_test_cases(csv_path);
+        if(!csv_cases.empty())
+        {
+            // Successfully loaded CSV data - add all test cases to conv_params
+            for(const auto& test_case : csv_cases)
+            {
+                this->conv_params.push_back(test_case);
+            }
+            std::cout << "Loaded " << csv_cases.size() << " 2D test cases from " << csv_path
+                      << std::endl;
+            loaded = true;
+            break;
+        }
+    }
+
+    // FAIL if CSV loading fails - no fallback!
+    if(!loaded)
+    {
+        std::cerr << "ERROR: Failed to load CSV test data from any of these locations:"
+                  << std::endl;
+        for(const auto& path : csv_paths)
+        {
+            std::cerr << "  - " << path << std::endl;
+        }
+        std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
+        std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets."
+                  << std::endl;
+
+        // Force test failure - no test cases means test should fail
+        EXPECT_TRUE(loaded) << "CSV test data loading failed";
+    }
+
+    // Execute all test cases with 2D convolution
+    // This calls Run<2>() which loops through conv_params and calls GPU profiler for each
+    this->template Run<2>();
+}
+
+// THE ACTUAL 3D TEST - This runs 3 times (once for each data type: fp32, fp16, bf16)
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    // LOAD TEST CASES FROM CSV FILE instead of hardcoded cases
+    // Try different locations for the CSV file (build directory vs source directory)
+    std::vector<std::string> csv_paths = {
+        "../test_data/conv_test_set_3d_dataset.csv", // From build directory to source
+    };
+
+    bool loaded = false;
+    for(const auto& csv_path : csv_paths)
+    {
+        auto csv_cases = load_csv_test_cases(csv_path);
+        if(!csv_cases.empty())
+        {
+            // Successfully loaded CSV data - add all test cases to conv_params
+            for(const auto& test_case : csv_cases)
+            {
+                this->conv_params.push_back(test_case);
+            }
+            std::cout << "Loaded " << csv_cases.size() << " 3D test cases from " << csv_path
+                      << std::endl;
+            loaded = true;
+            break;
+        }
+    }
+
+    // FAIL if CSV loading fails - no fallback!
+    if(!loaded)
+    {
+        std::cerr << "ERROR: Failed to load CSV test data from any of these locations:"
+                  << std::endl;
+        for(const auto& path : csv_paths)
+        {
+            std::cerr << "  - " << path << std::endl;
+        }
+        std::cerr << "\nPlease ensure CSV test data exists in one of these locations." << std::endl;
+        std::cerr << "Run generate_test_dataset.sh in test_data/ to create test datasets."
+                  << std::endl;
+
+        // Force test failure - no test cases means test should fail
+        EXPECT_TRUE(loaded) << "CSV test data loading failed";
+    }
+
+    // Execute all test cases with 3D convolution
+    // This calls Run<3>() which loops through conv_params and calls GPU profiler for each
+    this->template Run<3>();
+}
diff --git a/test_data/generate_model_configs.py b/test_data/generate_model_configs.py
new file mode 100644
index 0000000000..d799c0fb94
--- /dev/null
+++ b/test_data/generate_model_configs.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+Generate Model Configuration Combinations for MIOpen Testing
+
+This script generates all possible combinations of model parameters
+and saves them as CSV files that can be read by the shell script.
+"""
+
+import csv
+import itertools
+import argparse
+
+def generate_2d_configs():
+    """Generate all 2D model configuration combinations"""
+    
+    # Define parameter ranges
+    models_2d = [
+        'resnet18', 'resnet34', 'resnet50', 
+        'mobilenet_v2', 'mobilenet_v3_large', 'mobilenet_v3_small',
+        'vgg11', 'vgg16', 'vgg19',
+        'alexnet', 'googlenet',
+        'densenet121', 'densenet161',
+        'squeezenet1_0', 'squeezenet1_1',
+        'shufflenet_v2_x1_0'
+    ]
+    
+    batch_sizes = [1, 4, 8, 16, 32]
+    
+    # Input dimensions: (height, width)
+    input_dims = [
+        (64, 64), (128, 128), (224, 224), (256, 256), (512, 512),  # Square
+        (224, 320), (224, 448), (320, 224), (448, 224),            # Rectangular
+        (227, 227),  # AlexNet preferred
+        (299, 299)   # Inception preferred
+    ]
+    
+    precisions = ['fp32'] #, 'fp16', 'bf16']
+    channels = [3]  # Most models expect RGB
+    
+    configs = []
+    config_id = 1
+    
+    # Generate all combinations (but limit to reasonable subset)
+    for model in models_2d:
+        for batch_size in batch_sizes:
+            for height, width in input_dims:
+                for precision in precisions:
+                    # Skip some combinations to keep dataset manageable
+                    if batch_size > 16 and height > 256:
+                        continue  # Skip large batch + large image combinations
+                    if precision != 'fp32' and batch_size < 8:
+                        continue  # Skip mixed precision with tiny batches
+                    
+                    config_name = f"{model}_b{batch_size}_{height}x{width}_{precision}"
+                    
+                    config = {
+                        'config_name': config_name,
+                        'model': model,
+                        'batch_size': batch_size,
+                        'channels': channels[0],
+                        'height': height,
+                        'width': width,
+                        'precision': precision
+                    }
+                    
+                    configs.append(config)
+                    config_id += 1
+    
+    return configs
+
+def generate_3d_configs():
+    """Generate all 3D model configuration combinations"""
+    
+    models_3d = ['r3d_18', 'mc3_18', 'r2plus1d_18']
+    
+    batch_sizes = [1, 2, 4, 8]  # 3D models are more memory intensive
+    temporal_sizes = [8, 16, 32]
+    
+    # 3D input dimensions: (height, width) 
+    input_dims = [
+        (112, 112), (224, 224), (256, 256),  # Standard sizes
+        (224, 320), (320, 224)               # Rectangular
+    ]
+    
+    precisions = ['fp32'] #, 'fp16']  # Skip bf16 for 3D to reduce combinations
+    channels = [3]
+    
+    configs = []
+    
+    for model in models_3d:
+        for batch_size in batch_sizes:
+            for temporal_size in temporal_sizes:
+                for height, width in input_dims:
+                    for precision in precisions:
+                        # Skip very large combinations
+                        if batch_size > 4 and temporal_size > 16:
+                            continue
+                        if batch_size > 2 and height > 224:
+                            continue
+                            
+                        config_name = f"{model}_b{batch_size}_t{temporal_size}_{height}x{width}_{precision}"
+                        
+                        config = {
+                            'config_name': config_name,
+                            'model': model,
+                            'batch_size': batch_size,
+                            'channels': channels[0],
+                            'temporal_size': temporal_size,
+                            'height': height,
+                            'width': width,
+                                'precision': precision
+                            }
+                        
+                        configs.append(config)
+    
+    return configs
+
+def save_configs_to_csv(configs, filename, config_type):
+    """Save configurations to CSV file"""
+    
+    if not configs:
+        print(f"No {config_type} configurations generated")
+        return
+    
+    fieldnames = list(configs[0].keys())
+    
+    with open(filename, 'w', newline='\n', encoding='utf-8') as csvfile:
+        csvfile.write(f"# {config_type} Model Configurations\n")
+        csvfile.write(f"# Generated {len(configs)} configurations\n")
+        
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
+        writer.writeheader()
+        
+        for config in configs:
+            writer.writerow(config)
+    
+    print(f"Generated {len(configs)} {config_type} configurations → {filename}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate model configuration combinations')
+    parser.add_argument('--output-2d', type=str, default='model_configs_2d.csv',
+                       help='Output file for 2D configurations')
+    parser.add_argument('--output-3d', type=str, default='model_configs_3d.csv', 
+                       help='Output file for 3D configurations')
+    parser.add_argument('--limit', type=int, 
+                       help='Limit number of configurations per type (for testing)')
+    
+    args = parser.parse_args()
+    
+    print("Generating 2D model configurations...")
+    configs_2d = generate_2d_configs()
+    if args.limit:
+        configs_2d = configs_2d[:args.limit]
+    save_configs_to_csv(configs_2d, args.output_2d, "2D")
+    
+    print("Generating 3D model configurations...")
+    configs_3d = generate_3d_configs()
+    if args.limit:
+        configs_3d = configs_3d[:args.limit]
+    save_configs_to_csv(configs_3d, args.output_3d, "3D")
+    
+    print(f"\nTotal configurations: {len(configs_2d)} 2D + {len(configs_3d)} 3D = {len(configs_2d) + len(configs_3d)}")
+    print("\nTo use these configurations:")
+    print("  Update generate_test_dataset.sh to read from these CSV files")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test_data/generate_test_dataset.sh b/test_data/generate_test_dataset.sh
new file mode 100755
index 0000000000..621ea4f144
--- /dev/null
+++ b/test_data/generate_test_dataset.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+# Generate Comprehensive Convolution Test Dataset for CK
+# This script captures MIOpen commands from PyTorch models and generates test cases
+
+set -e  # Exit on error
+
+# Check if target files already exist
+# if [ -f "conv_test_set_2d_dataset.csv" ] && [ -f "conv_test_set_3d_dataset.csv" ]; then
+#     echo "Target files already exist:"
+#     [ -f "conv_test_set_2d_dataset.csv" ] && echo "  - conv_test_set_2d_dataset.csv ($(wc -l < conv_test_set_2d_dataset.csv) lines)"
+#     [ -f "conv_test_set_3d_dataset.csv" ] && echo "  - conv_test_set_3d_dataset.csv ($(wc -l < conv_test_set_3d_dataset.csv) lines)"
+#     echo ""
+#     echo "To regenerate, please remove these files first:"
+#     echo "  rm conv_test_set_2d_dataset.csv conv_test_set_3d_dataset.csv"
+#     exit 0
+# fi
+
+echo "=========================================="
+echo "CK Convolution Test Dataset Generator"
+echo "=========================================="
+
+# Configuration
+OUTPUT_DIR="generated_datasets"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+MAX_ITERATIONS=0  # Maximum number of iterations per model type (set to 0 for unlimited)
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+PURPLE='\033[0;35m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Create output directory
+rm -rf "$OUTPUT_DIR"
+mkdir -p $OUTPUT_DIR
+
+echo ""
+echo "Step 1: Generating model configurations"
+echo "-----------------------------------------"
+
+# Generate model configuration files (with limit for testing)
+echo "Generating model configuration files..."
+python3 generate_model_configs.py \
+    --output-2d $OUTPUT_DIR/model_configs_2d.csv \
+    --output-3d $OUTPUT_DIR/model_configs_3d.csv 
+
+if [ ! -f "$OUTPUT_DIR/model_configs_2d.csv" ] || [ ! -f "$OUTPUT_DIR/model_configs_3d.csv" ]; then
+    echo "ERROR: Failed to generate configuration files"
+    exit 1
+fi
+
+
+# Check if running on GPU
+if ! command -v rocm-smi &> /dev/null; then
+    echo "WARNING: ROCm not detected. Models will run on CPU (no MIOpen commands)."
+    echo "For actual MIOpen commands, run this on a system with AMD GPU."
+fi
+
+
+echo ""
+echo "Step 2: Running 2D/3D models and capturing MIOpen commands"
+echo "-----------------------------------------"
+
+
+# Process 2D models from CSV configuration file
+echo "Processing 2D models from $OUTPUT_DIR/model_configs_2d.csv..."
+
+# Count total configurations (excluding comments and header)
+TOTAL_CONFIGS=$(grep -v "^#" $OUTPUT_DIR/model_configs_2d.csv | tail -n +2 | wc -l)
+CURRENT_CONFIG=0
+
+echo "Total configurations to process: $TOTAL_CONFIGS"
+echo ""
+
+# Read 2D configurations from CSV (skip comments and header)
+while IFS=',' read -r config_name model batch_size channels height width precision; do
+    # Skip comments and empty lines
+    [[ "$config_name" =~ ^#.*$ ]] && continue
+    [[ "$config_name" == "config_name" ]] && continue  # Skip header
+    [[ -z "$config_name" ]] && continue
+    
+    # Increment counter
+    CURRENT_CONFIG=$((CURRENT_CONFIG + 1))
+    
+    # Stop after MAX_ITERATIONS if set
+    if [ $MAX_ITERATIONS -gt 0 ] && [ $CURRENT_CONFIG -gt $MAX_ITERATIONS ]; then
+        echo -e "${RED}Stopping after $MAX_ITERATIONS iterations (testing mode)${NC}"
+        break
+    fi
+    
+    # Build configuration command
+    CONFIG="--model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision"
+    CONFIG_NAME="$config_name"
+    
+    echo -e "${GREEN}[${CURRENT_CONFIG}/${TOTAL_CONFIGS}]${NC} ${PURPLE}Running MIOpenDriver${NC} ${CYAN}2D${NC} ${YELLOW}$CONFIG_NAME${NC}: ${BLUE}$CONFIG${NC}"
+    
+    # Actual run with logging
+    MIOPEN_ENABLE_LOGGING_CMD=1 python3 run_model_with_miopen.py \
+        --model $model --batch-size $batch_size --channels $channels --height $height --width $width --precision $precision \
+        2>> $OUTPUT_DIR/${model}_miopen_log_2d.txt || true 
+
+
+done < $OUTPUT_DIR/model_configs_2d.csv
+
+# Process 3D models from CSV configuration file
+echo "Processing 3D models from $OUTPUT_DIR/model_configs_3d.csv..."
+
+# Count total 3D configurations (excluding comments and header)
+TOTAL_3D_CONFIGS=$(grep -v "^#" $OUTPUT_DIR/model_configs_3d.csv | tail -n +2 | wc -l)
+CURRENT_3D_CONFIG=0
+
+echo "Total 3D configurations to process: $TOTAL_3D_CONFIGS"
+echo ""
+
+# Read 3D configurations from CSV (skip comments and header)
+while IFS=',' read -r config_name model batch_size channels temporal_size height width precision; do
+    # Skip comments and empty lines  
+    [[ "$config_name" =~ ^#.*$ ]] && continue
+    [[ "$config_name" == "config_name" ]] && continue  # Skip header
+    [[ -z "$config_name" ]] && continue
+    
+    # Increment counter
+    CURRENT_3D_CONFIG=$((CURRENT_3D_CONFIG + 1))
+    
+    # Stop after MAX_ITERATIONS if set
+    if [ $MAX_ITERATIONS -gt 0 ] && [ $CURRENT_3D_CONFIG -gt $MAX_ITERATIONS ]; then
+        echo -e "${RED}Stopping after $MAX_ITERATIONS iterations (testing mode)${NC}"
+        break
+    fi
+
+    # Build configuration command for 3D models
+    CONFIG="--model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision"
+    CONFIG_NAME="$config_name"
+    
+    echo -e "${GREEN}[${CURRENT_3D_CONFIG}/${TOTAL_3D_CONFIGS}]${NC} ${PURPLE}Running MIOpenDriver${NC} ${CYAN}3D${NC} ${YELLOW}$CONFIG_NAME${NC}: ${BLUE}$CONFIG${NC}"
+    
+    
+    # Actual run with logging
+    MIOPEN_ENABLE_LOGGING_CMD=1 python3 run_model_with_miopen.py \
+        --model $model --batch-size $batch_size --channels $channels --temporal-size $temporal_size --height $height --width $width --precision $precision \
+        2>> $OUTPUT_DIR/${model}_miopen_log_3d.txt || true
+
+done < $OUTPUT_DIR/model_configs_3d.csv
+
+
+echo ""
+echo "Step 3: Converting MIOpen commands to CSV test cases"
+echo "-----------------------------------------"
+
+# Convert 2D MIOpen logs to CSV
+echo "Converting 2D MIOpen logs to CSV..."
+for log_file in $OUTPUT_DIR/*_miopen_log_2d.txt; do
+    if [ -f "$log_file" ]; then
+        # Extract model name from filename (e.g., resnet_miopen_log_2d.txt -> resnet)
+        base_name=$(basename "$log_file" _miopen_log_2d.txt)
+        output_csv="$OUTPUT_DIR/${base_name}_cases_2d.csv"
+        
+        echo "  Converting $log_file -> $output_csv"
+        python3 miopen_to_csv.py \
+            --input "$log_file" \
+            --output-2d "$output_csv" \
+            --model-name "$base_name" \
+            --filter-duplicates || true
+    fi
+done
+
+# Convert 3D MIOpen logs to CSV
+echo "Converting 3D MIOpen logs to CSV..."
+for log_file in $OUTPUT_DIR/*_miopen_log_3d.txt; do
+    if [ -f "$log_file" ]; then
+        # Extract model name from filename (e.g., resnet3d_18_miopen_log_3d.txt -> resnet3d_18)
+        base_name=$(basename "$log_file" _miopen_log_3d.txt)
+        output_csv="$OUTPUT_DIR/${base_name}_cases_3d.csv"
+        
+        echo "  Converting $log_file -> $output_csv"
+        python3 miopen_to_csv.py \
+            --input "$log_file" \
+            --output-3d "$output_csv" \
+            --model-name "$base_name" \
+            --filter-duplicates || true
+    fi
+done
+
+echo ""
+echo "Step 4: Combining CSV files into final datasets"
+echo "-----------------------------------------"
+
+# Combine all 2D CSV files into one
+echo "Combining all 2D test cases..."
+# First create empty file with comment headers
+echo "# 2D Convolution Test Cases" > conv_test_set_2d_dataset.csv
+echo "# Combined from multiple models" >> conv_test_set_2d_dataset.csv
+# Add header from first file as a comment
+first_2d_file=$(ls $OUTPUT_DIR/*_cases_2d.csv 2>/dev/null | head -1)
+if [ -f "$first_2d_file" ]; then
+    # Get the CSV header line and prefix with #
+    header_line=$(grep "^NDim," "$first_2d_file" | head -1)
+    if [ ! -z "$header_line" ]; then
+        echo "# $header_line" >> conv_test_set_2d_dataset.csv
+    fi
+fi
+# Append all data rows (skip comment lines and CSV header) from all files
+for csv_file in $OUTPUT_DIR/*_cases_2d.csv; do
+    if [ -f "$csv_file" ]; then
+        # Skip lines starting with # and the NDim header line
+        grep -v "^#" "$csv_file" | grep -v "^NDim," >> conv_test_set_2d_dataset.csv 2>/dev/null || true
+    fi
+done
+
+# Combine all 3D CSV files into one
+echo "Combining all 3D test cases..."
+# First create empty file with comment headers
+echo "# 3D Convolution Test Cases" > conv_test_set_3d_dataset.csv
+echo "# Combined from multiple models" >> conv_test_set_3d_dataset.csv
+# Add header from first file as a comment
+first_3d_file=$(ls $OUTPUT_DIR/*_cases_3d.csv 2>/dev/null | head -1)
+if [ -f "$first_3d_file" ]; then
+    # Get the CSV header line and prefix with #
+    header_line=$(grep "^NDim," "$first_3d_file" | head -1)
+    if [ ! -z "$header_line" ]; then
+        echo "# $header_line" >> conv_test_set_3d_dataset.csv
+    fi
+fi
+# Append all data rows (skip comment lines and CSV header) from all files
+for csv_file in $OUTPUT_DIR/*_cases_3d.csv; do
+    if [ -f "$csv_file" ]; then
+        # Skip lines starting with # and the NDim header line
+        grep -v "^#" "$csv_file" | grep -v "^NDim," >> conv_test_set_3d_dataset.csv 2>/dev/null || true
+    fi
+done
+
+# Count test cases
+COUNT_2D=0
+COUNT_3D=0
+if [ -f "conv_test_set_2d_dataset.csv" ]; then
+    COUNT_2D=$(grep -v "^#" conv_test_set_2d_dataset.csv | tail -n +2 | wc -l)
+fi
+if [ -f "conv_test_set_3d_dataset.csv" ]; then
+    COUNT_3D=$(grep -v "^#" conv_test_set_3d_dataset.csv | tail -n +2 | wc -l)
+fi
+
+echo ""
+echo "=========================================="
+echo "Dataset Generation Complete!"
+echo "=========================================="
+echo ""
+echo "Generated files:"
+if [ $COUNT_2D -gt 0 ]; then
+    echo "  - conv_test_set_2d_dataset.csv: $COUNT_2D test cases"
+fi
+if [ $COUNT_3D -gt 0 ]; then
+    echo "  - conv_test_set_3d_dataset.csv: $COUNT_3D test cases"
+fi
+echo "  - Intermediate files in: $OUTPUT_DIR/"
+echo ""
+echo "To use these datasets:"
+echo "  1. Build the test: cd ../script && make -j64 test_grouped_convnd_fwd_dataset_xdl"
+echo "  2. Run the test: ./bin/test_grouped_convnd_fwd_dataset_xdl"
+echo ""
\ No newline at end of file
diff --git a/test_data/miopen_to_csv.py b/test_data/miopen_to_csv.py
new file mode 100644
index 0000000000..ae8c187b43
--- /dev/null
+++ b/test_data/miopen_to_csv.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python3
+"""
+Convert MIOpen Driver Commands to CSV Test Cases
+
+Parses MIOpen driver commands from log files and converts them to CSV format
+for CK convolution testing.
+
+Usage:
+    python3 miopen_to_csv.py --input miopen_commands.txt --output conv_cases.csv
+    python3 miopen_to_csv.py --input miopen_log.txt --output-2d conv_2d.csv --output-3d conv_3d.csv
+"""
+
+import argparse
+import csv
+import re
+import os
+
+def parse_miopen_command(command_line):
+    """
+    Parse MIOpen driver command line into parameter dictionary
+    
+    Example input:
+    ./bin/MIOpenDriver conv -n 4 -c 3 -H 224 -W 224 -k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g 1 -F 1 -t 1
+    
+    Returns dict with parsed parameters or None if parsing fails
+    """
+    if not command_line.strip().startswith('./bin/MIOpenDriver conv'):
+        return None
+    
+    # Extract parameters using regex
+    params = {}
+    
+    # Parameter mapping: flag -> description
+    # Support both short (-D) and long (--in_d) parameter formats
+    param_patterns = {
+        'n': r'-n\s+(\d+)',      # batch size
+        'c': r'-c\s+(\d+)',      # input channels  
+        'k': r'-k\s+(\d+)',      # output channels
+        'H': r'-H\s+(\d+)',      # input height
+        'W': r'-W\s+(\d+)',      # input width
+        'D': r'(?:-D|--in_d)\s+(\d+)',      # input depth (3D only) - supports both -D and --in_d
+        'y': r'-y\s+(\d+)',      # kernel height
+        'x': r'-x\s+(\d+)',      # kernel width  
+        'z': r'(?:-z|--fil_d)\s+(\d+)',      # kernel depth (3D only) - supports both -z and --fil_d
+        'u': r'-u\s+(\d+)',      # stride height
+        'v': r'-v\s+(\d+)',      # stride width
+        'w': r'(?:-w|--conv_stride_d)\s+(\d+)',      # stride depth (3D only) - supports both -w and --conv_stride_d
+        'p': r'-p\s+(\d+)',      # pad height
+        'q': r'-q\s+(\d+)',      # pad width
+        's': r'(?:-s|--pad_d)\s+(\d+)',      # pad depth (3D only) - supports both -s and --pad_d
+        'l': r'-l\s+(\d+)',      # dilation height
+        'j': r'-j\s+(\d+)',      # dilation width
+        'r': r'(?:-r|--dilation_d)\s+(\d+)',      # dilation depth (3D only) - supports both -r and --dilation_d
+        'g': r'-g\s+(\d+)',      # groups
+        'F': r'-F\s+(\d+)',      # direction (1=fwd, 2=bwd_weight, 4=bwd_data)
+    }
+    
+    for param, pattern in param_patterns.items():
+        match = re.search(pattern, command_line)
+        if match:
+            params[param] = int(match.group(1))
+    
+    return params if params else None
+
+def miopen_to_conv_param(miopen_params):
+    """
+    Convert MIOpen parameters to CK ConvParam format
+    
+    Returns dictionary in CSV format or None if conversion fails
+    """
+    if not miopen_params:
+        return None
+    
+    # Determine if 2D or 3D convolution
+    is_3d = 'D' in miopen_params or 'z' in miopen_params or 'w' in miopen_params or 'r' in miopen_params or 's' in miopen_params
+    
+    # Extract basic parameters with defaults
+    ndim = 3 if is_3d else 2
+    groups = miopen_params.get('g', 1)
+    batch_size = miopen_params.get('n', 1)
+    # MIOpen uses total channels (C*G), CK uses channels per group
+    out_channels_total = miopen_params.get('k', 64)
+    in_channels_total = miopen_params.get('c', 3)
+    out_channels = out_channels_total // groups  # CK format: channels per group
+    in_channels = in_channels_total // groups    # CK format: channels per group
+    
+    if is_3d:
+        # 3D convolution
+        kernel_d = miopen_params.get('z', 3)
+        kernel_h = miopen_params.get('y', 3)
+        kernel_w = miopen_params.get('x', 3)
+        
+        input_d = miopen_params.get('D', 16)
+        input_h = miopen_params.get('H', 32)
+        input_w = miopen_params.get('W', 32)
+        
+        stride_d = miopen_params.get('w', 1)
+        stride_h = miopen_params.get('u', 1)
+        stride_w = miopen_params.get('v', 1)
+        
+        dilation_d = miopen_params.get('r', 1)
+        dilation_h = miopen_params.get('l', 1)
+        dilation_w = miopen_params.get('j', 1)
+        
+        pad_d = miopen_params.get('s', 0)
+        pad_h = miopen_params.get('p', 0)
+        pad_w = miopen_params.get('q', 0)
+        
+        # Calculate output dimensions
+        output_d = (input_d + 2 * pad_d - dilation_d * (kernel_d - 1) - 1) // stride_d + 1
+        output_h = (input_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) // stride_h + 1
+        output_w = (input_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) // stride_w + 1
+        
+        # Skip invalid configurations
+        if output_d <= 0 or output_h <= 0 or output_w <= 0:
+            return None
+        
+        direction = miopen_params.get('F', 1)  # 1=fwd, 2=bwd_weight, 4=bwd_data
+        direction_name = {1: 'fwd', 2: 'bwd_weight', 4: 'bwd_data'}.get(direction, 'fwd')
+        
+        return {
+            'NDim': ndim,
+            'Groups': groups,
+            'BatchSize': batch_size,
+            'OutChannels': out_channels,
+            'InChannels': in_channels,
+            'KernelD': kernel_d, 'KernelH': kernel_h, 'KernelW': kernel_w,
+            'InputD': input_d, 'InputH': input_h, 'InputW': input_w,
+            'OutputD': output_d, 'OutputH': output_h, 'OutputW': output_w,
+            'StrideD': stride_d, 'StrideH': stride_h, 'StrideW': stride_w,
+            'DilationD': dilation_d, 'DilationH': dilation_h, 'DilationW': dilation_w,
+            'LeftPadD': pad_d, 'LeftPadH': pad_h, 'LeftPadW': pad_w,
+            'RightPadD': pad_d, 'RightPadH': pad_h, 'RightPadW': pad_w,
+            'TestName': f'MIOpen_3D_{direction_name}'
+        }
+    
+    else:
+        # 2D convolution
+        kernel_h = miopen_params.get('y', 3)
+        kernel_w = miopen_params.get('x', 3)
+        
+        input_h = miopen_params.get('H', 32)
+        input_w = miopen_params.get('W', 32)
+        
+        stride_h = miopen_params.get('u', 1)
+        stride_w = miopen_params.get('v', 1)
+        
+        dilation_h = miopen_params.get('l', 1)
+        dilation_w = miopen_params.get('j', 1)
+        
+        pad_h = miopen_params.get('p', 0)
+        pad_w = miopen_params.get('q', 0)
+        
+        # Calculate output dimensions
+        output_h = (input_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) // stride_h + 1
+        output_w = (input_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) // stride_w + 1
+        
+        # Skip invalid configurations
+        if output_h <= 0 or output_w <= 0:
+            return None
+        
+        direction = miopen_params.get('F', 1)
+        direction_name = {1: 'fwd', 2: 'bwd_weight', 4: 'bwd_data'}.get(direction, 'fwd')
+        
+        return {
+            'NDim': ndim,
+            'Groups': groups,
+            'BatchSize': batch_size,
+            'OutChannels': out_channels,
+            'InChannels': in_channels,
+            'KernelH': kernel_h, 'KernelW': kernel_w,
+            'InputH': input_h, 'InputW': input_w,
+            'OutputH': output_h, 'OutputW': output_w,
+            'StrideH': stride_h, 'StrideW': stride_w,
+            'DilationH': dilation_h, 'DilationW': dilation_w,
+            'LeftPadH': pad_h, 'LeftPadW': pad_w,
+            'RightPadH': pad_h, 'RightPadW': pad_w,
+            'TestName': f'MIOpen_2D_{direction_name}'
+        }
+
+def write_csv_cases(test_cases, output_file, ndim):
+    """Write test cases to CSV file"""
+    if not test_cases:
+        print(f"No {ndim}D test cases to write")
+        return
+    
+    print(f"Writing {len(test_cases)} {ndim}D test cases to {output_file}")
+    
+    # Define CSV headers based on dimension
+    if ndim == 2:
+        headers = ['NDim', 'Groups', 'BatchSize', 'OutChannels', 'InChannels',
+                  'KernelH', 'KernelW', 'InputH', 'InputW', 'OutputH', 'OutputW',
+                  'StrideH', 'StrideW', 'DilationH', 'DilationW', 
+                  'LeftPadH', 'LeftPadW', 'RightPadH', 'RightPadW', 'TestName']
+    else:  # 3D
+        headers = ['NDim', 'Groups', 'BatchSize', 'OutChannels', 'InChannels',
+                  'KernelD', 'KernelH', 'KernelW', 'InputD', 'InputH', 'InputW', 
+                  'OutputD', 'OutputH', 'OutputW', 'StrideD', 'StrideH', 'StrideW',
+                  'DilationD', 'DilationH', 'DilationW', 
+                  'LeftPadD', 'LeftPadH', 'LeftPadW', 'RightPadD', 'RightPadH', 'RightPadW', 'TestName']
+    
+    with open(output_file, 'w', newline='') as csvfile:
+        # Write header comment
+        csvfile.write(f"# {ndim}D Convolution Test Cases from MIOpen Commands\n")
+        csvfile.write(f"# Generated {len(test_cases)} test cases\n")
+        
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer.writeheader()
+        
+        for test_case in test_cases:
+            # Only write fields that exist in headers
+            filtered_case = {k: v for k, v in test_case.items() if k in headers}
+            writer.writerow(filtered_case)
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert MIOpen commands to CSV test cases')
+    
+    parser.add_argument('--input', type=str, required=True,
+                       help='Input file with MIOpen driver commands')
+    parser.add_argument('--output', type=str,
+                       help='Output CSV file (for mixed 2D/3D cases)')
+    parser.add_argument('--output-2d', type=str, default='miopen_conv_2d.csv',
+                       help='Output CSV file for 2D cases')
+    parser.add_argument('--output-3d', type=str, default='miopen_conv_3d.csv',
+                       help='Output CSV file for 3D cases')
+    parser.add_argument('--filter-duplicates', action='store_true',
+                       help='Remove duplicate test cases')
+    parser.add_argument('--model-name', type=str, default='MIOpen',
+                       help='Model name to use in test case names (default: MIOpen)')
+    
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.input):
+        print(f"ERROR: Input file not found: {args.input}")
+        return 1
+    
+    print(f"Parsing MIOpen commands from {args.input}...")
+    
+    test_cases_2d = []
+    test_cases_3d = []
+    total_lines = 0
+    parsed_lines = 0
+    
+    with open(args.input, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            total_lines += 1
+            line = line.strip()
+            
+            # Skip empty lines and non-MIOpen commands
+            # Handle both direct commands and logged commands with MIOpen prefix
+            if not line:
+                continue
+            
+            # Extract the actual MIOpenDriver command from logged format
+            if 'MIOpenDriver conv' in line:
+                # Extract command after finding MIOpenDriver
+                command_start = line.find('./bin/MIOpenDriver conv')
+                if command_start != -1:
+                    line = line[command_start:]
+                else:
+                    # Handle cases where path might be different - create standard format
+                    driver_start = line.find('MIOpenDriver conv')
+                    if driver_start != -1:
+                        line = './bin/' + line[driver_start:]
+                    else:
+                        continue
+            elif not line.startswith('./bin/MIOpenDriver conv'):
+                continue
+            
+            try:
+                # Parse MIOpen command
+                miopen_params = parse_miopen_command(line)
+                if not miopen_params:
+                    continue
+                
+                # Convert to ConvParam format
+                conv_param = miopen_to_conv_param(miopen_params)
+                if not conv_param:
+                    continue
+                
+                # Add model name to test name
+                conv_param['TestName'] = f"{args.model_name}_{conv_param['NDim']}D_fwd"
+                
+                # Separate 2D and 3D cases
+                if conv_param['NDim'] == 2:
+                    test_cases_2d.append(conv_param)
+                else:
+                    test_cases_3d.append(conv_param)
+                
+                parsed_lines += 1
+                
+            except Exception as e:
+                print(f"WARNING: Failed to parse line {line_num}: {e}")
+                continue
+    
+    print(f"Processed {total_lines} lines, parsed {parsed_lines} commands")
+    print(f"Found {len(test_cases_2d)} 2D cases, {len(test_cases_3d)} 3D cases")
+    
+    # Remove duplicates if requested
+    if args.filter_duplicates:
+        # Simple duplicate removal based on key parameters
+        def make_key(case):
+            if case['NDim'] == 2:
+                return (case['Groups'], case['BatchSize'], case['OutChannels'], case['InChannels'],
+                       case['KernelH'], case['KernelW'], case['InputH'], case['InputW'],
+                       case['StrideH'], case['StrideW'])
+            else:
+                return (case['Groups'], case['BatchSize'], case['OutChannels'], case['InChannels'],
+                       case['KernelD'], case['KernelH'], case['KernelW'], 
+                       case['InputD'], case['InputH'], case['InputW'],
+                       case['StrideD'], case['StrideH'], case['StrideW'])
+        
+        seen_2d = set()
+        unique_2d = []
+        for case in test_cases_2d:
+            key = make_key(case)
+            if key not in seen_2d:
+                seen_2d.add(key)
+                unique_2d.append(case)
+        
+        seen_3d = set()
+        unique_3d = []
+        for case in test_cases_3d:
+            key = make_key(case)
+            if key not in seen_3d:
+                seen_3d.add(key)
+                unique_3d.append(case)
+        
+        print(f"After deduplication: {len(unique_2d)} 2D cases, {len(unique_3d)} 3D cases")
+        test_cases_2d = unique_2d
+        test_cases_3d = unique_3d
+    
+    # Write output files
+    if args.output:
+        # Write mixed cases to single file
+        all_cases = test_cases_2d + test_cases_3d
+        if all_cases:
+            print(f"Writing {len(all_cases)} total cases to {args.output}")
+            # Use 2D headers for mixed file, extend as needed
+            mixed_headers = ['NDim', 'Groups', 'BatchSize', 'OutChannels', 'InChannels',
+                           'KernelH', 'KernelW', 'InputH', 'InputW', 'OutputH', 'OutputW',
+                           'StrideH', 'StrideW', 'DilationH', 'DilationW', 
+                           'LeftPadH', 'LeftPadW', 'RightPadH', 'RightPadW', 'TestName']
+            
+            with open(args.output, 'w', newline='') as csvfile:
+                csvfile.write(f"# Mixed 2D/3D Convolution Test Cases from MIOpen Commands\n")
+                writer = csv.DictWriter(csvfile, fieldnames=mixed_headers, extrasaction='ignore')
+                writer.writeheader()
+                for case in all_cases:
+                    writer.writerow(case)
+    else:
+        # Write separate files for 2D and 3D
+        if test_cases_2d:
+            write_csv_cases(test_cases_2d, args.output_2d, 2)
+        
+        if test_cases_3d:
+            write_csv_cases(test_cases_3d, args.output_3d, 3)
+    
+    print("Conversion completed!")
+    return 0
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/test_data/run_model_with_miopen.py b/test_data/run_model_with_miopen.py
new file mode 100644
index 0000000000..83d08c82b7
--- /dev/null
+++ b/test_data/run_model_with_miopen.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+PyTorch Model Runner with MIOpen Command Logging using torchvision models
+
+Usage:
+    MIOPEN_ENABLE_LOGGING_CMD=1 python3 run_model_with_miopen.py --model resnet18 2> miopen_commands.txt
+    
+Available 2D models: alexnet, vgg11, vgg16, resnet18, resnet50, mobilenet_v2, etc.
+Available 3D models: r3d_18, mc3_18, r2plus1d_18
+"""
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torchvision.models.video as video_models
+import argparse
+import os
+
+# Define available models
+MODELS_2D = [
+    'alexnet', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn',
+    'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext101_32x8d', 'resnext101_64x4d',
+    'wide_resnet50_2', 'wide_resnet101_2',
+    'densenet121', 'densenet161', 'densenet169', 'densenet201',
+    'inception_v3', 'googlenet',
+    'shufflenet_v2_x0_5', 'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0',
+    'mobilenet_v2', 'mobilenet_v3_large', 'mobilenet_v3_small',
+    'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3',
+    'squeezenet1_0', 'squeezenet1_1'
+]
+
+MODELS_3D = [
+    'r3d_18', 'mc3_18', 'r2plus1d_18'
+]
+
+ALL_MODELS = MODELS_2D + MODELS_3D
+
+def main():
+    parser = argparse.ArgumentParser(description='PyTorch Model Runner with MIOpen Command Logging')
+    
+    # Model selection
+    parser.add_argument('--model', choices=ALL_MODELS, default='resnet18', 
+                       help='Model to run')
+    
+    # Input tensor dimensions
+    parser.add_argument('--batch-size', type=int, default=4, 
+                       help='Batch size')
+    parser.add_argument('--channels', type=int, default=3, 
+                       help='Input channels (e.g., 3 for RGB, 1 for grayscale)')
+    parser.add_argument('--height', type=int, default=224, 
+                       help='Input height')
+    parser.add_argument('--width', type=int, default=224, 
+                       help='Input width')
+    parser.add_argument('--input-size', type=int, 
+                       help='Input size (sets both height and width to same value)')
+    parser.add_argument('--temporal-size', type=int, default=16, 
+                       help='Temporal dimension for 3D models')
+    
+    # Device and precision
+    parser.add_argument('--device', choices=['cuda', 'cpu', 'auto'], default='auto',
+                       help='Device to run on')
+    parser.add_argument('--precision', choices=['fp32', 'fp16', 'bf16'], default='fp32',
+                       help='Floating point precision')
+    
+    
+    # Output control
+    parser.add_argument('--quiet', action='store_true',
+                       help='Suppress output except errors')
+    parser.add_argument('--verbose', action='store_true',
+                       help='Verbose output')
+    
+    args = parser.parse_args()
+    
+    # Handle input-size override
+    if args.input_size:
+        args.height = args.input_size
+        args.width = args.input_size
+    
+    # Check MIOpen logging
+    if not os.environ.get('MIOPEN_ENABLE_LOGGING_CMD') and not args.quiet:
+        print("WARNING: Set MIOPEN_ENABLE_LOGGING_CMD=1 to capture commands")
+    
+    # Device selection
+    if args.device == 'auto':
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    else:
+        device = torch.device(args.device)
+    
+    if not args.quiet:
+        print(f"Using device: {device}")
+    
+    # Create model using torchvision
+    if args.model in MODELS_3D:
+        # 3D Video models
+        model = getattr(video_models, args.model)(weights=None)
+        # 3D input: (batch, channels, temporal, height, width)
+        input_tensor = torch.randn(args.batch_size, args.channels, args.temporal_size, args.height, args.width)
+        if not args.quiet:
+            print(f"3D model: {args.model}")
+            print(f"Input shape: {input_tensor.shape} (B, C, T, H, W)")
+    else:
+        # 2D Image models
+        model = getattr(models, args.model)(weights=None)
+        # 2D input: (batch, channels, height, width)
+        input_tensor = torch.randn(args.batch_size, args.channels, args.height, args.width)
+        if not args.quiet:
+            print(f"2D model: {args.model}")
+            print(f"Input shape: {input_tensor.shape} (B, C, H, W)")
+    
+    # Set precision
+    if args.precision == 'fp16':
+        model = model.half()
+        input_tensor = input_tensor.half()
+    elif args.precision == 'bf16':
+        model = model.bfloat16()
+        input_tensor = input_tensor.bfloat16()
+    
+    model = model.to(device)
+    input_tensor = input_tensor.to(device)
+    
+    if not args.quiet:
+        print(f"Running {args.model} model...")
+    
+    # Run inference
+    model.eval()
+    with torch.no_grad():
+        output = model(input_tensor)
+        if not args.quiet:
+            print(f"Output shape: {output.shape}")
+    
+    if not args.quiet:
+        print("Done! MIOpen commands logged to stderr")
+
+if __name__ == "__main__":
+    main()

From a6f402927625138f24ff7c7cfb7091a45c086ca8 Mon Sep 17 00:00:00 2001
From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com>
Date: Thu, 14 Aug 2025 00:21:09 +0200
Subject: [PATCH 429/443] Add padding to 1x1Stride1Pad0 conv specialization
 (grouped conv bwd weight) (#2675)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
---
 include/ck/ck.hpp                             |   3 -
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |   7 -
 .../transform_conv_bwd_weight_to_gemm.hpp     | 126 +++++++-----------
 .../transform_conv_bwd_weight_to_gemm_v2.hpp  | 120 +++++++----------
 4 files changed, 90 insertions(+), 166 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 794c6f4e20..09801203ba 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -222,9 +222,6 @@
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
 #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
 
-// workaround: conv crash when K, C is even
-#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1
-
 // workaround: compiler crash when compiling recursive lambda
 #define CK_WORKAROUND_SWDEV_275126 1
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 1cd1f16245..6e74899706 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -1299,13 +1299,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
         if constexpr(ConvBackwardWeightSpecialization ==
                      ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
         {
-// workaround: disable when K, C is even
-#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN
-            if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0)
-            {
-                return false;
-            }
-#endif
             // check if it's 1x1, stride=1 pad = 0 conv
             for(int i = 0; i < NDimSpatial; i++)
             {
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
index bd3ab10802..efc7f20cdc 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp
@@ -192,7 +192,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -210,7 +210,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -218,9 +218,17 @@ struct TransformConvBwdWeightToGemm
             const auto wei_gemmm_gemmn_grid_desc =
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_gemmm_gemmn_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -240,7 +248,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -279,7 +287,7 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -288,26 +296,6 @@ struct TransformConvBwdWeightToGemm
                 make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -315,8 +303,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -392,7 +380,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -407,13 +395,21 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -428,7 +424,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -469,31 +465,11 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -501,8 +477,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -585,7 +561,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -600,13 +576,21 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -621,7 +605,7 @@ struct TransformConvBwdWeightToGemm
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
@@ -671,31 +655,11 @@ struct TransformConvBwdWeightToGemm
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch),
-                               make_pass_through_transform(GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -703,8 +667,8 @@ struct TransformConvBwdWeightToGemm
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
index b72ddb8243..e410f06190 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -374,7 +374,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -390,13 +390,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -412,7 +420,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -453,29 +461,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -483,8 +473,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
 
@@ -562,7 +552,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -578,13 +568,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -600,7 +598,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -650,29 +648,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -680,8 +660,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     }
@@ -765,7 +745,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -781,13 +761,21 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
+            // Padd
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
             return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
                               in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                              wei_grid_desc);
+                              wei_gemmm_gemmn_pad_grid_desc);
         }
         else
         {
@@ -803,7 +791,7 @@ struct TransformConvBwdWeightToGemmV2
             const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
                 out_gemmkpad_gemmm_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_right_pad_transform(GemmM, PadGemmM)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
@@ -868,29 +856,11 @@ struct TransformConvBwdWeightToGemmV2
             const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                 in_gemmkpad_gemmn_grid_desc,
                 make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmN)),
+                           make_right_pad_transform(GemmN, PadGemmN)),
                 make_tuple(Sequence<0>{}, Sequence<1>{}),
                 make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
             // Padd
-            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmM, PadGemmM),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
-                transform_tensor_descriptor(
-                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
-                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
-                               make_right_pad_transform(GemmN, PadGemmN),
-                               make_pass_through_transform(GemmK1Number)),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
             const auto wei_gemmm_gemmn_pad_grid_desc =
                 transform_tensor_descriptor(wei_grid_desc,
                                             make_tuple(make_right_pad_transform(GemmM, PadGemmM),
@@ -898,8 +868,8 @@ struct TransformConvBwdWeightToGemmV2
                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
                                             make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
-                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
                               wei_gemmm_gemmn_pad_grid_desc);
         }
     } // function end

From bcc38deff776b2bca6e228343046782dc85686c3 Mon Sep 17 00:00:00 2001
From: joyeamd <john.ye@amd.com>
Date: Thu, 14 Aug 2025 06:21:46 +0800
Subject: [PATCH 430/443] [CK_TILE]fix elementwise example in gfx11/12 (#2676)

* fix elementwise examples

* improve the robust

* fix ck_tile's elementwise test

* update elementwise test
---
 example/ck_tile/21_elementwise/elementwise_example.cpp     | 2 +-
 .../ck_tile/21_elementwise/elementwise_example_add_4d.cpp  | 2 +-
 .../21_elementwise/elementwise_example_transpose.cpp       | 5 +++--
 .../ck_tile/21_elementwise/elementwise_example_unary.cpp   | 3 +--
 .../ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp | 7 ++++---
 test/ck_tile/elementwise/test_elementwise_1d.cpp           | 5 ++---
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp
index 4c501860fd..469345b46c 100644
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -113,7 +113,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     // ElementWiseShape bundles these tiling parameters.
     // It calculates derived properties like threads per wavefront, repeats, vectorization and total
     // block size.
-    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
 
     // ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
     // - Data types (input, compute, output).
diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
index f18a910813..4a031265c9 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -69,7 +69,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using BlockWarps = ck_tile::sequence<1>;
     using WarpTile   = ck_tile::sequence<256>;
 
-    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
 
     using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
                                                         ComputeDataType,
diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
index affc337c38..aff74ae250 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -73,7 +73,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using BlockWarps = ck_tile::sequence<8>;
     using WarpTile   = ck_tile::sequence<64>;
 
-    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
 
     // Problem definition for a single input tensor
     using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
@@ -86,7 +86,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     ck_tile::index_t total_elements = M * N;
 
-    constexpr ck_tile::index_t kBlockSize         = 64 * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
     constexpr ck_tile::index_t kBlockPerCu        = 1;
     constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
     ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
index 147dfd3424..d83592a033 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -38,7 +38,6 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using XDataType             = DataType;
     using YDataType             = DataType;
-    using ComputeDataType       = float;
     using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
 
     // 1. Initialize the input data on the host
@@ -64,7 +63,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              // will cover some part of blockTile)
     using WarpTile = ck_tile::sequence<64>;  // How many elements are covered by a warp
 
-    using Shape   = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+    using Shape   = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
     using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
                                                         XDataType, // ComputeDataType is same as
                                                                    // XDataType in the unary case
diff --git a/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp b/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
index 0d25a8a202..aaad6407d4 100644
--- a/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
+++ b/include/ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp
@@ -14,13 +14,14 @@ struct ElementWiseShape
 
     static constexpr index_t kWarpM = WarpTile::at(number<0>{});
 
-    static constexpr index_t kVectorM = 16 / sizeof(ComputeDataType);
+    static constexpr index_t kVectorM =
+        min(static_cast<index_t>(16 / sizeof(ComputeDataType)), kWarpM / get_warp_size());
 
     static constexpr index_t kWarpPerBlockM = BlockWarps::at(number<0>{});
 
-    static constexpr index_t kThreadPerWarpM = kWarpM / kVectorM;
+    static constexpr index_t kThreadPerWarpM = get_warp_size();
 
-    static constexpr index_t kRepeatM = kBlockM / (kWarpPerBlockM * kWarpM);
+    static constexpr index_t kRepeatM = kBlockM / (kWarpPerBlockM * kVectorM * kThreadPerWarpM);
 
     static constexpr index_t kBlockSize =
         ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
diff --git a/test/ck_tile/elementwise/test_elementwise_1d.cpp b/test/ck_tile/elementwise/test_elementwise_1d.cpp
index 7013792335..9966c369be 100644
--- a/test/ck_tile/elementwise/test_elementwise_1d.cpp
+++ b/test/ck_tile/elementwise/test_elementwise_1d.cpp
@@ -53,7 +53,7 @@ class TestCkTileElementwise : public ::testing::Test
     using BlockTile_        = std::tuple_element_t<5, Tuple>;
     using WarpTile_         = std::tuple_element_t<6, Tuple>;
     using TestElementWiseShape =
-        ck_tile::ElementWiseShape<BlockWarps_, BlockTile_, WarpTile_, ComputeDataType>;
+        ck_tile::ElementWiseShape<BlockWarps_, BlockTile_, WarpTile_, XDataType>;
     static constexpr int NumInputs = elementwise_op_traits<ElementwiseOpType>::num_inputs;
 
     void RunTest(ck_tile::index_t total_m_elements)
@@ -195,8 +195,7 @@ TYPED_TEST(TestCkTileElementwise, RunElementwise_1024) { this->RunTest(1024); }
 
 TYPED_TEST(TestCkTileElementwise, RunElementwise_513)
 {
-    EXPECT_THROW((this->RunTest(513)),
-                 std::runtime_error); // Test with an input size that's not a multiple of kVectorM
+    this->RunTest(513); // Test with an input size that's not a multiple of kVectorM
 }
 
 TYPED_TEST(TestCkTileElementwise, RunElementwise_516)

From 8a698c7445ff9d04ae604cf7ebaee41613559c00 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Wed, 13 Aug 2025 18:24:16 -0400
Subject: [PATCH 431/443] Minor Improvements in CK TILE memory copy EXAMPLE
 (#2678)

* Rename vector to ThreadTile

* more notes on tile encoding

* remove number<> from tuple of make_tile_window

* add script to stress test the copy example
---
 example/ck_tile/39_copy/README.md            | 42 +++++----
 example/ck_tile/39_copy/copy_basic.cpp       |  9 +-
 example/ck_tile/39_copy/copy_basic.hpp       | 98 +++++++++-----------
 example/ck_tile/39_copy/test_tile_example.sh | 50 ++++++++++
 include/ck_tile/ops/fmha.hpp                 |  4 +-
 include/ck_tile/ops/gemm.hpp                 |  2 +
 6 files changed, 126 insertions(+), 79 deletions(-)
 create mode 100755 example/ck_tile/39_copy/test_tile_example.sh

diff --git a/example/ck_tile/39_copy/README.md b/example/ck_tile/39_copy/README.md
index f45fcb682b..fa98cc1de6 100644
--- a/example/ck_tile/39_copy/README.md
+++ b/example/ck_tile/39_copy/README.md
@@ -38,14 +38,14 @@ The CK Tile framework is built around four key architectural components that wor
 Defines the **hierarchical tile structure** and **memory layout** of the kernel:
 
 ```cpp
-using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+using Shape = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
 ```
 
 **Components:**
 - **BlockWaves**: Number of concurrent waves per block (e.g., `seq<4, 1>` for 4 waves along M, 1 along N)
 - **BlockTile**: Total elements processed by one block (e.g., `seq<512, 8>`)
 - **WaveTile**: Elements processed by one wave (e.g., `seq<32, 8>`)
-- **Vector**: Elements processed by one thread (e.g., `seq<1, 4>` for 4 contiguous elements)
+- **ThreadTile**: Elements processed by one thread (e.g., `seq<1, 4>` for 4 contiguous elements)
 
 **Purpose**: Defines the **work distribution hierarchy** from threads → waves → blocks.
 
@@ -91,7 +91,7 @@ Defines the **execution flow** and **memory movement patterns**:
 
 ```cpp
 // Complete kernel definition
-using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
 using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
 using Policy  = ck_tile::TileCopyPolicy<Problem>;
 using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
@@ -113,7 +113,7 @@ using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
 
 #### **Reusability**
 - Same **Shape** can be used with different **Problems**
-- Same **Policy** can be applied to different **Shapes**
+- Same **Policy** can be applied to different **Problems**
 - **Pipelines** can be reused across different kernels
 
 #### **Performance Optimization**
@@ -127,16 +127,16 @@ using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
 
 The CK Tile framework organizes work in a hierarchical manner:
 
-1. **Vector**: Number of contiguous elements processed by a single thread
+1. **ThreadTile**: Number of contiguous elements processed by a single thread
    - Enables vectorized memory loads/stores.
-   - Example: `Vector = seq<1, 4>` means each thread loads 4 contiguous elements along the N dimension
-   - A Vector can be imagined as a thread-level tile
+   - Example: `ThreadTile = seq<1, 4>` means each thread loads 4 contiguous elements along the N dimension
+   - A ThreadTile can be imagined as a thread-level tile
 
-2. **WaveTile**: Number of elements covered by a single wave (64 threads on AMD)
-   - Must satisfy: `Wave_Tile_M / Vector_M * Wave_Tile_N / Vector_N == WaveSize`
+2. **WaveTile**: Number of elements covered by a single wave (64 threads on CDNA, 32 threads on RDNA)
+   - Must satisfy: `Wave_Tile_M / ThreadTile_M * Wave_Tile_N / ThreadTile_N == WaveSize`
    - This ensures the number of threads needed equals the wave size
-   - Example: `WaveTile = seq<64, 4>` with `Vector = seq<1, 4>` means:
-     - Each thread handles 4 elements (Vector_N = 4)
+   - Example: `WaveTile = seq<64, 4>` with `ThreadTile = seq<1, 4>` means:
+     - Each thread handles 4 elements (ThreadTile_N = 4)
      - Wave needs 64×4/4 = 64 threads to cover 64×4 = 256 elements
      - Total elements = 256, which requires WaveSize = 64 threads
 
@@ -144,8 +144,9 @@ The CK Tile framework organizes work in a hierarchical manner:
    - Example: `BlockTile = seq<256, 64>` means each block processes 256×64 elements
 
 4. **BlockWaves**: Number of concurrent waves active in a block
-   - Usually 4 waves per block on modern AMD GPUs
-   - Example: `BlockWaves = seq<4, 1>` means 4 waves along M dimension, 1 along N
+   - Typical: 4 waves for heavy workloads (e.g., GEMM)
+   - Limit: up to 1024 threads per block → up to 16 waves (CDNA) or 32 waves (RDNA)
+   - Example: `BlockWaves = seq<4, 1>` means 4 waves along M, 1 along N
 
 ### Wave Repetition
 
@@ -159,7 +160,7 @@ static constexpr index_t WaveRepetitionPerBlock_N =
     Block_Tile_N / (Waves_Per_Block_N * Wave_Tile_N);
 ```
 
-**Key Insight**: When waves repeat, the effective work per thread becomes `Vector * Repeat`, not just `Vector`.
+**Key Insight**: When waves repeat, the effective work per thread becomes `ThreadTile * Repeat`, not just `ThreadTile`.
 
 ## Tile Distribution Encoding
 
@@ -183,8 +184,9 @@ constexpr auto outer_encoding =
   - M2: Number of threads per wave along M
 - **N0, N1**: Distribution along N dimension
   - N0: Number of threads along N
-  - N1: Vector size (elements per thread)
-- **YIELD arguments**: Both `Repeat` and `Vector` because effective work per thread is `Vector * Repeat`
+  - N1: ThreadTile size (elements per thread)
+- **Order and layout**: The inner-most (rightmost) dimension is the fastest-changing. Choosing `N1 = ThreadTile_N` maps vector width to contiguous addresses, i.e., row-major access in this example.
+- **YIELD arguments**: Both `Repeat` and `ThreadTile` because effective work per thread is `ThreadTile * Repeat`
 
 ## Tensor Abstractions
 
@@ -194,7 +196,7 @@ Defines the logical structure of a tensor:
 auto desc = make_naive_tensor_descriptor(
     make_tuple(M, N),           // tensor dimensions
     make_tuple(N, 1),           // strides
-    number<Vector_N>{},         // vector length for vectorized access
+    number<ThreadTile_N>{},     // per-thread vector length
     number<1>{}                 // guaranteed last dimension vector stride
 );
 ```
@@ -206,7 +208,7 @@ auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
     p_x,                        // memory buffer
     make_tuple(M, N),           // dimensions
     make_tuple(N, 1),           // strides  
-    number<S::Vector_N>{},      // vector length
+    number<S::ThreadTile_N>{},  // per-thread vector length
     number<1>{}                 // guaranteed last dimension vector stride
 );
 ```
@@ -247,10 +249,10 @@ struct TileCopyKernel
 1. **Tensor View Creation**:
    ```cpp
    const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-       p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+       p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
    ```
    - Creates views for both input and output tensors
-   - Specifies vectorized access with `Vector_N` elements per load
+   - Specifies vectorized access with `ThreadTile_N` elements per load
 
 2. **Tile Window Creation**:
    ```cpp
diff --git a/example/ck_tile/39_copy/copy_basic.cpp b/example/ck_tile/39_copy/copy_basic.cpp
index d46add879c..460036a641 100644
--- a/example/ck_tile/39_copy/copy_basic.cpp
+++ b/example/ck_tile/39_copy/copy_basic.cpp
@@ -54,7 +54,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     x_buf.ToDevice(x_host.data());
 
     // Define tile configuration
-    using Vector     = ck_tile::sequence<1, 4>;   // vector size along M and N dimension
+    using ThreadTile = ck_tile::sequence<1, 4>;   // per-thread tile size along M and N
     using WaveTile   = ck_tile::sequence<64, 4>;  // wave size along M and N dimension
     using BlockWaves = ck_tile::sequence<4, 1>;   // number of waves along M dimension
     using BlockTile  = ck_tile::sequence<512, 4>; // block size along M and N dimension
@@ -65,7 +65,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     std::cout << "grid size (number of blocks per grid) " << kGridSize << std::endl;
 
     // Define kernel types
-    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, Vector>;
+    using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
     using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
     using Policy  = ck_tile::TileCopyPolicy<Problem>;
     using Kernel  = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>;
@@ -88,8 +88,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
               << " " << BlockTile::at(ck_tile::number<1>{}) << std::endl;
     std::cout << "wave tile (number of elements per wave) " << WaveTile::at(ck_tile::number<0>{})
               << " " << WaveTile::at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "vector (number of elements per thread) " << Vector::at(ck_tile::number<0>{})
-              << " " << Vector::at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "thread tile (number of elements per thread) "
+              << ThreadTile::at(ck_tile::number<0>{}) << " " << ThreadTile::at(ck_tile::number<1>{})
+              << std::endl;
     std::cout << "WaveRepetitionPerBlock_M =  " << Shape::WaveRepetitionPerBlock_M << " --> ("
               << Shape::Block_Tile_M << "/" << Shape::Waves_Per_Block_M << "*" << Shape::Wave_Tile_M
               << ")" << std::endl;
diff --git a/example/ck_tile/39_copy/copy_basic.hpp b/example/ck_tile/39_copy/copy_basic.hpp
index bbeb964fda..1a313e1353 100644
--- a/example/ck_tile/39_copy/copy_basic.hpp
+++ b/example/ck_tile/39_copy/copy_basic.hpp
@@ -17,14 +17,14 @@ namespace ck_tile {
  * @tparam BlockWaves Number of waves along seq<M, N>
  * @tparam BlockTile Block size, seq<M, N>
  * @tparam WaveTile Wave size, seq<M, N>
- * @tparam Vector Contiguous elements (vector size) along seq<M, N>
+ * @tparam ThreadTile Contiguous elements per thread along seq<M, N>
  */
-template <typename BlockWaves, typename BlockTile, typename WaveTile, typename Vector>
+template <typename BlockWaves, typename BlockTile, typename WaveTile, typename ThreadTile>
 struct TileCopyShape
 {
-    // Vector dimensions for memory operations
-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
+    // ThreadTile dimensions for memory operations
+    static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{});
+    static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});
 
     // Wave tile dimensions
     static constexpr index_t Wave_Tile_M = WaveTile::at(number<0>{});
@@ -51,7 +51,7 @@ struct TileCopyShape
     // Configuration validation
     static_assert(Block_Tile_M > 0 && Block_Tile_N > 0, "Block tile dimensions must be positive");
     static_assert(Wave_Tile_M > 0 && Wave_Tile_N > 0, "Wave tile dimensions must be positive");
-    static_assert(Vector_M > 0 && Vector_N > 0, "Vector dimensions must be positive");
+    static_assert(ThreadTile_M > 0 && ThreadTile_N > 0, "ThreadTile dimensions must be positive");
     static_assert(Waves_Per_Block_M > 0 && Waves_Per_Block_N > 0,
                   "Waves per block must be positive");
     static_assert(Waves_Per_Block_M * Wave_Tile_M > 0,
@@ -60,8 +60,8 @@ struct TileCopyShape
                   "Invalid wave configuration for N dimension");
 
     // Ensure wave tile dimensions align with wave size
-    static_assert(Wave_Tile_M / Vector_M * Wave_Tile_N / Vector_N == WaveSize,
-                  "(Wave_Tile_M/Vector_M) * (Wave_Tile_N/Vector_N) != WaveSize");
+    static_assert(Wave_Tile_M / ThreadTile_M * Wave_Tile_N / ThreadTile_N == WaveSize,
+                  "(Wave_Tile_M/ThreadTile_M) * (Wave_Tile_N/ThreadTile_N) != WaveSize");
 };
 
 /**
@@ -95,7 +95,7 @@ struct TileCopyPolicy
         constexpr index_t block_size = S::BlockSize;
 
         // Distribution calculation to ensure all threads participate
-        constexpr index_t N1 = S::Vector_N;          // Elements per thread along N
+        constexpr index_t N1 = S::ThreadTile_N;      // Elements per thread along N
         constexpr index_t N0 = S::Block_Tile_N / N1; // Threads needed along N
 
         constexpr index_t M2 = wave_size / N0;              // Threads per wave along M
@@ -143,23 +143,21 @@ struct TileCopyKernel
 
         // Create tensor views for input and output
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         // Create tile windows with DRAM distribution
-        auto x_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
-        auto y_window =
-            make_tile_window(y_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto y_window = make_tile_window(y_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
         // Calculate iterations needed to cover N dimension
         // Note: This kernel uses data parallelism only in the M dimension.
@@ -218,23 +216,21 @@ struct ElementWiseTileCopyKernel
 
         // Create tensor views for input and output
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         // Create tile windows with DRAM distribution
-        auto x_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
-        auto y_window =
-            make_tile_window(y_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto y_window = make_tile_window(y_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
         // Calculate iterations needed to cover N dimension
         // Note: This kernel uses data parallelism only in the M dimension.
@@ -297,45 +293,41 @@ struct TileCopyKernel_LDS
         }
 
         // LDS buffer allocation
-        __shared__ XDataType x_lds_buffer[S::Block_Tile_M * S::Block_Tile_N];
+        __shared__ XDataType x_lds_buffer[S::Block_Tile_Mmake * S::Block_Tile_N];
 
         // LDS tensor descriptor and view
         const auto x_lds_descriptor =
             make_naive_tensor_descriptor(make_tuple(S::Block_Tile_M, S::Block_Tile_N),
                                          make_tuple(S::Block_Tile_N, 1),
-                                         number<S::Vector_N>{},
+                                         number<S::ThreadTile_N>{},
                                          number<1>{});
 
         auto x_lds_view = make_tensor_view<address_space_enum::lds>(x_lds_buffer, x_lds_descriptor);
 
         // LDS windows with different distributions for optimal access patterns
-        auto x_lds_write_window = make_tile_window(
-            x_lds_view, make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}), {0, 0});
+        auto x_lds_write_window =
+            make_tile_window(x_lds_view, make_tuple(S::Block_Tile_M, S::Block_Tile_N), {0, 0});
 
-        auto x_lds_read_window =
-            make_tile_window(x_lds_view,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {0, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_lds_read_window = make_tile_window(x_lds_view,
+                                                  make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                                  {0, 0},
+                                                  Policy::template MakeDRAMDistribution<Problem>());
 
         // Global memory tensor views
         const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
+            p_y, make_tuple(M, N), make_tuple(N, 1), number<S::ThreadTile_N>{}, number<1>{});
 
         // Global memory tile windows
-        auto x_window =
-            make_tile_window(x_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0},
-                             Policy::template MakeDRAMDistribution<Problem>());
+        auto x_window = make_tile_window(x_m_n,
+                                         make_tuple(S::Block_Tile_M, S::Block_Tile_N),
+                                         {tile_block_origin_m, 0},
+                                         Policy::template MakeDRAMDistribution<Problem>());
 
-        auto y_window =
-            make_tile_window(y_m_n,
-                             make_tuple(number<S::Block_Tile_M>{}, number<S::Block_Tile_N>{}),
-                             {tile_block_origin_m, 0});
+        auto y_window = make_tile_window(
+            y_m_n, make_tuple(S::Block_Tile_M, S::Block_Tile_N), {tile_block_origin_m, 0});
 
         // Calculate iterations needed to cover N dimension
         // Note: This kernel uses data parallelism only in the M dimension.
diff --git a/example/ck_tile/39_copy/test_tile_example.sh b/example/ck_tile/39_copy/test_tile_example.sh
new file mode 100755
index 0000000000..fcd8c8e991
--- /dev/null
+++ b/example/ck_tile/39_copy/test_tile_example.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+BIN="${BIN:-../../../build/bin/tile_example_copy}"
+WARMUP="${WARMUP:-20}"
+REPEAT="${REPEAT:-100}"
+VALIDATE="${VALIDATE:-1}"
+
+MS=(128 256 512 1024)
+NS=(64 256 1024 2048 4096)
+PRECS=(fp16 fp32)
+
+echo "Using BIN=$BIN"
+echo "WARMUP=$WARMUP REPEAT=$REPEAT VALIDATE=$VALIDATE"
+
+failures=0
+
+for prec in "${PRECS[@]}"; do
+  for m in "${MS[@]}"; do
+    for n in "${NS[@]}"; do
+      echo "=============================================="
+      echo "Running: prec=$prec m=$m n=$n"
+      set +e
+      out="$("$BIN" -prec="$prec" -m="$m" -n="$n" -warmup="$WARMUP" -repeat="$REPEAT" -v="$VALIDATE" 2>&1)"
+      rc=$?
+      set -e
+
+      echo "$out"
+      if [[ $rc -ne 0 ]]; then
+        echo "RUN ERROR (rc=$rc) for m=$m n=$n prec=$prec"
+        ((failures++)) || true
+        continue
+      fi
+
+      if [[ "$VALIDATE" == "1" ]]; then
+        if ! grep -q "valid:y" <<<"$out"; then
+          echo "VALIDATION FAILED for m=$m n=$n prec=$prec"
+          ((failures++)) || true
+        fi
+      fi
+    done
+  done
+done
+
+echo "=============================================="
+if [[ $failures -eq 0 ]]; then
+  echo "All runs passed"
+else
+  echo "$failures runs failed"
+fi
\ No newline at end of file
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 69f645b850..16fde15c7b 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -45,6 +45,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_whole_k_prefetch.hpp"
@@ -52,8 +54,6 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs_default_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index c9bedd7c53..e792820466 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -8,6 +8,8 @@
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v2_custom_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"

From 753b6227c507c45f4838971067f3cf158a392d55 Mon Sep 17 00:00:00 2001
From: Jimniu <jimmy.niu@amd.com>
Date: Wed, 13 Aug 2025 19:06:08 -0400
Subject: [PATCH 432/443] Jimniu/tile_example_flatmm_basic fix (#2680)

* Add stride_b validation

* run clang-format
---
 example/ck_tile/18_flatmm/flatmm_basic.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 475a0c7bf3..50bf791207 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -217,6 +217,17 @@ int run_flatmm_example(int argc, char* argv[])
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
+
+    int k        = arg_parser.get_int("k");
+    int stride_b = arg_parser.get_int("stride_b");
+
+    if(b_layout == "C" && stride_b > k)
+    {
+        throw std::runtime_error(
+            "For ColumnMajor layout, StrideB must be smaller than or equal to K (" +
+            std::to_string(k) + ")");
+    }
+
     if(a_layout == "R" && b_layout == "C")
     {
 

From e5623d3825a10c6f50af493cf12bda7da89f94c7 Mon Sep 17 00:00:00 2001
From: Gino Lu <gino.lu@amd.com>
Date: Thu, 14 Aug 2025 15:12:31 +0800
Subject: [PATCH 433/443] fix wrong nan producion. (#2640)

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 include/ck_tile/core/numeric/e8m0.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck_tile/core/numeric/e8m0.hpp b/include/ck_tile/core/numeric/e8m0.hpp
index ea94880f27..ba122b7f66 100644
--- a/include/ck_tile/core/numeric/e8m0.hpp
+++ b/include/ck_tile/core/numeric/e8m0.hpp
@@ -87,7 +87,7 @@ CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t::operator float() const
     using traits = numeric_traits<float>;
     if(data == numeric<e8m0_t>::binary_nan)
     {
-        return traits::NaN;
+        return std::numeric_limits<float>::signaling_NaN();
     }
     else if(data == 0)
     {

From 7f147724069a6df42359ef315861672b84ee8431 Mon Sep 17 00:00:00 2001
From: Yashvardhan Agarwal <yashagar@amd.com>
Date: Thu, 14 Aug 2025 11:18:52 +0300
Subject: [PATCH 434/443] CK_TILE: Implement two-stage split-K GEMM with
 workspace reduction (LWPCK-2966) (#2632)

* CK_TILE: Implement two-stage split-K GEMM with reduction

- Added split-K GEMM with reduction example

* comment resolutions
---
 example/ck_tile/03_gemm/CMakeLists.txt        |    2 +
 .../03_gemm/gemm_splitk_two_stage_reduce.cpp  | 1009 +++++++++++++++++
 .../ops/gemm/kernel/universal_gemm_kernel.hpp |   29 +
 3 files changed, 1040 insertions(+)
 create mode 100644 example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp

diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index b1aede42c7..825cd6e522 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
 add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
+add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
@@ -14,3 +15,4 @@ list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-rev
 target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
new file mode 100644
index 0000000000..a4a8039288
--- /dev/null
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
@@ -0,0 +1,1009 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+
+/**
+ * @brief Tile partitioner with output offset support.
+ *
+ * This partitioner extends the spatially local tile partitioner to support
+ * split-K reduction by providing workspace output offset calculation. Each K-split
+ * writes to a separate slice of the workspace: workspace[k_id * M * N].
+ */
+template <typename BlockGemmShapeType, ck_tile::index_t GroupNum, ck_tile::index_t M01>
+struct GemmSplitKTilePartitioner
+    : public ck_tile::GemmSpatiallyLocalTilePartitioner<BlockGemmShapeType, GroupNum, M01>
+{
+    using Base = ck_tile::GemmSpatiallyLocalTilePartitioner<BlockGemmShapeType, GroupNum, M01>;
+
+    // Inherit constructors and methods
+    using Base::Base;
+    using Base::GetLoopNum;
+
+    /**
+     * @brief Calculate output pointer offset for split-K reduction.
+     *
+     * @param kargs  Kernel arguments.
+     * @param k_id   Current K-split ID (from blockIdx.z or calculated k_batch).
+     * @return ck_tile::index_t  The offset for this K-split.
+     */
+    template <typename KernelArgs>
+    CK_TILE_HOST_DEVICE static ck_tile::index_t GetOutputOffset(const KernelArgs& kargs,
+                                                                ck_tile::index_t k_id) noexcept
+    {
+        // Each K-split gets its own M*N workspace slice
+        return (kargs.k_batch > 1) ? (k_id * kargs.M * kargs.N) : 0;
+    }
+};
+
+/**
+ * @brief Extended GEMM host arguments for two-stage split-K implementation
+ *
+ * This structure supports the two-stage split-K approach where:
+ * 1. Stage 1: GEMM writes partial results to workspace memory
+ * 2. Stage 2: Reduction kernel sums workspace results to final output
+ *
+ * The base class e_ptr points to workspace, while final_output_ptr points to the actual output
+ */
+struct GemmSplitKHostArgs : public ck_tile::GemmHostArgs
+{
+    using BaseArgs = ck_tile::GemmHostArgs;
+
+    CK_TILE_HOST GemmSplitKHostArgs() = default;
+    CK_TILE_HOST GemmSplitKHostArgs(const void* a_ptr_,
+                                    const void* b_ptr_,
+                                    void* workspace_ptr_, // Workspace for partial results
+                                    void* e_ptr_,         // Final output destination
+                                    ck_tile::index_t k_batch_,
+                                    ck_tile::index_t M_,
+                                    ck_tile::index_t N_,
+                                    ck_tile::index_t K_,
+                                    ck_tile::index_t stride_A_,
+                                    ck_tile::index_t stride_B_,
+                                    ck_tile::index_t workspace_stride_,
+                                    ck_tile::index_t stride_E_)
+        : BaseArgs(a_ptr_,
+                   b_ptr_,
+                   workspace_ptr_, // Base e_ptr = workspace_ptr
+                   k_batch_,
+                   M_,
+                   N_,
+                   K_,
+                   stride_A_,
+                   stride_B_,
+                   workspace_stride_),
+          final_output_ptr(e_ptr_),
+          final_stride_E(stride_E_)
+    {
+    }
+
+    void* final_output_ptr;          // Pointer to final output tensor
+    ck_tile::index_t final_stride_E; // Stride for final output tensor
+};
+
+/**
+ * @brief Stage 1: GEMM kernel that writes partial split-K results to workspace
+ *
+ * This function performs the matrix multiplication with split-K, where each
+ * K-split writes its partial result to a separate section of the workspace.
+ *
+ * Workspace layout: [k_batch, M, N] where each [M, N] slice contains
+ * partial results for one K-split.
+ *
+ * @param args Extended arguments containing workspace and final output pointers
+ * @param s Stream configuration for kernel execution
+ * @return Execution time in milliseconds
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm_stage1(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner = GemmSplitKTilePartitioner<GemmShape,
+                                                      GemmConfig::TileParitionerGroupNum,
+                                                      GemmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    float ave_time{0};
+
+    // Create base GEMM arguments pointing to workspace instead of final output
+    // The workspace will store partial results from each K-split
+    ck_tile::GemmHostArgs base_args(args.a_ptr,
+                                    args.b_ptr,
+                                    args.e_ptr,
+                                    args.k_batch,
+                                    args.M,
+                                    args.N,
+                                    args.K,
+                                    args.stride_A,
+                                    args.stride_B,
+                                    args.stride_E);
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(base_args);
+
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Stage 1 - Launching GEMM kernel: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        // For workspace mode, always use SET operation since each K-split writes to separate memory
+        Run(has_hot_loop_,
+            tail_number_,
+            ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+/**
+ * @brief Stage 2: Reduction kernel that sums partial split-K results to final output
+ *
+ * This function reduces the partial results stored in workspace memory by stage 1.
+ * It sums across the k_batch dimension to produce the final GEMM result.
+ *
+ * Workspace layout: [k_batch, M, N] -> Final output: [M, N]
+ *
+ * @tparam CDataType Output data type
+ * @tparam ComputeDataType Computation precision for reduction
+ * @tparam ELayout Memory layout of output tensor
+ * @param args Extended arguments containing workspace and output information
+ * @param s Stream configuration for kernel execution
+ * @return Execution time in milliseconds
+ */
+template <typename CDataType,
+          typename ComputeDataType = float,
+          typename ELayout         = ck_tile::tensor_layout::gemm::RowMajor>
+float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    const ck_tile::index_t reduce_dim_size = args.k_batch; // Number of partial results to reduce
+    // Calculate output size based on the final output tensor dimensions
+    const ck_tile::index_t output_size = args.M * args.N;
+
+    // Workspace layout: [k_batch, M, N] where each [M, N] slice has the same layout as final output
+    // The workspace strides need to account for the layout of the final output tensor
+    auto workspace_shape = ck_tile::make_tuple(args.k_batch, args.M, args.N);
+    auto workspace_strides =
+        ck_tile::make_tuple(args.M * args.N,     // k_batch stride: jump to next K split
+                            args.final_stride_E, // stride same as final output stride
+                            1);
+
+    // Define kept and reduced dimensions
+    constexpr auto kept_dim    = ck_tile::sequence<1, 2>{}; // Keep M, N dimensions
+    constexpr auto reduce_dims = ck_tile::sequence<0>{};    // Reduce k_batch dimension
+
+    using ReduceOp   = ck_tile::ReduceOp::Add;
+    using BlockWarps = ck_tile::sequence<4, 1>;
+    using BlockTile  = ck_tile::sequence<128, 128>;
+    using WarpTile   = ck_tile::sequence<32, 128>;
+    using ThreadTile = ck_tile::sequence<8, 8>;
+
+    constexpr ck_tile::index_t kBlockSize  = 256;
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    ck_tile::index_t kGridSize = (output_size + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
+
+    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Problem =
+        ck_tile::Reduce2dProblem<CDataType, ComputeDataType, CDataType, Shape, ReduceOp>;
+    using Kernel = ck_tile::Reduce<Problem>;
+
+    if(!Kernel::IsSupportedArgument(reduce_dim_size, workspace_strides))
+    {
+        throw std::runtime_error("Wrong! Reduction arguments not supported!\n");
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Stage 2 - Launching Reduction kernel" << '\n'
+                  << "workspace shape: [" << args.k_batch << ", " << args.M << ", " << args.N << "]"
+                  << '\n'
+                  << "output shape: [" << args.M << ", " << args.N << "]" << '\n'
+                  << "grid size: " << kGridSize << std::endl;
+    }
+
+    float ave_time =
+        ck_tile::launch_kernel(s,
+                               ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                   Kernel{},
+                                   kGridSize,
+                                   kBlockSize,
+                                   0,                                         // LDS size
+                                   static_cast<const CDataType*>(args.e_ptr), // workspace input
+                                   static_cast<CDataType*>(args.final_output_ptr), // final output
+                                   workspace_shape,
+                                   workspace_strides,
+                                   kept_dim,
+                                   reduce_dims));
+
+    return ave_time;
+}
+
+/**
+ * @brief Orchestrator for two-stage split-K GEMM implementation
+ *
+ * This function coordinates the two-stage approach:
+ * 1. Stage 1: Execute GEMM with each K-split writing to workspace
+ * 2. Stage 2: Reduce workspace results to final output (if k_batch > 1)
+ *
+ * @param args Extended arguments for two-stage execution
+ * @param s Stream configuration
+ * @return Total execution time (GEMM + Reduction)
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm_splitk_two_stage(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    float gemm_time   = 0.0f;
+    float reduce_time = 0.0f;
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Starting Two-Stage GEMM+SplitK with k_batch=" << args.k_batch << std::endl;
+        std::cout << "Workspace size: " << args.k_batch << " x " << args.M << " x " << args.N
+                  << " = " << args.k_batch * args.M * args.N * sizeof(CDataType) << " bytes"
+                  << std::endl;
+    }
+
+    // Stage 1: GEMM to workspace
+    gemm_time = gemm_stage1<GemmConfig,
+                            ADataType,
+                            BDataType,
+                            DsDataType,
+                            AccDataType,
+                            CDataType,
+                            ALayout,
+                            BLayout,
+                            DsLayout,
+                            ELayout,
+                            Persistent,
+                            CDEElementWise>(args, s);
+
+    // Synchronize before stage 2
+    auto sync_result = hipStreamSynchronize(s.stream_id_);
+    if(sync_result != hipSuccess)
+    {
+        throw std::runtime_error("Stream synchronization failed");
+    }
+
+    // Stage 2: Reduction from workspace to final output (if needed)
+    if(args.k_batch > 1)
+    {
+        // Use appropriate precision for reduction computations
+        using ComputeDataType = std::conditional_t<
+            std::is_same_v<CDataType, ck_tile::half_t>,
+            float,
+            std::conditional_t<std::is_same_v<CDataType, ck_tile::bf16_t>, float, CDataType>>;
+        reduce_time = reduce_stage2<CDataType, ComputeDataType, ELayout>(args, s);
+    }
+    else
+    {
+        // Single K-split: simple copy from workspace to final output
+        auto copy_result = hipMemcpyAsync(args.final_output_ptr,
+                                          args.e_ptr,
+                                          args.M * args.N * sizeof(CDataType),
+                                          hipMemcpyDeviceToDevice,
+                                          s.stream_id_);
+        if(copy_result != hipSuccess)
+        {
+            throw std::runtime_error("Memory copy failed");
+        }
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "GEMM stage time: " << gemm_time << " ms" << std::endl;
+        if(args.k_batch > 1)
+        {
+            std::cout << "Reduction stage time: " << reduce_time << " ms" << std::endl;
+        }
+        std::cout << "Total time: " << gemm_time + reduce_time << " ms" << std::endl;
+    }
+
+    return gemm_time + reduce_time;
+}
+
+/**
+ * @brief High-level interface for two-stage split-K GEMM execution
+ *
+ * @param a_m_k_dev_buf Input matrix A device buffer
+ * @param b_k_n_dev_buf Input matrix B device buffer
+ * @param c_m_n_dev_buf Output matrix C device buffer
+ * @param M Matrix M dimension
+ * @param N Matrix N dimension
+ * @param K Matrix K dimension
+ * @param stride_A Memory stride for matrix A
+ * @param stride_B Memory stride for matrix B
+ * @param stride_C Memory stride for matrix C
+ * @param kbatch Number of K-splits for split-K execution
+ * @param n_warmup Number of warmup iterations
+ * @param n_repeat Number of repeat iterations for benchmarking
+ * @param persistent Whether to use persistent kernel execution
+ * @return Average execution time in milliseconds
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm_splitk_two_stage(ck_tile::DeviceMem& a_m_k_dev_buf,
+                                   ck_tile::DeviceMem& b_k_n_dev_buf,
+                                   ck_tile::DeviceMem& c_m_n_dev_buf,
+                                   ck_tile::index_t M,
+                                   ck_tile::index_t N,
+                                   ck_tile::index_t K,
+                                   ck_tile::index_t stride_A,
+                                   ck_tile::index_t stride_B,
+                                   ck_tile::index_t stride_C,
+                                   ck_tile::index_t kbatch,
+                                   int n_warmup,
+                                   int n_repeat,
+                                   bool persistent)
+{
+    // Calculate workspace size: kbatch * M * N elements
+    const ck_tile::index_t workspace_size   = kbatch * M * N * sizeof(CDataType);
+    const ck_tile::index_t workspace_stride = stride_C; // Stride for k_batch dimension
+
+    // Allocate workspace memory
+    ck_tile::DeviceMem workspace_buf(workspace_size);
+    workspace_buf.SetZero();
+
+    // Create extended args for two-stage approach
+    GemmSplitKHostArgs args{
+        a_m_k_dev_buf.GetDeviceBuffer(), // a_ptr
+        b_k_n_dev_buf.GetDeviceBuffer(), // b_ptr
+        workspace_buf.GetDeviceBuffer(), // workspace_ptr (used as e_ptr for stage 1)
+        c_m_n_dev_buf.GetDeviceBuffer(), // final_output_ptr
+        kbatch,                          // k_batch
+        M,
+        N,
+        K, // dimensions
+        stride_A,
+        stride_B,         // input strides
+        workspace_stride, // workspace stride
+        stride_C          // final output stride
+    };
+
+    float ave_time;
+    ck_tile::stream_config config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50};
+
+    if(persistent)
+    {
+        ave_time = gemm_splitk_two_stage<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         CLayout,
+                                         true,
+                                         CDEElementWise>(args, config);
+    }
+    else
+    {
+        ave_time = gemm_splitk_two_stage<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         CLayout,
+                                         false,
+                                         CDEElementWise>(args, config);
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Two-Stage GEMM+SplitK with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " kbatch=" << kbatch << " WorkspaceSize=" << workspace_size << " bytes"
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
+
+    return ave_time;
+}
+
+// Two-stage implementation of run_gemm_example_with_layouts
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts_two_stage(int argc,
+                                            char* argv[],
+                                            const ALayout a_layout                  = ALayout{},
+                                            const BLayout b_layout                  = BLayout{},
+                                            [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
+
+    const bool preshuffle = GemmConfig::Preshuffle;
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        if constexpr(preshuffle)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_k_n);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        }
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    if(!preshuffle && GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
+
+    if constexpr(preshuffle)
+    {
+        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        // shuffled buffer B for device implementation
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+    }
+    else
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            if constexpr(GemmConfig::PermuteB)
+            {
+                permute_tensor_b<GemmConfig,
+                                 decltype(b_k_n_dev),
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(b_k_n_dev);
+            }
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(GemmConfig::PermuteB)
+            {
+                std::cout << "Permute for this DataType is not implemented." << std::endl;
+                return false;
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    std::cout << "Using Workspace Split-K Mode (Two-Stage with Reduction)" << std::endl;
+    // Use the new two-stage approach
+    invoke_gemm_splitk_two_stage<GemmConfig,
+                                 ADataType,
+                                 BDataType,
+                                 ck_tile::tuple<>,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 ck_tile::tuple<>,
+                                 CLayout>(a_m_k_dev_buf,
+                                          b_k_n_dev_buf,
+                                          c_m_n_dev_buf,
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          stride_C,
+                                          kbatch,
+                                          n_warmup,
+                                          n_repeat,
+                                          persistent);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        // memory on host to store gpu reference result
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        // memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_gpu_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row                 = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col                 = ck_tile::tensor_layout::gemm::ColumnMajor;
+    auto [result, arg_parser] = create_args(argc, argv);
+    bool preshuffle           = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    // Use new two-stage approach for both int4 and other data types
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType,
+                                                           Row,
+                                                           Col,
+                                                           Row>(argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType,
+                                                           Col,
+                                                           Col,
+                                                           Row>(argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Row{}, Row{}, Row{});
+        }
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                argc, argv, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_gemm_example<GemmConfigComputeV3>(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        // Return a non-zero code to indicate failure
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 0ac0ca37e6..ec1cc2ddb4 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -213,6 +213,23 @@ struct UniversalGemmKernel
     };
     static constexpr bool PersistentKernel = has_persistent_kernel::value;
 
+    // Check if TilePartitioner has GetOutputOffset method with kargs and k_id
+    struct has_tile_partitioner_output_offset_impl
+    {
+        template <typename T, typename KernelArgs>
+        using has_get_output_offset_t =
+            decltype(T::GetOutputOffset(std::declval<KernelArgs>(), std::declval<index_t>()));
+
+        static constexpr bool value = []() {
+            if constexpr(is_detected<has_get_output_offset_t, TilePartitioner>{})
+                return true;
+            else
+                return false;
+        }();
+    };
+    static constexpr bool has_tile_partitioner_output_offset =
+        has_tile_partitioner_output_offset_impl::value;
+
     static constexpr auto I0 = number<0>();
     static constexpr auto I1 = number<1>();
     static constexpr auto I2 = number<2>();
@@ -1032,7 +1049,13 @@ struct UniversalGemmKernel
                         splitk_batch_offset.bs_k_split_offset[i];
         });
 
+        // Calculate output offset from tile partitioner and apply to output pointer
         EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+        if constexpr(has_tile_partitioner_output_offset)
+        {
+            const index_t output_offset = TilePartitioner::GetOutputOffset(kargs, blockIdx.z);
+            e_ptr += output_offset;
+        }
 
         // allocate LDS
         __shared__ char smem_ptr_0[GetSmemSize()];
@@ -1110,7 +1133,13 @@ struct UniversalGemmKernel
                             splitk_batch_offset.bs_k_split_offset[i];
             });
 
+            // Calculate output offset from tile partitioner and apply to output pointer
             EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+            if constexpr(has_tile_partitioner_output_offset)
+            {
+                const index_t output_offset = TilePartitioner::GetOutputOffset(kargs, k_batch);
+                e_ptr += output_offset;
+            }
 
             // allocate LDS
             __shared__ char smem_ptr_0[GetSmemSize()];

From 70dce4e0c6477424093b560c98522a136a68e84e Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:54:57 -0600
Subject: [PATCH 435/443] [CK_Tile] Refactor MOE Sorting and Smoothquant ctests
 to gtests (#2596)

* refactor moe_sorting ctests to use gtest framework

* Refactor ctests for smoothquant to gtests

* fix clang format to use version 18

* Print local_eid in MOE sorting gtests

* Remove extra space in smoothquant output
---
 test/ck_tile/moe_sorting/CMakeLists.txt       |   19 +-
 test/ck_tile/moe_sorting/moe_sorting_fp32.cpp |  544 --------
 test/ck_tile/moe_sorting/test_moe_sorting.cpp |   14 +
 .../moe_sorting/test_moe_sorting_cases.inc    | 1211 +++++++++++++++++
 .../moe_sorting/test_moe_sorting_types.hpp    |    8 +
 .../moe_sorting/test_moe_sorting_util.hpp     |  356 +++++
 test/ck_tile/smoothquant/CMakeLists.txt       |    5 +-
 .../instances/smoothquant_fwd_api.cpp         |   24 +-
 test/ck_tile/smoothquant/smoothquant.hpp      |    3 +-
 test/ck_tile/smoothquant/smoothquant.inc      |  273 ----
 test/ck_tile/smoothquant/smoothquant_bf16.cpp |   11 -
 test/ck_tile/smoothquant/smoothquant_fp16.cpp |   11 -
 test/ck_tile/smoothquant/test_smoothquant.cpp |   14 +
 .../smoothquant/test_smoothquant_cases.inc    |  206 +++
 .../smoothquant/test_smoothquant_types.hpp    |    9 +
 .../smoothquant/test_smoothquant_util.hpp     |  181 +++
 16 files changed, 2025 insertions(+), 864 deletions(-)
 delete mode 100644 test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
 create mode 100644 test/ck_tile/moe_sorting/test_moe_sorting.cpp
 create mode 100755 test/ck_tile/moe_sorting/test_moe_sorting_cases.inc
 create mode 100644 test/ck_tile/moe_sorting/test_moe_sorting_types.hpp
 create mode 100644 test/ck_tile/moe_sorting/test_moe_sorting_util.hpp
 delete mode 100644 test/ck_tile/smoothquant/smoothquant.inc
 delete mode 100644 test/ck_tile/smoothquant/smoothquant_bf16.cpp
 delete mode 100644 test/ck_tile/smoothquant/smoothquant_fp16.cpp
 create mode 100644 test/ck_tile/smoothquant/test_smoothquant.cpp
 create mode 100755 test/ck_tile/smoothquant/test_smoothquant_cases.inc
 create mode 100644 test/ck_tile/smoothquant/test_smoothquant_types.hpp
 create mode 100644 test/ck_tile/smoothquant/test_smoothquant_util.hpp

diff --git a/test/ck_tile/moe_sorting/CMakeLists.txt b/test/ck_tile/moe_sorting/CMakeLists.txt
index 9a7490f0c9..5abc7df5a9 100644
--- a/test/ck_tile/moe_sorting/CMakeLists.txt
+++ b/test/ck_tile/moe_sorting/CMakeLists.txt
@@ -1,14 +1,19 @@
 # Currently ck_tile is only built on gfx90a, gfx942 and gfx950
 if(GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950" OR GPU_TARGETS MATCHES "gfx90a")
 
-    add_test_executable(test_ck_tile_moe_sorting_fp32 moe_sorting_fp32.cpp moe_sorting_api.cpp)
-    target_include_directories(test_ck_tile_moe_sorting_fp32 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+    function(add_moe_sorting_test EXECUTABLE USE_2D_BUF)
+        add_gtest_executable(${EXECUTABLE} test_moe_sorting.cpp moe_sorting_api.cpp)
+        target_include_directories(${EXECUTABLE} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
 
-    set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
-    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-    list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
-    # list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
-    target_compile_options(test_ck_tile_moe_sorting_fp32 PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
+        set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
+        # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+        list(APPEND EXAMPLE_MOE_SORTING_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal -DMOE_SORTING_FMOE_2D_BUF=${USE_2D_BUF})
+        target_compile_options(${EXECUTABLE} PRIVATE ${EXAMPLE_MOE_SORTING_COMPILE_OPTIONS})
+
+    endfunction(add_moe_sorting_test EXECUTABLE USE_2D_BUF)
+
+    add_moe_sorting_test(test_ck_tile_moe_sorting_2d_buf 1)
+    add_moe_sorting_test(test_ck_tile_moe_sorting 0)
 
 else()
     message(DEBUG "Skipping ck_tile_moe_sorting tests for current target")
diff --git a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp b/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
deleted file mode 100644
index 8a300dd890..0000000000
--- a/test/ck_tile/moe_sorting/moe_sorting_fp32.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include <set>
-#include <vector>
-#include <iostream>
-#include <numeric>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <time.h>
-#include <unordered_set>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/reduce.hpp"
-#include "moe_sorting_api.hpp"
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "turn CPU validation on (1) or off (0).")
-        .insert("pr_i", "int32", "index data type.  Only int32 is currently supported.")
-        .insert("pr_w", "fp32", "output weight data type. Only fp32 is currently supported.")
-        .insert("t",
-                "128",
-                "number of input tokens.\n"
-                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
-        .insert(
-            "local_t",
-            "-1",
-            "Number of local input tokens for curent rank.\n"
-            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
-            "This feature is to simulate EP case where where each rank has different tokens.\n"
-            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
-        .insert("e", "8", "number of num_experts")
-        .insert("k", "4", "topk")
-        .insert("unit", "32", "unit_size")
-#if MOE_SORTING_FMOE_2D_BUF
-        .insert("moe_buf_interm_dim", "0", "interm_dim(col) of the following fmoe buf")
-        .insert(
-            "moe_buf_elem_bytes", "2", "fmoe buf element byte size, 1:8bit, 2:16bit, 4:32bit...")
-#else
-        .insert("moe_buf_size", "0", "moe_buf_size")
-#endif
-        .insert("ci",
-                "1",
-                "clear workspace inside API or not(if \"0\", require manually clear outside)")
-        .insert(
-            "dispatch",
-            "0",
-            "dispatch policy. 0:automatically pick up kernel, 1:use single kernel, 2:use mp kernel")
-        .insert("local_eid",
-                "-1",
-                "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
-                "please make sure eid is in ascending order!")
-        .insert("seed",
-                "-1",
-                "seed to be used. When set to -1, a random seed will be generated each time "
-                "invoking this example")
-        .insert("kname", "0", "prints the kernel name when set to 1")
-        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename IndexType>
-void topid_unique_gen(
-    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
-{
-    size_t total_size = topk * tokens;
-    std::srand(seed);
-    std::set<IndexType> unique_set;
-    IndexType current_v;
-    for(size_t i = 0; i < total_size; i++)
-    {
-        if(i % topk == 0)
-        {
-            unique_set.clear();
-        }
-        current_v = std::rand() % num_expert;
-        while(unique_set.find(current_v) != unique_set.end())
-        {
-            current_v = std::rand() % num_expert;
-        }
-        unique_set.insert(current_v);
-        host_tensor[i] = current_v;
-    }
-}
-
-template <typename WeightType, typename IndexType = ck_tile::index_t>
-bool test_moe_sorting(ck_tile::ArgParser args)
-{
-    int validate            = args.get_int("v");
-    std::string index_prec  = args.get_str("pr_i");
-    std::string weight_prec = args.get_str("pr_w");
-    int tokens              = args.get_int("t");
-    int local_tokens        = args.get_int("local_t");
-    int num_experts         = args.get_int("e");
-    int topk                = args.get_int("k");
-    int seed                = args.get_int("seed");
-    int unit_size           = args.get_int("unit");
-#if MOE_SORTING_FMOE_2D_BUF
-    int moe_buf_interm_dim = args.get_int("moe_buf_interm_dim");
-    int moe_buf_elem_bytes = args.get_int("moe_buf_elem_bytes");
-#else
-    int64_t moe_buf_size = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
-#endif
-    int kname           = args.get_int("kname");
-    int warmup          = args.get_int("warmup");
-    int repeat          = args.get_int("repeat");
-    bool clear_inside   = args.get_int("ci") != 0;
-    int dispatch_policy = args.get_int("dispatch");
-
-    int max_output_ids =
-        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
-
-    if(seed < 0)
-    {
-        seed = std::time(nullptr);
-    }
-
-    if(topk > num_experts)
-    {
-        printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
-               topk,
-               num_experts);
-        return false;
-    }
-
-    // if local_tokens == tokens, not local_token, but better avoid this since no meaning for such
-    // case
-    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
-
-    if(local_tokens > tokens)
-    {
-        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
-        return false;
-    }
-
-    bool local_expert_masking      = args.get_str("local_eid") != "-1";
-    auto local_expert_masking_host = [&]() {
-        if(local_expert_masking)
-        {
-            auto local_eid = args.get_int_vec("local_eid");
-            ck_tile::HostTensor<IndexType> v_{{num_experts}};
-            v_.SetZero();
-            for(auto eid : local_eid)
-            {
-                if(eid >= num_experts)
-                {
-                    throw std::runtime_error(
-                        "local_eid larger than number of expert, please check");
-                }
-                v_.mData[eid] = 1;
-            }
-            return v_;
-        }
-        else
-            return ck_tile::HostTensor<IndexType>{{1}};
-    }();
-
-    // tokens already considered batch size
-    ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
-    ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
-    ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
-    ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
-    ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
-    // for simplicity, below buffer allocate 2 dword
-    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
-#if MOE_SORTING_FMOE_2D_BUF
-    ck_tile::HostTensor<int8_t> moe_buf_host(
-        {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
-         moe_buf_elem_bytes});
-    auto moe_buf_bytes = moe_buf_interm_dim == 0 ? static_cast<std::size_t>(0)
-                                                 : moe_buf_host.get_element_space_size_in_bytes();
-#else
-    ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
-    auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
-                                           : moe_buf_host.get_element_space_size_in_bytes();
-#endif
-
-    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
-#if MOE_SORTING_FMOE_2D_BUF
-    ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
-#else
-    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
-#endif
-    topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
-
-    ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_weights_dev(sorted_weights_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_expert_ids_dev(
-        sorted_expert_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem local_expert_masking_dev(
-        local_expert_masking_host.get_element_space_size_in_bytes());
-
-    // used for simulating dynamic_tokens for EP case
-    ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
-    if(is_local_token)
-    {
-        local_tokens_dev.ToDevice(&local_tokens);
-    }
-
-    topk_ids_dev.ToDevice(topk_ids_host.data());
-    weights_dev.ToDevice(weights_host.data());
-    if(moe_buf_bytes > 0)
-    {
-        moe_buf_dev.ToDevice(moe_buf_host.data());
-    }
-    if(local_expert_masking)
-        local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
-
-    // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-    ck_tile::index_t workspace_size =
-        moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
-    ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
-    if(workspace_size != 0 && clear_inside == false)
-        moe_sorting_ws.SetZero(); // note, clear here!!!!
-
-    moe_sorting_trait trait{
-        index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
-
-    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
-                          weights_dev.GetDeviceBuffer(),
-                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
-                                               : nullptr,
-                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
-                          sorted_ids_dev.GetDeviceBuffer(),
-                          sorted_weights_dev.GetDeviceBuffer(),
-                          sorted_expert_ids_dev.GetDeviceBuffer(),
-                          sorted_id_cnt_dev.GetDeviceBuffer(),
-                          moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
-                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
-                          tokens,
-                          unit_size,
-                          num_experts,
-                          topk,
-#if MOE_SORTING_FMOE_2D_BUF
-                          moe_buf_interm_dim,
-                          moe_buf_elem_bytes
-#else
-                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
-#endif
-    };
-
-    ck_tile::stream_config sc{nullptr,
-                              true,
-                              /* log_level = */ (kname ? 1 : 0),
-                              warmup,
-                              repeat};
-
-    auto ms = moe_sorting(trait, karg, sc);
-
-    printf("[%s|%s|%s|%d]tokens:%d",
-           index_prec.c_str(),
-           weight_prec.c_str(),
-           workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
-           dispatch_policy,
-           tokens);
-    if(is_local_token)
-    {
-        printf("(%d)", local_tokens);
-    }
-    printf(", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);
-
-    if(local_expert_masking)
-    {
-        printf("local_eid:%s, ", args.get_str("local_eid").c_str());
-    }
-
-    if(moe_buf_bytes > 0)
-    {
-#if MOE_SORTING_FMOE_2D_BUF
-        printf("moe_buf:%lu(%d,%d), ",
-               static_cast<uint64_t>(moe_buf_bytes),
-               moe_buf_interm_dim,
-               moe_buf_elem_bytes);
-#else
-
-        printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
-#endif
-    }
-
-    if(ms < 0)
-        printf("not supported\n");
-    else
-        printf("ms:%f, ", ms);
-    fflush(stdout);
-    if(ms < 0)
-    {
-        return false;
-    }
-
-    sorted_ids_dev.FromDevice(sorted_ids_host.data());
-    sorted_weights_dev.FromDevice(sorted_weights_host.data());
-    sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
-    sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
-    if(moe_buf_bytes > 0)
-    {
-        moe_buf_dev.FromDevice(moe_buf_host.data());
-    }
-
-    bool rtn = true;
-    if(validate)
-    {
-        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
-        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
-        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
-
-        int32_t ref_total_tokens_post_pad = 0;
-        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
-                                                              weights_host,
-                                                              local_expert_masking_host,
-                                                              sorted_ids_ref,
-                                                              sorted_weights_ref,
-                                                              sorted_expert_ids_ref,
-                                                              ref_total_tokens_post_pad,
-                                                              num_experts,
-                                                              unit_size,
-                                                              is_local_token ? local_tokens
-                                                                             : tokens,
-                                                              local_expert_masking);
-        printf("total_tokens_post_pad:%d(%d), ",
-               ref_total_tokens_post_pad,
-               sorted_id_cnt_host.mData[0]);
-        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
-        {
-            size_t slen = ref_total_tokens_post_pad;
-            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
-                                      sorted_ids_ref.slice({0}, {slen}),
-                                      std::string("OUT Error: Incorrect ids!"),
-                                      1e-6,
-                                      1e-6);
-            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
-                                      sorted_weights_ref.slice({0}, {slen}),
-                                      std::string("OUT Error: Incorrect w!"),
-                                      1e-6,
-                                      1e-6);
-            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
-                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
-                                      std::string("OUT Error: Incorrect eid!"),
-                                      1e-6,
-                                      1e-6);
-            // if(is_local_token)
-            {
-                auto t_ = is_local_token ? local_tokens : tokens;
-                bool _f = t_ == sorted_id_cnt_host.mData[1];
-                rtn &= _f;
-                if(!_f)
-                {
-                    printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
-                }
-            }
-        }
-        else
-        {
-            printf("(token size not equal!!)");
-            rtn = false;
-        }
-
-        if(moe_buf_bytes)
-        {
-#if MOE_SORTING_FMOE_2D_BUF
-            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
-#else
-            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
-#endif
-            rtn &= ck_tile::check_err(
-                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
-        }
-        // rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
-    }
-
-    printf("valid:%s", rtn ? "y" : "n");
-    fflush(stdout);
-    if(!rtn)
-        printf(", (%d)", seed);
-    printf("\n");
-    fflush(stdout);
-    return rtn;
-}
-template <typename WeightType, typename IndexType = ck_tile::index_t>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, args] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return test_moe_sorting<WeightType, IndexType>(args);
-}
-
-template <typename WeightType, typename IndexType = ck_tile::index_t>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid = true;
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-
-        constexpr int max_num_args = 7;
-        const int num_args         = test_cases[test_idx].size();
-
-        assert(max_num_args >= num_args && "Invalid number of arguments in test case");
-
-        char* argv[max_num_args];
-
-        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
-        {
-            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
-        }
-
-        try
-        {
-            valid = valid && run_test_case<WeightType, IndexType>(num_args, argv);
-
-            if(!valid)
-                break;
-        }
-        catch(const std::runtime_error& e)
-        {
-            std::cerr << "Runtime error: " << e.what() << '\n';
-            return false;
-        }
-    }
-
-    return valid;
-}
-
-std::vector<std::vector<std::string>> create_test_cases()
-{
-#if MOE_SORTING_FMOE_2D_BUF
-    return {{"-t=80", "-e=17", "-moe_buf_interm_dim=16", "-moe_buf_elem_bytes=4"},
-            {"-t=111", "-e=117", "-moe_buf_interm_dim=4", "-moe_buf_elem_bytes=4"},
-            {"-t=1000", "-e=55", "-moe_buf_interm_dim=1024", "-moe_buf_elem_bytes=1"},
-            {"-t=99", "-e=120", "-moe_buf_interm_dim=10244", "-moe_buf_elem_bytes=2"},
-            {"-t=175", "-e=64", "-k=8"},
-            {"-t=65", "-e=8", "-k=2"},
-            {"-t=1", "-e=25"},
-            {"-t=31", "-e=19", "-k=15"},
-            {"-t=81", "-e=37", "-k=7"},
-            {"-t=23", "-e=1", "-k=1"},
-            {"-t=127", "-e=99", "-k=19"},
-            {"-t=71", "-e=11", "-k=11"},
-            {"-t=1", "-e=1", "-k=1"},
-            {"-t=99", "-e=2", "-k=1"},
-            {"-t=333", "-e=99", "-k=13"},
-            {"-t=11", "-e=256", "-k=5"},
-            {"-t=64", "-e=455", "-k=8"},
-            {"-t=777", "-e=802", "-k=99"},
-            {"-t=4097", "-e=906", "-k=51"},
-            {"-t=128", "-e=32", "-k=5", "-local_t=6", "-moe_buf_interm_dim=262144"},
-            {"-t=13", "-e=64", "-k=3", "-local_eid=4,5,6,7,8,9,10,11"},
-            {"-t=99", "-e=33", "-k=9", "-local_eid=6,10,11,15,19"},
-            {"-t=80", "-e=99", "-k=10", "-local_eid=0,8,12,33"},
-            {"-t=11", "-e=256", "-k=5", "-local_eid=99,110,129"},
-            {"-t=128", "-e=128", "-k=6", "-moe_buf_interm_dim=163840", "-moe_buf_elem_bytes=1"},
-            {"-t=8192", "-e=32", "-k=5", "-local_t=11", "-moe_buf_interm_dim=163840"},
-            {"-t=8192",
-             "-e=32",
-             "-k=8",
-             "-local_t=12",
-             "-moe_buf_interm_dim=163840",
-             "-moe_buf_elem_bytes=1"},
-            {"-t=8192", "-e=256", "-k=5", "-local_t=13", "-moe_buf_interm_dim=163840"},
-            {"-t=8192", "-e=256", "-k=8", "-local_t=8", "-moe_buf_interm_dim=163840"},
-            {"-t=163840",
-             "-e=256",
-             "-k=8",
-             "-local_t=4",
-             "-moe_buf_interm_dim=163840",
-             "-moe_buf_elem_bytes=4"},
-            {"-t=12", "-local_t=3", "-e=256", "-k=5", "-local_eid=9,10,199,145"},
-            {"-t=67", "-local_t=9", "-e=555", "-k=5", "-local_eid=19,23,24,25,26,99"},
-            {"-t=99", "-local_t=93", "-e=121", "-local_t=4", "-moe_buf_interm_dim=10244"},
-            {"-t=536", "-local_t=345", "-e=802", "-k=99"},
-            {"-t=331", "-local_t=39", "-e=83", "-k=33"},
-            {"-t=765", "-local_t=654", "-e=783", "-k=8"},
-            {"-t=23", "-local_t=9", "-e=1", "-k=1"},
-            {"-t=7", "-local_t=0", "-e=89", "-k=1", "-local_eid=0,8,12,33"},
-            {"-t=61", "-local_t=0", "-e=333", "-k=99", "-local_eid=0,8,12,33"},
-            {"-t=133940",
-             "-local_t=111921",
-             "-e=256",
-             "-k=17",
-             "-local_t=2",
-             "-moe_buf_interm_dim=133940",
-             "-moe_buf_elem_bytes=1"}};
-
-#else
-    return {{"-t=80", "-e=17", "-moe_buf_size=16"},
-            {"-t=111", "-e=117", "-moe_buf_size=4"},
-            {"-t=1000", "-e=55", "-moe_buf_size=1024"},
-            {"-t=99", "-e=120", "-moe_buf_size=10244"},
-            {"-t=175", "-e=64", "-k=8"},
-            {"-t=65", "-e=8", "-k=2"},
-            {"-t=1", "-e=25"},
-            {"-t=31", "-e=19", "-k=15"},
-            {"-t=81", "-e=37", "-k=7"},
-            {"-t=23", "-e=1", "-k=1"},
-            {"-t=127", "-e=99", "-k=19"},
-            {"-t=71", "-e=11", "-k=11"},
-            {"-t=1", "-e=1", "-k=1"},
-            {"-t=99", "-e=2", "-k=1"},
-            {"-t=333", "-e=99", "-k=13"},
-            {"-t=11", "-e=256", "-k=5"},
-            {"-t=64", "-e=455", "-k=8"},
-            {"-t=777", "-e=802", "-k=99"},
-            {"-t=4097", "-e=906", "-k=51"},
-            {"-t=128", "-e=32", "-k=5", "-moe_buf_size=262144"},
-            {"-t=13", "-e=64", "-k=3", "-local_eid=4,5,6,7,8,9,10,11"},
-            {"-t=99", "-e=33", "-k=9", "-local_eid=6,10,11,15,19"},
-            {"-t=80", "-e=99", "-k=10", "-local_eid=0,8,12,33"},
-            {"-t=11", "-e=256", "-k=5", "-local_eid=99,110,129"},
-            {"-t=128", "-e=128", "-k=6", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=32", "-k=5", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=32", "-k=8", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=256", "-k=5", "-moe_buf_size=163840"},
-            {"-t=8192", "-e=256", "-k=8", "-moe_buf_size=163840"},
-            {"-t=163840", "-e=256", "-k=8", "-moe_buf_size=163840"},
-            {"-t=12", "-local_t=3", "-e=256", "-k=5", "-local_eid=9,10,199,145"},
-            {"-t=67", "-local_t=9", "-e=555", "-k=5", "-local_eid=19,23,24,25,26,99"},
-            {"-t=99", "-local_t=93", "-e=121", "-moe_buf_size=10244"},
-            {"-t=536", "-local_t=345", "-e=802", "-k=99"},
-            {"-t=331", "-local_t=39", "-e=83", "-k=33"},
-            {"-t=765", "-local_t=654", "-e=783", "-k=8"},
-            {"-t=23", "-local_t=9", "-e=1", "-k=1"},
-            {"-t=7", "-local_t=0", "-e=89", "-k=1", "-local_eid=0,8,12,33"},
-            {"-t=61", "-local_t=0", "-e=333", "-k=99", "-local_eid=0,8,12,33"},
-            {"-t=133940", "-local_t=111921", "-e=256", "-k=17", "-moe_buf_size=133940"}};
-#endif
-}
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases();
-
-    return !run_test_cases<float, ck_tile::index_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting.cpp b/test/ck_tile/moe_sorting/test_moe_sorting.cpp
new file mode 100644
index 0000000000..8f6cb72c24
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_moe_sorting_types.hpp"
+#include "test_moe_sorting_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileMoeSorting
+
+TYPED_TEST_SUITE(TestCkTileMoeSorting, KernelTypesMoeSorting);
+
+#include "test_moe_sorting_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting_cases.inc b/test/ck_tile/moe_sorting/test_moe_sorting_cases.inc
new file mode 100755
index 0000000000..4d44e7101e
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting_cases.inc
@@ -0,0 +1,1211 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_MOE_SORTING_CASES_INC
+#define TEST_MOE_SORTING_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase1)
+{
+    int tokens       = 80;
+    int local_tokens = -1;
+    int num_experts  = 17;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 16;
+    int moe_buf_elem_bytes = 4;
+#else
+    int64_t moe_buf_size = 16;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase2)
+{
+    int tokens       = 111;
+    int local_tokens = -1;
+    int num_experts  = 117;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 4;
+    int moe_buf_elem_bytes = 4;
+#else
+    int64_t moe_buf_size = 4;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase3)
+{
+    int tokens       = 1000;
+    int local_tokens = -1;
+    int num_experts  = 55;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 1024;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 1024;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase4)
+{
+    int tokens       = 99;
+    int local_tokens = -1;
+    int num_experts  = 120;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 10244;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 10244;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase5)
+{
+    int tokens       = 175;
+    int local_tokens = -1;
+    int num_experts  = 8;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase6)
+{
+    int tokens       = 65;
+    int local_tokens = -1;
+    int num_experts  = 8;
+    int topk         = 2;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase7)
+{
+    int tokens       = 1;
+    int local_tokens = -1;
+    int num_experts  = 65;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase8)
+{
+    int tokens       = 31;
+    int local_tokens = -1;
+    int num_experts  = 19;
+    int topk         = 15;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase9)
+{
+    int tokens       = 81;
+    int local_tokens = -1;
+    int num_experts  = 37;
+    int topk         = 7;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase10)
+{
+    int tokens       = 23;
+    int local_tokens = -1;
+    int num_experts  = 1;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase11)
+{
+    int tokens       = 127;
+    int local_tokens = -1;
+    int num_experts  = 99;
+    int topk         = 19;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase12)
+{
+    int tokens       = 71;
+    int local_tokens = -1;
+    int num_experts  = 11;
+    int topk         = 11;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase13)
+{
+    int tokens       = 1;
+    int local_tokens = -1;
+    int num_experts  = 1;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase14)
+{
+    int tokens       = 99;
+    int local_tokens = -1;
+    int num_experts  = 2;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase15)
+{
+    int tokens       = 333;
+    int local_tokens = -1;
+    int num_experts  = 99;
+    int topk         = 13;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase16)
+{
+    int tokens       = 11;
+    int local_tokens = -1;
+    int num_experts  = 256;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase17)
+{
+    int tokens       = 64;
+    int local_tokens = -1;
+    int num_experts  = 455;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase18)
+{
+    int tokens       = 777;
+    int local_tokens = -1;
+    int num_experts  = 802;
+    int topk         = 99;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase19)
+{
+    int tokens       = 4097;
+    int local_tokens = -1;
+    int num_experts  = 906;
+    int topk         = 51;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase20)
+{
+    int tokens       = 128;
+    int local_tokens = 6;
+    int num_experts  = 32;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 262144;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 262144;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase21)
+{
+    int tokens       = 13;
+    int local_tokens = -1;
+    int num_experts  = 64;
+    int topk         = 3;
+    int unit_size    = 32;
+    std::vector<int> local_eid{4, 5, 6, 7, 8, 9, 10, 11};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase22)
+{
+    int tokens       = 99;
+    int local_tokens = -1;
+    int num_experts  = 33;
+    int topk         = 9;
+    int unit_size    = 32;
+    std::vector<int> local_eid{6, 10, 11, 15, 19};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase23)
+{
+    int tokens       = 80;
+    int local_tokens = -1;
+    int num_experts  = 99;
+    int topk         = 10;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase24)
+{
+    int tokens       = 11;
+    int local_tokens = -1;
+    int num_experts  = 256;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{99, 110, 129};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase25)
+{
+    int tokens       = 128;
+    int local_tokens = -1;
+    int num_experts  = 128;
+    int topk         = 6;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase26)
+{
+    int tokens      = 8192;
+    int num_experts = 32;
+    int topk        = 5;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 11;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 2;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase27)
+{
+    int tokens       = 8192;
+    int local_tokens = 12;
+    int num_experts  = 32;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase28)
+{
+    int tokens      = 8192;
+    int num_experts = 256;
+    int topk        = 5;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 12;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 0;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase29)
+{
+    int tokens      = 8192;
+    int num_experts = 256;
+    int topk        = 8;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 8;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 2;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 163840;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase30)
+{
+    int tokens      = 163840;
+    int num_experts = 256;
+    int topk        = 8;
+    int unit_size   = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int local_tokens       = 4;
+    int moe_buf_interm_dim = 163840;
+    int moe_buf_elem_bytes = 4;
+#else
+    int local_tokens     = -1;
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase31)
+{
+    int tokens       = 12;
+    int local_tokens = 3;
+    int num_experts  = 256;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{9, 10, 199, 145};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase32)
+{
+    int tokens       = 67;
+    int local_tokens = 9;
+    int num_experts  = 555;
+    int topk         = 5;
+    int unit_size    = 32;
+    std::vector<int> local_eid{19, 23, 24, 25, 26, 99};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase33)
+{
+    int tokens       = 99;
+    int local_tokens = 93;
+    int num_experts  = 121;
+    int topk         = 4;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 10244;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 10244;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase34)
+{
+    int tokens       = 536;
+    int local_tokens = 345;
+    int num_experts  = 802;
+    int topk         = 99;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase35)
+{
+    int tokens       = 331;
+    int local_tokens = 39;
+    int num_experts  = 83;
+    int topk         = 33;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase36)
+{
+    int tokens       = 765;
+    int local_tokens = 654;
+    int num_experts  = 783;
+    int topk         = 8;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase37)
+{
+    int tokens       = 23;
+    int local_tokens = 9;
+    int num_experts  = 1;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase38)
+{
+    int tokens       = 7;
+    int local_tokens = 0;
+    int num_experts  = 89;
+    int topk         = 1;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase39)
+{
+    int tokens       = 61;
+    int local_tokens = 0;
+    int num_experts  = 333;
+    int topk         = 99;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 0;
+    int moe_buf_elem_bytes = 2;
+#else
+    int64_t moe_buf_size = 0;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSortingCase40)
+{
+    int tokens       = 133940;
+    int local_tokens = 2;
+    int num_experts  = 256;
+    int topk         = 17;
+    int unit_size    = 32;
+    std::vector<int> local_eid{0, 8, 12, 33};
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = 133940;
+    int moe_buf_elem_bytes = 1;
+#else
+    int64_t moe_buf_size = 133940;
+#endif
+
+    this->RunSingle(tokens,
+                    local_tokens,
+                    num_experts,
+                    topk,
+                    unit_size,
+                    local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                    moe_buf_interm_dim,
+                    moe_buf_elem_bytes
+#else
+                    moe_buf_size
+#endif
+    );
+}
+#endif
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting_types.hpp b/test/ck_tile/moe_sorting/test_moe_sorting_types.hpp
new file mode 100644
index 0000000000..447e48abb6
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting_types.hpp
@@ -0,0 +1,8 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using KernelTypesMoeSorting = ::testing::Types<std::tuple<float, ck_tile::index_t>>;
diff --git a/test/ck_tile/moe_sorting/test_moe_sorting_util.hpp b/test/ck_tile/moe_sorting/test_moe_sorting_util.hpp
new file mode 100644
index 0000000000..5d58dcac7a
--- /dev/null
+++ b/test/ck_tile/moe_sorting/test_moe_sorting_util.hpp
@@ -0,0 +1,356 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <set>
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "moe_sorting_api.hpp"
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+void print_vector(std::vector<int>& data)
+{
+    for(const auto& x : data)
+    {
+        std::cout << x << ",";
+    }
+    std::cout << " ";
+}
+
+template <typename Tuple>
+class TestCkTileMoeSorting : public ::testing::Test
+{
+
+    protected:
+    using WeightType = std::tuple_element_t<0, Tuple>;
+    using IndexType  = std::tuple_element_t<1, Tuple>;
+
+    void RunSingle(int tokens,
+                   int local_tokens,
+                   int num_experts,
+                   int topk,
+                   int unit_size,
+                   std::vector<int>& local_eid,
+#if MOE_SORTING_FMOE_2D_BUF
+                   int moe_buf_interm_dim,
+                   int moe_buf_elem_bytes)
+#else
+                   int64_t moe_buf_size)
+#endif
+    {
+        std::string index_prec  = get_precision_string<IndexType>();
+        std::string weight_prec = get_precision_string<WeightType>();
+
+        bool clear_inside   = true;
+        int dispatch_policy = 0;
+
+        int max_output_ids = ck_tile::integer_least_multiple(
+            topk * tokens + num_experts * unit_size - topk, unit_size);
+
+        int seed = 42; // Fixed seed for testing reproducibility
+
+        if(topk > num_experts)
+        {
+            printf("topk:%d value should be smaller than, or equal to number of num_experts:%d\n",
+                   topk,
+                   num_experts);
+            EXPECT_TRUE(false);
+        }
+
+        // if local_tokens == tokens, not local_token, but better avoid this since no meaning for
+        // such case
+        bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+        if(local_tokens > tokens)
+        {
+            printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+            EXPECT_TRUE(false);
+        }
+
+        bool local_expert_masking      = !local_eid.empty();
+        auto local_expert_masking_host = [&]() {
+            if(local_expert_masking)
+            {
+                // auto local_eid = args.get_int_vec("local_eid");
+                ck_tile::HostTensor<IndexType> v_{{num_experts}};
+                v_.SetZero();
+                for(auto eid : local_eid)
+                {
+                    if(eid >= num_experts)
+                    {
+                        throw std::runtime_error(
+                            "local_eid larger than number of expert, please check");
+                    }
+                    v_.mData[eid] = 1;
+                }
+                return v_;
+            }
+            else
+                return ck_tile::HostTensor<IndexType>{{1}};
+        }();
+
+        // tokens already considered batch size
+        ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
+        ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
+        ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
+        // for simplicity, below buffer allocate 2 dword
+        ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
+#if MOE_SORTING_FMOE_2D_BUF
+        ck_tile::HostTensor<int8_t> moe_buf_host(
+            {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
+             moe_buf_elem_bytes});
+        auto moe_buf_bytes = moe_buf_interm_dim == 0
+                                 ? static_cast<std::size_t>(0)
+                                 : moe_buf_host.get_element_space_size_in_bytes();
+#else
+        ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+        auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
+                                               : moe_buf_host.get_element_space_size_in_bytes();
+#endif
+
+        ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+#if MOE_SORTING_FMOE_2D_BUF
+        ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
+#else
+        ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+#endif
+        topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);
+
+        ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem weights_dev(weights_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_ids_dev(sorted_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_weights_dev(
+            sorted_weights_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_expert_ids_dev(
+            sorted_expert_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem local_expert_masking_dev(
+            local_expert_masking_host.get_element_space_size_in_bytes());
+
+        // used for simulating dynamic_tokens for EP case
+        ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+        if(is_local_token)
+        {
+            local_tokens_dev.ToDevice(&local_tokens);
+        }
+
+        topk_ids_dev.ToDevice(topk_ids_host.data());
+        weights_dev.ToDevice(weights_host.data());
+        if(moe_buf_bytes > 0)
+        {
+            moe_buf_dev.ToDevice(moe_buf_host.data());
+        }
+        if(local_expert_masking)
+            local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
+
+        // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
+        ck_tile::index_t workspace_size =
+            moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
+        ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
+        if(workspace_size != 0 && clear_inside == false)
+            moe_sorting_ws.SetZero(); // note, clear here!!!!
+
+        moe_sorting_trait trait{
+            index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};
+
+        moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
+                              weights_dev.GetDeviceBuffer(),
+                              local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
+                                                   : nullptr,
+                              is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
+                              sorted_ids_dev.GetDeviceBuffer(),
+                              sorted_weights_dev.GetDeviceBuffer(),
+                              sorted_expert_ids_dev.GetDeviceBuffer(),
+                              sorted_id_cnt_dev.GetDeviceBuffer(),
+                              moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                              workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
+                              tokens,
+                              unit_size,
+                              num_experts,
+                              topk,
+#if MOE_SORTING_FMOE_2D_BUF
+                              moe_buf_interm_dim,
+                              moe_buf_elem_bytes
+#else
+                              static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+#endif
+        };
+
+        ck_tile::stream_config sc{nullptr, false};
+
+        auto ret_val = moe_sorting(trait, karg, sc);
+
+        printf("[%s|%s|%s|%d]tokens:%d",
+               index_prec.c_str(),
+               weight_prec.c_str(),
+               workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
+               dispatch_policy,
+               tokens);
+        if(is_local_token)
+        {
+            printf("(%d)", local_tokens);
+        }
+        printf(
+            ", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);
+
+        if(local_expert_masking)
+        {
+            printf("local_eid:");
+            print_vector(local_eid);
+        }
+
+        if(moe_buf_bytes > 0)
+        {
+#if MOE_SORTING_FMOE_2D_BUF
+            printf("moe_buf:%lu(%d,%d), ",
+                   static_cast<uint64_t>(moe_buf_bytes),
+                   moe_buf_interm_dim,
+                   moe_buf_elem_bytes);
+#else
+
+            printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
+#endif
+        }
+
+        if(ret_val < 0)
+        {
+            printf("not supported\n");
+            fflush(stdout);
+            EXPECT_TRUE(false);
+        }
+
+        sorted_ids_dev.FromDevice(sorted_ids_host.data());
+        sorted_weights_dev.FromDevice(sorted_weights_host.data());
+        sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
+        sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
+        if(moe_buf_bytes > 0)
+        {
+            moe_buf_dev.FromDevice(moe_buf_host.data());
+        }
+
+        bool rtn = true;
+        ck_tile::HostTensor<IndexType> sorted_ids_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<WeightType> sorted_weights_ref({max_output_ids}, {1});
+        ck_tile::HostTensor<IndexType> sorted_expert_ids_ref({max_output_ids / unit_size}, {1});
+
+        int32_t ref_total_tokens_post_pad = 0;
+        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
+                                                              weights_host,
+                                                              local_expert_masking_host,
+                                                              sorted_ids_ref,
+                                                              sorted_weights_ref,
+                                                              sorted_expert_ids_ref,
+                                                              ref_total_tokens_post_pad,
+                                                              num_experts,
+                                                              unit_size,
+                                                              is_local_token ? local_tokens
+                                                                             : tokens,
+                                                              local_expert_masking);
+        printf("total_tokens_post_pad:%d(%d), ",
+               ref_total_tokens_post_pad,
+               sorted_id_cnt_host.mData[0]);
+        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
+        {
+            size_t slen = ref_total_tokens_post_pad;
+            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
+                                      sorted_ids_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect ids!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
+                                      sorted_weights_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect w!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
+                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
+                                      std::string("OUT Error: Incorrect eid!"),
+                                      1e-6,
+                                      1e-6);
+
+            auto t_ = is_local_token ? local_tokens : tokens;
+            bool _f = t_ == sorted_id_cnt_host.mData[1];
+            rtn &= _f;
+            if(!_f)
+            {
+                printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
+            }
+        }
+        else
+        {
+            printf("(token size not equal!!)");
+            rtn = false;
+        }
+
+        if(moe_buf_bytes)
+        {
+#if MOE_SORTING_FMOE_2D_BUF
+            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
+#else
+            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+#endif
+            rtn &= ck_tile::check_err(
+                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
+        }
+
+        printf("valid:%s", rtn ? "y" : "n");
+        fflush(stdout);
+        if(!rtn)
+            printf(", (%d)", seed);
+        printf("\n");
+        fflush(stdout);
+
+        EXPECT_TRUE(rtn);
+    }
+
+    template <typename PrecisionType>
+    static std::string get_precision_string()
+    {
+        if constexpr(std::is_same_v<PrecisionType, float>)
+        {
+            return "fp32";
+        }
+        else if(std::is_same_v<PrecisionType, ck_tile::index_t>)
+        {
+            return "int32";
+        }
+        else
+        {
+            throw std::runtime_error("Invalid precision.");
+        }
+    }
+};
diff --git a/test/ck_tile/smoothquant/CMakeLists.txt b/test/ck_tile/smoothquant/CMakeLists.txt
index de4459051c..548fc03a41 100644
--- a/test/ck_tile/smoothquant/CMakeLists.txt
+++ b/test/ck_tile/smoothquant/CMakeLists.txt
@@ -3,7 +3,7 @@ if(GPU_TARGETS MATCHES "gfx9")
     function (add_smoothquant_test TARGET_NAME MAIN_SRC)
         message(DEBUG "adding ${TARGET_NAME}")
 
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
         target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
         foreach(source IN LISTS ARGN)
@@ -20,8 +20,7 @@ if(GPU_TARGETS MATCHES "gfx9")
     endfunction(add_smoothquant_test TARGET_NAME MAIN_SRC)
 
     file(GLOB INSTANCE_SRCS instances/*.cpp)
-    add_smoothquant_test(test_ck_tile_smoothquant_fp16 smoothquant_fp16.cpp ${INSTANCE_SRCS})
-    add_smoothquant_test(test_ck_tile_smoothquant_bf16 smoothquant_bf16.cpp ${INSTANCE_SRCS})
+    add_smoothquant_test(test_ck_tile_smoothquant test_smoothquant.cpp ${INSTANCE_SRCS})
 
 else()
     message(DEBUG "Skipping ck_tile smoothquant tests for current target")
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
index 4b7ef5a38d..04e6732a7e 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fwd_api.cpp
@@ -22,9 +22,7 @@ using trait_ = smoothquant_traits_<DataType_,
                                    kTwoPass_>;
 
 template <typename data_type>
-float smoothquant_dispatch(smoothquant_traits /*t*/,
-                           smoothquant_args a,
-                           const ck_tile::stream_config& s)
+float smoothquant_dispatch(smoothquant_args a, const ck_tile::stream_config& s)
 {
     float r = -1;
     // clang-format off
@@ -128,16 +126,14 @@ float smoothquant_dispatch(smoothquant_traits /*t*/,
     // clang-format on
 }
 
-float smoothquant(smoothquant_traits t, smoothquant_args a, const ck_tile::stream_config& s)
+template <>
+float smoothquant<ck_tile::fp16_t>(smoothquant_args a, const ck_tile::stream_config& s)
 {
-    if(t.data_type.compare("fp16") == 0)
-    {
-        return smoothquant_dispatch<ck_tile::fp16_t>(t, a, s);
-    }
-    else if(t.data_type.compare("bf16") == 0)
-    {
-        return smoothquant_dispatch<ck_tile::bf16_t>(t, a, s);
-    }
-    else
-        throw std::runtime_error("Without supported instances!");
+    return smoothquant_dispatch<ck_tile::fp16_t>(a, s);
+}
+
+template <>
+float smoothquant<ck_tile::bf16_t>(smoothquant_args a, const ck_tile::stream_config& s)
+{
+    return smoothquant_dispatch<ck_tile::bf16_t>(a, s);
 }
diff --git a/test/ck_tile/smoothquant/smoothquant.hpp b/test/ck_tile/smoothquant/smoothquant.hpp
index ce9ab25448..b1d5dae3d3 100644
--- a/test/ck_tile/smoothquant/smoothquant.hpp
+++ b/test/ck_tile/smoothquant/smoothquant.hpp
@@ -111,4 +111,5 @@ struct smoothquant_traits
     std::string data_type;
 };
 
-float smoothquant(smoothquant_traits, smoothquant_args, const ck_tile::stream_config&);
+template <typename DataType>
+float smoothquant(smoothquant_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/smoothquant/smoothquant.inc b/test/ck_tile/smoothquant/smoothquant.inc
deleted file mode 100644
index 23dba27e88..0000000000
--- a/test/ck_tile/smoothquant/smoothquant.inc
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "ck_tile/host.hpp"
-#include "smoothquant.hpp"
-#include <cstring>
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::int8_t>()
-{
-    // due to rounding, int8 quantization might have 1 abs error
-    double rtol = 1;
-    double atol = 1;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "n dimension")
-        .insert("x_stride", "-1", "input stride per row, if -1 then equal to n")
-        .insert("y_stride", "-1", "output stride per row, if -1 then equal to n")
-        .insert("v", "1", "cpu validation or not")
-        .insert("kname", "1", "print kernel name or not")
-        .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename DataType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    ck_tile::index_t m        = arg_parser.get_int("m");
-    ck_tile::index_t n        = arg_parser.get_int("n");
-    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
-    if(x_stride < 0)
-        x_stride = n;
-    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
-    if(y_stride < 0)
-        y_stride = n;
-    std::string data_type = arg_parser.get_str("prec");
-    int kname             = arg_parser.get_int("kname");
-    int do_validation     = arg_parser.get_int("v");
-    int warmup            = arg_parser.get_int("warmup");
-    int repeat            = arg_parser.get_int("repeat");
-
-    assert(x_stride >= n);
-
-    using TypeConfig = SmoothquantTypeConfig<DataType>;
-
-    using XDataType           = typename TypeConfig::XDataType;
-    using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
-    using YScaleDataType      = typename TypeConfig::YScaleDataType;
-    using QYDataType          = typename TypeConfig::QYDataType;
-    using ComputeDataType     = typename TypeConfig::ComputeDataType;
-
-    // host verify
-    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
-    ck_tile::HostTensor<SmoothScaleDataType> smscale_host({n});
-
-    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
-    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
-
-    ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
-
-    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
-    ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-    smscale_buf.ToDevice(smscale_host.data());
-
-    std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
-              << ", y_stride:" << y_stride << std::flush;
-
-    smoothquant_traits traits{data_type};
-
-    smoothquant_args args{x_buf.GetDeviceBuffer(),
-                          smscale_buf.GetDeviceBuffer(),
-                          yscale_buf.GetDeviceBuffer(),
-                          qy_buf.GetDeviceBuffer(),
-                          m,
-                          n,
-                          x_stride,
-                          y_stride};
-
-    float ave_time = smoothquant(
-        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
-
-    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(SmoothScaleDataType) * n +
-                           sizeof(YScaleDataType) * m + sizeof(QYDataType) * m * n;
-
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
-
-    bool pass = true;
-
-    if(do_validation)
-    {
-        using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
-        // smooth outlier
-        {
-            auto f = [&](auto n_) {
-                auto v_smscale = ck_tile::type_convert<ComputeDataType>(smscale_host(n_));
-
-                for(int m_ = 0; m_ < m; ++m_)
-                {
-                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
-                    y_host(m_, n_) = v_x * v_smscale;
-                }
-            };
-
-            ck_tile::make_ParallelTensorFunctor(f, smscale_host.get_element_space_size())(
-                std::thread::hardware_concurrency());
-        }
-
-        // yscale
-        {
-            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
-
-            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
-            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
-                y_host, y_rowwise_amax_host, ReduceAmax{});
-
-            auto op = [](const auto& v0) {
-                return v0 /
-                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
-            };
-            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
-                y_rowwise_amax_host, yscale_host_ref, op);
-
-            yscale_buf.FromDevice(yscale_host_dev.mData.data());
-
-            auto [rtol, atol] = get_elimit<YScaleDataType>();
-            pass &= ck_tile::check_err(yscale_host_dev,
-                                       yscale_host_ref,
-                                       std::string("yscale Error: Incorrect results!"),
-                                       rtol,
-                                       atol);
-        }
-
-        // rowwise quantization
-        {
-            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
-                y_host, yscale_host_ref, qy_host_ref);
-
-            qy_buf.FromDevice(qy_host_dev.data());
-            auto [rtol, atol] = get_elimit<QYDataType>();
-
-            if(y_stride == n)
-            {
-                pass = ck_tile::check_err(qy_host_dev,
-                                          qy_host_ref,
-                                          std::string("qy Error: Incorrect results!"),
-                                          rtol,
-                                          atol);
-            }
-            else
-            {
-                for(int i_r = 0; i_r < m; i_r++)
-                {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
-                                                            qy_host_dev.begin() + i_r * y_stride +
-                                                                n);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
-                                                            qy_host_ref.begin() + i_r * y_stride +
-                                                                n);
-                    pass &= ck_tile::check_err(qy_host_dev_row,
-                                               qy_host_ref_row,
-                                               std::string("qy[") + std::to_string(i_r) +
-                                                   std::string("] Error: Incorrect results!"),
-                                               rtol,
-                                               atol);
-                }
-            }
-        }
-
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-    }
-
-    return pass;
-}
-
-std::vector<std::vector<std::string>> create_test_cases(const std::string prec)
-{
-    return {{"-prec=" + prec, "-m=99", "-n=13", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=17", "-n=16", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=1", "-n=100", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=4", "-n=128", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=80", "-n=127", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=22", "-n=255", "-x_stride=256"},
-            {"-prec=" + prec, "-m=7", "-n=599", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=19", "-n=512", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=33", "-n=313", "-x_stride=1000"},
-            {"-prec=" + prec, "-m=11", "-n=510", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=171", "-n=676", "-x_stride=818"},
-            {"-prec=" + prec, "-m=91", "-n=636", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=12", "-n=768", "-x_stride=800"},
-            {"-prec=" + prec, "-m=100", "-n=766", "-x_stride=812"},
-            {"-prec=" + prec, "-m=31", "-n=1024", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=64", "-n=1000", "-x_stride=1004"},
-            {"-prec=" + prec, "-m=8", "-n=1501", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=3", "-n=1826", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=5", "-n=2040", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=7", "-n=2734", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=1", "-n=3182", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=9", "-n=4096", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=3", "-n=8192", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=1", "-n=10547", "-x_stride=-1"},
-            {"-prec=" + prec, "-m=3", "-n=17134", "-x_stride=-1"}};
-}
-
-template <typename DataType>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return run<DataType>(arg_parser);
-}
-
-template <typename DataType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid             = true;
-    constexpr int num_args = 4;
-
-    char* argv[num_args];
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-        assert(test_cases[test_idx].size() == num_args &&
-               "invalid number of arguments in test case");
-        for(std::size_t idx = 0; idx < num_args; ++idx)
-        {
-            argv[idx] = test_cases[test_idx][idx].data();
-        }
-        valid = valid && run_test_case<DataType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
diff --git a/test/ck_tile/smoothquant/smoothquant_bf16.cpp b/test/ck_tile/smoothquant/smoothquant_bf16.cpp
deleted file mode 100644
index 4f5a8ac63e..0000000000
--- a/test/ck_tile/smoothquant/smoothquant_bf16.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("bf16");
-
-    return !run_test_cases<ck_tile::bf16_t>(test_cases);
-}
diff --git a/test/ck_tile/smoothquant/smoothquant_fp16.cpp b/test/ck_tile/smoothquant/smoothquant_fp16.cpp
deleted file mode 100644
index 7d822b4903..0000000000
--- a/test/ck_tile/smoothquant/smoothquant_fp16.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp16");
-
-    return !run_test_cases<ck_tile::half_t>(test_cases);
-}
diff --git a/test/ck_tile/smoothquant/test_smoothquant.cpp b/test/ck_tile/smoothquant/test_smoothquant.cpp
new file mode 100644
index 0000000000..6cce425e1b
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_smoothquant_types.hpp"
+#include "test_smoothquant_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileSmoothquant
+
+TYPED_TEST_SUITE(TestCkTileSmoothquant, KernelTypesSmoothquant);
+
+#include "test_smoothquant_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/smoothquant/test_smoothquant_cases.inc b/test/ck_tile/smoothquant/test_smoothquant_cases.inc
new file mode 100755
index 0000000000..27a7ea4676
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant_cases.inc
@@ -0,0 +1,206 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_SMOOTHQUANT_CASES_INC
+#define TEST_SMOOTHQUANT_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m99_n13)
+{
+    ck_tile::index_t m = 99;
+    ck_tile::index_t n = 13;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m17_n16)
+{
+    ck_tile::index_t m = 17;
+    ck_tile::index_t n = 16;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m1_n100)
+{
+    ck_tile::index_t m = 1;
+    ck_tile::index_t n = 100;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m4_n128)
+{
+    ck_tile::index_t m = 4;
+    ck_tile::index_t n = 128;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m80_n127)
+{
+    ck_tile::index_t m = 80;
+    ck_tile::index_t n = 127;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m22_n255)
+{
+    ck_tile::index_t m        = 22;
+    ck_tile::index_t n        = 255;
+    ck_tile::index_t x_stride = 256;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m7_n599)
+{
+    ck_tile::index_t m = 7;
+    ck_tile::index_t n = 599;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m33_n313)
+{
+    ck_tile::index_t m        = 33;
+    ck_tile::index_t n        = 313;
+    ck_tile::index_t x_stride = 1000;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m11_n510)
+{
+    ck_tile::index_t m = 11;
+    ck_tile::index_t n = 510;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m171_n676)
+{
+    ck_tile::index_t m        = 171;
+    ck_tile::index_t n        = 676;
+    ck_tile::index_t x_stride = 818;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m91_n636)
+{
+    ck_tile::index_t m = 91;
+    ck_tile::index_t n = 636;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m12_n768)
+{
+    ck_tile::index_t m        = 12;
+    ck_tile::index_t n        = 768;
+    ck_tile::index_t x_stride = 800;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m100_n766)
+{
+    ck_tile::index_t m        = 100;
+    ck_tile::index_t n        = 766;
+    ck_tile::index_t x_stride = 812;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m31_n1024)
+{
+    ck_tile::index_t m = 31;
+    ck_tile::index_t n = 1024;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m64_n1000)
+{
+    ck_tile::index_t m        = 64;
+    ck_tile::index_t n        = 1000;
+    ck_tile::index_t x_stride = 1004;
+
+    this->Run(m, n, x_stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m8_n1501)
+{
+    ck_tile::index_t m = 8;
+    ck_tile::index_t n = 1501;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m3_n1826)
+{
+    ck_tile::index_t m = 3;
+    ck_tile::index_t n = 1826;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m5_n2040)
+{
+    ck_tile::index_t m = 5;
+    ck_tile::index_t n = 2040;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m7_n2734)
+{
+    ck_tile::index_t m = 7;
+    ck_tile::index_t n = 2734;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m1_n3182)
+{
+    ck_tile::index_t m = 1;
+    ck_tile::index_t n = 3182;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m9_n4096)
+{
+    ck_tile::index_t m = 9;
+    ck_tile::index_t n = 4096;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m3_n8192)
+{
+    ck_tile::index_t m = 3;
+    ck_tile::index_t n = 8192;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m1_n10547)
+{
+    ck_tile::index_t m = 1;
+    ck_tile::index_t n = 10547;
+
+    this->Run(m, n);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Smoothqauant_m3_n17134)
+{
+    ck_tile::index_t m = 3;
+    ck_tile::index_t n = 17134;
+
+    this->Run(m, n);
+}
+
+#endif
diff --git a/test/ck_tile/smoothquant/test_smoothquant_types.hpp b/test/ck_tile/smoothquant/test_smoothquant_types.hpp
new file mode 100644
index 0000000000..7f79ce3ff9
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant_types.hpp
@@ -0,0 +1,9 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using KernelTypesSmoothquant =
+    ::testing::Types<std::tuple<ck_tile::fp16_t>, std::tuple<ck_tile::bf16_t>>;
diff --git a/test/ck_tile/smoothquant/test_smoothquant_util.hpp b/test/ck_tile/smoothquant/test_smoothquant_util.hpp
new file mode 100644
index 0000000000..5c1b733e03
--- /dev/null
+++ b/test/ck_tile/smoothquant/test_smoothquant_util.hpp
@@ -0,0 +1,181 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "ck_tile/host.hpp"
+#include "smoothquant.hpp"
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename Tuple>
+class TestCkTileSmoothquant : public ::testing::Test
+{
+
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+
+    void Run(ck_tile::index_t m,
+             ck_tile::index_t n,
+             ck_tile::index_t x_stride = -1,
+             ck_tile::index_t y_stride = -1)
+    {
+        if(x_stride < 0)
+            x_stride = n;
+        if(y_stride < 0)
+            y_stride = n;
+
+        assert(x_stride >= n);
+
+        using TypeConfig = SmoothquantTypeConfig<DataType>;
+
+        using XDataType           = typename TypeConfig::XDataType;
+        using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
+        using YScaleDataType      = typename TypeConfig::YScaleDataType;
+        using QYDataType          = typename TypeConfig::QYDataType;
+        using ComputeDataType     = typename TypeConfig::ComputeDataType;
+
+        // host verify
+        ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+        ck_tile::HostTensor<SmoothScaleDataType> smscale_host({n});
+
+        ck_tile::HostTensor<YScaleDataType> yscale_host_ref({m}, {1});
+        ck_tile::HostTensor<YScaleDataType> yscale_host_dev({m}, {1});
+
+        ck_tile::HostTensor<QYDataType> qy_host_ref({m, n}, {y_stride, 1});
+        ck_tile::HostTensor<QYDataType> qy_host_dev({m, n}, {y_stride, 1});
+
+        ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+        ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+        smscale_buf.ToDevice(smscale_host.data());
+
+        std::cout << "m:" << m << ", n:" << n << ", x_stride:" << x_stride
+                  << ", y_stride:" << y_stride << std::flush;
+
+        smoothquant_args args{x_buf.GetDeviceBuffer(),
+                              smscale_buf.GetDeviceBuffer(),
+                              yscale_buf.GetDeviceBuffer(),
+                              qy_buf.GetDeviceBuffer(),
+                              m,
+                              n,
+                              x_stride,
+                              y_stride};
+
+        smoothquant<DataType>(args, ck_tile::stream_config{nullptr, false});
+
+        bool pass = true;
+
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({m, n}, {y_stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto n_) {
+                auto v_smscale = ck_tile::type_convert<ComputeDataType>(smscale_host(n_));
+
+                for(int m_ = 0; m_ < m; ++m_)
+                {
+                    auto v_x       = ck_tile::type_convert<ComputeDataType>(x_host(m_, n_));
+                    y_host(m_, n_) = v_x * v_smscale;
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, smscale_host.get_element_space_size())(
+                std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({m});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(y_stride == n)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < m; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * y_stride,
+                                                            qy_host_dev.begin() + i_r * y_stride +
+                                                                n);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * y_stride,
+                                                            qy_host_ref.begin() + i_r * y_stride +
+                                                                n);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+};

From 10395fc895a73727cf0bda5a44a88d1b2595dcb2 Mon Sep 17 00:00:00 2001
From: Emily Martins <65371150+ecamartins@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:01:54 -0600
Subject: [PATCH 436/443] [CK_Tile] Refactor Permute and MOE Smoothquant ctests
 to gtests (#2622)

* Refactor CK tile permute ctests to gtests

* Refactor CK tile MOE smoothquant ctests to gtests

* fix typo in comment

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update invalid case in else clause for get_precision_string

* Refactor permute gtests to use templated versions of matrix_core_swizzle and permute functions

---------

Co-authored-by: root <root@splinter-126-wr-c2.aus.dcgpu>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 test/ck_tile/moe_smoothquant/CMakeLists.txt   |   8 +-
 .../instances/moe_smoothquant_fwd_api.cpp     |  52 +-
 .../moe_smoothquant/moe_smoothquant.hpp       |   9 +-
 .../moe_smoothquant/moe_smoothquant.inc       | 317 -----------
 .../moe_smoothquant_bf16_fp8.cpp              |  11 -
 .../moe_smoothquant_bf16_int8.cpp             |  11 -
 .../moe_smoothquant_fp16_fp8.cpp              |  11 -
 .../moe_smoothquant_fp16_int8.cpp             |  11 -
 .../moe_smoothquant/test_moe_smoothquant.cpp  |  14 +
 .../test_moe_smoothquant_cases.inc            | 206 ++++++++
 .../test_moe_smoothquant_types.hpp            |  11 +
 .../test_moe_smoothquant_util.hpp             | 218 ++++++++
 test/ck_tile/permute/CMakeLists.txt           |  16 +-
 .../alternative_impl/matrix_core_swizzle.cpp  | 101 ----
 .../alternative_impl/matrix_core_swizzle.hpp  | 117 ++++-
 test/ck_tile/permute/permute.hpp              |   8 -
 test/ck_tile/permute/permute_fp16.cpp         |  29 --
 test/ck_tile/permute/permute_fp32.cpp         |  29 --
 test/ck_tile/permute/permute_fp8.cpp          |  29 --
 test/ck_tile/permute/permute_utils.inc        | 490 ------------------
 test/ck_tile/permute/test_permute.cpp         |  14 +
 test/ck_tile/permute/test_permute_cases.inc   | 279 ++++++++++
 test/ck_tile/permute/test_permute_types.hpp   |  10 +
 test/ck_tile/permute/test_permute_util.hpp    | 328 ++++++++++++
 24 files changed, 1227 insertions(+), 1102 deletions(-)
 delete mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant.inc
 delete mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
 delete mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
 delete mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
 delete mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp
 create mode 100644 test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc
 create mode 100644 test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp
 create mode 100644 test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp
 delete mode 100644 test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
 delete mode 100644 test/ck_tile/permute/permute_fp16.cpp
 delete mode 100644 test/ck_tile/permute/permute_fp32.cpp
 delete mode 100644 test/ck_tile/permute/permute_fp8.cpp
 delete mode 100644 test/ck_tile/permute/permute_utils.inc
 create mode 100644 test/ck_tile/permute/test_permute.cpp
 create mode 100755 test/ck_tile/permute/test_permute_cases.inc
 create mode 100644 test/ck_tile/permute/test_permute_types.hpp
 create mode 100644 test/ck_tile/permute/test_permute_util.hpp

diff --git a/test/ck_tile/moe_smoothquant/CMakeLists.txt b/test/ck_tile/moe_smoothquant/CMakeLists.txt
index 70999fa06b..b6c8a395b6 100644
--- a/test/ck_tile/moe_smoothquant/CMakeLists.txt
+++ b/test/ck_tile/moe_smoothquant/CMakeLists.txt
@@ -2,7 +2,7 @@
 if(GPU_TARGETS MATCHES "gfx9")
     function (add_moe_smoothquant_test TARGET_NAME MAIN_SRC)
         message(DEBUG "adding ${TARGET_NAME}")
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
         target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 
         foreach(source IN LISTS ARGN)
@@ -21,11 +21,7 @@ if(GPU_TARGETS MATCHES "gfx9")
 
     file(GLOB INSTANCE_SRCS instances/*.cpp)
 
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_fp8 moe_smoothquant_fp16_fp8.cpp ${INSTANCE_SRCS})
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_int8 moe_smoothquant_fp16_int8.cpp ${INSTANCE_SRCS})
-
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_fp8 moe_smoothquant_bf16_fp8.cpp ${INSTANCE_SRCS})
-    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_int8 moe_smoothquant_bf16_int8.cpp ${INSTANCE_SRCS})
+    add_moe_smoothquant_test(test_ck_tile_moe_smoothquant test_moe_smoothquant.cpp ${INSTANCE_SRCS})
 
 else()
     message(DEBUG "Skipping ck_tile MOE smoothquant tests for current target")
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
index 0b890ab3ac..60c640d930 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
@@ -24,9 +24,7 @@ using trait_ = moe_smoothquant_traits_<InType,
                                        kTwoPass_>;
 
 template <typename in_type, typename out_type>
-float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/,
-                               moe_smoothquant_args a,
-                               const ck_tile::stream_config& s)
+float moe_smoothquant_dispatch(moe_smoothquant_args a, const ck_tile::stream_config& s)
 {
     float r = -1;
     // clang-format off
@@ -130,26 +128,30 @@ float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/,
     // clang-format on
 }
 
-float moe_smoothquant(moe_smoothquant_traits t,
-                      moe_smoothquant_args a,
-                      const ck_tile::stream_config& s)
+template <>
+float moe_smoothquant<ck_tile::fp16_t, ck_tile::int8_t>(moe_smoothquant_args a,
+                                                        const ck_tile::stream_config& s)
 {
-    if(t.in_type.compare("fp16") == 0 && t.out_type == "int8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::int8_t>(t, a, s);
-    }
-    else if(t.in_type.compare("fp16") == 0 && t.out_type == "fp8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::fp8_t>(t, a, s);
-    }
-    else if(t.in_type.compare("bf16") == 0 && t.out_type == "int8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::int8_t>(t, a, s);
-    }
-    else if(t.in_type.compare("bf16") == 0 && t.out_type == "fp8")
-    {
-        return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::fp8_t>(t, a, s);
-    }
-    else
-        throw std::runtime_error("Without supported instances!");
-}
+    return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::int8_t>(a, s);
+};
+
+template <>
+float moe_smoothquant<ck_tile::fp16_t, ck_tile::fp8_t>(moe_smoothquant_args a,
+                                                       const ck_tile::stream_config& s)
+{
+    return moe_smoothquant_dispatch<ck_tile::fp16_t, ck_tile::fp8_t>(a, s);
+};
+
+template <>
+float moe_smoothquant<ck_tile::bf16_t, ck_tile::int8_t>(moe_smoothquant_args a,
+                                                        const ck_tile::stream_config& s)
+{
+    return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::int8_t>(a, s);
+};
+
+template <>
+float moe_smoothquant<ck_tile::bf16_t, ck_tile::fp8_t>(moe_smoothquant_args a,
+                                                       const ck_tile::stream_config& s)
+{
+    return moe_smoothquant_dispatch<ck_tile::bf16_t, ck_tile::fp8_t>(a, s);
+};
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
index d137e64cb4..ced9b4ef3d 100644
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
+++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp
@@ -95,10 +95,5 @@ template <typename Traits_>
 float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a);
 
 // This is the public API, will be generated by script
-struct moe_smoothquant_traits
-{
-    std::string in_type;  // input type
-    std::string out_type; // output type
-};
-
-float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&);
+template <typename InputType, typename OutputType>
+float moe_smoothquant(moe_smoothquant_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
deleted file mode 100644
index 9e181a9d8c..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "ck_tile/host.hpp"
-#include "moe_smoothquant.hpp"
-#include <cstring>
-#include <set>
-#include <hip/hip_runtime.h>
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>()
-{
-    double rtol = 1e-5;
-    double atol = 1e-5;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::int8_t>()
-{
-    // due to rounding, int8 quantization might have 1 abs error
-    double rtol = 1;
-    double atol = 1;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <typename IndexType>
-void topid_unique_gen(
-    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
-{
-    size_t total_size = topk * tokens;
-    std::srand(seed);
-    std::set<IndexType> unique_set;
-    IndexType current_v;
-    for(size_t i = 0; i < total_size; i++)
-    {
-        if(i % topk == 0)
-        {
-            unique_set.clear();
-        }
-        current_v = std::rand() % num_expert;
-        while(unique_set.find(current_v) != unique_set.end())
-        {
-            current_v = std::rand() % num_expert;
-        }
-        unique_set.insert(current_v);
-        host_tensor[i] = current_v;
-    }
-}
-
-auto create_args(int argc, char* argv[], int index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("t", "3328", "tokens dimension")
-        .insert("h", "4096", "hidden_size dimension")
-        .insert("e", "32", "experts")
-        .insert("k", "5", "topk")
-        .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size")
-        .insert("v", "1", "cpu validation or not")
-        .insert("kname", "1", "print kernel name or not")
-        .insert("prec_i", "fp16", "input precision, fp16/bf16")
-        .insert("prec_o", "int8", "precision, int8/fp8")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
-
-    bool result = arg_parser.parse(argc, argv, index);
-    return std::make_tuple(result, arg_parser);
-}
-
-template <typename InputType, typename OutputType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    ck_tile::index_t tokens      = arg_parser.get_int("t");
-    ck_tile::index_t hidden_size = arg_parser.get_int("h");
-    ck_tile::index_t stride      = arg_parser.get_int("stride");
-    if(stride < 0)
-        stride = hidden_size;
-    ck_tile::index_t experts = arg_parser.get_int("e");
-    ck_tile::index_t topk    = arg_parser.get_int("k");
-    std::string prec_i       = arg_parser.get_str("prec_i");
-    std::string prec_o       = arg_parser.get_str("prec_o");
-    int kname                = arg_parser.get_int("kname");
-    int do_validation        = arg_parser.get_int("v");
-    int warmup               = arg_parser.get_int("warmup");
-    int repeat               = arg_parser.get_int("repeat");
-
-    assert(stride >= hidden_size);
-
-    using TypeConfig = MoeSmoothquantTypeConfig<InputType, OutputType>;
-
-    using XDataType           = typename TypeConfig::XDataType;
-    using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
-    using YScaleDataType      = typename TypeConfig::YScaleDataType;
-    using QYDataType          = typename TypeConfig::QYDataType;
-    using ComputeDataType     = typename TypeConfig::ComputeDataType;
-
-    // host verify
-    ck_tile::HostTensor<XDataType> x_host({tokens, hidden_size}, {stride, 1});
-    ck_tile::HostTensor<SmoothScaleDataType> smscale_host({experts * hidden_size});
-    ck_tile::HostTensor<ck_tile::index_t> topk_ids_host({tokens, topk});
-
-    ck_tile::HostTensor<YScaleDataType> yscale_host_ref({topk * tokens}, {1});
-    ck_tile::HostTensor<YScaleDataType> yscale_host_dev({topk * tokens}, {1});
-
-    ck_tile::HostTensor<QYDataType> qy_host_ref({topk * tokens, hidden_size}, {stride, 1});
-    ck_tile::HostTensor<QYDataType> qy_host_dev({topk * tokens, hidden_size}, {stride, 1});
-
-    topid_unique_gen<ck_tile::index_t>(topk_ids_host.mData, tokens, topk, experts, 11937);
-    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
-    ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
-
-    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x_host.data());
-    smscale_buf.ToDevice(smscale_host.data());
-    topk_ids_buf.ToDevice(topk_ids_host.data());
-
-    std::cout << "[" << prec_i << "-" << prec_o << "]" << " tokens:" << tokens
-              << ", hidden_size:" << hidden_size << ", stride:" << stride << ", experts:" << experts
-              << ", topk:" << topk << std::flush;
-
-    moe_smoothquant_traits traits{prec_i, prec_o};
-
-    moe_smoothquant_args args{x_buf.GetDeviceBuffer(),
-                              smscale_buf.GetDeviceBuffer(),
-                              topk_ids_buf.GetDeviceBuffer(),
-                              yscale_buf.GetDeviceBuffer(),
-                              qy_buf.GetDeviceBuffer(),
-                              tokens,
-                              hidden_size,
-                              experts,
-                              topk,
-                              stride,
-                              stride};
-
-    float ave_time = moe_smoothquant(
-        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
-
-    std::size_t num_byte = sizeof(XDataType) * tokens * hidden_size +
-                           sizeof(SmoothScaleDataType) * topk * hidden_size +
-                           sizeof(YScaleDataType) * topk * tokens +
-                           sizeof(QYDataType) * topk * tokens * hidden_size;
-
-    float gb_per_sec = num_byte / 1.E6 / ave_time;
-    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
-
-    bool pass = true;
-
-    if(do_validation)
-    {
-        using YDataType = ComputeDataType;
-        ck_tile::HostTensor<ComputeDataType> y_host({topk * tokens, hidden_size}, {stride, 1});
-        // smooth outlier
-        {
-            auto f = [&](auto i_token) {
-                for(int i_topk = 0; i_topk < topk; i_topk++)
-                {
-                    auto i_expert = topk_ids_host(i_token, i_topk);
-
-                    for(int i_h = 0; i_h < hidden_size; ++i_h)
-                    {
-                        auto v_smscale = ck_tile::type_convert<ComputeDataType>(
-                            smscale_host(i_expert * hidden_size + i_h));
-                        auto v_x = ck_tile::type_convert<ComputeDataType>(x_host(i_token, i_h));
-                        // y_host(i_token * topk + i_topk, i_h) = v_x * v_smscale;
-                        y_host(i_topk * tokens + i_token, i_h) = v_x * v_smscale;
-                    }
-                }
-            };
-
-            ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency());
-        }
-
-        // yscale
-        {
-            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({topk * tokens});
-
-            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
-            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
-                y_host, y_rowwise_amax_host, ReduceAmax{});
-
-            auto op = [](const auto& v0) {
-                return v0 /
-                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
-            };
-            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
-                y_rowwise_amax_host, yscale_host_ref, op);
-
-            yscale_buf.FromDevice(yscale_host_dev.mData.data());
-
-            auto [rtol, atol] = get_elimit<YScaleDataType>();
-            pass &= ck_tile::check_err(yscale_host_dev,
-                                       yscale_host_ref,
-                                       std::string("yscale Error: Incorrect results!"),
-                                       rtol,
-                                       atol);
-        }
-
-        // rowwise quantization
-        {
-            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
-                y_host, yscale_host_ref, qy_host_ref);
-
-            qy_buf.FromDevice(qy_host_dev.data());
-            auto [rtol, atol] = get_elimit<QYDataType>();
-
-            if(stride == hidden_size)
-            {
-                pass = ck_tile::check_err(qy_host_dev,
-                                          qy_host_ref,
-                                          std::string("qy Error: Incorrect results!"),
-                                          rtol,
-                                          atol);
-            }
-            else
-            {
-                for(int i_r = 0; i_r < topk * tokens; i_r++)
-                {
-                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
-                                                            qy_host_dev.begin() + i_r * stride +
-                                                                hidden_size);
-                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
-                                                            qy_host_ref.begin() + i_r * stride +
-                                                                hidden_size);
-                    pass &= ck_tile::check_err(qy_host_dev_row,
-                                               qy_host_ref_row,
-                                               std::string("qy[") + std::to_string(i_r) +
-                                                   std::string("] Error: Incorrect results!"),
-                                               rtol,
-                                               atol);
-                }
-            }
-        }
-
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
-    }
-
-    return pass;
-}
-
-std::vector<std::vector<std::string>> generate_test_cases(const std::string prec_in,
-                                                          const std::string prec_out)
-{
-    return {{"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=99", "-h=13", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=17", "-h=16", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=100", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=4", "-h=128", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=80", "-h=127", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=22", "-h=255", "-stride=256"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=599", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=19", "-h=512", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=33", "-h=313", "-stride=1000"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=11", "-h=510", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=171", "-h=676", "-stride=818"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=12", "-h=768", "-stride=800"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=100", "-h=766", "-stride=812"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=31", "-h=1024", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=64", "-h=1000", "-stride=1004"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=8", "-h=1501", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=1826", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=5", "-h=2040", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=2734", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=3182", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=9", "-h=4096", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=8192", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=10547", "-stride=-1"},
-            {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=17134", "-stride=-1"}};
-}
-
-template <typename InputType, typename OutputType>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return false;
-
-    return run<InputType, OutputType>(arg_parser);
-}
-
-template <typename InputType, typename OutputType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid             = true;
-    constexpr int num_args = 5;
-    char* argv[num_args];
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-
-        assert(num_args == test_cases[test_idx].size() && "invalid number of arguments");
-
-        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
-        {
-            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
-        }
-
-        valid = valid && run_test_case<InputType, OutputType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
deleted file mode 100644
index 3b5350da4b..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16", "fp8");
-
-    return !run_test_cases<ck_tile::bf16_t, ck_tile::fp8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
deleted file mode 100644
index 4751273f1d..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("bf16", "int8");
-
-    return !run_test_cases<ck_tile::bf16_t, ck_tile::int8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
deleted file mode 100644
index b9932dee65..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16", "fp8");
-
-    return !run_test_cases<ck_tile::half_t, ck_tile::fp8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
deleted file mode 100644
index 91c53b77bc..0000000000
--- a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "moe_smoothquant.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = generate_test_cases("fp16", "int8");
-
-    return !run_test_cases<ck_tile::half_t, ck_tile::int8_t>(test_cases);
-}
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp b/test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp
new file mode 100644
index 0000000000..dcd7ba2d26
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_moe_smoothquant_types.hpp"
+#include "test_moe_smoothquant_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileMoeSmoothquant
+
+TYPED_TEST_SUITE(TestCkTileMoeSmoothquant, KernelTypesMoeSmoothquant);
+
+#include "test_moe_smoothquant_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc
new file mode 100644
index 0000000000..12e8b5edc6
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_cases.inc
@@ -0,0 +1,206 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_MOE_SMOOTHQUANT_CASES_INC
+#define TEST_MOE_SMOOTHQUANT_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t99_h13)
+{
+    ck_tile::index_t tokens      = 99;
+    ck_tile::index_t hidden_size = 13;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t17_h16)
+{
+    ck_tile::index_t tokens      = 17;
+    ck_tile::index_t hidden_size = 16;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t1_h100)
+{
+    ck_tile::index_t tokens      = 1;
+    ck_tile::index_t hidden_size = 100;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t4_h128)
+{
+    ck_tile::index_t tokens      = 4;
+    ck_tile::index_t hidden_size = 128;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t80_h127)
+{
+    ck_tile::index_t tokens      = 80;
+    ck_tile::index_t hidden_size = 127;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t22_h255)
+{
+    ck_tile::index_t tokens      = 22;
+    ck_tile::index_t hidden_size = 255;
+    ck_tile::index_t stride      = 256;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t7_h599)
+{
+    ck_tile::index_t tokens      = 7;
+    ck_tile::index_t hidden_size = 599;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t19_h512)
+{
+    ck_tile::index_t tokens      = 19;
+    ck_tile::index_t hidden_size = 512;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t33_h313)
+{
+    ck_tile::index_t tokens      = 33;
+    ck_tile::index_t hidden_size = 313;
+    ck_tile::index_t stride      = 1000;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t11_h510)
+{
+    ck_tile::index_t tokens      = 11;
+    ck_tile::index_t hidden_size = 510;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t171_h676)
+{
+    ck_tile::index_t tokens      = 171;
+    ck_tile::index_t hidden_size = 676;
+    ck_tile::index_t stride      = 818;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t12_h768)
+{
+    ck_tile::index_t tokens      = 12;
+    ck_tile::index_t hidden_size = 768;
+    ck_tile::index_t stride      = 800;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t100_h766)
+{
+    ck_tile::index_t tokens      = 100;
+    ck_tile::index_t hidden_size = 766;
+    ck_tile::index_t stride      = 812;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t31_h1024)
+{
+    ck_tile::index_t tokens      = 31;
+    ck_tile::index_t hidden_size = 1024;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t64_h1000)
+{
+    ck_tile::index_t tokens      = 64;
+    ck_tile::index_t hidden_size = 1000;
+    ck_tile::index_t stride      = 1004;
+
+    this->Run(tokens, hidden_size, stride);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t8_h1501)
+{
+    ck_tile::index_t tokens      = 8;
+    ck_tile::index_t hidden_size = 1501;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t3_h1826)
+{
+    ck_tile::index_t tokens      = 3;
+    ck_tile::index_t hidden_size = 1826;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t5_h2040)
+{
+    ck_tile::index_t tokens      = 5;
+    ck_tile::index_t hidden_size = 2040;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t7_h2734)
+{
+    ck_tile::index_t tokens      = 7;
+    ck_tile::index_t hidden_size = 2734;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t1_h3182)
+{
+    ck_tile::index_t tokens      = 1;
+    ck_tile::index_t hidden_size = 3182;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t9_h4096)
+{
+    ck_tile::index_t tokens      = 9;
+    ck_tile::index_t hidden_size = 4096;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t3_h8192)
+{
+    ck_tile::index_t tokens      = 3;
+    ck_tile::index_t hidden_size = 8192;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t1_h10547)
+{
+    ck_tile::index_t tokens      = 1;
+    ck_tile::index_t hidden_size = 10547;
+
+    this->Run(tokens, hidden_size);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MoeSmoothquant_t3_h17134)
+{
+    ck_tile::index_t tokens      = 3;
+    ck_tile::index_t hidden_size = 17134;
+
+    this->Run(tokens, hidden_size);
+}
+
+#endif
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp
new file mode 100644
index 0000000000..7855def63d
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_types.hpp
@@ -0,0 +1,11 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using KernelTypesMoeSmoothquant = ::testing::Types<std::tuple<ck_tile::bf16_t, ck_tile::fp8_t>,
+                                                   std::tuple<ck_tile::bf16_t, ck_tile::int8_t>,
+                                                   std::tuple<ck_tile::fp16_t, ck_tile::fp8_t>,
+                                                   std::tuple<ck_tile::fp16_t, ck_tile::int8_t>>;
diff --git a/test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp
new file mode 100644
index 0000000000..18993a6e97
--- /dev/null
+++ b/test/ck_tile/moe_smoothquant/test_moe_smoothquant_util.hpp
@@ -0,0 +1,218 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "ck_tile/host.hpp"
+#include "moe_smoothquant.hpp"
+#include <cstring>
+#include <set>
+#include <hip/hip_runtime.h>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-5;
+    double atol = 1e-5;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    // due to rounding, int8 quantization might have 1 abs error
+    double rtol = 1;
+    double atol = 1;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <typename IndexType>
+void topid_unique_gen(
+    std::vector<IndexType>& host_tensor, int tokens, int topk, int num_expert, int seed)
+{
+    size_t total_size = topk * tokens;
+    std::srand(seed);
+    std::set<IndexType> unique_set;
+    IndexType current_v;
+    for(size_t i = 0; i < total_size; i++)
+    {
+        if(i % topk == 0)
+        {
+            unique_set.clear();
+        }
+        current_v = std::rand() % num_expert;
+        while(unique_set.find(current_v) != unique_set.end())
+        {
+            current_v = std::rand() % num_expert;
+        }
+        unique_set.insert(current_v);
+        host_tensor[i] = current_v;
+    }
+}
+
+template <typename Tuple>
+class TestCkTileMoeSmoothquant : public ::testing::Test
+{
+    protected:
+    using InputType  = std::tuple_element_t<0, Tuple>;
+    using OutputType = std::tuple_element_t<1, Tuple>;
+
+    void Run(ck_tile::index_t tokens,
+             ck_tile::index_t hidden_size,
+             ck_tile::index_t stride  = -1,
+             ck_tile::index_t experts = 32,
+             ck_tile::index_t topk    = 5)
+    {
+        if(stride < 0)
+            stride = hidden_size;
+
+        assert(stride >= hidden_size);
+
+        using TypeConfig = MoeSmoothquantTypeConfig<InputType, OutputType>;
+
+        using XDataType           = typename TypeConfig::XDataType;
+        using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType;
+        using YScaleDataType      = typename TypeConfig::YScaleDataType;
+        using QYDataType          = typename TypeConfig::QYDataType;
+        using ComputeDataType     = typename TypeConfig::ComputeDataType;
+
+        // host verify
+        ck_tile::HostTensor<XDataType> x_host({tokens, hidden_size}, {stride, 1});
+        ck_tile::HostTensor<SmoothScaleDataType> smscale_host({experts * hidden_size});
+        ck_tile::HostTensor<ck_tile::index_t> topk_ids_host({tokens, topk});
+
+        ck_tile::HostTensor<YScaleDataType> yscale_host_ref({topk * tokens}, {1});
+        ck_tile::HostTensor<YScaleDataType> yscale_host_dev({topk * tokens}, {1});
+
+        ck_tile::HostTensor<QYDataType> qy_host_ref({topk * tokens, hidden_size}, {stride, 1});
+        ck_tile::HostTensor<QYDataType> qy_host_dev({topk * tokens, hidden_size}, {stride, 1});
+
+        topid_unique_gen<ck_tile::index_t>(topk_ids_host.mData, tokens, topk, experts, 11937);
+        ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+        ck_tile::FillUniformDistribution<SmoothScaleDataType>{1e-3, .5f}(smscale_host);
+
+        ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x_host.data());
+        smscale_buf.ToDevice(smscale_host.data());
+        topk_ids_buf.ToDevice(topk_ids_host.data());
+
+        std::cout << "tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
+                  << ", experts:" << experts << ", topk:" << topk << std::flush;
+
+        moe_smoothquant_args args{x_buf.GetDeviceBuffer(),
+                                  smscale_buf.GetDeviceBuffer(),
+                                  topk_ids_buf.GetDeviceBuffer(),
+                                  yscale_buf.GetDeviceBuffer(),
+                                  qy_buf.GetDeviceBuffer(),
+                                  tokens,
+                                  hidden_size,
+                                  experts,
+                                  topk,
+                                  stride,
+                                  stride};
+
+        moe_smoothquant<InputType, OutputType>(args, ck_tile::stream_config{nullptr, false});
+
+        bool pass = true;
+
+        using YDataType = ComputeDataType;
+        ck_tile::HostTensor<ComputeDataType> y_host({topk * tokens, hidden_size}, {stride, 1});
+        // smooth outlier
+        {
+            auto f = [&](auto i_token) {
+                for(int i_topk = 0; i_topk < topk; i_topk++)
+                {
+                    auto i_expert = topk_ids_host(i_token, i_topk);
+
+                    for(int i_h = 0; i_h < hidden_size; ++i_h)
+                    {
+                        auto v_smscale = ck_tile::type_convert<ComputeDataType>(
+                            smscale_host(i_expert * hidden_size + i_h));
+                        auto v_x = ck_tile::type_convert<ComputeDataType>(x_host(i_token, i_h));
+                        // y_host(i_token * topk + i_topk, i_h) = v_x * v_smscale;
+                        y_host(i_topk * tokens + i_token, i_h) = v_x * v_smscale;
+                    }
+                }
+            };
+
+            ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency());
+        }
+
+        // yscale
+        {
+            ck_tile::HostTensor<YDataType> y_rowwise_amax_host({topk * tokens});
+
+            using ReduceAmax = ck_tile::ReduceOp::AbsMax;
+            ck_tile::reference_reduce<ComputeDataType, ComputeDataType, YDataType>(
+                y_host, y_rowwise_amax_host, ReduceAmax{});
+
+            auto op = [](const auto& v0) {
+                return v0 /
+                       ck_tile::type_convert<ComputeDataType>(ck_tile::numeric<QYDataType>::max());
+            };
+            ck_tile::reference_unary_elementwise<YDataType, YScaleDataType, ComputeDataType>(
+                y_rowwise_amax_host, yscale_host_ref, op);
+
+            yscale_buf.FromDevice(yscale_host_dev.mData.data());
+
+            auto [rtol, atol] = get_elimit<YScaleDataType>();
+            pass &= ck_tile::check_err(yscale_host_dev,
+                                       yscale_host_ref,
+                                       std::string("yscale Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        // rowwise quantization
+        {
+            ck_tile::reference_rowwise_quantization2d<YDataType, YScaleDataType, QYDataType>(
+                y_host, yscale_host_ref, qy_host_ref);
+
+            qy_buf.FromDevice(qy_host_dev.data());
+            auto [rtol, atol] = get_elimit<QYDataType>();
+
+            if(stride == hidden_size)
+            {
+                pass = ck_tile::check_err(qy_host_dev,
+                                          qy_host_ref,
+                                          std::string("qy Error: Incorrect results!"),
+                                          rtol,
+                                          atol);
+            }
+            else
+            {
+                for(int i_r = 0; i_r < topk * tokens; i_r++)
+                {
+                    std::vector<QYDataType> qy_host_dev_row(qy_host_dev.begin() + i_r * stride,
+                                                            qy_host_dev.begin() + i_r * stride +
+                                                                hidden_size);
+                    std::vector<QYDataType> qy_host_ref_row(qy_host_ref.begin() + i_r * stride,
+                                                            qy_host_ref.begin() + i_r * stride +
+                                                                hidden_size);
+                    pass &= ck_tile::check_err(qy_host_dev_row,
+                                               qy_host_ref_row,
+                                               std::string("qy[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+};
diff --git a/test/ck_tile/permute/CMakeLists.txt b/test/ck_tile/permute/CMakeLists.txt
index 7ee55a984d..4256ad8de1 100644
--- a/test/ck_tile/permute/CMakeLists.txt
+++ b/test/ck_tile/permute/CMakeLists.txt
@@ -2,7 +2,7 @@
 if(GPU_TARGETS MATCHES "gfx9")
 
     function(add_permute_test TARGET_NAME MAIN_SRC)
-        add_test_executable(${TARGET_NAME} ${MAIN_SRC})
+        add_gtest_executable(${TARGET_NAME} ${MAIN_SRC})
 
         if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
         set(PERMUTE_USE_ALTERNATIVE_IMPL true)
@@ -10,23 +10,11 @@ if(GPU_TARGETS MATCHES "gfx9")
 
         if(PERMUTE_USE_ALTERNATIVE_IMPL)
         target_compile_options(${TARGET_NAME} PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
-        target_sources(${TARGET_NAME} PRIVATE alternative_impl/matrix_core_swizzle.cpp)
         endif()
 
     endfunction(add_permute_test TARGET_NAME MAIN_SRC)
-    
-    set(CUSTOM_TARGET_NAME test_ck_tile_permute)
 
-    add_custom_target(${CUSTOM_TARGET_NAME})
-
-    add_permute_test(test_ck_tile_permute_fp16 permute_fp16.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp16)
-
-    add_permute_test(test_ck_tile_permute_fp8 permute_fp8.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp8)
-
-    add_permute_test(test_ck_tile_permute_fp32 permute_fp32.cpp)
-    add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_permute_fp32)
+    add_permute_test(test_ck_tile_permute test_permute.cpp)
 
 else()
     message(DEBUG "Skipping ck_tile_permute tests for current target")
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
deleted file mode 100644
index aedcfac138..0000000000
--- a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "matrix_core_swizzle.hpp"
-#include "matrix_core_swizzle_kernel.hpp"
-
-float matrix_core_swizzle(matrix_core_swizzle_traits t,
-                          matrix_core_swizzle_args a,
-                          const ck_tile::stream_config& s)
-{
-    if(t.data_type.compare("fp16") == 0)
-    {
-        if(t.inst.compare("32x32x8") == 0)
-        {
-            constexpr int BLOCK_SIZE             = 256;
-            constexpr int NPerBlock              = 256;
-            constexpr int KPerBlock              = 128;
-            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
-            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,3,4,2,5") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-        }
-        else if(t.inst.compare("16x16x16") == 0)
-        {
-            constexpr int BLOCK_SIZE             = 256;
-            constexpr int NPerBlock              = 256;
-            constexpr int KPerBlock              = 128;
-            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
-            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-            else if(t.permute.compare("0,1,3,4,2,5") == 0)
-            {
-                constexpr matrix_core_permute_style pstyle =
-                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
-                using Kernel =
-                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
-
-                auto k         = Kernel(a);
-                float ave_time = ck_tile::launch_kernel(s, k);
-
-                return ave_time;
-            }
-        }
-    }
-    return -1;
-}
diff --git a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
index 89dfeda4af..021cc303ad 100644
--- a/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
+++ b/test/ck_tile/permute/alternative_impl/matrix_core_swizzle.hpp
@@ -7,14 +7,125 @@
 
 struct matrix_core_swizzle_traits
 {
-    std::string data_type; // fp16 only
-    std::string inst;      // 32x32x8, 16x16x16
-    std::string permute;   //
+    std::string inst; // 32x32x8, 16x16x16
+    std::string permute;
 };
 
 using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
 
 // host API
+template <typename DataType> // only supported with fp16 data type
 float matrix_core_swizzle(matrix_core_swizzle_traits,
                           matrix_core_swizzle_args,
                           const ck_tile::stream_config&);
+
+template <>
+float matrix_core_swizzle<ck_tile::half_t>(matrix_core_swizzle_traits t,
+                                           matrix_core_swizzle_args a,
+                                           const ck_tile::stream_config& s)
+{
+    if(t.inst.compare("32x32x8") == 0)
+    {
+        constexpr int BLOCK_SIZE             = 256;
+        constexpr int NPerBlock              = 256;
+        constexpr int KPerBlock              = 128;
+        constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
+        if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,3,4,2,5") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+    }
+    else if(t.inst.compare("16x16x16") == 0)
+    {
+        constexpr int BLOCK_SIZE             = 256;
+        constexpr int NPerBlock              = 256;
+        constexpr int KPerBlock              = 128;
+        constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
+        if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+        else if(t.permute.compare("0,1,3,4,2,5") == 0)
+        {
+            constexpr matrix_core_permute_style pstyle =
+                matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+            using Kernel =
+                matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+            auto k         = Kernel(a);
+            float ave_time = ck_tile::launch_kernel(s, k);
+
+            return ave_time;
+        }
+    }
+
+    return -1;
+}
+
+template <>
+float matrix_core_swizzle<ck_tile::fp8_t>(matrix_core_swizzle_traits,
+                                          matrix_core_swizzle_args,
+                                          const ck_tile::stream_config&)
+{
+    throw std::runtime_error("Not supported for fp8");
+}
+
+template <>
+float matrix_core_swizzle<float>(matrix_core_swizzle_traits,
+                                 matrix_core_swizzle_args,
+                                 const ck_tile::stream_config&)
+{
+    throw std::runtime_error("Not supported for fp32");
+}
diff --git a/test/ck_tile/permute/permute.hpp b/test/ck_tile/permute/permute.hpp
index 5724b0f316..83488a8c1b 100644
--- a/test/ck_tile/permute/permute.hpp
+++ b/test/ck_tile/permute/permute.hpp
@@ -8,12 +8,4 @@
 #include "ck_tile/ops/permute.hpp"
 #include <string>
 
-struct permute_traits
-{
-    std::string data_type;
-};
-
 using permute_args = ck_tile::GenericPermuteHostArgs;
-
-// host API
-float permute(permute_traits, permute_args, const ck_tile::stream_config&);
diff --git a/test/ck_tile/permute/permute_fp16.cpp b/test/ck_tile/permute/permute_fp16.cpp
deleted file mode 100644
index 24781261ef..0000000000
--- a/test/ck_tile/permute/permute_fp16.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "permute.hpp"
-#include "ck_tile/host.hpp"
-
-#include <array>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-#include "alternative_impl/matrix_core_swizzle.hpp"
-#endif
-
-#include "permute_utils.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases_fp16();
-
-    return !run_test_cases<ck_tile::half_t>(test_cases);
-}
diff --git a/test/ck_tile/permute/permute_fp32.cpp b/test/ck_tile/permute/permute_fp32.cpp
deleted file mode 100644
index 2ece7c20bb..0000000000
--- a/test/ck_tile/permute/permute_fp32.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "permute.hpp"
-#include "ck_tile/host.hpp"
-
-#include <array>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-#include "alternative_impl/matrix_core_swizzle.hpp"
-#endif
-
-#include "permute_utils.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp32");
-
-    return !run_test_cases<float>(test_cases);
-}
diff --git a/test/ck_tile/permute/permute_fp8.cpp b/test/ck_tile/permute/permute_fp8.cpp
deleted file mode 100644
index e8ae5d0410..0000000000
--- a/test/ck_tile/permute/permute_fp8.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#include "permute.hpp"
-#include "ck_tile/host.hpp"
-
-#include <array>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <numeric>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-#include "alternative_impl/matrix_core_swizzle.hpp"
-#endif
-
-#include "permute_utils.inc"
-
-int main()
-{
-    std::vector<std::vector<std::string>> test_cases = create_test_cases("fp8");
-
-    return !run_test_cases<ck_tile::fp8_t>(test_cases);
-}
diff --git a/test/ck_tile/permute/permute_utils.inc b/test/ck_tile/permute/permute_utils.inc
deleted file mode 100644
index 6b8cb86b53..0000000000
--- a/test/ck_tile/permute/permute_utils.inc
+++ /dev/null
@@ -1,490 +0,0 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier:  MIT
-
-#pragma once
-
-namespace detail {
-template <int bytes>
-struct to_integer_type;
-
-template <>
-struct to_integer_type<4>
-{
-    using type = int32_t;
-};
-template <>
-struct to_integer_type<2>
-{
-    using type = int16_t;
-};
-template <>
-struct to_integer_type<1>
-{
-    using type = int8_t;
-};
-} // namespace detail
-
-template <int bytes>
-using to_integer_type = typename detail::to_integer_type<bytes>::type;
-
-// host API (shoule come from codegen)
-float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
-{
-    if(t.data_type.compare("fp8") == 0)
-    {
-        using DataType        = ck_tile::fp8_t;
-        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
-        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(a);
-
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
-    else if(t.data_type.compare("fp16") == 0)
-    {
-        using DataType        = ck_tile::half_t;
-        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
-        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(a);
-
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
-    else if(t.data_type.compare("fp32") == 0)
-    {
-        using DataType        = float;
-        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
-        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
-
-        auto kargs = Kernel::MakeKargs(a);
-
-        const dim3 grids      = Kernel::GridSize(a);
-        constexpr dim3 blocks = Kernel::BlockSize();
-
-        float ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
-
-        return ave_time;
-    }
-
-    return 0;
-}
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
-{
-    using size_type = typename std::vector<T>::size_type;
-
-    os << "[";
-    for(size_type idx = 0; idx < v.size(); ++idx)
-    {
-        if(0 < idx)
-        {
-            os << ", ";
-        }
-        os << v[idx];
-    }
-    return os << "]";
-}
-
-auto create_args(int argc, char* argv[], int start_index = 0)
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
-        .insert("prec", "fp16", "data type. fp8/fp16/fp32 (representing 8/16/32 bit data)")
-        .insert("shape", "2,3,4", "the shape of the input tensor")
-        .insert("perm", "2,1,0", "permute perm")
-        .insert("kname", "0", "t to 1 will print kernel name")
-        .insert("seed",
-                "11939",
-                "random seed used for initializing input tensors. 0 for "
-                "non-deterministic seed")
-        .insert("warmup", "5", "number of iterations before benchmark the kernel")
-        .insert("repeat", "20", "number of iterations to benchmark the kernel");
-
-    bool result = arg_parser.parse(argc, argv, start_index);
-    return std::make_tuple(result, arg_parser);
-}
-
-// different threshold for different dtype
-template <typename DataType>
-auto get_elimit(std::string /*init_method*/)
-{
-    double rtol = 1e-3;
-    double atol = 1e-3;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
-{
-    double rtol = 1e-2;
-    double atol = 1e-2;
-    return ck_tile::make_tuple(rtol, atol);
-}
-
-template <>
-auto get_elimit<ck_tile::fp8_t>(std::string init_method)
-{
-    if(init_method == "ui" || init_method == "ni")
-    {
-        unsigned max_rounding_point_distance = 0;
-        double atol                          = 2e-3;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-    else
-    {
-        unsigned max_rounding_point_distance = 1;
-        double atol                          = 0.0625;
-        return ck_tile::make_tuple(max_rounding_point_distance, atol);
-    }
-}
-
-// "1,2,3,4" -> vector{1,2,3,4}
-std::vector<ck_tile::index_t> decode_vec(std::string q_val)
-{
-#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
-    std::string::size_type pos = 0;
-    std::vector<ck_tile::index_t> v;
-    while(true)
-    {
-        auto found = q_val.find(',', pos);
-        ck_tile::index_t n =
-            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
-        v.push_back(n);
-        if(found == std::string::npos)
-        {
-            break;
-        }
-        pos = found + 1;
-    }
-    return v;
-#undef _S2I_
-}
-
-template <typename DataType>
-bool run(const ck_tile::ArgParser& arg_parser)
-{
-    std::string data_type = arg_parser.get_str("prec");
-    int do_validation     = arg_parser.get_int("v");
-
-    auto shape        = decode_vec(arg_parser.get_str("shape"));
-    auto perm         = decode_vec(arg_parser.get_str("perm"));
-    int stream_warmup = arg_parser.get_int("warmup");
-    int stream_repeat = arg_parser.get_int("repeat");
-    bool kname        = arg_parser.get_bool("kname");
-    int seed          = arg_parser.get_int("seed");
-
-    assert(shape.size() == perm.size());
-    ck_tile::index_t rank = perm.size();
-    if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
-    {
-        printf("rank %d permute is not support yet\n", rank);
-        return false;
-    }
-
-    ck_tile::HostTensor<DataType> x(shape);
-    ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
-
-    std::vector<ck_tile::index_t> y_shape = [&]() {
-        std::vector<ck_tile::index_t> tmp(rank, 0);
-        // std::cout << "@@@@" << tmp << std::endl;
-        for(int i = 0; i < static_cast<int>(rank); i++)
-        {
-            // std::cout << "  i:" << i << ", perm:" << perm[i] << ", rak:" <<
-            // static_cast<int>(rank)
-            // << std::endl;
-            tmp[i] = shape[perm[i]];
-        }
-        // std::cout << "@@@" << tmp << std::endl;
-        return tmp;
-    }();
-
-    ck_tile::HostTensor<DataType> y(y_shape);
-
-    ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
-
-    x_buf.ToDevice(x.data());
-
-    std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm
-              << std::endl;
-
-    ck_tile::stream_config stream_config{nullptr,
-                                         true,
-                                         /* log_level = */ (kname ? 1 : 0),
-                                         stream_warmup,
-                                         stream_repeat};
-    float ave_time   = 0.f;
-    auto run_permute = [&]() {
-        permute_traits t;
-        t.data_type = data_type;
-
-        permute_args a;
-        a.p_src = x_buf.GetDeviceBuffer();
-        a.p_dst = y_buf.GetDeviceBuffer();
-        a.rank  = rank;
-        std::copy(shape.begin(), shape.end(), a.shape);
-        std::copy(perm.begin(), perm.end(), a.perm);
-
-        return permute(t, a, stream_config);
-    };
-#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
-    // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
-    if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
-        arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6") ||
-        arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")))
-    {
-        if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))
-        {
-            // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
-            matrix_core_swizzle_traits t;
-            t.data_type = data_type;
-            t.permute   = arg_parser.get_str("perm");
-
-            matrix_core_swizzle_args a;
-            a.p_src = x_buf.GetDeviceBuffer();
-            a.p_dst = y_buf.GetDeviceBuffer();
-            a.batch = shape[0];
-
-            auto nr = shape[1];
-            auto nw = shape[2];
-            auto kr = shape[3];
-            auto kw = shape[4];
-            auto kv = shape[5];
-            a.n     = nr * nw;
-            a.k     = kr * kw * kv;
-            if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
-            {
-                t.inst = "16x16x16";
-                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
-            {
-                t.inst = "32x32x8";
-                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else
-            {
-                ave_time = run_permute();
-            }
-        }
-        else
-        {
-            matrix_core_swizzle_traits t;
-            t.data_type = data_type;
-            t.permute   = arg_parser.get_str("perm");
-
-            matrix_core_swizzle_args a;
-            a.p_src = x_buf.GetDeviceBuffer();
-            a.p_dst = y_buf.GetDeviceBuffer();
-            a.batch = shape[0];
-            a.n     = shape[1] * shape[2] * shape[3];
-            a.k     = shape[4] * shape[5] * shape[6];
-            if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
-               shape[4] % 8 == 0 && shape[1] % 2 == 0)
-            {
-                // 32x32x8 inst
-                // perm=0,1,4,2,5,3,6
-                // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
-                // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
-
-                t.inst = "32x32x8";
-                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
-                    shape[4] % 4 == 0 && shape[1] % 4 == 0)
-            {
-                // 16x16x16 inst
-                // perm=0,1,4,2,5,3,6
-                // y_shape=*,4x,4x,4,4,16,8
-                // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
-                t.inst = "16x16x16";
-                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
-
-                ave_time = matrix_core_swizzle(t, a, stream_config);
-            }
-            else
-            {
-                ave_time = run_permute();
-            }
-        }
-    }
-    else
-#endif
-    {
-        ave_time = run_permute();
-    }
-    std::cout << ", time:" << ave_time << "ms" << std::flush;
-
-    bool pass = true;
-    if(do_validation)
-    {
-        reference_permute(x, y, perm);
-
-        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
-
-        y_buf.FromDevice(y_dev.data());
-
-        pass = std::equal(
-            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
-                using itype = to_integer_type<sizeof(DataType)>;
-                itype i_d   = ck_tile::bit_cast<itype>(d);
-                itype i_h   = ck_tile::bit_cast<itype>(h);
-                return i_d == i_h;
-            });
-        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
-    }
-
-    std::cout << std::endl;
-
-    return pass;
-}
-
-template <typename DataType>
-bool run_test_case(int argc, char* argv[])
-{
-    auto [result, arg_parser] = create_args(argc, argv);
-
-    if(!result)
-        return false;
-
-    return run<DataType>(arg_parser);
-}
-
-template <typename DataType>
-bool run_test_cases(std::vector<std::vector<std::string>>& test_cases)
-{
-    bool valid             = true;
-    constexpr int num_args = 6;
-    char* argv[num_args];
-
-    for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx)
-    {
-        assert(test_cases[test_idx].size() == num_args &&
-               "invalid number of arguments in test case");
-
-        for(int arg_idx = 0; arg_idx < num_args; ++arg_idx)
-        {
-            argv[arg_idx] = test_cases[test_idx][arg_idx].data();
-        }
-
-        valid = valid && run_test_case<DataType>(num_args, argv);
-
-        if(!valid)
-            break;
-    }
-
-    return valid;
-}
-
-std::vector<std::vector<std::string>> create_test_cases(const std::string prec)
-{
-    return {
-        {"-prec=" + prec, "-shape=3,8", "-perm=1,0", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=48,6,8", "-perm=2,1,0", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=24,128,3", "-perm=0,2,1", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=4,10,7,6", "-perm=0,2,3,1", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=8,24,36,10", "-perm=3,1,2,0", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec, "-shape=8,1,36,4", "-perm=2,1,0,3", "-v=1", "-warmup=0", "-repeat=1"},
-        {"-prec=" + prec,
-         "-shape=5,10,16,2,36,4",
-         "-perm=4,5,2,1,0,3",
-         "-v=1",
-         "-warmup=0",
-         "-repeat=1"},
-        {"-prec=" + prec,
-         "-shape=2,32,8,3,6,2,5,4",
-         "-perm=5,2,4,7,1,6,3,0",
-         "-v=1",
-         "-warmup=0",
-         "-repeat=1"}};
-}
-
-std::vector<std::vector<std::string>> create_test_cases_fp16()
-{
-    return {{"-prec=fp16",
-             "-shape=3,6,4,32,16,2,8",
-             "-perm=0,1,4,2,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=5,10,4,32,8,2,8",
-             "-perm=0,1,4,2,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=3,8,4,16,16,4,8",
-             "-perm=0,1,4,2,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=3,6,4,32,16,2,8",
-             "-perm=0,1,2,4,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=5,10,4,32,8,2,8",
-             "-perm=0,1,2,4,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=3,8,4,16,16,4,8",
-             "-perm=0,1,2,4,5,3,6",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=2,8,16,8,4,8",
-             "-perm=0,1,3,4,2,5",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=1,24,32,16,2,8",
-             "-perm=0,1,3,4,2,5",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16", "-shape=3,8", "-perm=1,0", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=48,6,8", "-perm=2,1,0", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=24,128,3", "-perm=0,2,1", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=4,10,7,6", "-perm=0,2,3,1", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=8,24,36,10", "-perm=3,1,2,0", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16", "-shape=8,1,36,4", "-perm=2,1,0,3", "-v=1", "-warmup=0", "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=5,10,16,2,36,4",
-             "-perm=4,5,2,1,0,3",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"},
-            {"-prec=fp16",
-             "-shape=2,32,8,3,6,2,5,4",
-             "-perm=5,2,4,7,1,6,3,0",
-             "-v=1",
-             "-warmup=0",
-             "-repeat=1"}};
-}
diff --git a/test/ck_tile/permute/test_permute.cpp b/test/ck_tile/permute/test_permute.cpp
new file mode 100644
index 0000000000..3a2bcecf58
--- /dev/null
+++ b/test/ck_tile/permute/test_permute.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_permute_types.hpp"
+#include "test_permute_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTilePermute
+
+TYPED_TEST_SUITE(TestCkTilePermute, KernelTypesPermute);
+
+#include "test_permute_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/permute/test_permute_cases.inc b/test/ck_tile/permute/test_permute_cases.inc
new file mode 100755
index 0000000000..e596bfc721
--- /dev/null
+++ b/test/ck_tile/permute/test_permute_cases.inc
@@ -0,0 +1,279 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_PERMUTE_CASES_INC
+#define TEST_PERMUTE_CASES_INC
+TYPED_TEST(TEST_SUITE_NAME, Permute1)
+{
+    std::vector<ck_tile::index_t> shape{3, 8};
+    std::string perm{"1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute2)
+{
+    std::vector<ck_tile::index_t> shape{48, 6, 8};
+    std::string perm{"2,1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute3)
+{
+    std::vector<ck_tile::index_t> shape{24, 128, 3};
+    std::string perm{"0,2,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute4)
+{
+    std::vector<ck_tile::index_t> shape{4, 10, 7, 6};
+    std::string perm{"0,2,3,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute5)
+{
+    std::vector<ck_tile::index_t> shape{8, 24, 36, 10};
+    std::string perm{"3,1,2,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute6)
+{
+    std::vector<ck_tile::index_t> shape{8, 1, 36, 4};
+    std::string perm{"2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute7)
+{
+    std::vector<ck_tile::index_t> shape{5, 10, 16, 2, 36, 4};
+    std::string perm{"4,5,2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute8)
+{
+    std::vector<ck_tile::index_t> shape{2, 32, 8, 3, 6, 2, 5, 4};
+    std::string perm{"5,2,4,7,1,6,3,0"};
+
+    this->Run(shape, perm);
+}
+TYPED_TEST(TEST_SUITE_NAME, Permute9)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 6, 4, 32, 16, 2, 8};
+    std::string perm{"0,1,4,2,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute10)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{5, 10, 4, 32, 8, 2, 8};
+    std::string perm{"0,1,4,2,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute11)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 8, 4, 16, 16, 4, 8};
+    std::string perm{"0,1,4,2,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute12)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 6, 4, 32, 16, 2, 8};
+    std::string perm{"0,1,2,4,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute13)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{5, 10, 4, 32, 8, 2, 8};
+    std::string perm{"0,1,2,4,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute14)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 8, 4, 16, 16, 4, 8};
+    std::string perm{"0,1,2,4,5,3,6"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute15)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{2, 8, 16, 8, 4, 8};
+    std::string perm{"0,1,3,4,2,5"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute16)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{1, 24, 32, 16, 2, 8};
+    std::string perm{"0,1,3,4,2,5"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute17)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{3, 8};
+    std::string perm{"1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute18)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{48, 6, 8};
+    std::string perm{"2,1,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute19)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{24, 128, 3};
+    std::string perm{"0,2,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute20)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{4, 10, 7, 6};
+    std::string perm{"0,2,3,1"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute21)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{8, 24, 36, 10};
+    std::string perm{"3,1,2,0"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute22)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{8, 1, 36, 4};
+    std::string perm{"2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute23)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{5, 10, 16, 2, 36, 4};
+    std::string perm{"4,5,2,1,0,3"};
+
+    this->Run(shape, perm);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Permute24)
+{
+    if constexpr(!std::is_same_v<TypeParam, F16Types>)
+    {
+        GTEST_SKIP() << "Skipping this test: Only run with fp16";
+    }
+
+    std::vector<ck_tile::index_t> shape{2, 32, 8, 3, 6, 2, 5, 4};
+    std::string perm{"5,2,4,7,1,6,3,0"};
+
+    this->Run(shape, perm);
+}
+
+#endif
diff --git a/test/ck_tile/permute/test_permute_types.hpp b/test/ck_tile/permute/test_permute_types.hpp
new file mode 100644
index 0000000000..412e1e14ba
--- /dev/null
+++ b/test/ck_tile/permute/test_permute_types.hpp
@@ -0,0 +1,10 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include "ck_tile/host.hpp"
+#include "gtest/gtest.h"
+
+using F16Types = std::tuple<ck_tile::fp16_t>;
+using KernelTypesPermute =
+    ::testing::Types<F16Types, std::tuple<float>, std::tuple<ck_tile::fp8_t>>;
diff --git a/test/ck_tile/permute/test_permute_util.hpp b/test/ck_tile/permute/test_permute_util.hpp
new file mode 100644
index 0000000000..cca3148382
--- /dev/null
+++ b/test/ck_tile/permute/test_permute_util.hpp
@@ -0,0 +1,328 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+namespace detail {
+template <int bytes>
+struct to_integer_type;
+
+template <>
+struct to_integer_type<4>
+{
+    using type = int32_t;
+};
+template <>
+struct to_integer_type<2>
+{
+    using type = int16_t;
+};
+template <>
+struct to_integer_type<1>
+{
+    using type = int8_t;
+};
+} // namespace detail
+
+template <int bytes>
+using to_integer_type = typename detail::to_integer_type<bytes>::type;
+
+// host API (should come from codegen)
+template <typename DataType>
+float permute(permute_args a, const ck_tile::stream_config& s)
+{
+    using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+    using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+    auto kargs = Kernel::MakeKargs(a);
+
+    const dim3 grids      = Kernel::GridSize(a);
+    constexpr dim3 blocks = Kernel::BlockSize();
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, 1>(Kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+// "1,2,3,4" -> vector{1,2,3,4}
+std::vector<ck_tile::index_t> decode_vec(std::string q_val)
+{
+#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
+    std::string::size_type pos = 0;
+    std::vector<ck_tile::index_t> v;
+    while(true)
+    {
+        auto found = q_val.find(',', pos);
+        ck_tile::index_t n =
+            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
+        v.push_back(n);
+        if(found == std::string::npos)
+        {
+            break;
+        }
+        pos = found + 1;
+    }
+    return v;
+#undef _S2I_
+}
+
+template <typename Tuple>
+class TestCkTilePermute : public ::testing::Test
+{
+
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+
+    void Run(std::vector<ck_tile::index_t>& shape, std::string& perm)
+    {
+        std::string data_type                  = get_precision_string();
+        std::vector<ck_tile::index_t> perm_vec = decode_vec(perm);
+        int seed                               = 11939;
+
+        assert(shape.size() == perm_vec.size());
+        ck_tile::index_t rank = perm_vec.size();
+        if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
+        {
+            printf("rank %d permute is not support yet\n", rank);
+            EXPECT_TRUE(false);
+        }
+
+        ck_tile::HostTensor<DataType> x(shape);
+        ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
+
+        std::vector<ck_tile::index_t> y_shape = [&]() {
+            std::vector<ck_tile::index_t> tmp(rank, 0);
+
+            for(int i = 0; i < static_cast<int>(rank); i++)
+            {
+                tmp[i] = shape[perm_vec[i]];
+            }
+
+            return tmp;
+        }();
+
+        ck_tile::HostTensor<DataType> y(y_shape);
+
+        ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
+
+        x_buf.ToDevice(x.data());
+
+        std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape
+                  << ", permute:" << perm_vec << std::endl;
+
+        ck_tile::stream_config stream_config{nullptr, false, 0, 0, 1};
+
+        auto run_permute = [&]() {
+            permute_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.rank  = rank;
+            std::copy(shape.begin(), shape.end(), a.shape);
+            std::copy(perm_vec.begin(), perm_vec.end(), a.perm);
+
+            return permute<DataType>(a, stream_config);
+        };
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+        // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+        if((perm == std::string("0,1,4,2,5,3,6") || perm == std::string("0,1,2,4,5,3,6") ||
+            perm == std::string("0,1,3,4,2,5")))
+        {
+            if(perm == std::string("0,1,3,4,2,5"))
+            {
+                // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
+                matrix_core_swizzle_traits t;
+                t.permute = perm;
+
+                matrix_core_swizzle_args a;
+                a.p_src = x_buf.GetDeviceBuffer();
+                a.p_dst = y_buf.GetDeviceBuffer();
+                a.batch = shape[0];
+
+                auto nr = shape[1];
+                auto nw = shape[2];
+                auto kr = shape[3];
+                auto kw = shape[4];
+                auto kv = shape[5];
+                a.n     = nr * nw;
+                a.k     = kr * kw * kv;
+                if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
+                {
+                    t.inst = "16x16x16";
+                    std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
+                {
+                    t.inst = "32x32x8";
+                    std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else
+                {
+                    run_permute();
+                }
+            }
+            else
+            {
+                matrix_core_swizzle_traits t;
+                t.permute = perm;
+
+                matrix_core_swizzle_args a;
+                a.p_src = x_buf.GetDeviceBuffer();
+                a.p_dst = y_buf.GetDeviceBuffer();
+                a.batch = shape[0];
+                a.n     = shape[1] * shape[2] * shape[3];
+                a.k     = shape[4] * shape[5] * shape[6];
+                if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
+                   shape[4] % 8 == 0 && shape[1] % 2 == 0)
+                {
+                    // 32x32x8 inst
+                    // perm=0,1,4,2,5,3,6
+                    // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
+                    // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
+
+                    t.inst = "32x32x8";
+                    std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
+                        shape[4] % 4 == 0 && shape[1] % 4 == 0)
+                {
+                    // 16x16x16 inst
+                    // perm=0,1,4,2,5,3,6
+                    // y_shape=*,4x,4x,4,4,16,8
+                    // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
+                    t.inst = "16x16x16";
+                    std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                    matrix_core_swizzle<DataType>(t, a, stream_config);
+                }
+                else
+                {
+                    run_permute();
+                }
+            }
+        }
+        else
+#endif
+        {
+            run_permute();
+        }
+
+        bool pass = true;
+
+        // Do Validation
+        reference_permute(x, y, perm_vec);
+
+        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
+
+        y_buf.FromDevice(y_dev.data());
+
+        pass = std::equal(
+            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
+                using itype = to_integer_type<sizeof(DataType)>;
+                itype i_d   = ck_tile::bit_cast<itype>(d);
+                itype i_h   = ck_tile::bit_cast<itype>(h);
+                return i_d == i_h;
+            });
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+
+        std::cout << std::endl;
+
+        EXPECT_TRUE(pass);
+    }
+
+    static std::string get_precision_string()
+    {
+        if constexpr(std::is_same_v<DataType, ck_tile::fp16_t>)
+        {
+            return "fp16";
+        }
+        else if(std::is_same_v<DataType, ck_tile::fp8_t>)
+        {
+            return "fp8";
+        }
+        else if(std::is_same_v<DataType, float>)
+        {
+            return "fp32";
+        }
+        else
+        {
+            throw std::runtime_error("invalid precision");
+        }
+    }
+};

From d7c95dd4915fb7172f86219379b839152dbfbf26 Mon Sep 17 00:00:00 2001
From: jefyang1 <146495389+jefyang1@users.noreply.github.com>
Date: Thu, 14 Aug 2025 13:25:24 -0700
Subject: [PATCH 437/443] Add gemm universal f8 f8 bf16 instances on gfx950
 (#2662)

---
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    | 68 ++++++++++-----
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp | 84 +++++++++++++++++++
 ...f8_bf16_mk_nk_mn_comp_default_instance.cpp |  8 +-
 ...8_bf16_mk_nk_mn_comp_kpadding_instance.cpp |  8 +-
 ..._bf16_mk_nk_mn_mem_v1_default_instance.cpp |  7 ++
 ...bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp |  7 ++
 ..._bf16_mk_nk_mn_mem_v2_default_instance.cpp |  7 ++
 ...bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp |  7 ++
 8 files changed, 174 insertions(+), 22 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 8fea287941..dc8e98218e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -36,16 +36,30 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    enum struct Arch : bool
+    {
+#if defined(__gfx950__)
+        is_gfx950_build = true,
+#else
+        is_gfx950_build = false,
+#endif
+    };
+    // skip building the instances with K1>=32 on pre-gfx950
+    if constexpr(((GridwiseGemm::AK1Number >= 32 || GridwiseGemm::BK1Number >= 32) &&
+                  static_cast<bool>(Arch::is_gfx950_build)) ||
+                 (GridwiseGemm::AK1Number < 32 && GridwiseGemm::BK1Number < 32))
+    {
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
 
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared,
-        karg);
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -64,20 +78,34 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_gemm_xdl_cshuffle_v3_2lds(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    // Pass two lds pointer is the key to tell compiler that ds_read/write
-    // operate on different lds chunk at same time without order dependecy
-    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    enum struct Arch : bool
+    {
+#if defined(__gfx950__)
+        is_gfx950_build = true,
+#else
+        is_gfx950_build = false,
+#endif
+    };
+    // skip building the instances with K1>=32 on pre-gfx950
+    if constexpr(((GridwiseGemm::AK1Number >= 32 || GridwiseGemm::BK1Number >= 32) &&
+                  static_cast<bool>(Arch::is_gfx950_build)) ||
+                 (GridwiseGemm::AK1Number < 32 && GridwiseGemm::BK1Number < 32))
+    {
+        // Pass two lds pointer is the key to tell compiler that ds_read/write
+        // operate on different lds chunk at same time without order dependecy
+        __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+        __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
 
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
-        p_shared_0,
-        p_shared_1,
-        karg);
+        GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
+            p_shared_0,
+            p_shared_1,
+            karg);
+    }
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index 27d7933477..c5ebd7d2f5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -74,6 +74,54 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_part2 = std::
 #endif
     // clang-format on
     >;
+// instances for double rate mfma on gfx950
+template <GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  32,  32,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  32,  32,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  32,  32,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   256,  32,  32,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  32,  32,  32,   32,    4,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  32,  32,  32,   32,    2,    4,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  32,  32,  32,   32,    4,    2,     S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<2, 128, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  32,  32,  32,   32,    1,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,    64,   128,  32,  32,  32,   32,    4,    1,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   256,  32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   512,  32,  32,  32,   32,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>
+#endif
+    // clang-format on
+    >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
@@ -115,6 +163,42 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std::tuple<
 #endif
     // clang-format on
     >;
+// instances for double rate mfma on gfx950
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr = std::tuple<
+// clang-format off
+        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Latency friendly 
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // Memory friendly
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    256, 32,  32,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    256, 32,  32,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    256, 32,  32,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 32,  32,  16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    256, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    512, 32,  32,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    256, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    512, 32,  32,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    256, 32,  32,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    256, 32,  32,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    256, 32,  32,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    256, 32,  32,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    256, 32,  32,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2, F8>
+#endif
+    // clang-format on
+    >;
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
index d6c9809020..6cf0228c04 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
@@ -17,7 +17,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
 
-    if(ck::get_device_name() != "gfx950")
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmDefault>{});
+    }
+    else
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
index fc6ad01742..65e49d5f88 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp
@@ -17,7 +17,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instances(
     add_device_operation_instances(
         instances, device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmKPadding>{});
 
-    if(ck::get_device_name() != "gfx950")
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances_dr<GemmKPadding>{});
+    }
+    else
     {
         add_device_operation_instances(
             instances,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
index f6a9c48555..13c4ff682f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmDefault>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
+                                                                           GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
index f9c12e7cb2..49652b8680 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave, GemmKPadding>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Intrawave,
+                                                                           GemmKPadding>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
index 1d33c7fa57..120dfe0bee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmDefault>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
+                                                                           GemmDefault>{});
+    }
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
index 252aec5bc2..2dc5acaabb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp
@@ -16,6 +16,13 @@ void add_device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_instances
     add_device_operation_instances(
         instances,
         device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave, GemmKPadding>{});
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances_dr<Interwave,
+                                                                           GemmKPadding>{});
+    }
 }
 
 } // namespace instance

From 1c2078066baf02fe6f8fb442499aa6d035945b4c Mon Sep 17 00:00:00 2001
From: Thrupti Raj Lakshmana Gowda <thruptiraj.lakshmanagowda@amd.com>
Date: Thu, 14 Aug 2025 15:35:55 -0500
Subject: [PATCH 438/443] Variable name correction in Jenkins file (#2686)

---
 Jenkinsfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 282c3a6049..ed4c39126b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1207,8 +1207,8 @@ pipeline {
                                             -D GPU_TARGETS="gfx90a" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
-                                            -D DGEMM_MULTI_D_DATATYPE="fp16" \
-                                            -D DGEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
+                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
+                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                            ninja -j64 benchmark_gemm_fp8_rcr && \
                                            ./bin/benchmark_gemm_fp8_rcr && \
@@ -1255,8 +1255,8 @@ pipeline {
                                             -D GPU_TARGETS="gfx942" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
-                                            -D DGEMM_MULTI_D_DATATYPE="fp16" \
-                                            -D DGEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
+                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
+                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -DCMAKE_CXX_FLAGS=" -O3 " .. && \
                                            ninja -j64 benchmark_gemm_fp8_rcr && \
                                            ./bin/benchmark_gemm_fp8_rcr && \

From c06e8b4a66e03c50790d077d30afe1b1aa0b6f85 Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Fri, 15 Aug 2025 18:00:25 -0400
Subject: [PATCH 439/443] feat(gemm_wp): add two new configs for gemm weight
 preshuffle in gemm_utils.h (#2690)

* feat(gemm_wp): add two new configs for wp

* delete the unnecessary files

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
---
 example/ck_tile/03_gemm/README.md             |  2 +
 example/ck_tile/03_gemm/gemm_utils.hpp        | 37 ++++++-------------
 .../03_gemm/gemm_weight_preshuffle.cpp        |  4 +-
 3 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md
index 59ef2640b7..c9e392dbd5 100644
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -12,6 +12,8 @@ sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_gemm_basic -j
 # The memory bound pipeline on the gemm calculation
 make tile_example_gemm_universal -j
+# The weight preshuffle pipeline on the gemm calculation
+make tile_example_gemm_weight_preshuffle -j
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal`
 
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 5f477b3821..ab481b97a0 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -34,21 +34,6 @@ constexpr ck_tile::index_t get_k_warp_tile()
         return 32;
 #endif
 }
-template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile_flatmm()
-{
-#if defined(CK_GFX950_SUPPORT)
-    if constexpr(M_Warp_Tile == 32)
-        return sizeof(PrecType) == 2 ? 16 : 64;
-    else
-        return sizeof(PrecType) == 2 ? 32 : 128;
-#else
-    if constexpr(M_Warp_Tile == 32)
-        return sizeof(PrecType) == 2 ? 16 : 32;
-    else
-        return sizeof(PrecType) == 2 ? 32 : 64;
-#endif
-}
 
 struct GemmConfigBase
 {
@@ -232,11 +217,11 @@ struct GemmConfigComputeV5 : public GemmConfigBase
 };
 
 template <typename PrecType>
-struct GemmConfigPreshuffle_1 : public GemmConfigBase
+struct GemmConfigPreshuffleDecode : public GemmConfigBase
 {
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
 
     static constexpr ck_tile::index_t M_Warp = 1;
     static constexpr ck_tile::index_t N_Warp = 4;
@@ -244,17 +229,17 @@ struct GemmConfigPreshuffle_1 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
-    static constexpr int kBlockPerCu           = 2;
+    static constexpr int kBlockPerCu           = 1;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
-    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V1;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
     static constexpr bool Preshuffle           = true;
-    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr bool DoubleSmemBuffer     = true;
 };
 
 template <typename PrecType>
-struct GemmConfigPreshuffle_2 : public GemmConfigBase
+struct GemmConfigPreshufflePrefill : public GemmConfigBase
 {
     static constexpr ck_tile::index_t M_Tile = 128;
     static constexpr ck_tile::index_t N_Tile = 128;
@@ -266,7 +251,7 @@ struct GemmConfigPreshuffle_2 : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -470,7 +455,7 @@ auto create_args(int argc, char* argv[])
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
         .insert("persistent", "0", "0:non-persistent, 1:persistent")
         .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
-        .insert("rotating_count", "1", "rotating count, defaults to 1");
+        .insert("rotating_count", "1000", "rotating count, defaults to 1000");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
index 8a7560bf86..2057f1e4f5 100644
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -141,7 +141,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
                       << "pipeline: " << GemmPipeline::GetName() << '\n'
                       << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                       << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+                      << ", kBlockPerCu: {" << GemmConfig::kBlockPerCu << "}" << std::endl;
         }
         if(s.flush_cache_)
         {
@@ -280,7 +280,7 @@ int main(int argc, char* argv[])
 
     try
     {
-        return !run_gemm_example<GemmConfigPreshuffle_2>(arg_parser);
+        return !run_gemm_example<GemmConfigPreshuffleDecode>(arg_parser);
     }
     catch(const std::runtime_error& e)
     {

From 5ada85ec047591dc2d67b3e608c1951156b5ef4f Mon Sep 17 00:00:00 2001
From: Thomas Ning <Thomas.Ning@amd.com>
Date: Fri, 15 Aug 2025 15:49:07 -0700
Subject: [PATCH 440/443] Preshuffle Decode Prefill config fix (#2693)

* feat(gemm_wp): add two new configs for wp

* delete the unnecessary files

* fix the config error

* update the config

---------

Co-authored-by: AviralGoelAMD <aviral.goel@amd.com>
---
 example/ck_tile/03_gemm/gemm_utils.hpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 example/ck_tile/03_gemm/gemm_utils.hpp

diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
old mode 100644
new mode 100755
index ab481b97a0..e319e2d668
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -35,6 +35,22 @@ constexpr ck_tile::index_t get_k_warp_tile()
 #endif
 }
 
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(CK_GFX950_SUPPORT)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
+#endif
+}
+
 struct GemmConfigBase
 {
     static constexpr bool kPadM = false;
@@ -229,7 +245,7 @@ struct GemmConfigPreshuffleDecode : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 1;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -251,7 +267,7 @@ struct GemmConfigPreshufflePrefill : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
 
     static constexpr int kBlockPerCu           = 2;
     static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;

From 68134b60e45612b54f6c3165e39078676b41928d Mon Sep 17 00:00:00 2001
From: Tianyuan Wu <tianyuan.wu@amd.com>
Date: Sat, 16 Aug 2025 07:22:27 +0800
Subject: [PATCH 441/443] [CK_TILE] CK_TILE GEMM WMMA Support for GFX11/GFX12
 (#2466)

* WMMA GEMM F16 Implementation

Signed-off-by: root <tianyuwu@amd.com>

* Self-review

Signed-off-by: root <tianyuwu@amd.com>

* ASIC check minor tweak

Signed-off-by: root <tianyuwu@amd.com>

* add missing include file

* Set GPU_TARGETS to gfx11/12 generic

Signed-off-by: root <tianyuwu@amd.com>

* INT8 GFX12

Signed-off-by: root <tianyuwu@amd.com>

* add int8x16 branch

* Fix CI script

Signed-off-by: root <tianyuwu@amd.com>

* Fix typo

Signed-off-by: root <tianyuwu@amd.com>

* Add CK_Tile WMMA example

Signed-off-by: Tianyuan Wu <tianyuwu@amd.com>

* Fix CI

Signed-off-by: Tianyuan Wu <tianyuwu@amd.com>

* fix clang format

* Set M/N_Warp Back to Constant

Signed-off-by: Tianyuan Wu <tianyuwu@amd.com>

* Use GemmConfigComputeV3 by default

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Enable CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT for gfx12

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Remove CK_Tile wmma gemm examples from the CI list

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Add atomic add fallback method for gfx11

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Fix typo

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Omit copyright year

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Support non-square cases

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Fix CI

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Add get_device_ip()

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Revert "Add atomic add fallback method for gfx11"

This reverts commit 07a79e797dab4bda326459426844b83e25275636.

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>

* Revert "Enable CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT for gfx12"

This reverts commit ceee9180070dda0f7f351dad850b075cd53ba433.

* Revise method name and typos

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>

* clang-format

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Try fix CI

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Revert "Try fix CI"

This reverts commit 7a7241085e6aed36933d21c39ab714f8f96151d9.

* clang-format

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>

* Fix typo caused by merge

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>

* Fix typo caused by merging

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>

---------

Signed-off-by: root <tianyuwu@amd.com>
Signed-off-by: Tianyuan Wu <tianyuwu@amd.com>
Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>
Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
Co-authored-by: joye <joye@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
---
 CMakeLists.txt                                |   7 +
 Jenkinsfile                                   |   4 +-
 .../gemm_bilinear_wmma_fp16.cpp               |   2 +-
 .../gemm_bilinear_wmma_int8.cpp               |   2 +-
 .../gemm_bilinear_xdl_fp16.cpp                |   2 +-
 .../gemm_multi_ABD_xdl_fp16.cpp               |   2 +-
 .../contraction_multi_ABD_xdl_fp16.cpp        |   2 +-
 example/ck_tile/03_gemm/gemm_utils.hpp        |  21 ++
 example/ck_tile/03_gemm/universal_gemm.cpp    |   1 -
 .../ck_tile/17_grouped_gemm/grouped_gemm.cpp  |   0
 include/ck_tile/core/arch/arch.hpp            |  19 +-
 .../core/arch/generic_memory_space_atomic.hpp |  58 ++++++
 include/ck_tile/core/config.hpp               |   8 +-
 include/ck_tile/host/device_prop.hpp          |  13 ++
 .../ops/epilogue/cshuffle_epilogue.hpp        |  14 +-
 .../ops/epilogue/default_2d_epilogue.hpp      |  14 +-
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp |  14 +-
 ...block_fmha_bwd_pipeline_default_policy.hpp |  67 ++++---
 ...mha_bwd_pipeline_trload_default_policy.hpp |   6 +-
 ..._pipeline_qr_ks_vs_async_trload_policy.hpp |  49 +++--
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |  17 +-
 .../fused_moegemm_pipeline_flatmm_policy.hpp  |   8 +-
 include/ck_tile/ops/gemm.hpp                  |   6 +
 ...emm_asmem_bsmem_creg_v1_default_policy.hpp |  40 ++--
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |   4 +-
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |   2 +-
 ...peline_ag_bg_cr_comp_v4_default_policy.hpp |  21 +-
 ...peline_ag_bg_cr_comp_v5_default_policy.hpp |  19 +-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |  21 +-
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |  21 +-
 ..._pipeline_agmem_bgmem_creg_base_policy.hpp |  14 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   | 134 ++++++-------
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    |  18 +-
 .../gemm/warp/warp_gemm_attribute_wmma.hpp    | 147 ++++++++++++++
 .../warp/warp_gemm_attribute_wmma_impl.hpp    | 132 +++++++++++++
 ..._gemm_attribute_wmma_impl_16bit_traits.hpp |  87 ++++++++
 ...p_gemm_attribute_wmma_impl_8bit_traits.hpp | 138 +++++++++++++
 ...p_gemm_attribute_wmma_impl_base_traits.hpp |  86 ++++++++
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    | 185 ++++++++++--------
 .../ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp  |  37 ++++
 .../gemm_aquant_pipeline_ag_bg_cr_policy.hpp  |  28 +--
 test/ck_tile/gemm/CMakeLists.txt              |  11 ++
 .../gemm/test_gemm_pipeline_compv3.cpp        |   3 +-
 .../gemm/test_gemm_pipeline_compv3_wmma.cpp   |  17 ++
 .../gemm/test_gemm_pipeline_compv4.cpp        |   3 +-
 .../gemm/test_gemm_pipeline_compv4_wmma.cpp   |  17 ++
 .../gemm/test_gemm_pipeline_kernel_types.hpp  | 144 ++++++++++----
 test/ck_tile/gemm/test_gemm_pipeline_mem.cpp  |   2 +-
 .../gemm/test_gemm_pipeline_mem_wmma.cpp      |  17 ++
 .../gemm/test_gemm_pipeline_persistent.cpp    |   3 +-
 .../test_gemm_pipeline_persistent_wmma.cpp    |  17 ++
 test/ck_tile/gemm/test_gemm_pipeline_util.hpp |  63 ++++--
 .../gemm/test_gemm_pipeline_wmma_base.hpp     |  24 +++
 .../test_gemm_pipeline_ut_cases.inc           |   0
 54 files changed, 1388 insertions(+), 403 deletions(-)
 mode change 100755 => 100644 example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
 create mode 100644 include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp
 create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
 mode change 100755 => 100644 test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19c036e1a5..07d2e166bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -327,6 +327,7 @@ endif()
 if(USE_OPT_GFX11)
     add_compile_options(-mcumode)
     add_compile_options(-mno-wavefrontsize64)
+    add_compile_definitions(CK_TILE_WAVE32_ENABLED)
     message(STATUS "CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
 endif()
 
@@ -336,6 +337,12 @@ if(ENABLE_ASM_DUMP)
     message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
 endif()
 
+if(USE_OPT_GFX12 AND (SUPPORTED_GPU_TARGETS MATCHES "gfx12"))
+    add_compile_options(-mno-wavefrontsize64)
+    add_compile_definitions(CK_TILE_WAVE32_ENABLED)
+    message(STATUS "CK compiled with USE_OPT_GFX12 set to ${USE_OPT_GFX12}")
+endif()
+
 ## Threads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
diff --git a/Jenkinsfile b/Jenkinsfile
index ed4c39126b..d1f1baf15f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1474,7 +1474,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1101") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DUSE_OPT_GFX11=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx11-generic" \
@@ -1495,7 +1495,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx1201") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DUSE_OPT_GFX12=ON -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx12-generic" \
diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
index 18731e810e..03c531c1ad 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
index 87812369bd..5167097b6d 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
index c3e6ef7d5d..abf7ef3905 100644
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
index 93034a8b70..2582ea8a11 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
index e7c1d6f0be..57e2feb084 100644
--- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
+++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <numeric>
diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index e319e2d668..eb0a6de8aa 100755
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -172,6 +172,27 @@ struct GemmConfigComputeV3_2 : public GemmConfigBase
     static constexpr int kBlockPerCu = 2;
 };
 
+template <typename PrecType>
+struct GemmConfigComputeV3_WMMA : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
 template <typename PrecType>
 struct GemmConfigComputeV4 : public GemmConfigBase
 {
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index 14c4905720..149a8c2f0c 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -346,5 +346,4 @@ int main(int argc, char* argv[])
         // Return a non-zero code to indicate failure
         return EXIT_FAILURE;
     }
-    return EXIT_SUCCESS;
 }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
old mode 100755
new mode 100644
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index f0e9518120..ec5f49108e 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
 
 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
 #define CK_TILE_VMCNT(cnt)                                              \
@@ -59,7 +60,7 @@ enum struct memory_operation_enum : std::uint16_t
 
 CK_TILE_HOST_DEVICE constexpr index_t get_warp_size()
 {
-#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
+#if defined(__GFX9__) || (!defined(__HIP_DEVICE_COMPILE__) && !defined(CK_TILE_WAVE32_ENABLED))
     return 64;
 #else
     return 32;
@@ -230,4 +231,20 @@ CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_
     }
 }
 
+// Architecture tags
+struct gfx11_t
+{
+};
+struct gfx12_t
+{
+};
+
+CK_TILE_DEVICE static constexpr auto get_device_arch()
+{
+#if defined(__gfx11__)
+    return gfx11_t{};
+#else // if defined(__gfx12__)
+    return gfx12_t{};
+#endif
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
index 07c6aa0baf..c02c46958c 100644
--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
@@ -6,6 +6,10 @@
 #include "ck_tile/core/numeric/type_convert.hpp"
 #include "ck_tile/core/container/thread_buffer.hpp"
 
+#define HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN                        \
+    __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2f16) && \
+        __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16)
+
 namespace ck_tile {
 
 template <typename T, typename ComputeType>
@@ -32,6 +36,14 @@ CK_TILE_HOST_DEVICE bf16x4_t add_bf16x4_t(const bf16x4_t& a, const bf16x4_t& b)
     return rtn;
 }
 
+CK_TILE_HOST_DEVICE fp16x2_t add_f16x2_t(const fp16x2_t& a, const fp16x2_t& b)
+{
+    fp16x2_t rtn;
+    rtn[0] = add<fp16_t, float>(a[0], b[0]);
+    rtn[1] = add<fp16_t, float>(a[1], b[1]);
+    return rtn;
+}
+
 CK_TILE_HOST_DEVICE fp8x4_t add_fp8x4_t(const fp8x4_t& a, const fp8x4_t& b)
 {
     fp8x4_t rtn;
@@ -304,6 +316,44 @@ CK_TILE_DEVICE void atomic_add<bf8x8_t>(bf8x8_t* p_dst, bf8x8_t const& x)
     } while(cur_v.u64 != old_v);
 }
 
+//
+// Atomic add for fp16x2_t
+//
+template <>
+CK_TILE_DEVICE void atomic_add<fp16x2_t>(fp16x2_t* p_dst, fp16x2_t const& x)
+{
+#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN
+    __builtin_amdgcn_global_atomic_fadd_v2f16(c_style_pointer_cast<fp16x2_t*>(p_dst), x);
+#else
+    union U32F162_ADDR
+    {
+        uint32_t* u32_a;
+        fp16x2_t* f162_a;
+    };
+
+    union U32F162
+    {
+        uint32_t u32;
+        fp16x2_t f162;
+    };
+
+    U32F162_ADDR dword_addr;
+    U32F162 cur_v;
+    U32F162 new_;
+    uint32_t old_v, new_v;
+    dword_addr.f162_a = p_dst;
+    cur_v.u32         = *dword_addr.u32_a;
+
+    do
+    {
+        old_v     = cur_v.u32;
+        new_.f162 = add_f16x2_t(cur_v.f162, x);
+        new_v     = new_.u32;
+        cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v);
+    } while(cur_v.u32 != old_v);
+#endif
+}
+
 template <typename T, index_t N>
 CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
 {
@@ -311,6 +361,7 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
                       (std::is_same<T, uint32_t>::value && (N == 1)) ||
                       (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
                       (std::is_same<T, double>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
                       (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
                       (std::is_same<T, fp8_t>::value && (N == 4 || N == 8 || N == 16)) ||
                       (std::is_same<T, bf8_t>::value && (N == 4 || N == 8 || N == 16)),
@@ -406,6 +457,13 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
             atomic_add(c_style_pointer_cast<bf8x8_t*>(p_dst) + 1, x.template get_as<bf8x8_t>()[I1]);
         }
     }
+    else if constexpr(std::is_same<T, fp16_t>::value)
+    {
+        static_for<0, N / 2, 1>{}([&](auto i) {
+            atomic_add(c_style_pointer_cast<fp16x2_t*>(p_dst) + i,
+                       x.template get_as<fp16x2_t>()[i]);
+        });
+    }
 }
 
 template <typename T, index_t N>
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index e472bd01e5..f94065da2b 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -152,7 +152,7 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx9__) // for GPU code
+#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
@@ -274,6 +274,12 @@
 #define CK_TILE_WA_ISSUE_2028 0
 #endif
 
+#ifndef CK_TILE_WAVE32_ENABLED
+#if defined(__gfx11__) || defined(__gfx12__)
+#define CK_TILE_WAVE32_ENABLED
+#endif
+#endif
+
 // Y pointed to R, we don't see a valuable use case.
 // Will enforce encoding to check Y not pointed to R if set to zero
 #ifndef CK_TILE_ENC_SUPPORT_Y_TO_R
diff --git a/include/ck_tile/host/device_prop.hpp b/include/ck_tile/host/device_prop.hpp
index 0d8f89ea31..f86e4b889a 100644
--- a/include/ck_tile/host/device_prop.hpp
+++ b/include/ck_tile/host/device_prop.hpp
@@ -52,6 +52,19 @@ inline std::string get_device_name()
     }
 }
 
+inline bool is_gfx11_supported()
+{
+    return get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
+           get_device_name() == "gfx1102" || get_device_name() == "gfx1103" ||
+           get_device_name() == "gfx1150" || get_device_name() == "gfx1151" ||
+           get_device_name() == "gfx1152";
+}
+
+inline bool is_gfx12_supported()
+{
+    return get_device_name() == "gfx1200" || get_device_name() == "gfx1201";
+}
+
 inline bool is_load_tr_supported()
 {
     // Check if load transpose is supported.
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index d42f144baa..f773de9e7e 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -203,13 +203,13 @@ struct CShuffleEpilogue
     static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle);
     static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle);
 
-    using WG = WarpGemmMfmaDispatcher<ATypeToUse,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      MPerXdl,
-                                      NPerXdl,
-                                      KPerXdl,
-                                      isCTransposed>;
+    using WG = WarpGemmDispatcher<ATypeToUse,
+                                  BTypeToUse,
+                                  AccDataType,
+                                  MPerXdl,
+                                  NPerXdl,
+                                  KPerXdl,
+                                  isCTransposed>;
 
     using CWarpDstr   = typename WG::CWarpDstr;
     using CWarpTensor = typename WG::CWarpTensor;
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index fdbe2e7a6d..8a0970f494 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -130,13 +130,13 @@ struct DefaultGemm2DEpilogue : public Default2DEpilogue<Problem_, Policy_>
     static constexpr index_t kKPerXdl      = Problem::kKPerXdl;
     static constexpr index_t isCTransposed = Problem::isCTransposed;
 
-    using WG = WarpGemmMfmaDispatcher<ADataType,
-                                      BTypeToUse,
-                                      AccDataType,
-                                      kMPerXdl,
-                                      kNPerXdl,
-                                      kKPerXdl,
-                                      isCTransposed>;
+    using WG = WarpGemmDispatcher<ADataType,
+                                  BTypeToUse,
+                                  AccDataType,
+                                  kMPerXdl,
+                                  kNPerXdl,
+                                  kKPerXdl,
+                                  isCTransposed>;
 
     using CWarpDstr = typename WG::CWarpDstr;
 
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index cc00000efc..20783ea8bf 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -430,13 +430,13 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         // using AccDataType = float;
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                  typename Problem::BDataType,
-                                                  typename Problem::CDataType,
-                                                  WarpTile::at(I0),
-                                                  WarpTile::at(I1),
-                                                  WarpTile::at(I2),
-                                                  Problem::TransposeC>;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType,
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
 
         using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy<
             typename Problem::ADataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
index aa2ec99590..68ead7c765 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp
@@ -43,7 +43,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm0BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<
+        using WarpGemm = WarpGemmDispatcher<
             typename Problem::QDataType,
             typename Problem::KDataType,
             typename Problem::AccDataType,
@@ -78,18 +78,18 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm1WarpTile>>;
 
         using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                   typename Problem::OGradDataType,
-                                   typename Problem::AccDataType,
-                                   Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-                                   true,
-                                   false, // SwizzleAccess
-                                   false, // UseStructuredSparsity
-                                   (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
-                                       ? WGAttrNumAccessEnum ::Double
-                                       : WGAttrNumAccessEnum ::Single>;
+            WarpGemmDispatcher<typename Problem::GemmDataType,
+                               typename Problem::OGradDataType,
+                               typename Problem::AccDataType,
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+                               true,
+                               false, // SwizzleAccess
+                               false, // UseStructuredSparsity
+                               (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
+                                   ? WGAttrNumAccessEnum ::Double
+                                   : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
@@ -115,7 +115,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm2BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm2WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<
+        using WarpGemm = WarpGemmDispatcher<
             typename Problem::OGradDataType,
             typename Problem::VDataType,
             typename Problem::AccDataType,
@@ -150,18 +150,18 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm3WarpTile>>;
 
         using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                   typename Problem::QDataType,
-                                   typename Problem::AccDataType,
-                                   Problem::BlockFmhaShape::Gemm3WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm3WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm3WarpTile::at(number<2>{}),
-                                   true,
-                                   false, // SwizzleAccess
-                                   false, // UseStructuredSparsity
-                                   (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
-                                       ? WGAttrNumAccessEnum ::Double
-                                       : WGAttrNumAccessEnum ::Single>;
+            WarpGemmDispatcher<typename Problem::GemmDataType,
+                               typename Problem::QDataType,
+                               typename Problem::AccDataType,
+                               Problem::BlockFmhaShape::Gemm3WarpTile::at(number<0>{}),
+                               Problem::BlockFmhaShape::Gemm3WarpTile::at(number<1>{}),
+                               Problem::BlockFmhaShape::Gemm3WarpTile::at(number<2>{}),
+                               true,
+                               false, // SwizzleAccess
+                               false, // UseStructuredSparsity
+                               (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32)
+                                   ? WGAttrNumAccessEnum ::Double
+                                   : WGAttrNumAccessEnum ::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
@@ -187,14 +187,13 @@ struct BlockFmhaBwdPipelineDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm4BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm4WarpTile>>;
 
-        using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::GemmDataType,
-                                   typename Problem::KDataType,
-                                   typename Problem::AccDataType,
-                                   Problem::BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
-                                   false>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::GemmDataType,
+                                            typename Problem::KDataType,
+                                            typename Problem::AccDataType,
+                                            Problem::BlockFmhaShape::Gemm4WarpTile::at(number<0>{}),
+                                            Problem::BlockFmhaShape::Gemm4WarpTile::at(number<1>{}),
+                                            Problem::BlockFmhaShape::Gemm4WarpTile::at(number<2>{}),
+                                            false>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::GemmDataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
index d1fb1669c9..7849c931f7 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_trload_default_policy.hpp
@@ -25,7 +25,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
         constexpr auto SwizzleA = false;
-        using WarpGemm          = WarpGemmMfmaDispatcher< //
+        using WarpGemm          = WarpGemmDispatcher< //
             typename Problem::QDataType,
             typename Problem::KDataType,
             typename Problem::AccDataType,
@@ -66,7 +66,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm2WarpTile>>;
 
         constexpr auto SwizzleA = false;
-        using WarpGemm          = WarpGemmMfmaDispatcher< //
+        using WarpGemm          = WarpGemmDispatcher< //
             typename Problem::OGradDataType,
             typename Problem::VDataType,
             typename Problem::AccDataType,
@@ -106,7 +106,7 @@ struct BlockFmhaBwdPipelineTrLoadDefaultPolicy
                    typename BlockFmhaShape::Gemm4BlockWarps,
                    typename BlockFmhaShape::Gemm4WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher< //
+        using WarpGemm = WarpGemmDispatcher< //
             typename Problem::GemmDataType,
             typename Problem::KDataType,
             typename Problem::AccDataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
index 6582991207..6d414ee851 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_trload_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -512,14 +512,13 @@ struct BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm0BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm0WarpTile>>;
 
-        using WarpGemm =
-            WarpGemmMfmaDispatcher<typename Problem::QDataType,
-                                   typename Problem::KDataType,
-                                   typename Problem::SaccDataType,
-                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
-                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
-                                   Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
-                                   true>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::QDataType,
+                                            typename Problem::KDataType,
+                                            typename Problem::SaccDataType,
+                                            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<0>{}),
+                                            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<1>{}),
+                                            Problem::BlockFmhaShape::Gemm0WarpTile::at(number<2>{}),
+                                            true>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV2CustomPolicy<typename Problem::QDataType,
@@ -546,22 +545,22 @@ struct BlockFmhaPipelineQRKSVSAsyncTrloadDefaultPolicy
                                            typename Problem::BlockFmhaShape::Gemm1BlockWarps,
                                            typename Problem::BlockFmhaShape::Gemm1WarpTile>>;
 
-        using WarpGemm = WarpGemmMfmaDispatcher<
-            typename Problem::PDataType,
-            typename Problem::VDataType,
-            typename Problem::OaccDataType,
-            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
-            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
-            Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-            true,
-            false,
-            false,
-            ((Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 16 &&
-              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32) ||
-             (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 32 &&
-              Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 16))
-                ? WGAttrNumAccessEnum::Double
-                : WGAttrNumAccessEnum::Single>;
+        using WarpGemm =
+            WarpGemmDispatcher<typename Problem::PDataType,
+                               typename Problem::VDataType,
+                               typename Problem::OaccDataType,
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+                               Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+                               true,
+                               false,
+                               false,
+                               ((Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 16 &&
+                                 Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 32) ||
+                                (Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}) == 32 &&
+                                 Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}) == 16))
+                                   ? WGAttrNumAccessEnum::Double
+                                   : WGAttrNumAccessEnum::Single>;
 
         using BlockGemmPolicy =
             BlockGemmARegBRegCRegV2CustomPolicy<typename Problem::PDataType,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index e2cea97f9a..c492ce6827 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -956,20 +956,19 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
             {
                 return WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution<>{};
                 // return
-                // WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+                // WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
                 //         WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<typename
                 //         Problem::PDataType, typename Problem::VDataType>>>{};
             }
             else
             {
-                return WarpGemmMfmaDispatcher<
-                    typename Problem::PDataType,
-                    typename Problem::VDataType,
-                    typename Problem::OaccDataType,
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
-                    Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
-                    true>{};
+                return WarpGemmDispatcher<typename Problem::PDataType,
+                                          typename Problem::VDataType,
+                                          typename Problem::OaccDataType,
+                                          Problem::BlockFmhaShape::Gemm1WarpTile::at(number<0>{}),
+                                          Problem::BlockFmhaShape::Gemm1WarpTile::at(number<1>{}),
+                                          Problem::BlockFmhaShape::Gemm1WarpTile::at(number<2>{}),
+                                          true>{};
             }
         }();
 
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
index 0c8baaf191..dbd6913cdb 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -568,7 +568,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                      std::is_same_v<typename Problem::GDataType, ck_tile::bf16_t> &&
                      S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<wg_ctrl>,
                 2>>{};
         }
@@ -576,7 +576,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                           std::is_same_v<typename Problem::GDataType, ck_tile::int8_t> &&
                           S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<wg_ctrl>,
                 2>>{};
         }
@@ -695,7 +695,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                      std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
                      S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 16)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<wg_ctrl>,
                 2>>{};
         }
@@ -703,7 +703,7 @@ struct FusedMoeGemmPipelineFlatmmPolicy
                           std::is_same_v<typename Problem::DDataType, ck_tile::int8_t> &&
                           S_::Warp_M0 == 32 && S_::Warp_N0 == 32 && S_::Warp_K0 == 32)
         {
-            return WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+            return WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
                 WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<wg_ctrl>,
                 2>>{};
         }
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index e792820466..7a01420c51 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -58,9 +58,15 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
index cfbd78967f..d16651da93 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
@@ -54,16 +54,16 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
                 return make_tuple(WarpGemmMfmaF16F16F32M32N32K16<>{}, 2, 2);
             }
 #else
-            using WG = WarpGemmMfmaDispatcher<ck_tile::half_t,
-                                              ck_tile::half_t,
-                                              float,
-                                              32,
-                                              32,
-                                              16,
-                                              true,
-                                              false,
-                                              false,
-                                              wg_attr_num_access>;
+            using WG = WarpGemmDispatcher<ck_tile::half_t,
+                                          ck_tile::half_t,
+                                          float,
+                                          32,
+                                          32,
+                                          16,
+                                          true,
+                                          false,
+                                          false,
+                                          wg_attr_num_access>;
             return make_tuple(WG{}, 4, 1);
 #endif
         }
@@ -71,16 +71,16 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
                           std::is_same_v<typename Problem::CDataType, float>)
         {
-            using WG = WarpGemmMfmaDispatcher<ck_tile::bf16_t,
-                                              ck_tile::bf16_t,
-                                              float,
-                                              32,
-                                              32,
-                                              16,
-                                              true,
-                                              false,
-                                              false,
-                                              wg_attr_num_access>;
+            using WG = WarpGemmDispatcher<ck_tile::bf16_t,
+                                          ck_tile::bf16_t,
+                                          float,
+                                          32,
+                                          32,
+                                          16,
+                                          true,
+                                          false,
+                                          false,
+                                          wg_attr_num_access>;
             return make_tuple(WG{}, 4, 1);
         }
         else
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 5b7903a9e7..2d439c6970 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -182,7 +182,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
         constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
         constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;
 
-        constexpr index_t WaveSize = 64;
+        constexpr index_t WaveSize = get_warp_size();
         constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
         constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
@@ -242,7 +242,7 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             constexpr index_t NPerXDL = BlockGemm::WarpGemm::kN;
             constexpr index_t KPerXDL = BlockGemm::WarpGemm::WarpGemmAttribute::Impl::kK;
 
-            constexpr index_t WaveSize = 64;
+            constexpr index_t WaveSize = get_warp_size();
             constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
             constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index e6da00da95..b0cd93a661 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -182,7 +182,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{});
             constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{});
 
-            constexpr index_t WaveSize = 64;
+            constexpr index_t WaveSize = get_warp_size();
             constexpr index_t WaveNumM = BlockGemmShape::BlockWarps::at(I0{});
             constexpr index_t WaveNumN = BlockGemmShape::BlockWarps::at(I1{});
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
index 7d88c804f3..a80ed57be5 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
@@ -32,16 +32,17 @@ struct GemmPipelineAgBgCrCompV4DefaultPolicy
                 ? WGAttrNumAccessEnum::Double
                 : WGAttrNumAccessEnum::Single;
 
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                       typename Problem::BDataType,
-                                                       typename Problem::CDataType, // AccDataType
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC,
-                                                       false,
-                                                       false,
-                                                       wg_attr_num_access>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ADataType,
+                                            typename Problem::BDataType,
+                                            typename Problem::CDataType, // AccDataType
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC,
+                                            false,
+                                            false,
+                                            wg_attr_num_access>;
+
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
index 17cd46d560..7065e55e6d 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5_default_policy.hpp
@@ -21,15 +21,16 @@ struct GemmPipelineAgBgCrCompV5DefaultPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
         // using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                       typename Problem::BDataType,
-                                                       typename Problem::CDataType, // AccDataType
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC>;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType, // AccDataType
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
+
         using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
                                                                     typename Problem::BDataType,
                                                                     typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 0560ed9ba9..c8f4cfd4ec 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -390,16 +390,17 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
     {
-        using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                       typename Problem::ComputeDataType,
-                                                       AccDataType,
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC>;
+        using AccDataType = float;
+        using BlockWarps  = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile    = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm    = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                               typename Problem::ComputeDataType,
+                                               AccDataType,
+                                               WarpTile::at(I0),
+                                               WarpTile::at(I1),
+                                               WarpTile::at(I2),
+                                               Problem::TransposeC>;
+
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index 15f3358aad..e4b3649595 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -635,16 +635,17 @@ struct UniversalGemmPipelineAgBgCrPolicy
             : vector_size * 4 == thread_elements              ? WGAttrNumAccessEnum::Quad
                                                               : WGAttrNumAccessEnum::Invalid;
 
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                       typename Problem::ComputeDataType,
-                                                       typename Problem::CDataType,
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       Problem::TransposeC,
-                                                       false,
-                                                       Problem::UseStructuredSparsity,
-                                                       wg_attr_num_access>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC,
+                                            false,
+                                            Problem::UseStructuredSparsity,
+                                            wg_attr_num_access>;
+
         using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
                                                                       typename Problem::BDataType,
                                                                       typename Problem::CDataType,
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
index 83555e5295..f28208df52 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
@@ -280,13 +280,13 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
     {
         using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm   = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                  typename Problem::BDataType,
-                                                  typename Problem::CDataType,
-                                                  WarpTile::at(I0),
-                                                  WarpTile::at(I1),
-                                                  WarpTile::at(I2),
-                                                  Problem::TransposeC>;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType,
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
 
         using BlockWeightPreshufflePolicy =
             BlockWeightPreshuffleASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index d1deaf9e0e..c42874ca55 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -15,19 +15,19 @@ namespace ck_tile {
 // fp16
 
 using WarpGemmMfmaF16F16F32M32N32K8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaF16F16F32M16N16K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
@@ -36,42 +36,42 @@ using WarpGemmMfmaF16F16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterate
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M16N16K32 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
 #endif
 
-using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+using WarpGemmMfmaF16F16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     1>>;
 
-using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2>>;
 
 using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
@@ -80,13 +80,13 @@ using WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution =
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
@@ -94,36 +94,36 @@ using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
 
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M16N16K32<WGAttrCtlEnum::Default_>,
         1>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
         1>>;
 #endif
 
 using WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplF16F16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 #endif
 
-using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M4N64K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M4N64K4<WGAttrCtlEnum::Default_>,
     4>>;
 
-using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaF16F16F32M64N4K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplF16F16F32M64N4K4<WGAttrCtlEnum::Default_>,
     4>>;
 
@@ -136,19 +136,19 @@ using WarpGemmSmfmacF16F16F32M16N16K32 = WarpGemmSmfmacImpl<WarpGemmAttributeSmf
 
 // bf16
 using WarpGemmMfmaBf16Bf16F32M32N32K8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
@@ -157,43 +157,43 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16 = WarpGemmImpl<WarpGemmAtrributeMfmaItera
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M16N16K32 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
     2,
     AttrNumAccess>>;
 #endif
 
-using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA = WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
     WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
     1>>;
 
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK_SwizzleA<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateK_SwizzleA<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
@@ -202,153 +202,153 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
 #if defined(__gfx950__)
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>,
         AttrNumAccess>>;
 #else
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution<
         WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>,
         2,
         AttrNumAccess>>;
 #endif
 
 using WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>>>;
 
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K16<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8<WGAttrCtlEnum::Default_>,
         2>>;
 #endif
 
-using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M4N64K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M4N64K4<WGAttrCtlEnum::Default_>,
     4>>;
 
-using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfmaBf16Bf16F32M64N4K16 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImplBf16Bf16F32M64N4K4<WGAttrCtlEnum::Default_>,
     4>>;
 
 // fp8
 
 using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_32x32x32_fp8_fp8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>,
     2>>;
 
-using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
 using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_16x16x32_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
-using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_16x16x64_fp8_fp8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>,
     2>>;
 
-using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<
+using WarpGemmMfma_f32_16x16x64_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
     WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8<WGAttrCtlEnum::Default_>,
     2>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_fp8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_16x16x128_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x128_bf8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_bf8_fp8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
                           AttrNumAccess>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>>>;
 
 template <index_t swizzle_factor = 2>
 using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
-    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+    WarpGemmImpl<WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
         WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, WGAttrCtlEnum::Default_>,
         2,
         swizzle_factor>>;
 
 // int8
 using WarpGemmMfma_i32_32x32x16_i8_i8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_i32_16x16x32_i8_i8 = WarpGemmImpl<
-    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
+    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
 
 using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed =
-    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
         WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index 97fab489ab..36a9955912 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -19,7 +19,7 @@ enum class WGAttrNumAccessEnum
 
 template <typename WarpGemmAttributeMfmaImpl_,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfma
+struct WarpGemmAttributeMfma
 {
     using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
     static constexpr auto AttrNumAccess  = AttrNumAccess_;
@@ -103,7 +103,7 @@ struct WarpGemmAtrributeMfma
 template <typename WarpGemmAttributeMfmaImpl_,
           index_t kKIter,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfmaIterateK
+struct WarpGemmAttributeMfmaIterateK
 {
     static_assert(kKIter > 0, "wrong!");
 
@@ -367,7 +367,7 @@ struct WarpGemmAtrributeMfmaIterateK
 
 template <typename WarpGemmAttributeMfmaImpl_,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfmaTransposedCDistribution
+struct WarpGemmAttributeMfmaTransposedCDistribution
 {
     using Impl                           = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
     static constexpr auto AttrNumAccess  = AttrNumAccess_;
@@ -450,7 +450,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution
 };
 
 template <typename WarpGemmAttributeMfmaImpl_, index_t SFactor_ = 2>
-struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
+struct WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
 
@@ -546,7 +546,7 @@ struct WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
 template <typename WarpGemmAttributeMfmaImpl_,
           index_t kKIter,
           WGAttrNumAccessEnum AttrNumAccess_ = WGAttrNumAccessEnum::Single>
-struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
+struct WarpGemmAttributeMfmaIterateKAndTransposedCDistribution
 {
     using Impl                          = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
     static constexpr auto AttrNumAccess = AttrNumAccess_;
@@ -574,13 +574,13 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 
     CK_TILE_DEVICE static constexpr auto get_awarp_dstr_encoding()
     {
-        return WarpGemmAtrributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
+        return WarpGemmAttributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
             get_bwarp_dstr_encoding();
     }
 
     CK_TILE_DEVICE static constexpr auto get_bwarp_dstr_encoding()
     {
-        return WarpGemmAtrributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
+        return WarpGemmAttributeMfmaIterateK<Impl, kKIter, AttrNumAccess>::
             get_awarp_dstr_encoding();
     }
 
@@ -696,7 +696,7 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
 };
 
 template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter, index_t SFactor_ = 2>
-struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
+struct WarpGemmAttributeMfmaIterateKAndTransposedCDistribution_SwizzleB
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
 
@@ -840,7 +840,7 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
 };
 
 template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter, index_t SFactor_ = 2>
-struct WarpGemmAtrributeMfmaIterateK_SwizzleA
+struct WarpGemmAttributeMfmaIterateK_SwizzleA
 {
     using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
new file mode 100644
index 0000000000..0f021c62f2
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp"
+
+namespace ck_tile {
+
+// TODO: currently only support 16 bit input, which means only support tr16_b128; will use ADataType
+// to determine the layout in the future
+template <typename Impl>
+struct AWarpDstrEncodingTrait
+{
+    using type = tile_distribution_encoding<
+        sequence<Impl::kRepeat>,
+        tuple<sequence<Impl::kAMLane>,
+              sequence<Impl::kABK0PerLane, Impl::kABKLane, Impl::kABK1PerLane>>,
+        tuple<typename Impl::kABPs2RHssMajor>,
+        tuple<typename Impl::kABPs2RHssMinor>,
+        typename Impl::kABYs2RHsMajor,
+        typename Impl::kABYs2RHsMinor>;
+};
+
+template <typename Impl>
+struct BWarpDstrEncodingTrait
+{
+    using type = tile_distribution_encoding<
+        sequence<Impl::kRepeat>,
+        tuple<sequence<Impl::kBNLane>,
+              sequence<Impl::kABK0PerLane, Impl::kABKLane, Impl::kABK1PerLane>>,
+        tuple<typename Impl::kABPs2RHssMajor>,
+        tuple<typename Impl::kABPs2RHssMinor>,
+        typename Impl::kABYs2RHsMajor,
+        typename Impl::kABYs2RHsMinor>;
+};
+
+template <typename Impl>
+struct CWarpDstrEncodingTrait
+{
+    using type = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane, Impl::kCMLane, Impl::kCM1PerLane>,
+              sequence<Impl::kCNLane>>,
+        tuple<typename Impl::kCPs2RHssMajor>,
+        tuple<typename Impl::kCPs2RHssMinor>,
+        typename Impl::kCYs2RHsMajor,
+        typename Impl::kCYs2RHsMinor>;
+};
+
+template <typename WarpGemmAttributeWmmaImpl_, bool kTransC = false>
+struct WarpGemmAttributeWmma
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeWmmaImpl_>;
+
+    using ADataType = typename Impl::ADataType;
+    using BDataType = typename Impl::BDataType;
+    using CDataType = typename Impl::CDataType;
+
+    using AVecType = typename Impl::AVecType;
+    using BVecType = typename Impl::BVecType;
+    using CVecType = typename Impl::CVecType;
+
+    static constexpr index_t kM          = Impl::kM;
+    static constexpr index_t kN          = Impl::kN;
+    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kKPerThread = Impl::kABK0PerLane * Impl::kABK1PerLane;
+
+    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
+
+    // 16 bit input, kAMLane = 16, kABK0PerLane = 4, kABKLane = 2, kABK1PerLane = 2
+    // 8  bit input, kAMLane = 16, kABK0PerLane = 2, kABKLane = 2, kABK1PerLane = 4
+    using AWarpDstrEncoding = typename AWarpDstrEncodingTrait<Impl>::type;
+    using BWarpDstrEncoding = typename BWarpDstrEncodingTrait<Impl>::type;
+
+    // kCM0PerLane = 4, kCMLane = 2, kCM1PerLane = 2, kCNLane = 16 for 16 bit input
+    // kCM0PerLane = 2, kCMLane = 2, kCM1PerLane = 4, kCNLane = 16 for 8 bit input
+    using CWarpDstrEncoding = typename CWarpDstrEncodingTrait<Impl>::type;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        if constexpr(kTransC)
+        {
+            Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
+        }
+        else
+        {
+            Impl{}(c_vec, a_vec, b_vec, bool_constant<post_nop_>{});
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        if constexpr(kTransC)
+        {
+            return Impl{}(b_vec, a_vec);
+        }
+        else
+        {
+            return Impl{}(a_vec, b_vec);
+        }
+    }
+};
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          index_t M_Warp_Tile,
+          index_t N_Warp_Tile,
+          index_t K_Warp_Tile>
+CK_TILE_HOST bool check_wmma_supported()
+{
+    if(is_gfx12_supported())
+    {
+        return has_wmma_traits_v<gfx12_t,
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 M_Warp_Tile,
+                                 N_Warp_Tile,
+                                 K_Warp_Tile>;
+    }
+    else if(is_gfx11_supported())
+    {
+        return has_wmma_traits_v<gfx11_t,
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 M_Warp_Tile,
+                                 N_Warp_Tile,
+                                 K_Warp_Tile>;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
new file mode 100644
index 0000000000..13727d41b1
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+// Base traits for WMMA operations
+template <typename Arch,
+          typename AType,
+          typename BType,
+          typename CType,
+          index_t M,
+          index_t N,
+          index_t K>
+struct WmmaTraits;
+
+// Generic WMMA implementation using traits
+template <typename Traits>
+struct WarpGemmAttributeWmmaImpl
+{
+    using ADataType = typename Traits::ADataType;
+    using BDataType = typename Traits::BDataType;
+    using CDataType = typename Traits::CDataType;
+
+    using AVecType = typename Traits::AVecType;
+    using BVecType = typename Traits::BVecType;
+    using CVecType = typename Traits::CVecType;
+
+    // Forward all static constants and type aliases
+    static constexpr index_t kM = Traits::kM;
+    static constexpr index_t kN = Traits::kN;
+    static constexpr index_t kK = Traits::kK;
+
+    static constexpr index_t kRepeat      = Traits::kRepeat;
+    static constexpr index_t kAMLane      = Traits::kAMLane;
+    static constexpr index_t kBNLane      = Traits::kBNLane;
+    static constexpr index_t kABK0PerLane = Traits::kABK0PerLane;
+    static constexpr index_t kABKLane     = Traits::kABKLane;
+    static constexpr index_t kABK1PerLane = Traits::kABK1PerLane;
+
+    static constexpr index_t kCMLane     = Traits::kCMLane;
+    static constexpr index_t kCNLane     = Traits::kCNLane;
+    static constexpr index_t kCM0PerLane = Traits::kCM0PerLane;
+    static constexpr index_t kCM1PerLane = Traits::kCM1PerLane;
+
+    using kABPs2RHssMajor = typename Traits::kABPs2RHssMajor;
+    using kABPs2RHssMinor = typename Traits::kABPs2RHssMinor;
+    using kABYs2RHsMajor  = typename Traits::kABYs2RHsMajor;
+    using kABYs2RHsMinor  = typename Traits::kABYs2RHsMinor;
+
+    using kCPs2RHssMajor = typename Traits::kCPs2RHssMajor;
+    using kCPs2RHssMinor = typename Traits::kCPs2RHssMinor;
+    using kCYs2RHsMajor  = typename Traits::kCYs2RHsMajor;
+    using kCYs2RHsMinor  = typename Traits::kCYs2RHsMinor;
+
+    // c_vec += a_vec * b_vec
+    template <bool clamp = false, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        c_vec = Traits::template wmma_intrinsic<clamp>(a_vec, b_vec, c_vec);
+    }
+
+    // c_vec = a_vec * b_vec
+    template <bool clamp = false>
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        return bit_cast<CVecType>(
+            Traits::template wmma_intrinsic<clamp>(a_vec, b_vec, CVecType{0.f}));
+    }
+};
+
+using DeviceIp = remove_cvref_t<decltype(ck_tile::get_device_arch())>;
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, fp16_t, fp16_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, bf16_t, bf16_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<DeviceIp, int8_t, int8_t, int32_t, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, fp8_t, fp8_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, bf8_t, bf8_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, fp8_t, bf8_t, float, 16, 16, 16>>;
+
+using WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8 =
+    WarpGemmAttributeWmmaImpl<WmmaTraits<gfx12_t, bf8_t, fp8_t, float, 16, 16, 16>>;
+
+template <typename Arch,
+          typename AType,
+          typename BType,
+          typename CType,
+          index_t warp_m,
+          index_t warp_n,
+          index_t warp_k>
+struct has_wmma_traits
+{
+    template <typename T>
+    static auto
+    test(int) -> decltype(std::declval<
+                              typename WmmaTraits<T, AType, BType, CType, warp_m, warp_n, warp_k>::
+                                  ADataType>(),
+                          std::true_type{});
+
+    template <typename>
+    static std::false_type test(...);
+
+    static constexpr bool value = decltype(test<Arch>(0))::value;
+};
+
+template <typename Arch,
+          typename AType,
+          typename BType,
+          typename CType,
+          index_t warp_m,
+          index_t warp_n,
+          index_t warp_k>
+constexpr bool has_wmma_traits_v =
+    has_wmma_traits<Arch, AType, BType, CType, warp_m, warp_n, warp_k>::value;
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp
new file mode 100644
index 0000000000..7e834d9add
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_16bit_traits.hpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "warp_gemm_attribute_wmma_impl_base_traits.hpp"
+namespace ck_tile {
+// fp16 specialization - GFX11
+template <>
+struct WmmaTraits<gfx11_t, fp16_t, fp16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx11_t, fp16_t, fp16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx11__
+        return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// bf16 specialization - GFX11
+template <>
+struct WmmaTraits<gfx11_t, bf16_t, bf16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx11_t, bf16_t, bf16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx11__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// fp16 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, fp16_t, fp16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, fp16_t, fp16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+
+// bf16 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, bf16_t, bf16_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, bf16_t, bf16_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_vec, b_vec, c_vec);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0.f};
+#endif
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp
new file mode 100644
index 0000000000..81ff5af2fe
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_8bit_traits.hpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "warp_gemm_attribute_wmma_impl_base_traits.hpp"
+namespace ck_tile {
+// int8 specialization - GFX11
+template <>
+struct WmmaTraits<gfx11_t, int8_t, int8_t, int32_t, 16, 16, 16>
+    : WmmaTraitsBase<gfx11_t, int8_t, int8_t, int32_t>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx11__
+        return __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, // neg_a
+                                                          bit_cast<int32x4_t>(a_vec),
+                                                          true, // neg_b
+                                                          bit_cast<int32x4_t>(b_vec),
+                                                          bit_cast<int32x8_t>(c_vec),
+                                                          clamp);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+// int8 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, int8_t, int8_t, int32_t, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, int8_t, int8_t, int32_t>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, // neg_a
+                                                                bit_cast<int32x2_t>(a_vec),
+                                                                true, // neg_b
+                                                                bit_cast<int32x2_t>(b_vec),
+                                                                bit_cast<int32x8_t>(c_vec),
+                                                                clamp);
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+// fp8/bf8 specialization - GFX12
+template <>
+struct WmmaTraits<gfx12_t, fp8_t, fp8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, fp8_t, fp8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+template <>
+struct WmmaTraits<gfx12_t, bf8_t, bf8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, bf8_t, bf8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+template <>
+struct WmmaTraits<gfx12_t, fp8_t, bf8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, fp8_t, bf8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+
+template <>
+struct WmmaTraits<gfx12_t, bf8_t, fp8_t, float, 16, 16, 16>
+    : WmmaTraitsBase<gfx12_t, bf8_t, fp8_t, float>
+{
+    template <bool clamp = false>
+    CK_TILE_DEVICE static CVecType
+    wmma_intrinsic(const AVecType& a_vec, const BVecType& b_vec, const CVecType& c_vec)
+    {
+#ifdef __gfx12__
+        return __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(
+            bit_cast<int32x2_t>(a_vec), bit_cast<int32x2_t>(b_vec), bit_cast<fp32x8_t>(c_vec));
+#else
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = c_vec;
+        return CVecType{0};
+#endif
+    }
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
new file mode 100644
index 0000000000..7ea5507d09
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+namespace ck_tile {
+template <typename Arch, typename ADType, typename BDType, typename CDType>
+struct WmmaTraitsBase;
+
+// GFX11 specialization
+template <typename ADType, typename BDType, typename CDType>
+struct WmmaTraitsBase<gfx11_t, ADType, BDType, CDType>
+{
+    using ADataType = ADType;
+    using BDataType = BDType;
+    using CDataType = CDType;
+
+    using AVecType = ext_vector_t<ADataType, 16>;
+    using BVecType = ext_vector_t<BDataType, 16>;
+    using CVecType = ext_vector_t<CDataType, 8>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kRepeat      = 2;
+    static constexpr index_t kAMLane      = 16;
+    static constexpr index_t kBNLane      = 16;
+    static constexpr index_t kABK0PerLane = 1;
+    static constexpr index_t kABKLane     = 1;
+    static constexpr index_t kABK1PerLane = 16;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 8;
+    static constexpr index_t kCM1PerLane = 1;
+
+    using kABPs2RHssMajor = sequence<0, 2, 1>;
+    using kABPs2RHssMinor = sequence<0, 1, 0>;
+    using kABYs2RHsMajor  = sequence<2, 2>;
+    using kABYs2RHsMinor  = sequence<0, 2>;
+
+    using kCPs2RHssMajor = sequence<1, 2>;
+    using kCPs2RHssMinor = sequence<1, 0>;
+    using kCYs2RHsMajor  = sequence<1, 1>;
+    using kCYs2RHsMinor  = sequence<0, 2>;
+};
+
+// GFX12 specialization
+template <typename ADType, typename BDType, typename CDType>
+struct WmmaTraitsBase<gfx12_t, ADType, BDType, CDType>
+{
+    using ADataType = ADType;
+    using BDataType = BDType;
+    using CDataType = CDType;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 8>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 16;
+
+    static constexpr index_t kRepeat      = 1;
+    static constexpr index_t kAMLane      = 16;
+    static constexpr index_t kBNLane      = 16;
+    static constexpr index_t kABK0PerLane = 2;
+    static constexpr index_t kABKLane     = 2;
+    static constexpr index_t kABK1PerLane = 4;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 8;
+
+    using kABPs2RHssMajor = sequence<2, 1>;
+    using kABPs2RHssMinor = sequence<1, 0>;
+    using kABYs2RHsMajor  = sequence<2, 2>;
+    using kABYs2RHsMinor  = sequence<0, 2>;
+
+    using kCPs2RHssMajor = sequence<1, 2>;
+    using kCPs2RHssMinor = sequence<1, 0>;
+    using kCYs2RHsMajor  = sequence<1, 1>;
+    using kCYs2RHsMinor  = sequence<0, 2>;
+};
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 8c6f39e511..d50b208946 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp"
 
 namespace ck_tile {
 
@@ -19,115 +20,133 @@ template <typename AType,
           bool SwizzleA                     = false,
           bool UseStructuredSparsity        = false,
           WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-struct WarpGemmMfmaDispatcher;
+struct WarpGemmDispatcher;
 
 // clang-format off
 // fp16
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true>  { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true>  { using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M32N32K16<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaF16F16F32M16N16K32<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true>  { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M16N16K32<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaF16F16F32M4N64K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaF16F16F32M64N4K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float,  4, 64, 16, false> { using Type = WarpGemmMfmaF16F16F32M4N64K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 64,  4, 16, false> { using Type = WarpGemmMfmaF16F16F32M64N4K16; };
+// WMMA cases
+#if defined(__gfx11__) || defined(__gfx12__)
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_f32_16x16x16_f16_f16<TransposeC>;};
+#else
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaF16F16F32M16N16K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 16, true>  { using Type = WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution; };
+#endif
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16,  true, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp16 2:4 structural sparsity
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
+template<> struct WarpGemmDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };
 
 // bf16
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true>  { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true>  { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M32N32K16<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true>  { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M16N16K32<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true, false, false, WGAttrNumAccessEnum::Double> {
     using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution<WGAttrNumAccessEnum::Double>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 4, 64, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 64, 4, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float,  4, 64, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M4N64K16; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 64,  4, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M64N4K16; };
+// WMMA cases
+#if defined(__gfx11__) || defined(__gfx12__)
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_f32_16x16x16_bf16_bf16<TransposeC>; };
+#else
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+#endif
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8SwizzleBTransposedCDistribution; };
+template<> struct WarpGemmDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution; };
 
 // fp8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_bf8_bf8; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<>; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<>; };
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
 
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_fp8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_fp8_bf8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8<WGAttrNumAccessEnum::Quad>; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
+template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  128, false, false, false, WGAttrNumAccessEnum::Quad> {
     using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8<WGAttrNumAccessEnum::Quad>; };
+//WMMA cases
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_f8_f8<TransposeC>; };
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_bf8_bf8<TransposeC>; };
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_f8_bf8<TransposeC>; };
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 16, 16,  16, TransposeC, false> { using Type =WarpGemmWmma_f32_16x16x16_bf8_f8<TransposeC>; };
+
 // int8
 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, true> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, false> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, true> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, true>  { using Type = WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, false> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8; };
+template<> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, true>  { using Type = WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed; };
+// WMMA cases
+template<bool TransposeC> struct WarpGemmDispatcher<ck_tile::int8_t, ck_tile::int8_t, int32_t, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_i32_16x16x16_i8_i8<TransposeC>;};
 
 // clang-format on
 } // namespace impl
@@ -142,15 +161,15 @@ template <typename AType,
           bool SwizzleA                     = false,
           bool UseStructuredSparsity        = false,
           WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
-using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
-                                                                     BType,
-                                                                     AccType,
-                                                                     MPerWave,
-                                                                     NPerWave,
-                                                                     KPerWave,
-                                                                     TransposeC,
-                                                                     SwizzleA,
-                                                                     UseStructuredSparsity,
-                                                                     AttrNumAccess>::Type;
+using WarpGemmDispatcher = typename impl::WarpGemmDispatcher<AType,
+                                                             BType,
+                                                             AccType,
+                                                             MPerWave,
+                                                             NPerWave,
+                                                             KPerWave,
+                                                             TransposeC,
+                                                             SwizzleA,
+                                                             UseStructuredSparsity,
+                                                             AttrNumAccess>::Type;
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp
new file mode 100644
index 0000000000..cf477f7928
--- /dev/null
+++ b/include/ck_tile/ops/gemm/warp/warp_wmma_gemm.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp"
+
+namespace ck_tile {
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_f16_f16 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f16_f16, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_bf16_bf16 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf16_bf16, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_i32_16x16x16_i8_i8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_i32_16x16x16_i8_i8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_f8_f8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_f8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_bf8_bf8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_bf8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_f8_bf8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_f8_bf8, kTransC>>;
+
+template <bool kTransC = false>
+using WarpGemmWmma_f32_16x16x16_bf8_f8 =
+    WarpGemmImpl<WarpGemmAttributeWmma<WarpGemmAttributeWmmaImpl_f32_16x16x16_bf8_f8, kTransC>>;
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
index f2d78d7ab5..1fb92ad14d 100644
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -44,13 +44,13 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         constexpr index_t VecLoadSize = GetVectorSizeAQ<Problem>();
         constexpr bool Preshuffle     = Problem::Traits::Preshuffle;
         using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm                = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                               typename Problem::ComputeDataType,
-                                                               typename Problem::CDataType,
-                                                               WarpTile::at(I0),
-                                                               WarpTile::at(I1),
-                                                               WarpTile::at(I2),
-                                                               false>;
+        using WarpGemm                = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                                           typename Problem::ComputeDataType,
+                                                           typename Problem::CDataType,
+                                                           WarpTile::at(I0),
+                                                           WarpTile::at(I1),
+                                                           WarpTile::at(I2),
+                                                           false>;
 
         static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
         if constexpr(Preshuffle)
@@ -92,13 +92,13 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
         static_assert(Problem::kQuantGroupSize % WarpTile::at(I2) == 0,
                       "KPerWarpGemm must be a multiple of kQuantGroupSize!");
 
-        using WarpGemm = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
-                                                typename Problem::ComputeDataType,
-                                                typename Problem::CDataType,
-                                                WarpTile::at(I0),
-                                                WarpTile::at(I1),
-                                                WarpTile::at(I2),
-                                                false>;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            false>;
         static_assert(std::is_same_v<typename Problem::ComputeDataType, fp8_t> ||
                       std::is_same_v<typename Problem::ComputeDataType, bf8_t>);
         static_assert(std::is_same_v<typename Problem::CDataType, float>);
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 6cbdc1a24e..a982e30a4c 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -30,6 +30,14 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
     target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+elseif(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
+    # On Radeon devices, build the WMMA version instead
+    add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_mem_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv3_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+    target_compile_options(test_ck_tile_gemm_pipeline_compv4_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
 else()
     message(DEBUG "Skipping ck_tile_gemm tests for current target")
 endif()
@@ -46,4 +54,7 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95" OR GPU_TARGETS MAT
     target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
     add_test_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
     target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+elseif(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
+    add_gtest_executable(test_ck_tile_gemm_pipeline_persistent_wmma test_gemm_pipeline_persistent_wmma.cpp)
+    target_compile_options(test_ck_tile_gemm_pipeline_persistent_wmma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
index 8944e6865d..370f4c16a8 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3.cpp
@@ -3,7 +3,8 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelineCompV3 : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelineCompV3
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV3<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
new file mode 100644
index 0000000000..6bd98d0bc7
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv3_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineCompV3Wmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV3Wmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineCompV3Wmma
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV3Wmma, KernelTypesCompV3Wmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
index 22e77fac41..6d5a5b93d6 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4.cpp
@@ -3,7 +3,8 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelineCompV4 : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelineCompV4
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompV4<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
new file mode 100644
index 0000000000..f73901e761
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compv4_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineCompV4Wmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineCompV4Wmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineCompV4Wmma
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineCompV4Wmma, KernelTypesCompV4Wmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
index ae8899ba71..a55cd100c1 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp
@@ -9,13 +9,16 @@
 #include "ck_tile/host.hpp"
 #include "test_gemm_pipeline_util.hpp"
 
-using I8  = ck_tile::int8_t;
-using I32 = ck_tile::int32_t;
+using INT8  = ck_tile::int8_t;
+using INT32 = ck_tile::int32_t;
 
 using F16 = ck_tile::half_t;
 using F32 = float;
 using F8  = ck_tile::fp8_t;
 
+using BF16 = ck_tile::bf16_t;
+using BF8  = ck_tile::bf8_t;
+
 using Row       = ck_tile::tensor_layout::gemm::RowMajor;
 using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
 using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
@@ -30,52 +33,119 @@ using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Co
 using Persistent    = std::true_type;
 using NonPersistent = std::false_type;
 
+using I16  = ck_tile::number<16>;
+using I32  = ck_tile::number<32>;
+using I64  = ck_tile::number<64>;
+using I256 = ck_tile::number<256>;
+
 // clang-format off
 using KernelTypesMem = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Interwave,         Mem>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, M_TileSize, K_TileSize, Scheduler, PipelineType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Interwave,         Mem>
+>;
+
+using KernelTypesMemWmma = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,       I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       BF16,       I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Interwave,         Mem>
 >;
 
 using KernelTypesCompV3 = ::testing::Types<
-     std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Row,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-     std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Row,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-     std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Col,     Row,     Row,       F8,       F8,         F32,       F16,             Intrawave,         CompV3>,
-     std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Col,     Col,     Row,       F8,       F8,         F32,       F16,             Intrawave,        CompV3>,
-     std::tuple<    Row,     Row,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>, 
-     std::tuple<    Row,     Col,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>,
-     std::tuple<    Col,     Row,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>, 
-     std::tuple<    Col,     Col,     Row,       I8,       I8,         I32,       I32,             Intrawave,        CompV3>
-    
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3>
+>;
+
+using KernelTypesCompV3Wmma = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Row,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Row,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF16,      BF16,        F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       INT8,      INT8,        INT32,     INT32,      I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       F8,        F8,          F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>,
+    std::tuple<    Col,     Col,     Row,       BF8,       BF8,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3>
 >;
 
 using KernelTypesCompV4 = ::testing::Types<
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV4>
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I32,        I32,        I32,        I16, Intrawave,        CompV4>
 >;
 
+using KernelTypesCompV4Wmma = ::testing::Types<
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV4>
+>;
+
+
 using KernelTypesPersistent = ::testing::Types<
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3,    Persistent>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        CompV3, NonPersistent>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, M_TileSize, K_TileSize, Scheduler,  PipelineType,    Persistent
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3,    Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I256,        I256,         I64,        I32,        I32,        I16, Intrawave,        CompV3, NonPersistent>
+>;
+
+using KernelTypesPersistentWmma = ::testing::Types<
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3,    Persistent>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,        I64,         I64,          I32,        I16,        I16,        I16, Intrawave,        CompV3, NonPersistent>
 >;
 
 // clang-format on
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
index a7f4e68386..51fbebc915 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem.cpp
@@ -3,7 +3,7 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelineMem : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineMem<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp
new file mode 100644
index 0000000000..5af5e09b28
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_mem_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelineMemWmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelineMemWmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelineMemWmma
+
+TYPED_TEST_SUITE(TestCkTileGemmPipelineMemWmma, KernelTypesMemWmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
index 1dea1ab48c..54410acf70 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent.cpp
@@ -3,7 +3,8 @@
 #include "gtest/gtest.h"
 
 template <typename T>
-class TestCkTileGemmPipelinePersistent : public TestCkTileGemmPipeline<T>
+class TestCkTileGemmPipelinePersistent
+    : public TestCkTileGemmPipeline<T, TestCkTileGemmPipelinePersistent<T>>
 {
 };
 
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp b/test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp
new file mode 100644
index 0000000000..45ab586aa9
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_persistent_wmma.cpp
@@ -0,0 +1,17 @@
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_wmma_base.hpp"
+#include "gtest/gtest.h"
+
+template <typename T>
+class TestCkTileGemmPipelinePersistentWmma
+    : public TestCkTileGemmPipelineWmmaBase<T, TestCkTileGemmPipelinePersistentWmma<T>>
+{
+};
+
+#define TEST_SUITE_NAME TestCkTileGemmPipelinePersistentWmma
+
+TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesPersistentWmma);
+
+#include "test_gemm_pipeline_ut_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
index 70aa161881..26ff847841 100644
--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -69,7 +69,7 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
     static constexpr auto GetName() { return "GemmPipelineAgBgCrCompV4"; }
 };
 
-template <typename Tuple>
+template <typename Tuple, typename Derived>
 class TestCkTileGemmPipeline : public ::testing::Test
 {
     protected:
@@ -80,32 +80,30 @@ class TestCkTileGemmPipeline : public ::testing::Test
     using BDataType                    = std::tuple_element_t<4, Tuple>;
     using AccDataType                  = std::tuple_element_t<5, Tuple>;
     using CDataType                    = std::tuple_element_t<6, Tuple>;
-    static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
-    static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
+    static constexpr auto Scheduler    = std::tuple_element_t<13, Tuple>::value;
+    static constexpr auto PipelineType = std::tuple_element_t<14, Tuple>::value;
+
+    static constexpr ck_tile::index_t M_Tile = std::tuple_element_t<7, Tuple>{};
+    static constexpr ck_tile::index_t N_Tile = std::tuple_element_t<8, Tuple>{};
+    static constexpr ck_tile::index_t K_Tile = std::tuple_element_t<9, Tuple>{};
+
+    static constexpr ck_tile::index_t M_Warp_Tile = std::tuple_element_t<10, Tuple>{};
+    static constexpr ck_tile::index_t N_Warp_Tile = std::tuple_element_t<11, Tuple>{};
+    static constexpr ck_tile::index_t K_Warp_Tile = std::tuple_element_t<12, Tuple>{};
 
     using DsLayout   = ck_tile::tuple<>;
     using DsDataType = ck_tile::tuple<>;
 
     static constexpr bool Persistent =
-        ck_tile::tuple_element_or_default_t<Tuple, 9, std::false_type>::value;
-    // TODO: expose tile size through test t-param ?
+        ck_tile::tuple_element_or_default_t<Tuple, 15, std::false_type>::value;
 
     template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
     void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
     {
-        // TODO: This should be parameterized in tests
-        constexpr ck_tile::index_t M_Tile = 256;
-        constexpr ck_tile::index_t N_Tile = 256;
-        constexpr ck_tile::index_t K_Tile = (PipelineType == GemmPipelineType::CompV4) ? 32 : 64;
-
         constexpr ck_tile::index_t M_Warp = 2;
         constexpr ck_tile::index_t N_Warp = 2;
         constexpr ck_tile::index_t K_Warp = 1;
 
-        constexpr ck_tile::index_t M_Warp_Tile = 32;
-        constexpr ck_tile::index_t N_Warp_Tile = 32;
-        constexpr ck_tile::index_t K_Warp_Tile = 16;
-
         constexpr bool kPadM      = PadM;
         constexpr bool kPadN      = PadN;
         constexpr bool kPadK      = PadK;
@@ -247,11 +245,48 @@ class TestCkTileGemmPipeline : public ::testing::Test
         BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
     }
 
+    template <typename ADataType,
+              typename BDataType,
+              typename AccDataType,
+              ck_tile::index_t M_Warp_Tile,
+              ck_tile::index_t N_Warp_Tile,
+              ck_tile::index_t K_Warp_Tile>
+    bool check_data_type()
+    {
+        return static_cast<Derived*>(this)
+            ->template check_data_type_impl<ADataType,
+                                            BDataType,
+                                            AccDataType,
+                                            M_Warp_Tile,
+                                            N_Warp_Tile,
+                                            K_Warp_Tile>();
+    }
+
+    template <typename ADataType,
+              typename BDataType,
+              typename AccDataType,
+              ck_tile::index_t M_Warp_Tile,
+              ck_tile::index_t N_Warp_Tile,
+              ck_tile::index_t K_Warp_Tile>
+    bool check_data_type_impl()
+    {
+        return true;
+    }
+
     public:
     std::vector<int> k_batches_;
 
     void SetUp() override
     {
+        if(!check_data_type<ADataType,
+                            BDataType,
+                            AccDataType,
+                            M_Warp_Tile,
+                            N_Warp_Tile,
+                            K_Warp_Tile>())
+        {
+            GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test.";
+        }
         if constexpr(PipelineType == GemmPipelineType::CompV4)
         {
             // Only do k_batch = 1 when pipeline is CompV4
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
new file mode 100644
index 0000000000..8d8d245b6a
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_wmma_base.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "test_gemm_pipeline_util.hpp"
+
+template <typename Tuple, typename Derived>
+class TestCkTileGemmPipelineWmmaBase : public TestCkTileGemmPipeline<Tuple, Derived>
+{
+    public:
+    template <typename ADataType,
+              typename BDataType,
+              typename AccDataType,
+              ck_tile::index_t M_Warp_Tile,
+              ck_tile::index_t N_Warp_Tile,
+              ck_tile::index_t K_Warp_Tile>
+    bool check_data_type_impl()
+    {
+        return ck_tile::check_wmma_supported<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile>();
+    }
+};
diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc
old mode 100755
new mode 100644

From 7310830d143836ea10eee42121d2e4e43647d1e1 Mon Sep 17 00:00:00 2001
From: Tianyuan Wu <tianyuan.wu@amd.com>
Date: Mon, 18 Aug 2025 16:45:40 +0800
Subject: [PATCH 442/443] Fix CI build error (#2695)

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
---
 .../ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp | 3 +++
 .../gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
index 13727d41b1..88fde40067 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl.hpp
@@ -35,6 +35,9 @@ struct WarpGemmAttributeWmmaImpl
     static constexpr index_t kN = Traits::kN;
     static constexpr index_t kK = Traits::kK;
 
+    static constexpr index_t kAMBlock = Traits::kAMBlock;
+    static constexpr index_t kBNBlock = Traits::kBNBlock;
+
     static constexpr index_t kRepeat      = Traits::kRepeat;
     static constexpr index_t kAMLane      = Traits::kAMLane;
     static constexpr index_t kBNLane      = Traits::kBNLane;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
index 7ea5507d09..7a3190e6f4 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma_impl_base_traits.hpp
@@ -22,6 +22,9 @@ struct WmmaTraitsBase<gfx11_t, ADType, BDType, CDType>
     static constexpr index_t kN = 16;
     static constexpr index_t kK = 16;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kRepeat      = 2;
     static constexpr index_t kAMLane      = 16;
     static constexpr index_t kBNLane      = 16;
@@ -61,6 +64,9 @@ struct WmmaTraitsBase<gfx12_t, ADType, BDType, CDType>
     static constexpr index_t kN = 16;
     static constexpr index_t kK = 16;
 
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
     static constexpr index_t kRepeat      = 1;
     static constexpr index_t kAMLane      = 16;
     static constexpr index_t kBNLane      = 16;

From 26d33009306b0e77d3f51f071f8367f4c5bdf353 Mon Sep 17 00:00:00 2001
From: Sami Remes <samremes@amd.com>
Date: Mon, 18 Aug 2025 11:46:10 +0300
Subject: [PATCH 443/443] Add other layouts for FP8 block scaled gemm (#2665)

* Start adding other layouts for gemm_ab_scale

* Add some instances

* Create tensor descriptors for A/B scales depending on A/B layout

* Fix formatting

* Revert some comments

* Revert commented instances in CMakeLists.txt

* Add some more instances for col-row gemm

* enable more row,row instances

* Use occupancy=1 for col,row layout to avoid spills
---
 ...mm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp |  21 ++-
 ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp |  38 +++-
 .../gpu/gemm_ab_scale.hpp                     | 173 ++++++++++++++++++
 .../gpu/gemm_ab_scale/CMakeLists.txt          |  22 +++
 ...le_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp |  96 ++++++++++
 ...n_mn_128_128_128_comp_default_instance.cpp |  37 ++++
 ..._mn_128_128_128_comp_kpadding_instance.cpp |  37 ++++
 ...mn_128_128_128_mem_v1_default_instance.cpp |  38 ++++
 ...n_128_128_128_mem_v1_kpadding_instance.cpp |  38 ++++
 ...le_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp |  87 +++++++++
 ...n_mn_128_128_128_comp_default_instance.cpp |  37 ++++
 ..._mn_128_128_128_comp_kpadding_instance.cpp |  37 ++++
 ...mn_128_128_128_mem_v1_default_instance.cpp |  38 ++++
 ...n_128_128_128_mem_v1_kpadding_instance.cpp |  38 ++++
 profiler/src/profile_gemm_ab_scale.cpp        |  34 ++++
 15 files changed, 758 insertions(+), 13 deletions(-)
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
index d5fec7201a..f444399812 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp
@@ -231,11 +231,22 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3
                 }
             };
 
-            constexpr index_t minimum_occupancy =
-                (BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave &&
-                 MPerBlock * NPerBlock / BlockSize > 64)
-                    ? 1
-                    : 2;
+            constexpr index_t minimum_occupancy = [&]() {
+                if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout> &&
+                             is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+                {
+                    // FIXME: many instances have many spills with occupancy > 1, a better solution
+                    // needed to get best performance
+                    return 1;
+                }
+                else
+                {
+                    return (BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave &&
+                            MPerBlock * NPerBlock / BlockSize > 64)
+                               ? 1
+                               : 2;
+                }
+            }();
 
             if(has_main_k_block_loop)
             {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 93ec6ca31e..e80a3702fb 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -402,6 +402,34 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         }
     }
 
+    __host__ __device__ static constexpr auto MakeAScaleGridDesciptor_M_K(index_t M, index_t K)
+    {
+        const auto BM = math::integer_divide_ceil(M, ScaleBlockM);
+        const auto BK = math::integer_divide_ceil(K, ScaleBlockK);
+        if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BM, BK), make_tuple(BK, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BM, BK), make_tuple(I1, BM));
+        }
+    }
+
+    __host__ __device__ static constexpr auto MakeBScaleGridDesciptor_N_K(index_t N, index_t K)
+    {
+        const auto BN = math::integer_divide_ceil(N, ScaleBlockN);
+        const auto BK = math::integer_divide_ceil(K, ScaleBlockK);
+        if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BN, BK), make_tuple(BK, I1));
+        }
+        else if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+        {
+            return make_naive_tensor_descriptor(make_tuple(BN, BK), make_tuple(I1, BN));
+        }
+    }
+
     template <typename ABlockDesc_AK0_M_AK1>
     __host__ __device__ static constexpr auto
     MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
@@ -1181,14 +1209,8 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
         const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
 
-        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.M, ScaleBlockM),
-                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
-            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
-        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
-            make_tuple(math::integer_divide_ceil(problem.N, ScaleBlockN),
-                       math::integer_divide_ceil(problem.K, ScaleBlockK)),
-            make_tuple(math::integer_divide_ceil(problem.K, ScaleBlockK), 1));
+        const auto a_scale_grid_desc_am_ak = MakeAScaleGridDesciptor_M_K(problem.M, problem.K);
+        const auto b_scale_grid_desc_bn_ak = MakeBScaleGridDesciptor_N_K(problem.N, problem.K);
 
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
index 3fa82ae53a..e78ef7b803 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_ab_scale.hpp
@@ -17,6 +17,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
+// Row, Col
 void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_comp_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
                                                             Col,
@@ -88,6 +89,152 @@ void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_
                                                             PassThrough,
                                                             PassThrough,
                                                             PassThrough>>>& instances);
+
+// Row, Row
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+// Col, Row
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances);
 #endif
 
 template <typename A0DataType,
@@ -154,6 +301,32 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
                 add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_1_128_128_mem_v1_kpadding_instances(
                     op_ptrs);
             }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_kpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_kpadding_instances(
+                    op_ptrs);
+            }
         }
 #endif
         return op_ptrs;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
index d572862884..4f3c2f1ff5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/CMakeLists.txt
@@ -2,15 +2,37 @@
 set(GEMM_AB_SCALE_INSTANCES)
 
 list(APPEND GEMM_AB_SCALE_INSTANCES 
+        # Row, Col
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp
         device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp
+        # Row, Row
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
+        # Col, Row
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
+        device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
         )
 
+# Row, Col
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+# Row, Row
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+# Col, Row
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 add_instance_library(device_gemm_ab_scale_instance ${GEMM_AB_SCALE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp
new file mode 100644
index 0000000000..353e3db0f9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |               |        |           |           |           |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  32,   32,    2,    2,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,   S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  16,   16,    4,    4,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,   S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  32,   32,    2,    2,     S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,   S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,   S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,    64,   128,   4,   4,  32,   32,    2,    1,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,   4,   4,  32,   32,    1,    2,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,   4,   4,  32,   32,    1,    1,     S< 8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|      DsLayout| ELayout|AData    |     BData|      DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |              |        | Type    |      Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //################################|        |        |              |        |         |          |            |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //################################|        |        |              |        |         |          |            |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Memory friendly 
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   256,   128,  4,   4,  16,   16,    1,    4,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   128,  4,   4,  16,   16,    1,    2,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   128,  4,   4,  16,   16,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // FIXME: KPerBlock=256 give numerically bad results:
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   256,  4,   4,  16,   16,    1,    2,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   256,  4,   4,  16,   16,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              1,              1,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128,  4,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128,  4,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128,  4,   4,  16,   16,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              1,              1,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128,  4,   4,  32,   32,    1,    2,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128,  4,   4,  32,   32,    1,    1,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128,  4,   4,  16,   16,    2,    1,     S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,    S<32, 8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              2,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // FIXME: KPerBlock=256 give numerically bad results:
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   256,  4,   4,  32,   32,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   256,  4,   4,  16,   16,    2,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   256,   128,  4,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   128,  4,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   128,  4,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   256,  4,   4,  32,   32,    2,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Col,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   256,  4,   4,  32,   32,    1,    1,     S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,             16,             16,          0,    S<16, 16, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
new file mode 100644
index 0000000000..b1d5443c49
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_default_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..4d72edf910
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_comp_kpadding_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..fbb35d6bec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_default_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..b90c48c7af
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Col,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_km_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp
new file mode 100644
index 0000000000..9d846354bf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout|       DsLayout| ELayout|      AData|      BData|     DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block| Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |               |        |       Type|       Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //################################|        |        |               |        |           |           |           |      |        |         |   Operation|   Operation|      Operation|              |      |     M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //################################|        |        |               |        |           |           |           |      |        |         |            |            |               |              |      |      |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
+        // Compute friendly
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,             16,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,   128,    64,   128,  16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              8,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,   128,   128,  16,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              8,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>,    Row,    F8, F32,    F8, F32,    Tuple<>,   BF16,     F32,     F32,   PassThrough, PassThrough, PassThrough,    GemmSpec,       256,     1,   128,   128,    64,    64,   128,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,     S<0, 2, 1>,    S<0, 2, 1>,             1,              8,             4,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+        // clang-format on
+        >;
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| BLayout|      DsLayout| ELayout|AData    |     BData|      DsData| EData| AccData| Cshuffle|           A|           B|               C|          GEMM| Block|  Scale| Scale| Scale|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+        //################################|        |        |              |        | Type    |      Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise|     Elementwise|Specialization|  Size|  Block| Block| Block| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+        //################################|        |        |              |        |         |          |            |      |        |         |   Operation|   Operation|       Operation|              |      |      M|     N|     K|      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+        //################################|        |        |              |        |         |          |            |      |        |         |            |            |                |              |      |       |      |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
+        // Memory friendly 
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   256,   128,  8,   4,  16,   16,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   128,  8,   4,  16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              4,              4,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   128,  8,   4,  16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,               1,              2,              2,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,   128,   256, 16,  16,  16,   16,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           2,                 S<1, 16, 1, 16>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    16,    64,   256, 16,  16,  16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 16, 1, 16>,            S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   256,   128, 16,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<1, 0, 2>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   128, 16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<1, 0, 2>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   128, 16,   4,  16,   16,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<1, 0, 2>,              1,              2,              2,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,   128,   256, 16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    32,    64,   256, 16,  16,  16,   16,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          2,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   256,   128, 16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              8,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   128, 16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              4,              4,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   128, 16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<0, 2, 1>,    S<0, 2, 1>,              1,              2,              2,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,   128,   256, 16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
+        // DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3<  Row,     Row,     Tuple<>, Row,     F8,F32,     F8,F32,    Tuple<>, BF16,   F32,     F32,     PassThrough, PassThrough,      PassThrough,     GemmSpec,    256,      1,   128,   128,    64,    64,   256, 16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,             2,             16,             16,          0,          1,           1,                 S<1, 32, 1, 8>,            S<8>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
new file mode 100644
index 0000000000..b249fd82d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_default_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
new file mode 100644
index 0000000000..772a4e730b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_comp_kpadding_instance.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_comp_instances<GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
new file mode 100644
index 0000000000..8ffb38b115
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_default_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
new file mode 100644
index 0000000000..edccd05931
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_ab_scale/device_gemm_ab_scale_xdl_f8_f8_bf16/device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128_mem_v1_kpadding_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_128_128_128.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD_ABScale<Row,
+                                                            Row,
+                                                            Tuple<>,
+                                                            Row,
+                                                            F8,
+                                                            F32,
+                                                            F8,
+                                                            F32,
+                                                            Tuple<>,
+                                                            BF16,
+                                                            1,
+                                                            128,
+                                                            128,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_ab_scale_xdl_f8_f8_bf16_mk_kn_mn_1_128_128_mem_instances<Intrawave,
+                                                                             GemmKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/src/profile_gemm_ab_scale.cpp b/profiler/src/profile_gemm_ab_scale.cpp
index 3956038a30..531872bbb9 100644
--- a/profiler/src/profile_gemm_ab_scale.cpp
+++ b/profiler/src/profile_gemm_ab_scale.cpp
@@ -173,6 +173,40 @@ int profile_gemm_ab_scale(int argc, char* argv[])
                        Col{},
                        Row{});
     }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN &&
+            scale_block_tile == ScaleBlockTile::Tile_1_128_128)
+    {
+        return profile(F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       BF16{},
+                       ck::Number<1>{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       Row{},
+                       Row{},
+                       Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::KM_KN_MN &&
+            scale_block_tile == ScaleBlockTile::Tile_1_128_128)
+    {
+        return profile(F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       BF16{},
+                       ck::Number<1>{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       Col{},
+                       Row{},
+                       Row{});
+    }
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;